version 1.5, 2025/06/28 00:33:32
|
version 1.8, 2025/07/02 15:14:44
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.8 2025/07/02 15:14:44 snw |
|
# Fix bug in restricted mode |
|
# |
|
# Revision 1.7 2025/07/02 15:03:05 snw |
|
# Add support for restricted mode |
|
# |
|
# Revision 1.6 2025/07/01 19:20:47 snw |
|
# Stop crawling and indexing URLs containing page fragments |
|
# |
# Revision 1.5 2025/06/28 00:33:32 snw |
# Revision 1.5 2025/06/28 00:33:32 snw |
# Update locking |
# Update locking |
# |
# |
Line 40 use DBI;
|
Line 49 use DBI;
|
use WWW::RobotRules; |
use WWW::RobotRules; |
use Fcntl qw(:flock); |
use Fcntl qw(:flock); |
use LWP::Simple qw(get); |
use LWP::Simple qw(get); |
|
use Config::IniFiles; |
|
|
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
my $dbh = ""; |
my $dbh = ""; |
Line 51 my $depth = 0;
|
Line 61 my $depth = 0;
|
my $blacklist_matches = 0; |
my $blacklist_matches = 0; |
my $robots_txt_denies = 0; |
my $robots_txt_denies = 0; |
my $invalid_scheme_skips = 0; |
my $invalid_scheme_skips = 0; |
|
my $mode; |
|
|
sub store_url { |
sub store_url { |
my ($url, $parent) = @_; |
my ($url, $parent) = @_; |
|
|
|
if (index($url, '#') != -1) { |
|
print "F"; |
|
return; |
|
} |
|
|
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
|
|
my $u = URI->new($url); |
my $u = URI->new($url); |
my $domain = $u->host; |
my $domain = $u->host; |
my $scheme = $u->scheme; |
my $scheme = $u->scheme; |
|
my @parts = split($domain, '.'); |
|
my $tld = $parts[-1]; |
|
|
|
if ($mode eq 'restricted') { |
|
my $tld_ok = 0; |
|
foreach (@allowed_tlds) { |
|
my $allowed = $_; |
|
|
|
if($tld eq $allowed) { |
|
$tld_ok = 1; |
|
last; |
|
} |
|
} |
|
if($tld_ok == 0) { |
|
print "T"; |
|
return; |
|
} |
|
} |
|
|
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
$sth->execute($domain); |
$sth->execute($domain); |
Line 162 $| = 1;
|
Line 196 $| = 1;
|
print "pandia crawler v0.0.1\n"; |
print "pandia crawler v0.0.1\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
GetOptions("dbhost=s" => \$dbhost, |
my $profile; |
"dbname=s" => \$dbname, |
|
"dbusername=s" => \$dbusername, |
GetOptions("profile=s" => \$profile, |
"dbpw=s" => \$dbpw, |
"seed=s" => \$seed, |
"seed=s" => \$seed, |
"maxdepth=n" =>\$maxdepth) |
"maxdepth=n" =>\$maxdepth) |
|
or die("error in command line arguments"); |
or die("error in command line arguments"); |
|
|
|
my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); |
|
|
|
$dbhost = $cfg->val($profile, 'dbhost'); |
|
$dbname = $cfg->val($profile, 'dbname'); |
|
$dbusername = $cfg->val($profile, 'dbuser'); |
|
$dbpw = $cfg->val($profile, 'dbpass'); |
|
$tmp = $cfg->val($profile, 'allowed_tlds'); |
|
|
|
if($tmp ne '*') { |
|
$mode = 'restricted'; |
|
@allowed_tlds = split(',', $tmp); |
|
print "pandia: crawler restricted to these TLDs: "; |
|
foreach (@allowed_tlds) { |
|
print ".$_ "; |
|
} |
|
print "\n"; |
|
} |
|
else { |
|
print "pandia: crawler unrestricted\n"; |
|
$mode = 'normal'; |
|
} |
|
|
print "pandia: connecting to $dbname database at $dbhost..."; |
print "pandia: connecting to $dbname database at $dbhost..."; |
|
|
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
Line 178 die "pandia: failed to connect to MySQL
|
Line 233 die "pandia: failed to connect to MySQL
|
|
|
print "[OK]\n"; |
print "[OK]\n"; |
|
|
if($seed ne "") { |
print "pandia: each character represents the following status for a URL:\n"; |
print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; |
print " . URL added to indexer queue\n"; |
|
print " l crawl exceeded max depth\n"; |
|
print " x URL too long or invalid scheme\n"; |
|
print " d URL was a duplicate\n"; |
|
print " b crawl was blocked by robots.txt\n"; |
|
print " F URL contained a fragment\n"; |
|
print " T URL was from a disallowed top-level domain\n\n"; |
|
|
|
if($seed ne "") { |
|
print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; |
sleep 1; |
sleep 1; |
crawl_url($seed); |
crawl_url($seed); |
print "[OK]\n"; |
print "[OK]\n"; |