version 1.1, 2025/06/25 13:44:37
|
version 1.8, 2025/07/02 15:14:44
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.8 2025/07/02 15:14:44 snw |
|
# Fix bug in restricted mode |
|
# |
|
# Revision 1.7 2025/07/02 15:03:05 snw |
|
# Add support for restricted mode |
|
# |
|
# Revision 1.6 2025/07/01 19:20:47 snw |
|
# Stop crawling and indexing URLs containing page fragments |
|
# |
|
# Revision 1.5 2025/06/28 00:33:32 snw |
|
# Update locking |
|
# |
|
# Revision 1.4 2025/06/27 16:20:30 snw |
|
# Add blacklist |
|
# |
|
# Revision 1.3 2025/06/27 02:14:47 snw |
|
# Initial operational capability |
|
# |
|
# Revision 1.2 2025/06/25 19:38:48 snw |
|
# Add indexer |
|
# |
# Revision 1.1 2025/06/25 13:44:37 snw |
# Revision 1.1 2025/06/25 13:44:37 snw |
# Renaming |
# Renaming |
# |
# |
Line 26 use HTML::TreeBuilder;
|
Line 47 use HTML::TreeBuilder;
|
use URI; |
use URI; |
use DBI; |
use DBI; |
use WWW::RobotRules; |
use WWW::RobotRules; |
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
use Fcntl qw(:flock); |
use LWP::Simple qw(get); |
use LWP::Simple qw(get); |
|
use Config::IniFiles; |
|
|
|
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
my $dbh = ""; |
my $dbh = ""; |
my $dsn = ""; |
my $dsn = ""; |
my $skips = 0; |
my $skips = 0; |
my $inserts = 0; |
my $inserts = 0; |
my $seed = ""; |
my $seed = ""; |
my $depth = 0; |
my $depth = 0; |
|
my $blacklist_matches = 0; |
|
my $robots_txt_denies = 0; |
|
my $invalid_scheme_skips = 0; |
|
my $mode; |
|
|
sub store_url { |
sub store_url { |
my ($url) = @_; |
my ($url, $parent) = @_; |
|
|
if($url ne "" && length($url) <= 255) { |
if (index($url, '#') != -1) { |
print "."; |
print "F"; |
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); |
return; |
|
} |
|
|
|
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
|
|
|
my $u = URI->new($url); |
|
my $domain = $u->host; |
|
my $scheme = $u->scheme; |
|
my @parts = split($domain, '.'); |
|
my $tld = $parts[-1]; |
|
|
|
if ($mode eq 'restricted') { |
|
my $tld_ok = 0; |
|
foreach (@allowed_tlds) { |
|
my $allowed = $_; |
|
|
|
if($tld eq $allowed) { |
|
$tld_ok = 1; |
|
last; |
|
} |
|
} |
|
if($tld_ok == 0) { |
|
print "T"; |
|
return; |
|
} |
|
} |
|
|
|
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
|
$sth->execute($domain); |
|
|
|
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); |
|
|
$ins->execute($url) or $skips = $skips + 1; |
if(not $ins->execute($url, $parent, $domain, $scheme)) { |
$inserts = $inserts + 1; |
$skips = $skips + 1; |
if($depth < $maxdepth) { |
print "d"; |
$depth = $depth + 1; |
|
crawl_url($url); |
|
} |
} |
} |
else { |
|
print "."; |
|
$inserts = $inserts + 1; |
|
if($depth < $maxdepth) { |
|
$depth = $depth + 1; |
|
crawl_url($url); |
|
} |
|
else { |
|
print "l"; |
|
} |
|
} |
|
} |
|
else { |
|
print "x"; |
|
} |
} |
} |
|
|
sub crawl_url { |
sub crawl_url { |
Line 58 sub crawl_url {
|
Line 127 sub crawl_url {
|
my $u = URI->new($url); |
my $u = URI->new($url); |
|
|
if ($u->scheme ne "http" && $u->scheme ne "https") { |
if ($u->scheme ne "http" && $u->scheme ne "https") { |
|
$invalid_scheme_skips = $invalid_scheme_skips + 1; |
|
print "s"; |
return; |
return; |
} |
} |
|
|
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); |
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); |
$sth->execute($u->host); |
$sth->execute($u->host); |
if($sth->rows > 0) { |
if($sth->rows > 0) { |
|
print "b"; |
|
$blacklist_matches = $blacklist_matches + 1; |
return; |
return; |
} |
} |
|
|
Line 73 sub crawl_url {
|
Line 146 sub crawl_url {
|
$rules->parse($robots_url, $robots_txt) if defined $robots_txt; |
$rules->parse($robots_url, $robots_txt) if defined $robots_txt; |
|
|
if(!$rules->allowed($url)) { |
if(!$rules->allowed($url)) { |
|
print "r"; |
|
$robots_txt_denies = $robots_txt_denies + 1; |
return; |
return; |
} |
} |
|
|
Line 83 sub crawl_url {
|
Line 158 sub crawl_url {
|
my $tree = HTML::TreeBuilder->new(); |
my $tree = HTML::TreeBuilder->new(); |
|
|
my $response = $http->get($url); |
my $response = $http->get($url); |
|
|
|
|
$tree->parse($response->{content}); |
$tree->parse($response->{content}); |
|
|
my @links = $tree->find_by_tag_name('a'); |
my @links = $tree->find_by_tag_name('a'); |
Line 109 sub crawl_url {
|
Line 186 sub crawl_url {
|
$final = $href; |
$final = $href; |
} |
} |
|
|
store_url($final); |
store_url($final, $url); |
} |
} |
|
|
$depth = $depth - 1; |
$depth = $depth - 1; |
} |
} |
|
|
$| = 1; |
$| = 1; |
|
|
print "pandia crawler v0.0.1\n"; |
print "pandia crawler v0.0.1\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
GetOptions("dbhost=s" => \$dbhost, |
my $profile; |
"dbname=s" => \$dbname, |
|
"dbusername=s" => \$dbusername, |
GetOptions("profile=s" => \$profile, |
"dbpw=s" => \$dbpw, |
"seed=s" => \$seed, |
"seed=s" => \$seed, |
"maxdepth=n" =>\$maxdepth) |
"maxdepth=n" =>\$maxdepth) |
|
or die("error in command line arguments"); |
or die("error in command line arguments"); |
|
|
|
my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); |
|
|
|
$dbhost = $cfg->val($profile, 'dbhost'); |
|
$dbname = $cfg->val($profile, 'dbname'); |
|
$dbusername = $cfg->val($profile, 'dbuser'); |
|
$dbpw = $cfg->val($profile, 'dbpass'); |
|
$tmp = $cfg->val($profile, 'allowed_tlds'); |
|
|
|
if($tmp ne '*') { |
|
$mode = 'restricted'; |
|
@allowed_tlds = split(',', $tmp); |
|
print "pandia: crawler restricted to these TLDs: "; |
|
foreach (@allowed_tlds) { |
|
print ".$_ "; |
|
} |
|
print "\n"; |
|
} |
|
else { |
|
print "pandia: crawler unrestricted\n"; |
|
$mode = 'normal'; |
|
} |
|
|
print "pandia: connecting to $dbname database at $dbhost..."; |
print "pandia: connecting to $dbname database at $dbhost..."; |
|
|
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
Line 136 die "pandia: failed to connect to MySQL
|
Line 233 die "pandia: failed to connect to MySQL
|
|
|
print "[OK]\n"; |
print "[OK]\n"; |
|
|
if($seed ne "") { |
print "pandia: each character represents the following status for a URL:\n"; |
print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; |
print " . URL added to indexer queue\n"; |
|
print " l crawl exceeded max depth\n"; |
|
print " x URL too long or invalid scheme\n"; |
|
print " d URL was a duplicate\n"; |
|
print " b crawl was blocked by robots.txt\n"; |
|
print " F URL contained a fragment\n"; |
|
print " T URL was from a disallowed top-level domain\n\n"; |
|
|
|
if($seed ne "") { |
|
print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; |
sleep 1; |
sleep 1; |
crawl_url($seed); |
crawl_url($seed); |
print "[OK]\n"; |
print "[OK]\n"; |
} |
} |
else { |
else { |
|
open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; |
|
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
|
|
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
$sth->execute(); |
$sth->execute(); |
my $qlen = $sth->rows; |
my $qlen = $sth->rows; |
Line 161 else {
|
Line 270 else {
|
my $total = $inserts + $skips; |
my $total = $inserts + $skips; |
|
|
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; |
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; |
|
print " - $blacklist_matches blacklist matches\n"; |
|
print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; |
|
print " - $robots_txt_denies URLs skipped due to robots.txt\n"; |
|
|
|
|