--- pandia/crawler 2025/06/25 13:44:37 1.1 +++ pandia/crawler 2025/07/02 15:03:05 1.7 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.1 2025/06/25 13:44:37 snw Exp $ +# $Id: crawler,v 1.7 2025/07/02 15:03:05 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,24 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.7 2025/07/02 15:03:05 snw +# Add support for restricted mode +# +# Revision 1.6 2025/07/01 19:20:47 snw +# Stop crawling and indexing URLs containing page fragments +# +# Revision 1.5 2025/06/28 00:33:32 snw +# Update locking +# +# Revision 1.4 2025/06/27 16:20:30 snw +# Add blacklist +# +# Revision 1.3 2025/06/27 02:14:47 snw +# Initial operational capability +# +# Revision 1.2 2025/06/25 19:38:48 snw +# Add indexer +# # Revision 1.1 2025/06/25 13:44:37 snw # Renaming # @@ -26,30 +44,78 @@ use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; -my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); +use Fcntl qw(:flock); use LWP::Simple qw(get); +use Config::IniFiles; +my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; my $inserts = 0; my $seed = ""; my $depth = 0; +my $blacklist_matches = 0; +my $robots_txt_denies = 0; +my $invalid_scheme_skips = 0; +my $mode; sub store_url { - my ($url) = @_; + my ($url, $parent) = @_; - if($url ne "" && length($url) <= 255) { - print "."; - my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); + if (index($url, '#') != -1) { + print "F"; + return; + } + + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { + + my $u = URI->new($url); + my $domain = $u->host; + my $scheme = $u->scheme; + my @parts = split($domain, '.'); + my $tld = $parts[-1]; + + if ($mode eq 'restricted') { + my $tld_ok = 0; + foreach (@allowed_tlds) { + my $allowed = $_; + + if($tld eq $allowed) { + $tld_ok = 1; + last; + } + } + if($tld_ok == 0) { + print "T"; + return; + } + } + + my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); + $sth->execute($domain); + + my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); - $ins->execute($url) or $skips = $skips + 1; - $inserts = $inserts + 1; - if($depth < $maxdepth) { - $depth = $depth + 1; - crawl_url($url); + if(not $ins->execute($url, $parent, $domain, $scheme)) { + $skips = $skips + 1; + print "d"; } - } + else { + print "."; + $inserts = $inserts + 1; + if($depth < $maxdepth) { + $depth = $depth + 1; + crawl_url($url); + } + else { + print "l"; + } + } + } + else { + print "x"; + } } sub crawl_url { @@ -58,12 +124,16 @@ sub crawl_url { my $u = URI->new($url); if ($u->scheme ne "http" && $u->scheme ne "https") { + $invalid_scheme_skips = $invalid_scheme_skips + 1; + print "s"; return; } my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); $sth->execute($u->host); if($sth->rows > 0) { + print "b"; + $blacklist_matches = $blacklist_matches + 1; return; } @@ -73,6 +143,8 @@ sub crawl_url { $rules->parse($robots_url, $robots_txt) if defined $robots_txt; if(!$rules->allowed($url)) { + print "r"; + $robots_txt_denies = $robots_txt_denies + 1; return; } @@ -83,6 +155,8 @@ sub crawl_url { my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); + + $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); @@ -109,25 +183,45 @@ sub crawl_url { $final = $href; } - store_url($final); + store_url($final, $url); } $depth = $depth - 1; } $| = 1; - print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; -GetOptions("dbhost=s" => \$dbhost, - "dbname=s" => \$dbname, - "dbusername=s" => \$dbusername, - "dbpw=s" => \$dbpw, - "seed=s" => \$seed, - "maxdepth=n" =>\$maxdepth) +my $profile; + +GetOptions("profile=s" => \$profile, + "seed=s" => \$seed, + "maxdepth=n" =>\$maxdepth) or die("error in command line arguments"); +my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); + +$dbhost = $cfg->val($profile, 'dbhost'); +$dbname = $cfg->val($profile, 'dbname'); +$dbusername = $cfg->val($profile, 'dbuser'); +$dbpw = $cfg->val($profile, 'dbpass'); +$tmp = $cfg->val($profile, 'allowed_tlds'); + +if($tmp ne '*') { + $mode = 'restrictive'; + @allowed_tlds = split(',', $tmp); + print "pandia: crawler restricted to these TLDs: "; + foreach (@allowed_tlds) { + print ".$_ "; + } + print "\n"; +} +else { + print "pandia: crawler unrestricted\n"; + $mode = 'normal'; +} + print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; @@ -136,13 +230,25 @@ die "pandia: failed to connect to MySQL print "[OK]\n"; -if($seed ne "") { - print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; +print "pandia: each character represents the following status for a URL:\n"; +print " . URL added to indexer queue\n"; +print " l crawl exceeded max depth\n"; +print " x URL too long or invalid scheme\n"; +print " d URL was a duplicate\n"; +print " b crawl was blocked by robots.txt\n"; +print " F URL contained a fragment\n"; +print " T URL was from a disallowed top-level domain\n\n"; + +if($seed ne "") { + print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; sleep 1; crawl_url($seed); print "[OK]\n"; } else { + open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; + flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; + my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows; @@ -161,3 +267,8 @@ else { my $total = $inserts + $skips; print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; +print " - $blacklist_matches blacklist matches\n"; +print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; +print " - $robots_txt_denies URLs skipped due to robots.txt\n"; + +