--- pandia/crawler 2025/06/27 02:14:47 1.3 +++ pandia/crawler 2025/07/02 15:03:05 1.7 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.3 2025/06/27 02:14:47 snw Exp $ +# $Id: crawler,v 1.7 2025/07/02 15:03:05 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,18 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.7 2025/07/02 15:03:05 snw +# Add support for restricted mode +# +# Revision 1.6 2025/07/01 19:20:47 snw +# Stop crawling and indexing URLs containing page fragments +# +# Revision 1.5 2025/06/28 00:33:32 snw +# Update locking +# +# Revision 1.4 2025/06/27 16:20:30 snw +# Add blacklist +# # Revision 1.3 2025/06/27 02:14:47 snw # Initial operational capability # @@ -32,9 +44,11 @@ use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; -my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); +use Fcntl qw(:flock); use LWP::Simple qw(get); +use Config::IniFiles; +my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; @@ -44,15 +58,39 @@ my $depth = 0; my $blacklist_matches = 0; my $robots_txt_denies = 0; my $invalid_scheme_skips = 0; +my $mode; sub store_url { my ($url, $parent) = @_; - if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") { - + if (index($url, '#') != -1) { + print "F"; + return; + } + + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { + my $u = URI->new($url); - my $domain = $u->host; + my $domain = $u->host; my $scheme = $u->scheme; + my @parts = split($domain, '.'); + my $tld = $parts[-1]; + + if ($mode eq 'restricted') { + my $tld_ok = 0; + foreach (@allowed_tlds) { + my $allowed = $_; + + if($tld eq $allowed) { + $tld_ok = 1; + last; + } + } + if($tld_ok == 0) { + print "T"; + return; + } + } my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); $sth->execute($domain); @@ -118,15 +156,6 @@ sub crawl_url { my $response = $http->get($url); - if(not $response->{success}) { - print "pandia: http failure; skipping $url\n"; - next; - } - - if(exists $response->{redirects}) { - print "pandia: redirects detected; skipping $url\n"; - next; - } $tree->parse($response->{content}); @@ -164,14 +193,35 @@ $| = 1; print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; -GetOptions("dbhost=s" => \$dbhost, - "dbname=s" => \$dbname, - "dbusername=s" => \$dbusername, - "dbpw=s" => \$dbpw, - "seed=s" => \$seed, - "maxdepth=n" =>\$maxdepth) +my $profile; + +GetOptions("profile=s" => \$profile, + "seed=s" => \$seed, + "maxdepth=n" =>\$maxdepth) or die("error in command line arguments"); +my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); + +$dbhost = $cfg->val($profile, 'dbhost'); +$dbname = $cfg->val($profile, 'dbname'); +$dbusername = $cfg->val($profile, 'dbuser'); +$dbpw = $cfg->val($profile, 'dbpass'); +$tmp = $cfg->val($profile, 'allowed_tlds'); + +if($tmp ne '*') { + $mode = 'restrictive'; + @allowed_tlds = split(',', $tmp); + print "pandia: crawler restricted to these TLDs: "; + foreach (@allowed_tlds) { + print ".$_ "; + } + print "\n"; +} +else { + print "pandia: crawler unrestricted\n"; + $mode = 'normal'; +} + print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; @@ -180,13 +230,25 @@ die "pandia: failed to connect to MySQL print "[OK]\n"; -if($seed ne "") { - print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; +print "pandia: each character represents the following status for a URL:\n"; +print " . URL added to indexer queue\n"; +print " l crawl exceeded max depth\n"; +print " x URL too long or invalid scheme\n"; +print " d URL was a duplicate\n"; +print " b crawl was blocked by robots.txt\n"; +print " F URL contained a fragment\n"; +print " T URL was from a disallowed top-level domain\n\n"; + +if($seed ne "") { + print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; sleep 1; crawl_url($seed); print "[OK]\n"; } else { + open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; + flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; + my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows;