--- pandia/crawler 2025/07/01 19:20:47 1.6 +++ pandia/crawler 2025/07/02 15:03:05 1.7 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $ +# $Id: crawler,v 1.7 2025/07/02 15:03:05 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,9 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.7 2025/07/02 15:03:05 snw +# Add support for restricted mode +# # Revision 1.6 2025/07/01 19:20:47 snw # Stop crawling and indexing URLs containing page fragments # @@ -43,6 +46,7 @@ use DBI; use WWW::RobotRules; use Fcntl qw(:flock); use LWP::Simple qw(get); +use Config::IniFiles; my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; @@ -54,12 +58,13 @@ my $depth = 0; my $blacklist_matches = 0; my $robots_txt_denies = 0; my $invalid_scheme_skips = 0; +my $mode; sub store_url { my ($url, $parent) = @_; if (index($url, '#') != -1) { - print "pandia: URL contains a fragment; skipping\n"; + print "F"; return; } @@ -68,6 +73,24 @@ sub store_url { my $u = URI->new($url); my $domain = $u->host; my $scheme = $u->scheme; + my @parts = split($domain, '.'); + my $tld = $parts[-1]; + + if ($mode eq 'restricted') { + my $tld_ok = 0; + foreach (@allowed_tlds) { + my $allowed = $_; + + if($tld eq $allowed) { + $tld_ok = 1; + last; + } + } + if($tld_ok == 0) { + print "T"; + return; + } + } my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); $sth->execute($domain); @@ -170,14 +193,35 @@ $| = 1; print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; -GetOptions("dbhost=s" => \$dbhost, - "dbname=s" => \$dbname, - "dbusername=s" => \$dbusername, - "dbpw=s" => \$dbpw, - "seed=s" => \$seed, - "maxdepth=n" =>\$maxdepth) +my $profile; + +GetOptions("profile=s" => \$profile, + "seed=s" => \$seed, + "maxdepth=n" =>\$maxdepth) or die("error in command line arguments"); +my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); + +$dbhost = $cfg->val($profile, 'dbhost'); +$dbname = $cfg->val($profile, 'dbname'); +$dbusername = $cfg->val($profile, 'dbuser'); +$dbpw = $cfg->val($profile, 'dbpass'); +$tmp = $cfg->val($profile, 'allowed_tlds'); + +if($tmp ne '*') { + $mode = 'restrictive'; + @allowed_tlds = split(',', $tmp); + print "pandia: crawler restricted to these TLDs: "; + foreach (@allowed_tlds) { + print ".$_ "; + } + print "\n"; +} +else { + print "pandia: crawler unrestricted\n"; + $mode = 'normal'; +} + print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; @@ -186,8 +230,17 @@ die "pandia: failed to connect to MySQL print "[OK]\n"; -if($seed ne "") { - print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; +print "pandia: each character represents the following status for a URL:\n"; +print " . URL added to indexer queue\n"; +print " l crawl exceeded max depth\n"; +print " x URL too long or invalid scheme\n"; +print " d URL was a duplicate\n"; +print " b crawl was blocked by robots.txt\n"; +print " F URL contained a fragment\n"; +print " T URL was from a disallowed top-level domain\n\n"; + +if($seed ne "") { + print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; sleep 1; crawl_url($seed); print "[OK]\n";