--- pandia/crawler 2025/06/25 13:44:37 1.1 +++ pandia/crawler 2025/06/27 16:20:30 1.4 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.1 2025/06/25 13:44:37 snw Exp $ +# $Id: crawler,v 1.4 2025/06/27 16:20:30 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,15 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.4 2025/06/27 16:20:30 snw +# Add blacklist +# +# Revision 1.3 2025/06/27 02:14:47 snw +# Initial operational capability +# +# Revision 1.2 2025/06/25 19:38:48 snw +# Add indexer +# # Revision 1.1 2025/06/25 13:44:37 snw # Renaming # @@ -26,30 +35,53 @@ use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; -my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); +use Fcntl qw(:flock); use LWP::Simple qw(get); +my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; my $inserts = 0; my $seed = ""; my $depth = 0; +my $blacklist_matches = 0; +my $robots_txt_denies = 0; +my $invalid_scheme_skips = 0; sub store_url { - my ($url) = @_; + my ($url, $parent) = @_; - if($url ne "" && length($url) <= 255) { - print "."; - my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { + + my $u = URI->new($url); + my $domain = $u->host; + my $scheme = $u->scheme; + + my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); + $sth->execute($domain); + + my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); - $ins->execute($url) or $skips = $skips + 1; - $inserts = $inserts + 1; - if($depth < $maxdepth) { - $depth = $depth + 1; - crawl_url($url); + if(not $ins->execute($url, $parent, $domain, $scheme)) { + $skips = $skips + 1; + print "d"; + } + else { + print "."; + $inserts = $inserts + 1; + if($depth < $maxdepth) { + $depth = $depth + 1; + crawl_url($url); + } + else { + print "l"; + } } - } + } + else { + print "x"; + } } sub crawl_url { @@ -58,12 +90,16 @@ sub crawl_url { my $u = URI->new($url); if ($u->scheme ne "http" && $u->scheme ne "https") { + $invalid_scheme_skips = $invalid_scheme_skips + 1; + print "s"; return; } my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); $sth->execute($u->host); if($sth->rows > 0) { + print "b"; + $blacklist_matches = $blacklist_matches + 1; return; } @@ -73,6 +109,8 @@ sub crawl_url { $rules->parse($robots_url, $robots_txt) if defined $robots_txt; if(!$rules->allowed($url)) { + print "r"; + $robots_txt_denies = $robots_txt_denies + 1; return; } @@ -83,6 +121,8 @@ sub crawl_url { my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); + + $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); @@ -109,14 +149,13 @@ sub crawl_url { $final = $href; } - store_url($final); + store_url($final, $url); } $depth = $depth - 1; } $| = 1; - print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; @@ -143,6 +182,9 @@ if($seed ne "") { print "[OK]\n"; } else { + open my $file, ">", "pandia_crawler.lock" or die $!; + flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; + my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows; @@ -161,3 +203,8 @@ else { my $total = $inserts + $skips; print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; +print " - $blacklist_matches blacklist matches\n"; +print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; +print " - $robots_txt_denies URLs skipped due to robots.txt\n"; + +