--- pandia/crawler 2025/06/25 13:44:37 1.1 +++ pandia/crawler 2025/06/27 02:14:47 1.3 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.1 2025/06/25 13:44:37 snw Exp $ +# $Id: crawler,v 1.3 2025/06/27 02:14:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,12 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.3 2025/06/27 02:14:47 snw +# Initial operational capability +# +# Revision 1.2 2025/06/25 19:38:48 snw +# Add indexer +# # Revision 1.1 2025/06/25 13:44:37 snw # Renaming # @@ -35,21 +41,43 @@ my $skips = 0; my $inserts = 0; my $seed = ""; my $depth = 0; +my $blacklist_matches = 0; +my $robots_txt_denies = 0; +my $invalid_scheme_skips = 0; sub store_url { - my ($url) = @_; + my ($url, $parent) = @_; + + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") { - if($url ne "" && length($url) <= 255) { - print "."; - my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); + my $u = URI->new($url); + my $domain = $u->host; + my $scheme = $u->scheme; + + my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); + $sth->execute($domain); + + my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); - $ins->execute($url) or $skips = $skips + 1; - $inserts = $inserts + 1; - if($depth < $maxdepth) { - $depth = $depth + 1; - crawl_url($url); + if(not $ins->execute($url, $parent, $domain, $scheme)) { + $skips = $skips + 1; + print "d"; } - } + else { + print "."; + $inserts = $inserts + 1; + if($depth < $maxdepth) { + $depth = $depth + 1; + crawl_url($url); + } + else { + print "l"; + } + } + } + else { + print "x"; + } } sub crawl_url { @@ -58,12 +86,16 @@ sub crawl_url { my $u = URI->new($url); if ($u->scheme ne "http" && $u->scheme ne "https") { + $invalid_scheme_skips = $invalid_scheme_skips + 1; + print "s"; return; } my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); $sth->execute($u->host); if($sth->rows > 0) { + print "b"; + $blacklist_matches = $blacklist_matches + 1; return; } @@ -73,6 +105,8 @@ sub crawl_url { $rules->parse($robots_url, $robots_txt) if defined $robots_txt; if(!$rules->allowed($url)) { + print "r"; + $robots_txt_denies = $robots_txt_denies + 1; return; } @@ -83,6 +117,17 @@ sub crawl_url { my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); + + if(not $response->{success}) { + print "pandia: http failure; skipping $url\n"; + next; + } + + if(exists $response->{redirects}) { + print "pandia: redirects detected; skipping $url\n"; + next; + } + $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); @@ -109,14 +154,13 @@ sub crawl_url { $final = $href; } - store_url($final); + store_url($final, $url); } $depth = $depth - 1; } $| = 1; - print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; @@ -161,3 +205,8 @@ else { my $total = $inserts + $skips; print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; +print " - $blacklist_matches blacklist matches\n"; +print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; +print " - $robots_txt_denies URLs skipped due to robots.txt\n"; + +