pandia/crawler - view

File: [Coherent Logic Development] / pandia / crawler
Revision 1.6: download - view: text, annotated - select for diffs
Tue Jul 1 19:20:47 2025 UTC (2 weeks, 6 days ago) by snw
Branches: MAIN
CVS tags: HEAD

Stop crawling and indexing URLs containing page fragments

#!/usr/bin/env perl # # $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis <snw@coherent-logic.com> # # Licensed AGPL-3.0 # # $Log: crawler,v $ # Revision 1.6 2025/07/01 19:20:47 snw # Stop crawling and indexing URLs containing page fragments # # Revision 1.5 2025/06/28 00:33:32 snw # Update locking # # Revision 1.4 2025/06/27 16:20:30 snw # Add blacklist # # Revision 1.3 2025/06/27 02:14:47 snw # Initial operational capability # # Revision 1.2 2025/06/25 19:38:48 snw # Add indexer # # Revision 1.1 2025/06/25 13:44:37 snw # Renaming # # Revision 1.2 2025/06/25 03:10:01 snw # Initial working crawler # # Revision 1.1.1.1 2025/06/23 23:17:08 snw # Initial commit # # use Getopt::Long; use HTTP::Tiny; use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; use Fcntl qw(:flock); use LWP::Simple qw(get); my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; my $inserts = 0; my $seed = ""; my $depth = 0; my $blacklist_matches = 0; my $robots_txt_denies = 0; my $invalid_scheme_skips = 0; sub store_url { my ($url, $parent) = @_; if (index($url, '#') != -1) { print "pandia: URL contains a fragment; skipping\n"; return; } if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { my $u = URI->new($url); my $domain = $u->host; my $scheme = $u->scheme; my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); $sth->execute($domain); my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); if(not $ins->execute($url, $parent, $domain, $scheme)) { $skips = $skips + 1; print "d"; } else { print "."; $inserts = $inserts + 1; if($depth < $maxdepth) { $depth = $depth + 1; crawl_url($url); } else { print "l"; } } } else { print "x"; } } sub crawl_url { my ($url) = @_; my $u = URI->new($url); if ($u->scheme ne "http" && $u->scheme ne "https") { $invalid_scheme_skips = $invalid_scheme_skips + 1; print "s"; return; } my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); $sth->execute($u->host); if($sth->rows > 0) { print "b"; $blacklist_matches = $blacklist_matches + 1; return; } my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt"; my $robots_txt = get $robots_url; $rules->parse($robots_url, $robots_txt) if defined $robots_txt; if(!$rules->allowed($url)) { print "r"; $robots_txt_denies = $robots_txt_denies + 1; return; } my $origurl = $url; ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x; my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1"); my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); my $href = ""; my $firstchar = ""; my $final = ""; foreach my $link (@links) { $href = $link->attr('href'); $firstchar = substr($href, 0, 1); $final = ""; if($firstchar eq '/') { $final = $baseurl . $href; } elsif($href eq '##') { $final = $baseurl; } elsif($firstchar eq '#') { $final = $baseurl . '/' . $href; } else { $final = $href; } store_url($final, $url); } $depth = $depth - 1; } $| = 1; print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; GetOptions("dbhost=s" => \$dbhost, "dbname=s" => \$dbname, "dbusername=s" => \$dbusername, "dbpw=s" => \$dbpw, "seed=s" => \$seed, "maxdepth=n" =>\$maxdepth) or die("error in command line arguments"); print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; print "[OK]\n"; if($seed ne "") { print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; sleep 1; crawl_url($seed); print "[OK]\n"; } else { open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows; print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth"; sleep 1; while (my @row = $sth->fetchrow_array()) { my $url = @row[0]; crawl_url($url); } print "[OK]\n"; } my $total = $inserts + $skips; print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; print " - $blacklist_matches blacklist matches\n"; print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; print " - $robots_txt_denies URLs skipped due to robots.txt\n";