pandia/crawler - view

File: [Coherent Logic Development] / pandia / crawler
Revision 1.9: download - view: text, annotated - select for diffs
Sat Jul 5 15:27:53 2025 UTC (2 months ago) by snw
Branches: MAIN
CVS tags: HEAD

Update

#!/usr/bin/env perl # # $Id: crawler,v 1.9 2025/07/05 15:27:53 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis <snw@coherent-logic.com> # # Licensed AGPL-3.0 # # $Log: crawler,v $ # Revision 1.9 2025/07/05 15:27:53 snw # Update # # Revision 1.8 2025/07/02 15:14:44 snw # Fix bug in restricted mode # # Revision 1.7 2025/07/02 15:03:05 snw # Add support for restricted mode # # Revision 1.6 2025/07/01 19:20:47 snw # Stop crawling and indexing URLs containing page fragments # # Revision 1.5 2025/06/28 00:33:32 snw # Update locking # # Revision 1.4 2025/06/27 16:20:30 snw # Add blacklist # # Revision 1.3 2025/06/27 02:14:47 snw # Initial operational capability # # Revision 1.2 2025/06/25 19:38:48 snw # Add indexer # # Revision 1.1 2025/06/25 13:44:37 snw # Renaming # # Revision 1.2 2025/06/25 03:10:01 snw # Initial working crawler # # Revision 1.1.1.1 2025/06/23 23:17:08 snw # Initial commit # # use Getopt::Long; use HTTP::Tiny; use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; use Fcntl qw(:flock); use LWP::Simple qw(get); use Config::IniFiles; my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; my $inserts = 0; my $seed = ""; my $depth = 0; my $blacklist_matches = 0; my $robots_txt_denies = 0; my $invalid_scheme_skips = 0; my $mode; sub store_url { my ($url, $parent) = @_; if (index($url, '#') != -1) { print "F"; return; } if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { my $u = URI->new($url); my $domain = $u->host; my $scheme = $u->scheme; my @parts = split($domain, '.'); my $tld = $parts[-1]; if ($mode eq 'restricted') { my $tld_ok = 0; foreach (@allowed_tlds) { my $allowed = $_; if($tld eq $allowed) { $tld_ok = 1; last; } } if($tld_ok == 0) { print "T"; return; } } my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); $sth->execute($domain); my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); if(not $ins->execute($url, $parent, $domain, $scheme)) { $skips = $skips + 1; print "d"; } else { print "."; $inserts = $inserts + 1; if($depth < $maxdepth) { $depth = $depth + 1; crawl_url($url); } else { print "l"; } } } else { print "x"; } } sub crawl_url { my ($url) = @_; my $u = URI->new($url); if ($u->scheme ne "http" && $u->scheme ne "https") { $invalid_scheme_skips = $invalid_scheme_skips + 1; print "s"; return; } my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); $sth->execute($u->host); if($sth->rows > 0) { print "b"; $blacklist_matches = $blacklist_matches + 1; return; } my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt"; my $robots_txt = get $robots_url; $rules->parse($robots_url, $robots_txt) if defined $robots_txt; if(!$rules->allowed($url)) { print "r"; $robots_txt_denies = $robots_txt_denies + 1; return; } my $origurl = $url; ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x; my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1"); my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); my $href = ""; my $firstchar = ""; my $final = ""; foreach my $link (@links) { $href = $link->attr('href'); $firstchar = substr($href, 0, 1); $final = ""; if($firstchar eq '/') { $final = $baseurl . $href; } elsif($href eq '##') { $final = $baseurl; } elsif($firstchar eq '#') { $final = $baseurl . '/' . $href; } else { $final = $href; } store_url($final, $url); } $depth = $depth - 1; } $| = 1; print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; my $profile; GetOptions("profile=s" => \$profile, "seed=s" => \$seed, "maxdepth=n" =>\$maxdepth) or die("error in command line arguments"); my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); $dbhost = $cfg->val($profile, 'dbhost'); $dbname = $cfg->val($profile, 'dbname'); $dbusername = $cfg->val($profile, 'dbuser'); $dbpw = $cfg->val($profile, 'dbpass'); $tmp = $cfg->val($profile, 'allowed_tlds'); if($tmp ne '*') { $mode = 'restricted'; @allowed_tlds = split(',', $tmp); print "pandia: crawler restricted to these TLDs: "; foreach (@allowed_tlds) { print ".$_ "; } print "\n"; } else { print "pandia: crawler unrestricted\n"; $mode = 'normal'; } print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; print "[OK]\n"; print "pandia: each character represents the following status for a URL:\n"; print " . URL added to indexer queue\n"; print " l crawl exceeded max depth\n"; print " x URL too long or invalid scheme\n"; print " d URL was a duplicate\n"; print " b crawl was blocked by robots.txt\n"; print " F URL contained a fragment\n"; print " T URL was from a disallowed top-level domain\n\n"; if($seed ne "") { print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; sleep 1; crawl_url($seed); print "[OK]\n"; } else { open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows; print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth..."; sleep 1; while (my @row = $sth->fetchrow_array()) { my $url = @row[0]; crawl_url($url); } print "[OK]\n"; } my $total = $inserts + $skips; print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; print " - $blacklist_matches blacklist matches\n"; print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; print " - $robots_txt_denies URLs skipped due to robots.txt\n";