pandia/crawler - view

File: [Coherent Logic Development] / pandia / crawler
Revision 1.9: download - view: text, annotated - select for diffs
Sat Jul 5 15:27:53 2025 UTC (4 weeks ago) by snw
Branches: MAIN
CVS tags: HEAD

Update

1: #!/usr/bin/env perl 2: 3: # 4: # $Id: crawler,v 1.9 2025/07/05 15:27:53 snw Exp $ 5: # Copyright (C) 2025 Coherent Logic Development LLC 6: # 7: # Author: Serena Willis <snw@coherent-logic.com> 8: # 9: # Licensed AGPL-3.0 10: # 11: # $Log: crawler,v $ 12: # Revision 1.9 2025/07/05 15:27:53 snw 13: # Update 14: # 15: # Revision 1.8 2025/07/02 15:14:44 snw 16: # Fix bug in restricted mode 17: # 18: # Revision 1.7 2025/07/02 15:03:05 snw 19: # Add support for restricted mode 20: # 21: # Revision 1.6 2025/07/01 19:20:47 snw 22: # Stop crawling and indexing URLs containing page fragments 23: # 24: # Revision 1.5 2025/06/28 00:33:32 snw 25: # Update locking 26: # 27: # Revision 1.4 2025/06/27 16:20:30 snw 28: # Add blacklist 29: # 30: # Revision 1.3 2025/06/27 02:14:47 snw 31: # Initial operational capability 32: # 33: # Revision 1.2 2025/06/25 19:38:48 snw 34: # Add indexer 35: # 36: # Revision 1.1 2025/06/25 13:44:37 snw 37: # Renaming 38: # 39: # Revision 1.2 2025/06/25 03:10:01 snw 40: # Initial working crawler 41: # 42: # Revision 1.1.1.1 2025/06/23 23:17:08 snw 43: # Initial commit 44: # 45: # 46: 47: use Getopt::Long; 48: use HTTP::Tiny; 49: use HTML::TreeBuilder; 50: use URI; 51: use DBI; 52: use WWW::RobotRules; 53: use Fcntl qw(:flock); 54: use LWP::Simple qw(get); 55: use Config::IniFiles; 56: 57: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); 58: my $dbh = ""; 59: my $dsn = ""; 60: my $skips = 0; 61: my $inserts = 0; 62: my $seed = ""; 63: my $depth = 0; 64: my $blacklist_matches = 0; 65: my $robots_txt_denies = 0; 66: my $invalid_scheme_skips = 0; 67: my $mode; 68: 69: sub store_url { 70: my ($url, $parent) = @_; 71: 72: if (index($url, '#') != -1) { 73: print "F"; 74: return; 75: } 76: 77: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { 78: 79: my $u = URI->new($url); 80: my $domain = $u->host; 81: my $scheme = $u->scheme; 82: my @parts = split($domain, '.'); 83: my $tld = $parts[-1]; 84: 85: if ($mode eq 'restricted') { 86: my $tld_ok = 0; 87: foreach (@allowed_tlds) { 88: my $allowed = $_; 89: 90: if($tld eq $allowed) { 91: $tld_ok = 1; 92: last; 93: } 94: } 95: if($tld_ok == 0) { 96: print "T"; 97: return; 98: } 99: } 100: 101: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); 102: $sth->execute($domain); 103: 104: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); 105: 106: if(not $ins->execute($url, $parent, $domain, $scheme)) { 107: $skips = $skips + 1; 108: print "d"; 109: } 110: else { 111: print "."; 112: $inserts = $inserts + 1; 113: if($depth < $maxdepth) { 114: $depth = $depth + 1; 115: crawl_url($url); 116: } 117: else { 118: print "l"; 119: } 120: } 121: } 122: else { 123: print "x"; 124: } 125: } 126: 127: sub crawl_url { 128: my ($url) = @_; 129: 130: my $u = URI->new($url); 131: 132: if ($u->scheme ne "http" && $u->scheme ne "https") { 133: $invalid_scheme_skips = $invalid_scheme_skips + 1; 134: print "s"; 135: return; 136: } 137: 138: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); 139: $sth->execute($u->host); 140: if($sth->rows > 0) { 141: print "b"; 142: $blacklist_matches = $blacklist_matches + 1; 143: return; 144: } 145: 146: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt"; 147: 148: my $robots_txt = get $robots_url; 149: $rules->parse($robots_url, $robots_txt) if defined $robots_txt; 150: 151: if(!$rules->allowed($url)) { 152: print "r"; 153: $robots_txt_denies = $robots_txt_denies + 1; 154: return; 155: } 156: 157: my $origurl = $url; 158: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x; 159: 160: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1"); 161: my $tree = HTML::TreeBuilder->new(); 162: 163: my $response = $http->get($url); 164: 165: 166: $tree->parse($response->{content}); 167: 168: my @links = $tree->find_by_tag_name('a'); 169: 170: my $href = ""; 171: my $firstchar = ""; 172: my $final = ""; 173: 174: foreach my $link (@links) { 175: $href = $link->attr('href'); 176: $firstchar = substr($href, 0, 1); 177: $final = ""; 178: 179: if($firstchar eq '/') { 180: $final = $baseurl . $href; 181: } 182: elsif($href eq '##') { 183: $final = $baseurl; 184: } 185: elsif($firstchar eq '#') { 186: $final = $baseurl . '/' . $href; 187: } 188: else { 189: $final = $href; 190: } 191: 192: store_url($final, $url); 193: } 194: 195: $depth = $depth - 1; 196: } 197: 198: $| = 1; 199: print "pandia crawler v0.0.1\n"; 200: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; 201: 202: my $profile; 203: 204: GetOptions("profile=s" => \$profile, 205: "seed=s" => \$seed, 206: "maxdepth=n" =>\$maxdepth) 207: or die("error in command line arguments"); 208: 209: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); 210: 211: $dbhost = $cfg->val($profile, 'dbhost'); 212: $dbname = $cfg->val($profile, 'dbname'); 213: $dbusername = $cfg->val($profile, 'dbuser'); 214: $dbpw = $cfg->val($profile, 'dbpass'); 215: $tmp = $cfg->val($profile, 'allowed_tlds'); 216: 217: if($tmp ne '*') { 218: $mode = 'restricted'; 219: @allowed_tlds = split(',', $tmp); 220: print "pandia: crawler restricted to these TLDs: "; 221: foreach (@allowed_tlds) { 222: print ".$_ "; 223: } 224: print "\n"; 225: } 226: else { 227: print "pandia: crawler unrestricted\n"; 228: $mode = 'normal'; 229: } 230: 231: print "pandia: connecting to $dbname database at $dbhost..."; 232: 233: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; 234: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); 235: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; 236: 237: print "[OK]\n"; 238: 239: print "pandia: each character represents the following status for a URL:\n"; 240: print " . URL added to indexer queue\n"; 241: print " l crawl exceeded max depth\n"; 242: print " x URL too long or invalid scheme\n"; 243: print " d URL was a duplicate\n"; 244: print " b crawl was blocked by robots.txt\n"; 245: print " F URL contained a fragment\n"; 246: print " T URL was from a disallowed top-level domain\n\n"; 247: 248: if($seed ne "") { 249: print "pandia: crawling seed $seed to a maximum depth of $maxdepth..."; 250: sleep 1; 251: crawl_url($seed); 252: print "[OK]\n"; 253: } 254: else { 255: open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 256: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; 257: 258: my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); 259: $sth->execute(); 260: my $qlen = $sth->rows; 261: 262: 263: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth..."; 264: sleep 1; 265: while (my @row = $sth->fetchrow_array()) { 266: my $url = @row[0]; 267: crawl_url($url); 268: } 269: print "[OK]\n"; 270: } 271: 272: 273: my $total = $inserts + $skips; 274: 275: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; 276: print " - $blacklist_matches blacklist matches\n"; 277: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; 278: print " - $robots_txt_denies URLs skipped due to robots.txt\n"; 279: 280: