File:  [Coherent Logic Development] / pandia / crawler
Revision 1.9: download - view: text, annotated - select for diffs
Sat Jul 5 15:27:53 2025 UTC (4 weeks ago) by snw
Branches: MAIN
CVS tags: HEAD
Update

    1: #!/usr/bin/env perl
    2: 
    3: # 
    4: # $Id: crawler,v 1.9 2025/07/05 15:27:53 snw Exp $
    5: #  Copyright (C) 2025 Coherent Logic Development LLC
    6: #
    7: # Author: Serena Willis <snw@coherent-logic.com>
    8: #
    9: # Licensed AGPL-3.0
   10: #
   11: # $Log: crawler,v $
   12: # Revision 1.9  2025/07/05 15:27:53  snw
   13: # Update
   14: #
   15: # Revision 1.8  2025/07/02 15:14:44  snw
   16: # Fix bug in restricted mode
   17: #
   18: # Revision 1.7  2025/07/02 15:03:05  snw
   19: # Add support for restricted mode
   20: #
   21: # Revision 1.6  2025/07/01 19:20:47  snw
   22: # Stop crawling and indexing URLs containing page fragments
   23: #
   24: # Revision 1.5  2025/06/28 00:33:32  snw
   25: # Update locking
   26: #
   27: # Revision 1.4  2025/06/27 16:20:30  snw
   28: # Add blacklist
   29: #
   30: # Revision 1.3  2025/06/27 02:14:47  snw
   31: # Initial operational capability
   32: #
   33: # Revision 1.2  2025/06/25 19:38:48  snw
   34: # Add indexer
   35: #
   36: # Revision 1.1  2025/06/25 13:44:37  snw
   37: # Renaming
   38: #
   39: # Revision 1.2  2025/06/25 03:10:01  snw
   40: # Initial working crawler
   41: #
   42: # Revision 1.1.1.1  2025/06/23 23:17:08  snw
   43: # Initial commit
   44: #
   45: #
   46: 
   47: use Getopt::Long;
   48: use HTTP::Tiny;
   49: use HTML::TreeBuilder;
   50: use URI;
   51: use DBI;
   52: use WWW::RobotRules;
   53: use Fcntl qw(:flock);
   54: use LWP::Simple qw(get);
   55: use Config::IniFiles;
   56: 
   57: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
   58: my $dbh = "";
   59: my $dsn = "";
   60: my $skips = 0;
   61: my $inserts = 0;
   62: my $seed = "";
   63: my $depth = 0;
   64: my $blacklist_matches = 0;
   65: my $robots_txt_denies = 0;
   66: my $invalid_scheme_skips = 0;
   67: my $mode;
   68: 
   69: sub store_url {
   70:     my ($url, $parent) = @_;
   71: 
   72:     if (index($url, '#') != -1) {
   73:         print "F";
   74:         return;
   75:     }
   76:     
   77:     if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
   78: 	
   79: 	my $u = URI->new($url);
   80: 	my $domain = $u->host;	
   81: 	my $scheme = $u->scheme;
   82:         my @parts = split($domain, '.');
   83:         my $tld = $parts[-1];
   84:         
   85:         if ($mode eq 'restricted') {
   86:             my $tld_ok = 0;
   87:             foreach (@allowed_tlds) {
   88:                 my $allowed = $_;
   89:                 
   90:                 if($tld eq $allowed) {
   91:                     $tld_ok = 1;
   92:                     last;
   93:                 }
   94:             }
   95:             if($tld_ok == 0) {
   96:                 print "T";
   97:                 return;
   98:             }
   99:         }
  100: 
  101: 	my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
  102: 	$sth->execute($domain);
  103: 	
  104: 	my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
  105: 	    
  106: 	if(not $ins->execute($url, $parent, $domain, $scheme)) {
  107: 	    $skips = $skips + 1;
  108: 	    print "d";
  109: 	}
  110: 	else {
  111: 	    print ".";
  112: 	    $inserts = $inserts + 1;	    
  113: 	    if($depth < $maxdepth) {
  114: 		$depth = $depth + 1;
  115: 		crawl_url($url);
  116: 	    }
  117: 	    else {
  118: 		print "l";
  119: 	    }
  120: 	}
  121:     }
  122:     else {
  123: 	print "x";
  124:     }
  125: }
  126: 
  127: sub crawl_url {
  128:     my ($url) = @_;
  129: 
  130:     my $u = URI->new($url);
  131: 
  132:     if ($u->scheme ne "http" && $u->scheme ne "https") {
  133: 	$invalid_scheme_skips = $invalid_scheme_skips + 1;
  134: 	print "s";
  135: 	return;
  136:     }
  137: 
  138:     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
  139:     $sth->execute($u->host);
  140:     if($sth->rows > 0) {
  141: 	print "b";
  142: 	$blacklist_matches = $blacklist_matches + 1;
  143: 	return;
  144:     }
  145:     
  146:     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
  147: 
  148:     my $robots_txt = get $robots_url;
  149:     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
  150: 
  151:     if(!$rules->allowed($url)) {
  152: 	print "r";
  153: 	$robots_txt_denies = $robots_txt_denies + 1;
  154: 	return;
  155:     }
  156:     
  157:     my $origurl = $url;
  158:     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
  159:     
  160:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
  161:     my $tree = HTML::TreeBuilder->new();
  162: 
  163:     my $response = $http->get($url);
  164: 
  165: 
  166:     $tree->parse($response->{content});
  167: 
  168:     my @links = $tree->find_by_tag_name('a');
  169: 
  170:     my $href = "";
  171:     my $firstchar = "";
  172:     my $final = "";
  173:     
  174:     foreach my $link (@links) {
  175:         $href = $link->attr('href');
  176: 	$firstchar = substr($href, 0, 1);
  177: 	$final = "";
  178: 
  179: 	if($firstchar eq '/') {
  180: 	    $final = $baseurl . $href;	   
  181: 	}
  182: 	elsif($href eq '##') {
  183: 	    $final = $baseurl;
  184: 	}
  185: 	elsif($firstchar eq '#') {
  186: 	    $final = $baseurl . '/' . $href;
  187: 	}
  188: 	else {
  189: 	    $final = $href;
  190: 	}
  191: 	    
  192: 	store_url($final, $url);
  193:     }
  194: 
  195:     $depth = $depth - 1;
  196: }
  197: 
  198: $| = 1;
  199: print "pandia crawler v0.0.1\n";
  200: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
  201: 
  202: my $profile;
  203: 
  204: GetOptions("profile=s" => \$profile,
  205:            "seed=s" => \$seed,
  206:            "maxdepth=n" =>\$maxdepth)
  207:     or die("error in command line arguments");
  208: 
  209: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
  210: 
  211: $dbhost = $cfg->val($profile, 'dbhost');
  212: $dbname = $cfg->val($profile, 'dbname');
  213: $dbusername = $cfg->val($profile, 'dbuser');
  214: $dbpw = $cfg->val($profile, 'dbpass');
  215: $tmp = $cfg->val($profile, 'allowed_tlds');
  216: 
  217: if($tmp ne '*') {
  218:     $mode = 'restricted';
  219:     @allowed_tlds = split(',', $tmp);
  220:     print "pandia:  crawler restricted to these TLDs:  ";
  221:     foreach (@allowed_tlds) {
  222:         print ".$_ ";
  223:     }
  224:     print "\n";
  225: }
  226: else {
  227:     print "pandia:  crawler unrestricted\n";
  228:     $mode = 'normal';
  229: }
  230: 
  231: print "pandia:  connecting to $dbname database at $dbhost...";
  232: 
  233: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
  234: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
  235: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
  236: 
  237: print "[OK]\n";
  238: 
  239: print "pandia:  each character represents the following status for a URL:\n";
  240: print "  .    URL added to indexer queue\n";
  241: print "  l    crawl exceeded max depth\n";
  242: print "  x    URL too long or invalid scheme\n";
  243: print "  d    URL was a duplicate\n";
  244: print "  b    crawl was blocked by robots.txt\n";
  245: print "  F    URL contained a fragment\n";
  246: print "  T    URL was from a disallowed top-level domain\n\n";
  247: 
  248: if($seed ne "") {        
  249:     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth...";
  250:     sleep 1;
  251:     crawl_url($seed);
  252:     print "[OK]\n";
  253: }
  254: else {
  255:     open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 
  256:     flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
  257: 
  258:     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
  259:     $sth->execute();
  260:     my $qlen = $sth->rows;
  261: 
  262:     
  263:     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth...";
  264:     sleep 1;
  265:     while (my @row = $sth->fetchrow_array()) {
  266: 	my $url = @row[0];
  267: 	crawl_url($url);
  268:     }
  269:     print "[OK]\n";
  270: }
  271: 
  272: 
  273: my $total = $inserts + $skips;
  274: 
  275: print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
  276: print "          - $blacklist_matches blacklist matches\n";
  277: print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
  278: print "          - $robots_txt_denies URLs skipped due to robots.txt\n";
  279:     
  280:     

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>