Annotation of pandia/crawler, revision 1.9

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.9     ! snw         4: # $Id: crawler,v 1.8 2025/07/02 15:14:44 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2       snw        11: # $Log: crawler,v $
1.9     ! snw        12: # Revision 1.8  2025/07/02 15:14:44  snw
        !            13: # Fix bug in restricted mode
        !            14: #
1.8       snw        15: # Revision 1.7  2025/07/02 15:03:05  snw
                     16: # Add support for restricted mode
                     17: #
1.7       snw        18: # Revision 1.6  2025/07/01 19:20:47  snw
                     19: # Stop crawling and indexing URLs containing page fragments
                     20: #
1.6       snw        21: # Revision 1.5  2025/06/28 00:33:32  snw
                     22: # Update locking
                     23: #
1.5       snw        24: # Revision 1.4  2025/06/27 16:20:30  snw
                     25: # Add blacklist
                     26: #
1.4       snw        27: # Revision 1.3  2025/06/27 02:14:47  snw
                     28: # Initial operational capability
                     29: #
1.3       snw        30: # Revision 1.2  2025/06/25 19:38:48  snw
                     31: # Add indexer
                     32: #
1.2       snw        33: # Revision 1.1  2025/06/25 13:44:37  snw
                     34: # Renaming
                     35: #
1.1       snw        36: # Revision 1.2  2025/06/25 03:10:01  snw
                     37: # Initial working crawler
                     38: #
                     39: # Revision 1.1.1.1  2025/06/23 23:17:08  snw
                     40: # Initial commit
                     41: #
                     42: #
                     43: 
                     44: use Getopt::Long;
                     45: use HTTP::Tiny;
                     46: use HTML::TreeBuilder;
                     47: use URI;
                     48: use DBI;
                     49: use WWW::RobotRules;
1.4       snw        50: use Fcntl qw(:flock);
1.1       snw        51: use LWP::Simple qw(get);
1.7       snw        52: use Config::IniFiles;
1.1       snw        53: 
1.4       snw        54: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
1.1       snw        55: my $dbh = "";
                     56: my $dsn = "";
                     57: my $skips = 0;
                     58: my $inserts = 0;
                     59: my $seed = "";
                     60: my $depth = 0;
1.2       snw        61: my $blacklist_matches = 0;
                     62: my $robots_txt_denies = 0;
                     63: my $invalid_scheme_skips = 0;
1.7       snw        64: my $mode;
1.1       snw        65: 
                     66: sub store_url {
1.2       snw        67:     my ($url, $parent) = @_;
                     68: 
1.6       snw        69:     if (index($url, '#') != -1) {
1.7       snw        70:         print "F";
1.6       snw        71:         return;
                     72:     }
                     73:     
1.4       snw        74:     if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
                     75:        
1.2       snw        76:        my $u = URI->new($url);
1.4       snw        77:        my $domain = $u->host;  
1.2       snw        78:        my $scheme = $u->scheme;
1.7       snw        79:         my @parts = split($domain, '.');
                     80:         my $tld = $parts[-1];
                     81:         
                     82:         if ($mode eq 'restricted') {
                     83:             my $tld_ok = 0;
                     84:             foreach (@allowed_tlds) {
                     85:                 my $allowed = $_;
                     86:                 
                     87:                 if($tld eq $allowed) {
                     88:                     $tld_ok = 1;
                     89:                     last;
                     90:                 }
                     91:             }
                     92:             if($tld_ok == 0) {
                     93:                 print "T";
                     94:                 return;
                     95:             }
                     96:         }
1.2       snw        97: 
                     98:        my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
                     99:        $sth->execute($domain);
                    100:        
                    101:        my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1       snw       102:            
1.2       snw       103:        if(not $ins->execute($url, $parent, $domain, $scheme)) {
                    104:            $skips = $skips + 1;
                    105:            print "d";
                    106:        }
                    107:        else {
                    108:            print ".";
                    109:            $inserts = $inserts + 1;        
                    110:            if($depth < $maxdepth) {
                    111:                $depth = $depth + 1;
                    112:                crawl_url($url);
                    113:            }
                    114:            else {
                    115:                print "l";
                    116:            }
1.1       snw       117:        }
1.2       snw       118:     }
                    119:     else {
                    120:        print "x";
                    121:     }
1.1       snw       122: }
                    123: 
                    124: sub crawl_url {
                    125:     my ($url) = @_;
                    126: 
                    127:     my $u = URI->new($url);
                    128: 
                    129:     if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2       snw       130:        $invalid_scheme_skips = $invalid_scheme_skips + 1;
                    131:        print "s";
1.1       snw       132:        return;
                    133:     }
                    134: 
                    135:     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
                    136:     $sth->execute($u->host);
                    137:     if($sth->rows > 0) {
1.2       snw       138:        print "b";
                    139:        $blacklist_matches = $blacklist_matches + 1;
1.1       snw       140:        return;
                    141:     }
                    142:     
                    143:     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
                    144: 
                    145:     my $robots_txt = get $robots_url;
                    146:     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
                    147: 
                    148:     if(!$rules->allowed($url)) {
1.2       snw       149:        print "r";
                    150:        $robots_txt_denies = $robots_txt_denies + 1;
1.1       snw       151:        return;
                    152:     }
                    153:     
                    154:     my $origurl = $url;
                    155:     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
                    156:     
                    157:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
                    158:     my $tree = HTML::TreeBuilder->new();
                    159: 
                    160:     my $response = $http->get($url);
1.3       snw       161: 
                    162: 
1.1       snw       163:     $tree->parse($response->{content});
                    164: 
                    165:     my @links = $tree->find_by_tag_name('a');
                    166: 
                    167:     my $href = "";
                    168:     my $firstchar = "";
                    169:     my $final = "";
                    170:     
                    171:     foreach my $link (@links) {
                    172:         $href = $link->attr('href');
                    173:        $firstchar = substr($href, 0, 1);
                    174:        $final = "";
                    175: 
                    176:        if($firstchar eq '/') {
                    177:            $final = $baseurl . $href;     
                    178:        }
                    179:        elsif($href eq '##') {
                    180:            $final = $baseurl;
                    181:        }
                    182:        elsif($firstchar eq '#') {
                    183:            $final = $baseurl . '/' . $href;
                    184:        }
                    185:        else {
                    186:            $final = $href;
                    187:        }
                    188:            
1.2       snw       189:        store_url($final, $url);
1.1       snw       190:     }
                    191: 
                    192:     $depth = $depth - 1;
                    193: }
                    194: 
                    195: $| = 1;
                    196: print "pandia crawler v0.0.1\n";
                    197: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
                    198: 
1.7       snw       199: my $profile;
                    200: 
                    201: GetOptions("profile=s" => \$profile,
                    202:            "seed=s" => \$seed,
                    203:            "maxdepth=n" =>\$maxdepth)
1.1       snw       204:     or die("error in command line arguments");
                    205: 
1.7       snw       206: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
                    207: 
                    208: $dbhost = $cfg->val($profile, 'dbhost');
                    209: $dbname = $cfg->val($profile, 'dbname');
                    210: $dbusername = $cfg->val($profile, 'dbuser');
                    211: $dbpw = $cfg->val($profile, 'dbpass');
                    212: $tmp = $cfg->val($profile, 'allowed_tlds');
                    213: 
                    214: if($tmp ne '*') {
1.8       snw       215:     $mode = 'restricted';
1.7       snw       216:     @allowed_tlds = split(',', $tmp);
                    217:     print "pandia:  crawler restricted to these TLDs:  ";
                    218:     foreach (@allowed_tlds) {
                    219:         print ".$_ ";
                    220:     }
                    221:     print "\n";
                    222: }
                    223: else {
                    224:     print "pandia:  crawler unrestricted\n";
                    225:     $mode = 'normal';
                    226: }
                    227: 
1.1       snw       228: print "pandia:  connecting to $dbname database at $dbhost...";
                    229: 
                    230: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
                    231: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
                    232: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                    233: 
                    234: print "[OK]\n";
                    235: 
1.7       snw       236: print "pandia:  each character represents the following status for a URL:\n";
                    237: print "  .    URL added to indexer queue\n";
                    238: print "  l    crawl exceeded max depth\n";
                    239: print "  x    URL too long or invalid scheme\n";
                    240: print "  d    URL was a duplicate\n";
                    241: print "  b    crawl was blocked by robots.txt\n";
                    242: print "  F    URL contained a fragment\n";
                    243: print "  T    URL was from a disallowed top-level domain\n\n";
                    244: 
                    245: if($seed ne "") {        
                    246:     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth...";
1.1       snw       247:     sleep 1;
                    248:     crawl_url($seed);
                    249:     print "[OK]\n";
                    250: }
                    251: else {
1.5       snw       252:     open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 
1.4       snw       253:     flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                    254: 
1.1       snw       255:     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
                    256:     $sth->execute();
                    257:     my $qlen = $sth->rows;
                    258: 
                    259:     
1.9     ! snw       260:     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth...";
1.1       snw       261:     sleep 1;
                    262:     while (my @row = $sth->fetchrow_array()) {
                    263:        my $url = @row[0];
                    264:        crawl_url($url);
                    265:     }
                    266:     print "[OK]\n";
                    267: }
                    268: 
                    269: 
                    270: my $total = $inserts + $skips;
                    271: 
                    272: print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2       snw       273: print "          - $blacklist_matches blacklist matches\n";
                    274: print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
                    275: print "          - $robots_txt_denies URLs skipped due to robots.txt\n";
                    276:     
                    277:     

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>