Annotation of pandia/crawler, revision 1.6

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.6     ! snw         4: # $Id: crawler,v 1.5 2025/06/28 00:33:32 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2       snw        11: # $Log: crawler,v $
1.6     ! snw        12: # Revision 1.5  2025/06/28 00:33:32  snw
        !            13: # Update locking
        !            14: #
1.5       snw        15: # Revision 1.4  2025/06/27 16:20:30  snw
                     16: # Add blacklist
                     17: #
1.4       snw        18: # Revision 1.3  2025/06/27 02:14:47  snw
                     19: # Initial operational capability
                     20: #
1.3       snw        21: # Revision 1.2  2025/06/25 19:38:48  snw
                     22: # Add indexer
                     23: #
1.2       snw        24: # Revision 1.1  2025/06/25 13:44:37  snw
                     25: # Renaming
                     26: #
1.1       snw        27: # Revision 1.2  2025/06/25 03:10:01  snw
                     28: # Initial working crawler
                     29: #
                     30: # Revision 1.1.1.1  2025/06/23 23:17:08  snw
                     31: # Initial commit
                     32: #
                     33: #
                     34: 
                     35: use Getopt::Long;
                     36: use HTTP::Tiny;
                     37: use HTML::TreeBuilder;
                     38: use URI;
                     39: use DBI;
                     40: use WWW::RobotRules;
1.4       snw        41: use Fcntl qw(:flock);
1.1       snw        42: use LWP::Simple qw(get);
                     43: 
1.4       snw        44: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
1.1       snw        45: my $dbh = "";
                     46: my $dsn = "";
                     47: my $skips = 0;
                     48: my $inserts = 0;
                     49: my $seed = "";
                     50: my $depth = 0;
1.2       snw        51: my $blacklist_matches = 0;
                     52: my $robots_txt_denies = 0;
                     53: my $invalid_scheme_skips = 0;
1.1       snw        54: 
                     55: sub store_url {
1.2       snw        56:     my ($url, $parent) = @_;
                     57: 
1.6     ! snw        58:     if (index($url, '#') != -1) {
        !            59:         print "pandia:  URL contains a fragment; skipping\n";
        !            60:         return;
        !            61:     }
        !            62:     
1.4       snw        63:     if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
                     64:        
1.2       snw        65:        my $u = URI->new($url);
1.4       snw        66:        my $domain = $u->host;  
1.2       snw        67:        my $scheme = $u->scheme;
                     68: 
                     69:        my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
                     70:        $sth->execute($domain);
                     71:        
                     72:        my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1       snw        73:            
1.2       snw        74:        if(not $ins->execute($url, $parent, $domain, $scheme)) {
                     75:            $skips = $skips + 1;
                     76:            print "d";
                     77:        }
                     78:        else {
                     79:            print ".";
                     80:            $inserts = $inserts + 1;        
                     81:            if($depth < $maxdepth) {
                     82:                $depth = $depth + 1;
                     83:                crawl_url($url);
                     84:            }
                     85:            else {
                     86:                print "l";
                     87:            }
1.1       snw        88:        }
1.2       snw        89:     }
                     90:     else {
                     91:        print "x";
                     92:     }
1.1       snw        93: }
                     94: 
                     95: sub crawl_url {
                     96:     my ($url) = @_;
                     97: 
                     98:     my $u = URI->new($url);
                     99: 
                    100:     if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2       snw       101:        $invalid_scheme_skips = $invalid_scheme_skips + 1;
                    102:        print "s";
1.1       snw       103:        return;
                    104:     }
                    105: 
                    106:     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
                    107:     $sth->execute($u->host);
                    108:     if($sth->rows > 0) {
1.2       snw       109:        print "b";
                    110:        $blacklist_matches = $blacklist_matches + 1;
1.1       snw       111:        return;
                    112:     }
                    113:     
                    114:     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
                    115: 
                    116:     my $robots_txt = get $robots_url;
                    117:     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
                    118: 
                    119:     if(!$rules->allowed($url)) {
1.2       snw       120:        print "r";
                    121:        $robots_txt_denies = $robots_txt_denies + 1;
1.1       snw       122:        return;
                    123:     }
                    124:     
                    125:     my $origurl = $url;
                    126:     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
                    127:     
                    128:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
                    129:     my $tree = HTML::TreeBuilder->new();
                    130: 
                    131:     my $response = $http->get($url);
1.3       snw       132: 
                    133: 
1.1       snw       134:     $tree->parse($response->{content});
                    135: 
                    136:     my @links = $tree->find_by_tag_name('a');
                    137: 
                    138:     my $href = "";
                    139:     my $firstchar = "";
                    140:     my $final = "";
                    141:     
                    142:     foreach my $link (@links) {
                    143:         $href = $link->attr('href');
                    144:        $firstchar = substr($href, 0, 1);
                    145:        $final = "";
                    146: 
                    147:        if($firstchar eq '/') {
                    148:            $final = $baseurl . $href;     
                    149:        }
                    150:        elsif($href eq '##') {
                    151:            $final = $baseurl;
                    152:        }
                    153:        elsif($firstchar eq '#') {
                    154:            $final = $baseurl . '/' . $href;
                    155:        }
                    156:        else {
                    157:            $final = $href;
                    158:        }
                    159:            
1.2       snw       160:        store_url($final, $url);
1.1       snw       161:     }
                    162: 
                    163:     $depth = $depth - 1;
                    164: }
                    165: 
                    166: $| = 1;
                    167: print "pandia crawler v0.0.1\n";
                    168: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
                    169: 
                    170: GetOptions("dbhost=s" => \$dbhost,
                    171:            "dbname=s" => \$dbname,
                    172:            "dbusername=s" => \$dbusername,
                    173:            "dbpw=s" => \$dbpw,
                    174:           "seed=s" => \$seed,
                    175:           "maxdepth=n" =>\$maxdepth)
                    176:     or die("error in command line arguments");
                    177: 
                    178: print "pandia:  connecting to $dbname database at $dbhost...";
                    179: 
                    180: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
                    181: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
                    182: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                    183: 
                    184: print "[OK]\n";
                    185: 
                    186: if($seed ne "") {
                    187:     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
                    188:     sleep 1;
                    189:     crawl_url($seed);
                    190:     print "[OK]\n";
                    191: }
                    192: else {
1.5       snw       193:     open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 
1.4       snw       194:     flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                    195: 
1.1       snw       196:     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
                    197:     $sth->execute();
                    198:     my $qlen = $sth->rows;
                    199: 
                    200:     
                    201:     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
                    202:     sleep 1;
                    203:     while (my @row = $sth->fetchrow_array()) {
                    204:        my $url = @row[0];
                    205:        crawl_url($url);
                    206:     }
                    207:     print "[OK]\n";
                    208: }
                    209: 
                    210: 
                    211: my $total = $inserts + $skips;
                    212: 
                    213: print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2       snw       214: print "          - $blacklist_matches blacklist matches\n";
                    215: print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
                    216: print "          - $robots_txt_denies URLs skipped due to robots.txt\n";
                    217:     
                    218:     

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>