Annotation of pandia/indexer, revision 1.5

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.5     ! snw         4: # $Id: indexer,v 1.4 2025/06/28 00:33:32 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2       snw        11: # $Log: indexer,v $
1.5     ! snw        12: # Revision 1.4  2025/06/28 00:33:32  snw
        !            13: # Update locking
        !            14: #
1.4       snw        15: # Revision 1.3  2025/06/27 16:20:30  snw
                     16: # Add blacklist
                     17: #
1.3       snw        18: # Revision 1.2  2025/06/27 02:14:47  snw
                     19: # Initial operational capability
                     20: #
1.2       snw        21: # Revision 1.1  2025/06/25 19:38:48  snw
                     22: # Add indexer
                     23: #
1.1       snw        24: #
                     25: 
                     26: use Getopt::Long;
                     27: use HTTP::Tiny;
                     28: use HTML::TreeBuilder;
                     29: use URI;
1.2       snw        30: use Lingua::Stem;
1.1       snw        31: use DBI;
1.2       snw        32: use Data::Dumper;
                     33: use Try::Tiny;
1.3       snw        34: use Fcntl qw(:flock);
1.1       snw        35: 
                     36: my $dbh = "";
                     37: my $dsn = "";
                     38: 
1.2       snw        39: $| = 1;
1.1       snw        40: print "pandia indexer v0.0.1\n";
                     41: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
                     42: 
1.4       snw        43: open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
1.3       snw        44: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                     45: 
1.1       snw        46: GetOptions("dbhost=s" => \$dbhost,
                     47:            "dbname=s" => \$dbname,
                     48:            "dbusername=s" => \$dbusername,
                     49:            "dbpw=s" => \$dbpw)
                     50:     or die("error in command line arguments");
                     51: 
                     52: print "pandia:  connecting to $dbname database at $dbhost...";
                     53: 
                     54: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2       snw        55: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1       snw        56: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                     57: 
                     58: print "[OK]\n";
1.2       snw        59: 
                     60: print "pandia:  loading queue...";
                     61: 
                     62: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
                     63: $sth->execute() or die "pandia:  error retrieving crawl queue\n";
                     64: 
                     65: my $qlen = $sth->rows;
                     66: print "[OK (queue length $qlen)]\n";
                     67: 
                     68: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
                     69: 
                     70: while (my $hashref = $sth->fetchrow_hashref()) {    
                     71:     my $tree = HTML::TreeBuilder->new();
                     72:     my $url = $hashref->{url};
                     73:     my $url_domain = $hashref->{url_domain};
                     74: 
                     75:     my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
                     76:     $stemmer->stem_caching({ -level => 2 });
                     77: 
                     78:     print "pandia:  retrieving $url...\n";
                     79:     try {
                     80:        my $del_queue = 0;
1.5     ! snw        81:        my $head = $http->head($hashref->{url});
        !            82:        my $headers = $head->{headers};
        !            83:        my $content_type = $headers->{'content-type'};
        !            84: 
        !            85:        if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
        !            86:            print "pandia:  content type $content_type not indexable; skipping $url\n";
        !            87:            my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
        !            88:            $sth->execute($url);
        !            89:            next;
        !            90:        }
        !            91:        
1.2       snw        92:        my $response = $http->get($hashref->{url});
                     93: 
                     94:        if(not $response->{success}) {
                     95:            print "pandia:  http failure; skipping $url\n";
                     96:            $del_queue = 1;
                     97:        }
                     98: 
                     99:        if($del_queue == 1) {
                    100:            my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    101:            $sth->execute($url);
                    102:            next;
                    103:        }
                    104:        
                    105:        my $title = "";
                    106:        
                    107:        my $pagedata = $response->{content};    
                    108:        if($response) {
                    109:            $tree->parse($pagedata);   
                    110:            $title = $tree->look_down('_tag', 'title')->as_text;
1.5     ! snw       111:            $title =~ s/[^\x00-\x7F]//g;
1.2       snw       112:            
                    113:            print "pandia:  processing $url [$title]\n";
                    114: 
                    115:            $fulltext = $tree->as_text;
                    116:            $fulltext =~ s/[^\x00-\x7F]//g;
                    117:            
                    118:            my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
                    119:            $sth->execute($url, $title, $fulltext);
                    120:            
                    121:        }
                    122:     } catch {
                    123:        warn "pandia:  caught failure $_\n";
                    124:     };
                    125: 
                    126:     my @words = split(' ', $fulltext);    
                    127:     $stemmer->stem_in_place(@words);
                    128: 
                    129:     my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
                    130:     $sthd->execute($url);
                    131:     
                    132:     my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
1.3       snw       133:     my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
1.2       snw       134:     my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
                    135:     foreach my $word (@words) {
                    136:        $word =~ s/[^\x00-\x7F]//g;
1.3       snw       137:        $sths->execute($word, $url);
1.2       snw       138: 
                    139:        if($sths->rows > 0) {
                    140:            $sthu->execute($word, $url);
                    141:        }
                    142:        else {
                    143:            $sth->execute($word, $url, $url_domain, 1);
                    144:        }
                    145:     }
                    146: 
                    147:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
                    148:     $sthuc->execute($url);
                    149: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>