Annotation of pandia/Pandia.pm, revision 1.2

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.2     ! snw         4: # $Id: Pandia.pm,v 1.1 2025/06/28 23:54:11 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2     ! snw        11: # $Log: Pandia.pm,v $
        !            12: # Revision 1.1  2025/06/28 23:54:11  snw
        !            13: # Add new OO module
        !            14: #
1.1       snw        15: #
                     16: 
                     17: package Pandia;
                     18: 
                     19: use strict;
1.2     ! snw        20: use warnings;
1.1       snw        21: 
                     22: use HTTP::Tiny;
                     23: use HTML::TreeBuilder;
                     24: use URI;
                     25: use DBI;
                     26: use WWW::RobotRules;
                     27: use Fcntl qw(:flock);
                     28: use LWP::Simple qw(get);
                     29: use Config::IniFiles;
                     30: use Thread::Pool;
1.2     ! snw        31: use HTTP::Date;
        !            32: use POSIX qw(strftime);
1.1       snw        33: 
                     34: my $indices_waiting : shared;
                     35: 
                     36: sub index {
1.2     ! snw        37:     my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1       snw        38: 
1.2     ! snw        39:     print "pandia:  thread connecting to MySQL database...";
        !            40:     
1.1       snw        41:     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
                     42:     if(not $dbh) {
1.2     ! snw        43:         print "[FAIL]\n";
1.1       snw        44:         goto nodb_cleanup;
                     45:     }
1.2     ! snw        46:     print "[OK]\n";
1.1       snw        47:     
                     48:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
                     49:     my $tree = HTML::TreeBuilder->new();
1.2     ! snw        50:     my $tries;
1.1       snw        51:     
1.2     ! snw        52:     my $head;
        !            53:     print "pandia:  HEAD $url\n";
        !            54:     $head = $http->head($url);
        !            55:     if(not $head->{success}) {              
        !            56:         print "pandia:  HEAD fail $url\n";
        !            57:         goto nodb_cleanup;
        !            58:     }
        !            59:     else {
        !            60:         print "pandia:  HEAD OK $url\n";
1.1       snw        61:     }
1.2     ! snw        62: 
        !            63:   proc_head:
1.1       snw        64:     my $headers = $head->{headers};
                     65:     my $content_type = $headers->{'content-type'};
1.2     ! snw        66:     my $last_modified;
        !            67:     my $last_modified_sys;
        !            68: 
        !            69:     if ($reindex == 1) {
        !            70:         print "pandia:  REINDEX $url\n";
        !            71:         my $last_modified_t = $headers->{'last-modified'};
        !            72:         $last_modified_sys = str2time($last_modified_t);
        !            73: 
        !            74:         if($last_modified_sys) {
        !            75:             print "pandia:  GET_LAST_INDEX_DT $url\n";
        !            76:             my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
        !            77:             $sth->execute($url);
        !            78:             print "pandia:  GOT_LAST_INDEX_DT $url\n";
        !            79: 
        !            80:             if($sth->rows < 1) {
        !            81:                 print "pandia:  page not indexed\n";
        !            82:                 goto nodb_cleanup;
        !            83:             }
        !            84: 
        !            85:             my $hashref = $sth->fetchrow_hashref();
        !            86:             my $last_indexed = str2time($hashref->{last_indexed_dt});
        !            87: 
        !            88:             if($last_modified_sys > $last_indexed) {
        !            89:                 print "pandia:  $url has been modified since the last time it was indexed\n";
        !            90:                 my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
        !            91:                 $sth->execute($url);
        !            92:                 print "pandia:  INDEXDELETE $url\n";
        !            93:             }
        !            94:             else {
        !            95:                 print "pandia:  $url is still up-to-date in the index\n";
        !            96:                 goto cleanup;
        !            97:             }
        !            98: 
        !            99:         }
        !           100:         else {
        !           101:             print "pandia:  no modify info; skipping $url\n";
        !           102:             goto nodb_cleanup;
        !           103:         }
        !           104:     }
        !           105:     else {
        !           106:         print "pandia:  INDEX $url\n";
        !           107:         $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
        !           108:     }
        !           109:     
1.1       snw       110:     my $title = "";
                    111:     my $fulltext = "";
                    112:     my $fullhtml = "";
                    113:     
                    114:     if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
                    115:         print "pandia:  content type $content_type not indexable; skipping $url\n";
                    116:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    117:         $sth->execute($url);
                    118:         $sth->finish();
                    119:         $dbh->disconnect();
                    120:         goto nodb_cleanup;
                    121:     }
                    122:     
                    123:     my $response = $http->get($url);
                    124:     
                    125:     if(not $response->{success}) {
                    126:         print "pandia:  http failure; skipping $url\n";
                    127:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    128:         $sth->execute($url);
                    129:         $sth->finish();
                    130:         $dbh->disconnect();
                    131:         goto nodb_cleanup;
                    132:     }
                    133:     
                    134:     my $pagedata = $response->{content};    
                    135:     if($response) {
                    136:         $tree->parse($pagedata);   
                    137:         $title = $tree->look_down('_tag', 'title')->as_text;
                    138:         $title =~ s/[^\x00-\x7F]//g;
                    139:         
1.2     ! snw       140:         print "pandia:  processing $url [$title]\n";
1.1       snw       141:         
                    142:         $fulltext = $tree->as_text;
                    143:         $fulltext =~ s/[^\x00-\x7F]//g;
                    144: 
                    145:         $fullhtml = $tree->as_HTML;
                    146:         $fullhtml =~ s/[^\x00-\x7F]//g;
                    147: 
                    148:         my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
                    149:         $sth->execute($url);
                    150: 
                    151:         if($sth->rows > 0) {
                    152:             print "pandia:  we already have the full text of $url recorded\n";
                    153:             $sth->finish();
                    154:             goto cleanup;
                    155:         }
                    156:         
                    157:         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
                    158:         my $tries = 0;
                    159:         while(1) {
1.2     ! snw       160:             print "pandia:  INSERTINDEX $url\n";
1.1       snw       161:             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
                    162:             if($DBI::err) {
                    163:                 if($tries > 5) {
                    164:                     print "pandia:  giving up inserting fulltext on $url\n";
                    165:                     last;
                    166:                 }
                    167:                 $tries = $tries + 1;
                    168:                 print "pandia:  error inserting fulltext on $url; retrying\n";
                    169:                 next;
                    170:             }
                    171:             else {
                    172:                 last;
                    173:             }
                    174:         }
                    175:         $sth->finish();        
                    176:     }
                    177:     
                    178:     print "pandia:  $url has been processed\n";
                    179:     
                    180: 
                    181:   cleanup:
                    182:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2     ! snw       183:     $tries = 0;
1.1       snw       184:     while(1) {
                    185:         $sthuc->execute($url);
                    186:         if($DBI::err) {
                    187:             $tries = $tries + 1;
                    188:             if($tries > 2) {
                    189:                 print "pandia:  giving up updating crawl_queue for $url\n";
                    190:                 last;
                    191:             }
                    192:             print "pandia:  DBI deadlock; retrying crawl queue update\n";           
                    193:             next;
                    194:         }
                    195:         else {
                    196:             last;
                    197:         }
                    198:     }
                    199:     $sthuc->finish();
                    200:     $dbh->disconnect();
                    201: 
                    202:   nodb_cleanup:
                    203:     $indices_waiting = $indices_waiting - 1;
                    204: }
                    205: 
                    206: sub new {
                    207:     my ($class, $args) = @_;
                    208:     
                    209:     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
                    210: 
                    211:     my $thost = $cfg->val($args->{profile}, 'dbhost');
                    212:     my $tname = $cfg->val($args->{profile}, 'dbname');
                    213:     my $tuser = $cfg->val($args->{profile}, 'dbuser');
                    214:     my $tpass = $cfg->val($args->{profile}, 'dbpass');
                    215:     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
                    216:     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
                    217: 
                    218:     $indices_waiting = $tindex_workers;
                    219:     
                    220:     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
                    221:     
                    222:     my $self = bless {
                    223:         profile => $args->{profile},
                    224:         dbhost => $thost,
                    225:         dbname => $tname,
                    226:         dbuser => $tuser,
                    227:         dbpass => $tpass,
                    228:         dsn => $tdsn,
                    229:         index_workers => $tindex_workers,
                    230:         crawl_workers => $tcrawl_workers,
                    231:         index_pool => Thread::Pool->new(
                    232:             {
                    233:                 workers => $tindex_workers,
                    234:                 do => \&index
                    235:             }
                    236:             )
                    237:     }, $class;
                    238: 
                    239:     return $self;
                    240: }
                    241: 
                    242: sub run_index_batch {    
                    243:     my ($self) = @_;
                    244: 
                    245: #    open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
                    246: #    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                    247: 
                    248:     print "pandia:  creating $self->{index_workers} indexer threads\n";
                    249: 
                    250:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    251:     
                    252:     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2     ! snw       253:     $sth->execute($self->{index_workers});   
1.1       snw       254:     
                    255:     $indices_waiting = $sth->rows;
                    256:     
                    257:     if($indices_waiting == 0) {
                    258:         print "pandia:  nothing to index\n";
                    259:         goto done;
                    260:     }
                    261: 
                    262:     my $tmpi = 0;
                    263:     while (my $hashref = $sth->fetchrow_hashref()) {
                    264:         $tmpi = $tmpi + 1;
                    265:         print "pandia:  sending $hashref->{url} to worker thread\n";
1.2     ! snw       266:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1       snw       267:     }
                    268: 
1.2     ! snw       269:     print "pandia:  $indices_waiting total pages to be processed\n";
        !           270: 
        !           271: done:        
        !           272:     $sth->finish();    
        !           273:     $dbh->disconnect();
1.1       snw       274: 
1.2     ! snw       275:     my $start_time = time();
        !           276:     while($indices_waiting > 0) {
        !           277:         my $end_time = time();
        !           278:         my $time_diff = $end_time - $start_time;
        !           279: 
        !           280:         if($time_diff > 60) {
        !           281:             print "pandia:  timing out\n";
        !           282:             last;
        !           283:         }
        !           284:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
        !           285:         sleep(10);
        !           286:     }
        !           287:     $self->{index_pool}->shutdown;
        !           288: }
        !           289: 
        !           290: sub run_reindex_batch {
        !           291:     my ($self) = @_;
        !           292: 
        !           293:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
        !           294:     
        !           295:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
        !           296:     $sth->execute($self->{index_workers});   
        !           297: 
        !           298:     $indices_waiting = $sth->rows;
        !           299:     
        !           300:     if($indices_waiting == 0) {
        !           301:         print "pandia:  nothing to reindex\n";
        !           302:         goto done;
        !           303:     }
        !           304: 
        !           305:     my $tmpi = 0;    
        !           306:     while (my $hashref = $sth->fetchrow_hashref()) {
        !           307:         $tmpi = $tmpi + 1;
        !           308:         print "pandia:  sending $hashref->{url} to worker thread\n";
        !           309:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
        !           310:     }
1.1       snw       311: 
                    312:     print "pandia:  $indices_waiting total pages to be processed\n";
                    313: 
1.2     ! snw       314:   done:        
1.1       snw       315:     $sth->finish();    
                    316:     $dbh->disconnect();
                    317: 
                    318:     my $start_time = time();
                    319:     while($indices_waiting > 0) {
                    320:         my $end_time = time();
                    321:         my $time_diff = $end_time - $start_time;
                    322: 
1.2     ! snw       323:         if($time_diff > 60) {
1.1       snw       324:             print "pandia:  timing out\n";
                    325:             last;
                    326:         }
                    327:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
                    328:         sleep(10);
                    329:     }
                    330:     $self->{index_pool}->shutdown;
1.2     ! snw       331:         
1.1       snw       332: }
                    333: 
                    334: 1;
                    335: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>