Annotation of pandia/Pandia.pm, revision 1.3

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.3     ! snw         4: # $Id: Pandia.pm,v 1.2 2025/06/30 02:18:44 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2       snw        11: # $Log: Pandia.pm,v $
1.3     ! snw        12: # Revision 1.2  2025/06/30 02:18:44  snw
        !            13: # Updates
        !            14: #
1.2       snw        15: # Revision 1.1  2025/06/28 23:54:11  snw
                     16: # Add new OO module
                     17: #
1.1       snw        18: #
                     19: 
                     20: package Pandia;
                     21: 
                     22: use strict;
1.2       snw        23: use warnings;
1.1       snw        24: 
                     25: use HTTP::Tiny;
                     26: use HTML::TreeBuilder;
                     27: use URI;
                     28: use DBI;
                     29: use WWW::RobotRules;
                     30: use Fcntl qw(:flock);
                     31: use LWP::Simple qw(get);
                     32: use Config::IniFiles;
                     33: use Thread::Pool;
1.2       snw        34: use HTTP::Date;
                     35: use POSIX qw(strftime);
1.1       snw        36: 
                     37: my $indices_waiting : shared;
                     38: 
1.3     ! snw        39: sub do_index {
1.2       snw        40:     my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1       snw        41: 
1.2       snw        42:     print "pandia:  thread connecting to MySQL database...";
                     43:     
1.1       snw        44:     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
                     45:     if(not $dbh) {
1.2       snw        46:         print "[FAIL]\n";
1.1       snw        47:         goto nodb_cleanup;
                     48:     }
1.2       snw        49:     print "[OK]\n";
1.1       snw        50:     
                     51:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
                     52:     my $tree = HTML::TreeBuilder->new();
1.2       snw        53:     my $tries;
1.1       snw        54:     
1.2       snw        55:     my $head;
                     56:     print "pandia:  HEAD $url\n";
                     57:     $head = $http->head($url);
1.3     ! snw        58: 
1.2       snw        59:     if(not $head->{success}) {              
                     60:         print "pandia:  HEAD fail $url\n";
1.3     ! snw        61: 
        !            62:         my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
        !            63:         $sthh->execute($url);
        !            64:         $sthh->finish();
1.2       snw        65:         goto nodb_cleanup;
                     66:     }
                     67:     else {
                     68:         print "pandia:  HEAD OK $url\n";
1.1       snw        69:     }
1.2       snw        70: 
                     71:   proc_head:
1.1       snw        72:     my $headers = $head->{headers};
                     73:     my $content_type = $headers->{'content-type'};
1.2       snw        74:     my $last_modified;
                     75:     my $last_modified_sys;
                     76: 
                     77:     if ($reindex == 1) {
                     78:         print "pandia:  REINDEX $url\n";
                     79:         my $last_modified_t = $headers->{'last-modified'};
                     80:         $last_modified_sys = str2time($last_modified_t);
                     81: 
                     82:         if($last_modified_sys) {
                     83:             print "pandia:  GET_LAST_INDEX_DT $url\n";
                     84:             my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
                     85:             $sth->execute($url);
                     86:             print "pandia:  GOT_LAST_INDEX_DT $url\n";
                     87: 
                     88:             if($sth->rows < 1) {
                     89:                 print "pandia:  page not indexed\n";
                     90:                 goto nodb_cleanup;
                     91:             }
                     92: 
                     93:             my $hashref = $sth->fetchrow_hashref();
                     94:             my $last_indexed = str2time($hashref->{last_indexed_dt});
                     95: 
                     96:             if($last_modified_sys > $last_indexed) {
                     97:                 print "pandia:  $url has been modified since the last time it was indexed\n";
                     98:                 my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
                     99:                 $sth->execute($url);
                    100:                 print "pandia:  INDEXDELETE $url\n";
                    101:             }
                    102:             else {
                    103:                 print "pandia:  $url is still up-to-date in the index\n";
                    104:                 goto cleanup;
                    105:             }
                    106: 
                    107:         }
                    108:         else {
                    109:             print "pandia:  no modify info; skipping $url\n";
                    110:             goto nodb_cleanup;
                    111:         }
                    112:     }
                    113:     else {
                    114:         print "pandia:  INDEX $url\n";
                    115:         $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
                    116:     }
                    117:     
1.1       snw       118:     my $title = "";
                    119:     my $fulltext = "";
                    120:     my $fullhtml = "";
                    121:     
                    122:     if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
                    123:         print "pandia:  content type $content_type not indexable; skipping $url\n";
                    124:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    125:         $sth->execute($url);
                    126:         $sth->finish();
                    127:         $dbh->disconnect();
                    128:         goto nodb_cleanup;
                    129:     }
                    130:     
                    131:     my $response = $http->get($url);
                    132:     
                    133:     if(not $response->{success}) {
                    134:         print "pandia:  http failure; skipping $url\n";
                    135:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    136:         $sth->execute($url);
                    137:         $sth->finish();
                    138:         $dbh->disconnect();
                    139:         goto nodb_cleanup;
                    140:     }
                    141:     
                    142:     my $pagedata = $response->{content};    
                    143:     if($response) {
                    144:         $tree->parse($pagedata);   
                    145:         $title = $tree->look_down('_tag', 'title')->as_text;
                    146:         $title =~ s/[^\x00-\x7F]//g;
                    147:         
1.2       snw       148:         print "pandia:  processing $url [$title]\n";
1.1       snw       149:         
                    150:         $fulltext = $tree->as_text;
                    151:         $fulltext =~ s/[^\x00-\x7F]//g;
                    152: 
                    153:         $fullhtml = $tree->as_HTML;
                    154:         $fullhtml =~ s/[^\x00-\x7F]//g;
                    155: 
                    156:         my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
                    157:         $sth->execute($url);
                    158: 
                    159:         if($sth->rows > 0) {
                    160:             print "pandia:  we already have the full text of $url recorded\n";
                    161:             $sth->finish();
                    162:             goto cleanup;
                    163:         }
                    164:         
                    165:         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
                    166:         my $tries = 0;
                    167:         while(1) {
1.2       snw       168:             print "pandia:  INSERTINDEX $url\n";
1.1       snw       169:             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
                    170:             if($DBI::err) {
                    171:                 if($tries > 5) {
                    172:                     print "pandia:  giving up inserting fulltext on $url\n";
                    173:                     last;
                    174:                 }
                    175:                 $tries = $tries + 1;
                    176:                 print "pandia:  error inserting fulltext on $url; retrying\n";
                    177:                 next;
                    178:             }
                    179:             else {
                    180:                 last;
                    181:             }
                    182:         }
                    183:         $sth->finish();        
                    184:     }
                    185:     
                    186:     print "pandia:  $url has been processed\n";
                    187:     
                    188: 
                    189:   cleanup:
                    190:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2       snw       191:     $tries = 0;
1.1       snw       192:     while(1) {
                    193:         $sthuc->execute($url);
                    194:         if($DBI::err) {
                    195:             $tries = $tries + 1;
                    196:             if($tries > 2) {
                    197:                 print "pandia:  giving up updating crawl_queue for $url\n";
                    198:                 last;
                    199:             }
                    200:             print "pandia:  DBI deadlock; retrying crawl queue update\n";           
                    201:             next;
                    202:         }
                    203:         else {
                    204:             last;
                    205:         }
                    206:     }
                    207:     $sthuc->finish();
                    208:     $dbh->disconnect();
                    209: 
                    210:   nodb_cleanup:
                    211:     $indices_waiting = $indices_waiting - 1;
                    212: }
                    213: 
1.3     ! snw       214: sub blacklist_add {
        !           215:     my ($self, $domain) = @_;
        !           216: 
        !           217:     print "pandia:  connecting to database...";
        !           218:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
        !           219:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
        !           220:     print "[OK]\n";
        !           221: 
        !           222:     print "pandia:  blacklisting domain $domain...";
        !           223:     my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
        !           224:     $sth->execute($domain);
        !           225:     print "[OK]\n";
        !           226:     
        !           227:     print "pandia:  removing blacklisted items from crawl queue...";
        !           228:     $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
        !           229:     $sth->execute();
        !           230:     print "[OK]\n";
        !           231:     
        !           232:     print "pandia:  removing blacklisted items from index...";
        !           233:     $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
        !           234:     $sth->execute();
        !           235:     print "[OK]\n";
        !           236: 
        !           237:     $sth->finish();
        !           238:     $dbh->disconnect();
        !           239: }
        !           240: 
        !           241: sub blacklist_remove {
        !           242:     my ($self, $domain) = @_;
        !           243: 
        !           244:     print "pandia:  connecting to database...";
        !           245:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
        !           246:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
        !           247:     print "[OK]\n";
        !           248: 
        !           249:     my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
        !           250:     $sth->execute($domain);
        !           251: 
        !           252:     $sth->finish();
        !           253:     $dbh->disconnect();        
        !           254: }
        !           255: 
        !           256: sub index_serial {
        !           257:     my ($self) = @_;
        !           258: 
        !           259:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       260:     
1.3     ! snw       261:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
        !           262:     $sth->execute();
        !           263: 
        !           264:     while (my $hashref = $sth->fetchrow_hashref()) {
        !           265:         do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
        !           266:     }
        !           267: 
        !           268:     $sth->finish();
        !           269:     $dbh->disconnect();
        !           270: }
1.1       snw       271: 
1.3     ! snw       272: sub index_one {
        !           273:     my ($self, $url) = @_;
1.1       snw       274: 
1.3     ! snw       275:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       276:     
1.3     ! snw       277:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
        !           278:     $sth->execute($url);
        !           279: 
        !           280:     while (my $hashref = $sth->fetchrow_hashref()) {
        !           281:         do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
        !           282:     }
        !           283: 
        !           284:     $sth->finish();
        !           285:     $dbh->disconnect();
        !           286: }
        !           287: 
        !           288: sub index_domain {
        !           289:     my ($self, $domain) = @_;
        !           290: 
        !           291:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       292:     
1.3     ! snw       293:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
        !           294:     $sth->execute($domain);
        !           295: 
        !           296:     while (my $hashref = $sth->fetchrow_hashref()) {
        !           297:         do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
        !           298:     }
        !           299: 
        !           300:     $sth->finish();
        !           301:     $dbh->disconnect();
1.1       snw       302: 
                    303: }
                    304: 
                    305: sub run_index_batch {    
                    306:     my ($self) = @_;
                    307: 
                    308: #    open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
                    309: #    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                    310: 
                    311:     print "pandia:  creating $self->{index_workers} indexer threads\n";
                    312: 
                    313:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    314:     
                    315:     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2       snw       316:     $sth->execute($self->{index_workers});   
1.1       snw       317:     
                    318:     $indices_waiting = $sth->rows;
                    319:     
                    320:     if($indices_waiting == 0) {
                    321:         print "pandia:  nothing to index\n";
                    322:         goto done;
                    323:     }
                    324: 
                    325:     my $tmpi = 0;
                    326:     while (my $hashref = $sth->fetchrow_hashref()) {
                    327:         $tmpi = $tmpi + 1;
                    328:         print "pandia:  sending $hashref->{url} to worker thread\n";
1.2       snw       329:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1       snw       330:     }
                    331: 
1.2       snw       332:     print "pandia:  $indices_waiting total pages to be processed\n";
                    333: 
                    334: done:        
                    335:     $sth->finish();    
                    336:     $dbh->disconnect();
1.1       snw       337: 
1.2       snw       338:     my $start_time = time();
                    339:     while($indices_waiting > 0) {
                    340:         my $end_time = time();
                    341:         my $time_diff = $end_time - $start_time;
                    342: 
                    343:         if($time_diff > 60) {
                    344:             print "pandia:  timing out\n";
                    345:             last;
                    346:         }
                    347:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
                    348:         sleep(10);
                    349:     }
                    350:     $self->{index_pool}->shutdown;
                    351: }
                    352: 
                    353: sub run_reindex_batch {
                    354:     my ($self) = @_;
                    355: 
                    356:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    357:     
                    358:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
                    359:     $sth->execute($self->{index_workers});   
                    360: 
                    361:     $indices_waiting = $sth->rows;
                    362:     
                    363:     if($indices_waiting == 0) {
                    364:         print "pandia:  nothing to reindex\n";
                    365:         goto done;
                    366:     }
                    367: 
                    368:     my $tmpi = 0;    
                    369:     while (my $hashref = $sth->fetchrow_hashref()) {
                    370:         $tmpi = $tmpi + 1;
                    371:         print "pandia:  sending $hashref->{url} to worker thread\n";
                    372:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
                    373:     }
1.1       snw       374: 
                    375:     print "pandia:  $indices_waiting total pages to be processed\n";
                    376: 
1.2       snw       377:   done:        
1.1       snw       378:     $sth->finish();    
                    379:     $dbh->disconnect();
                    380: 
                    381:     my $start_time = time();
                    382:     while($indices_waiting > 0) {
                    383:         my $end_time = time();
                    384:         my $time_diff = $end_time - $start_time;
                    385: 
1.2       snw       386:         if($time_diff > 60) {
1.1       snw       387:             print "pandia:  timing out\n";
                    388:             last;
                    389:         }
                    390:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
                    391:         sleep(10);
                    392:     }
                    393:     $self->{index_pool}->shutdown;
1.2       snw       394:         
1.1       snw       395: }
                    396: 
1.3     ! snw       397: sub new {
        !           398:     my ($class, $args) = @_;
        !           399:     
        !           400:     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
        !           401: 
        !           402:     my $thost = $cfg->val($args->{profile}, 'dbhost');
        !           403:     my $tname = $cfg->val($args->{profile}, 'dbname');
        !           404:     my $tuser = $cfg->val($args->{profile}, 'dbuser');
        !           405:     my $tpass = $cfg->val($args->{profile}, 'dbpass');
        !           406:     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
        !           407:     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
        !           408: 
        !           409:     $indices_waiting = $tindex_workers;
        !           410:     
        !           411:     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
        !           412:     
        !           413:     my $self = bless {
        !           414:         profile => $args->{profile},
        !           415:         dbhost => $thost,
        !           416:         dbname => $tname,
        !           417:         dbuser => $tuser,
        !           418:         dbpass => $tpass,
        !           419:         dsn => $tdsn,
        !           420:         index_workers => $tindex_workers,
        !           421:         crawl_workers => $tcrawl_workers,
        !           422:         index_pool => Thread::Pool->new(
        !           423:             {
        !           424:                 workers => $tindex_workers,
        !           425:                 do => \&do_index
        !           426:             }
        !           427:             )
        !           428:     }, $class;
        !           429: 
        !           430:     return $self;
        !           431: }
        !           432: 
        !           433: 
1.1       snw       434: 1;
                    435: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>