Annotation of pandia/Pandia.pm, revision 1.4

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.4     ! snw         4: # $Id: Pandia.pm,v 1.3 2025/07/01 06:48:03 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2       snw        11: # $Log: Pandia.pm,v $
1.4     ! snw        12: # Revision 1.3  2025/07/01 06:48:03  snw
        !            13: # Updates
        !            14: #
1.3       snw        15: # Revision 1.2  2025/06/30 02:18:44  snw
                     16: # Updates
                     17: #
1.2       snw        18: # Revision 1.1  2025/06/28 23:54:11  snw
                     19: # Add new OO module
                     20: #
1.1       snw        21: #
                     22: 
                     23: package Pandia;
                     24: 
                     25: use strict;
1.2       snw        26: use warnings;
1.1       snw        27: 
                     28: use HTTP::Tiny;
                     29: use HTML::TreeBuilder;
                     30: use URI;
                     31: use DBI;
                     32: use WWW::RobotRules;
                     33: use Fcntl qw(:flock);
                     34: use LWP::Simple qw(get);
                     35: use Config::IniFiles;
                     36: use Thread::Pool;
1.2       snw        37: use HTTP::Date;
                     38: use POSIX qw(strftime);
1.1       snw        39: 
                     40: my $indices_waiting : shared;
                     41: 
1.3       snw        42: sub do_index {
1.2       snw        43:     my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1       snw        44: 
1.4     ! snw        45:     if (index($url, '#') != -1) {
        !            46:         print "pandia:  URL contains a fragment; skipping\n";
        !            47:         return;
        !            48:     }
        !            49:     
1.2       snw        50:     print "pandia:  thread connecting to MySQL database...";
                     51:     
1.1       snw        52:     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
                     53:     if(not $dbh) {
1.2       snw        54:         print "[FAIL]\n";
1.1       snw        55:         goto nodb_cleanup;
                     56:     }
1.2       snw        57:     print "[OK]\n";
1.1       snw        58:     
                     59:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
                     60:     my $tree = HTML::TreeBuilder->new();
1.2       snw        61:     my $tries;
1.1       snw        62:     
1.2       snw        63:     my $head;
                     64:     print "pandia:  HEAD $url\n";
                     65:     $head = $http->head($url);
1.3       snw        66: 
1.2       snw        67:     if(not $head->{success}) {              
                     68:         print "pandia:  HEAD fail $url\n";
1.3       snw        69: 
                     70:         my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                     71:         $sthh->execute($url);
                     72:         $sthh->finish();
1.2       snw        73:         goto nodb_cleanup;
                     74:     }
                     75:     else {
                     76:         print "pandia:  HEAD OK $url\n";
1.1       snw        77:     }
1.2       snw        78: 
                     79:   proc_head:
1.1       snw        80:     my $headers = $head->{headers};
                     81:     my $content_type = $headers->{'content-type'};
1.2       snw        82:     my $last_modified;
                     83:     my $last_modified_sys;
                     84: 
                     85:     if ($reindex == 1) {
                     86:         print "pandia:  REINDEX $url\n";
                     87:         my $last_modified_t = $headers->{'last-modified'};
                     88:         $last_modified_sys = str2time($last_modified_t);
                     89: 
                     90:         if($last_modified_sys) {
                     91:             print "pandia:  GET_LAST_INDEX_DT $url\n";
                     92:             my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
                     93:             $sth->execute($url);
                     94:             print "pandia:  GOT_LAST_INDEX_DT $url\n";
                     95: 
                     96:             if($sth->rows < 1) {
                     97:                 print "pandia:  page not indexed\n";
                     98:                 goto nodb_cleanup;
                     99:             }
                    100: 
                    101:             my $hashref = $sth->fetchrow_hashref();
                    102:             my $last_indexed = str2time($hashref->{last_indexed_dt});
                    103: 
                    104:             if($last_modified_sys > $last_indexed) {
                    105:                 print "pandia:  $url has been modified since the last time it was indexed\n";
                    106:                 my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
                    107:                 $sth->execute($url);
                    108:                 print "pandia:  INDEXDELETE $url\n";
                    109:             }
                    110:             else {
                    111:                 print "pandia:  $url is still up-to-date in the index\n";
                    112:                 goto cleanup;
                    113:             }
                    114: 
                    115:         }
                    116:         else {
                    117:             print "pandia:  no modify info; skipping $url\n";
                    118:             goto nodb_cleanup;
                    119:         }
                    120:     }
                    121:     else {
                    122:         print "pandia:  INDEX $url\n";
                    123:         $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
                    124:     }
                    125:     
1.1       snw       126:     my $title = "";
                    127:     my $fulltext = "";
                    128:     my $fullhtml = "";
                    129:     
                    130:     if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
                    131:         print "pandia:  content type $content_type not indexable; skipping $url\n";
                    132:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    133:         $sth->execute($url);
                    134:         $sth->finish();
                    135:         $dbh->disconnect();
                    136:         goto nodb_cleanup;
                    137:     }
                    138:     
                    139:     my $response = $http->get($url);
                    140:     
                    141:     if(not $response->{success}) {
                    142:         print "pandia:  http failure; skipping $url\n";
                    143:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
                    144:         $sth->execute($url);
                    145:         $sth->finish();
                    146:         $dbh->disconnect();
                    147:         goto nodb_cleanup;
                    148:     }
                    149:     
                    150:     my $pagedata = $response->{content};    
                    151:     if($response) {
                    152:         $tree->parse($pagedata);   
                    153:         $title = $tree->look_down('_tag', 'title')->as_text;
                    154:         $title =~ s/[^\x00-\x7F]//g;
                    155:         
1.2       snw       156:         print "pandia:  processing $url [$title]\n";
1.1       snw       157:         
                    158:         $fulltext = $tree->as_text;
                    159:         $fulltext =~ s/[^\x00-\x7F]//g;
                    160: 
                    161:         $fullhtml = $tree->as_HTML;
                    162:         $fullhtml =~ s/[^\x00-\x7F]//g;
                    163: 
                    164:         my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
                    165:         $sth->execute($url);
                    166: 
                    167:         if($sth->rows > 0) {
                    168:             print "pandia:  we already have the full text of $url recorded\n";
                    169:             $sth->finish();
                    170:             goto cleanup;
                    171:         }
                    172:         
                    173:         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
                    174:         my $tries = 0;
                    175:         while(1) {
1.2       snw       176:             print "pandia:  INSERTINDEX $url\n";
1.1       snw       177:             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
                    178:             if($DBI::err) {
                    179:                 if($tries > 5) {
                    180:                     print "pandia:  giving up inserting fulltext on $url\n";
                    181:                     last;
                    182:                 }
                    183:                 $tries = $tries + 1;
                    184:                 print "pandia:  error inserting fulltext on $url; retrying\n";
                    185:                 next;
                    186:             }
                    187:             else {
                    188:                 last;
                    189:             }
                    190:         }
                    191:         $sth->finish();        
                    192:     }
                    193:     
                    194:     print "pandia:  $url has been processed\n";
                    195:     
                    196: 
                    197:   cleanup:
                    198:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2       snw       199:     $tries = 0;
1.1       snw       200:     while(1) {
                    201:         $sthuc->execute($url);
                    202:         if($DBI::err) {
                    203:             $tries = $tries + 1;
                    204:             if($tries > 2) {
                    205:                 print "pandia:  giving up updating crawl_queue for $url\n";
                    206:                 last;
                    207:             }
                    208:             print "pandia:  DBI deadlock; retrying crawl queue update\n";           
                    209:             next;
                    210:         }
                    211:         else {
                    212:             last;
                    213:         }
                    214:     }
                    215:     $sthuc->finish();
                    216:     $dbh->disconnect();
                    217: 
                    218:   nodb_cleanup:
                    219:     $indices_waiting = $indices_waiting - 1;
                    220: }
                    221: 
1.3       snw       222: sub blacklist_add {
                    223:     my ($self, $domain) = @_;
                    224: 
                    225:     print "pandia:  connecting to database...";
                    226:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    227:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                    228:     print "[OK]\n";
                    229: 
                    230:     print "pandia:  blacklisting domain $domain...";
                    231:     my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
                    232:     $sth->execute($domain);
                    233:     print "[OK]\n";
                    234:     
                    235:     print "pandia:  removing blacklisted items from crawl queue...";
                    236:     $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
                    237:     $sth->execute();
                    238:     print "[OK]\n";
                    239:     
                    240:     print "pandia:  removing blacklisted items from index...";
                    241:     $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
                    242:     $sth->execute();
                    243:     print "[OK]\n";
                    244: 
                    245:     $sth->finish();
                    246:     $dbh->disconnect();
                    247: }
                    248: 
                    249: sub blacklist_remove {
                    250:     my ($self, $domain) = @_;
                    251: 
                    252:     print "pandia:  connecting to database...";
                    253:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    254:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                    255:     print "[OK]\n";
                    256: 
                    257:     my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
                    258:     $sth->execute($domain);
                    259: 
                    260:     $sth->finish();
                    261:     $dbh->disconnect();        
                    262: }
                    263: 
                    264: sub index_serial {
                    265:     my ($self) = @_;
                    266: 
                    267:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       268:     
1.3       snw       269:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
                    270:     $sth->execute();
                    271: 
                    272:     while (my $hashref = $sth->fetchrow_hashref()) {
                    273:         do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
                    274:     }
                    275: 
                    276:     $sth->finish();
                    277:     $dbh->disconnect();
                    278: }
1.1       snw       279: 
1.3       snw       280: sub index_one {
                    281:     my ($self, $url) = @_;
1.1       snw       282: 
1.3       snw       283:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       284:     
1.3       snw       285:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
                    286:     $sth->execute($url);
                    287: 
                    288:     while (my $hashref = $sth->fetchrow_hashref()) {
                    289:         do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
                    290:     }
                    291: 
                    292:     $sth->finish();
                    293:     $dbh->disconnect();
                    294: }
                    295: 
                    296: sub index_domain {
                    297:     my ($self, $domain) = @_;
                    298: 
                    299:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1       snw       300:     
1.3       snw       301:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
                    302:     $sth->execute($domain);
                    303: 
                    304:     while (my $hashref = $sth->fetchrow_hashref()) {
                    305:         do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
                    306:     }
                    307: 
                    308:     $sth->finish();
                    309:     $dbh->disconnect();
1.1       snw       310: 
                    311: }
                    312: 
                    313: sub run_index_batch {    
                    314:     my ($self) = @_;
                    315: 
                    316: #    open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
                    317: #    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
                    318: 
                    319:     print "pandia:  creating $self->{index_workers} indexer threads\n";
                    320: 
                    321:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    322:     
                    323:     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2       snw       324:     $sth->execute($self->{index_workers});   
1.1       snw       325:     
                    326:     $indices_waiting = $sth->rows;
                    327:     
                    328:     if($indices_waiting == 0) {
                    329:         print "pandia:  nothing to index\n";
                    330:         goto done;
                    331:     }
                    332: 
                    333:     my $tmpi = 0;
                    334:     while (my $hashref = $sth->fetchrow_hashref()) {
                    335:         $tmpi = $tmpi + 1;
                    336:         print "pandia:  sending $hashref->{url} to worker thread\n";
1.2       snw       337:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1       snw       338:     }
                    339: 
1.2       snw       340:     print "pandia:  $indices_waiting total pages to be processed\n";
                    341: 
                    342: done:        
                    343:     $sth->finish();    
                    344:     $dbh->disconnect();
1.1       snw       345: 
1.2       snw       346:     my $start_time = time();
                    347:     while($indices_waiting > 0) {
                    348:         my $end_time = time();
                    349:         my $time_diff = $end_time - $start_time;
                    350: 
                    351:         if($time_diff > 60) {
                    352:             print "pandia:  timing out\n";
                    353:             last;
                    354:         }
                    355:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
                    356:         sleep(10);
                    357:     }
                    358:     $self->{index_pool}->shutdown;
                    359: }
                    360: 
                    361: sub run_reindex_batch {
                    362:     my ($self) = @_;
                    363: 
                    364:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
                    365:     
                    366:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
                    367:     $sth->execute($self->{index_workers});   
                    368: 
                    369:     $indices_waiting = $sth->rows;
                    370:     
                    371:     if($indices_waiting == 0) {
                    372:         print "pandia:  nothing to reindex\n";
                    373:         goto done;
                    374:     }
                    375: 
                    376:     my $tmpi = 0;    
                    377:     while (my $hashref = $sth->fetchrow_hashref()) {
                    378:         $tmpi = $tmpi + 1;
                    379:         print "pandia:  sending $hashref->{url} to worker thread\n";
                    380:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
                    381:     }
1.1       snw       382: 
                    383:     print "pandia:  $indices_waiting total pages to be processed\n";
                    384: 
1.2       snw       385:   done:        
1.1       snw       386:     $sth->finish();    
                    387:     $dbh->disconnect();
                    388: 
                    389:     my $start_time = time();
                    390:     while($indices_waiting > 0) {
                    391:         my $end_time = time();
                    392:         my $time_diff = $end_time - $start_time;
                    393: 
1.2       snw       394:         if($time_diff > 60) {
1.1       snw       395:             print "pandia:  timing out\n";
                    396:             last;
                    397:         }
                    398:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
                    399:         sleep(10);
                    400:     }
                    401:     $self->{index_pool}->shutdown;
1.2       snw       402:         
1.1       snw       403: }
                    404: 
1.3       snw       405: sub new {
                    406:     my ($class, $args) = @_;
                    407:     
                    408:     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
                    409: 
                    410:     my $thost = $cfg->val($args->{profile}, 'dbhost');
                    411:     my $tname = $cfg->val($args->{profile}, 'dbname');
                    412:     my $tuser = $cfg->val($args->{profile}, 'dbuser');
                    413:     my $tpass = $cfg->val($args->{profile}, 'dbpass');
                    414:     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
                    415:     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
                    416: 
                    417:     $indices_waiting = $tindex_workers;
                    418:     
                    419:     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
                    420:     
                    421:     my $self = bless {
                    422:         profile => $args->{profile},
                    423:         dbhost => $thost,
                    424:         dbname => $tname,
                    425:         dbuser => $tuser,
                    426:         dbpass => $tpass,
                    427:         dsn => $tdsn,
                    428:         index_workers => $tindex_workers,
                    429:         crawl_workers => $tcrawl_workers,
                    430:         index_pool => Thread::Pool->new(
                    431:             {
                    432:                 workers => $tindex_workers,
                    433:                 do => \&do_index
                    434:             }
                    435:             )
                    436:     }, $class;
                    437: 
                    438:     return $self;
                    439: }
                    440: 
                    441: 
1.1       snw       442: 1;
                    443: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>