Annotation of pandia/Pandia.pm, revision 1.1

1.1     ! snw         1: #!/usr/bin/env perl
        !             2: 
        !             3: # 
        !             4: # $Id$
        !             5: #  Copyright (C) 2025 Coherent Logic Development LLC
        !             6: #
        !             7: # Author: Serena Willis <snw@coherent-logic.com>
        !             8: #
        !             9: # Licensed AGPL-3.0
        !            10: #
        !            11: # $Log$
        !            12: #
        !            13: 
        !            14: package Pandia;
        !            15: 
        !            16: use strict;
        !            17: #use warnings;
        !            18: 
        !            19: use HTTP::Tiny;
        !            20: use HTML::TreeBuilder;
        !            21: use URI;
        !            22: use DBI;
        !            23: use WWW::RobotRules;
        !            24: use Fcntl qw(:flock);
        !            25: use LWP::Simple qw(get);
        !            26: use Config::IniFiles;
        !            27: use Thread::Pool;
        !            28: 
        !            29: my $indices_waiting : shared;
        !            30: 
        !            31: sub index {
        !            32:     my ($url, $domain, $dsn, $dbuser, $dbpass) = @_;
        !            33: 
        !            34:     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
        !            35:     if(not $dbh) {
        !            36:         print "pandia:  failed to connect to MySQL database\n";
        !            37:         goto nodb_cleanup;
        !            38:     }
        !            39:     
        !            40:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
        !            41:     my $tree = HTML::TreeBuilder->new();
        !            42:     
        !            43:     my $head = $http->head($url);
        !            44:     if(not $head->{success}) {
        !            45:         print "pandia: http HEAD failure; skipping $url\n";
        !            46:         goto cleanup;
        !            47:     }
        !            48:     my $headers = $head->{headers};
        !            49:     my $content_type = $headers->{'content-type'};
        !            50:     my $title = "";
        !            51:     my $fulltext = "";
        !            52:     my $fullhtml = "";
        !            53:     
        !            54:     if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
        !            55:         print "pandia:  content type $content_type not indexable; skipping $url\n";
        !            56:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
        !            57:         $sth->execute($url);
        !            58:         $sth->finish();
        !            59:         $dbh->disconnect();
        !            60:         goto nodb_cleanup;
        !            61:     }
        !            62:     
        !            63:     my $response = $http->get($url);
        !            64:     
        !            65:     if(not $response->{success}) {
        !            66:         print "pandia:  http failure; skipping $url\n";
        !            67:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
        !            68:         $sth->execute($url);
        !            69:         $sth->finish();
        !            70:         $dbh->disconnect();
        !            71:         goto nodb_cleanup;
        !            72:     }
        !            73:     
        !            74:     my $pagedata = $response->{content};    
        !            75:     if($response) {
        !            76:         $tree->parse($pagedata);   
        !            77:         $title = $tree->look_down('_tag', 'title')->as_text;
        !            78:         $title =~ s/[^\x00-\x7F]//g;
        !            79:         
        !            80:         #print "pandia:  processing $url [$title]\n";
        !            81:         
        !            82:         $fulltext = $tree->as_text;
        !            83:         $fulltext =~ s/[^\x00-\x7F]//g;
        !            84: 
        !            85:         $fullhtml = $tree->as_HTML;
        !            86:         $fullhtml =~ s/[^\x00-\x7F]//g;
        !            87: 
        !            88:         my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
        !            89:         $sth->execute($url);
        !            90: 
        !            91:         if($sth->rows > 0) {
        !            92:             print "pandia:  we already have the full text of $url recorded\n";
        !            93:             $sth->finish();
        !            94:             goto cleanup;
        !            95:         }
        !            96:         
        !            97:         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
        !            98:         my $tries = 0;
        !            99:         while(1) {
        !           100:             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
        !           101:             if($DBI::err) {
        !           102:                 if($tries > 5) {
        !           103:                     print "pandia:  giving up inserting fulltext on $url\n";
        !           104:                     last;
        !           105:                 }
        !           106:                 $tries = $tries + 1;
        !           107:                 print "pandia:  error inserting fulltext on $url; retrying\n";
        !           108:                 next;
        !           109:             }
        !           110:             else {
        !           111:                 last;
        !           112:             }
        !           113:         }
        !           114:         $sth->finish();        
        !           115:     }
        !           116:     
        !           117:     print "pandia:  $url has been processed\n";
        !           118:     
        !           119: 
        !           120:   cleanup:
        !           121:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
        !           122:     my $tries = 0;
        !           123:     while(1) {
        !           124:         $sthuc->execute($url);
        !           125:         if($DBI::err) {
        !           126:             $tries = $tries + 1;
        !           127:             if($tries > 2) {
        !           128:                 print "pandia:  giving up updating crawl_queue for $url\n";
        !           129:                 last;
        !           130:             }
        !           131:             print "pandia:  DBI deadlock; retrying crawl queue update\n";           
        !           132:             next;
        !           133:         }
        !           134:         else {
        !           135:             last;
        !           136:         }
        !           137:     }
        !           138:     $sthuc->finish();
        !           139:     $dbh->disconnect();
        !           140: 
        !           141:   nodb_cleanup:
        !           142:     lock($indices_waiting);
        !           143:     $indices_waiting = $indices_waiting - 1;
        !           144: }
        !           145: 
        !           146: sub new {
        !           147:     my ($class, $args) = @_;
        !           148:     
        !           149:     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
        !           150: 
        !           151:     my $thost = $cfg->val($args->{profile}, 'dbhost');
        !           152:     my $tname = $cfg->val($args->{profile}, 'dbname');
        !           153:     my $tuser = $cfg->val($args->{profile}, 'dbuser');
        !           154:     my $tpass = $cfg->val($args->{profile}, 'dbpass');
        !           155:     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
        !           156:     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
        !           157: 
        !           158:     $indices_waiting = $tindex_workers;
        !           159:     
        !           160:     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
        !           161:     
        !           162:     my $self = bless {
        !           163:         profile => $args->{profile},
        !           164:         dbhost => $thost,
        !           165:         dbname => $tname,
        !           166:         dbuser => $tuser,
        !           167:         dbpass => $tpass,
        !           168:         dsn => $tdsn,
        !           169:         index_workers => $tindex_workers,
        !           170:         crawl_workers => $tcrawl_workers,
        !           171:         index_pool => Thread::Pool->new(
        !           172:             {
        !           173:                 workers => $tindex_workers,
        !           174:                 do => \&index
        !           175:             }
        !           176:             )
        !           177:     }, $class;
        !           178: 
        !           179:     return $self;
        !           180: }
        !           181: 
        !           182: sub run_index_batch {    
        !           183:     my ($self) = @_;
        !           184: 
        !           185: #    open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
        !           186: #    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
        !           187: 
        !           188:     print "pandia:  creating $self->{index_workers} indexer threads\n";
        !           189: 
        !           190:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
        !           191:     
        !           192:     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
        !           193:     $sth->execute($self->{index_workers} * 4);   
        !           194:     
        !           195:     $indices_waiting = $sth->rows;
        !           196:     
        !           197:     if($indices_waiting == 0) {
        !           198:         print "pandia:  nothing to index\n";
        !           199:         goto done;
        !           200:     }
        !           201: 
        !           202:     my $tmpi = 0;
        !           203:     while (my $hashref = $sth->fetchrow_hashref()) {
        !           204:         $tmpi = $tmpi + 1;
        !           205:         print "pandia:  sending $hashref->{url} to worker thread\n";
        !           206:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass});
        !           207:     }
        !           208: 
        !           209: 
        !           210: 
        !           211:     print "pandia:  $indices_waiting total pages to be processed\n";
        !           212: 
        !           213: done:        
        !           214:     $sth->finish();    
        !           215:     $dbh->disconnect();
        !           216: 
        !           217:     my $start_time = time();
        !           218:     while($indices_waiting > 0) {
        !           219:         my $end_time = time();
        !           220:         my $time_diff = $end_time - $start_time;
        !           221: 
        !           222:         if($time_diff > $indices_waiting * 20) {
        !           223:             print "pandia:  timing out\n";
        !           224:             last;
        !           225:         }
        !           226:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
        !           227:         sleep(10);
        !           228:     }
        !           229:     $self->{index_pool}->shutdown;
        !           230: }
        !           231: 
        !           232: 1;
        !           233: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>