File:  [Coherent Logic Development] / pandia / Pandia.pm
Revision 1.4: download - view: text, annotated - select for diffs
Tue Jul 1 19:20:47 2025 UTC (4 weeks, 4 days ago) by snw
Branches: MAIN
CVS tags: HEAD
Stop crawling and indexing URLs containing page fragments

    1: #!/usr/bin/env perl
    2: 
    3: # 
    4: # $Id: Pandia.pm,v 1.4 2025/07/01 19:20:47 snw Exp $
    5: #  Copyright (C) 2025 Coherent Logic Development LLC
    6: #
    7: # Author: Serena Willis <snw@coherent-logic.com>
    8: #
    9: # Licensed AGPL-3.0
   10: #
   11: # $Log: Pandia.pm,v $
   12: # Revision 1.4  2025/07/01 19:20:47  snw
   13: # Stop crawling and indexing URLs containing page fragments
   14: #
   15: # Revision 1.3  2025/07/01 06:48:03  snw
   16: # Updates
   17: #
   18: # Revision 1.2  2025/06/30 02:18:44  snw
   19: # Updates
   20: #
   21: # Revision 1.1  2025/06/28 23:54:11  snw
   22: # Add new OO module
   23: #
   24: #
   25: 
   26: package Pandia;
   27: 
   28: use strict;
   29: use warnings;
   30: 
   31: use HTTP::Tiny;
   32: use HTML::TreeBuilder;
   33: use URI;
   34: use DBI;
   35: use WWW::RobotRules;
   36: use Fcntl qw(:flock);
   37: use LWP::Simple qw(get);
   38: use Config::IniFiles;
   39: use Thread::Pool;
   40: use HTTP::Date;
   41: use POSIX qw(strftime);
   42: 
   43: my $indices_waiting : shared;
   44: 
   45: sub do_index {
   46:     my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
   47: 
   48:     if (index($url, '#') != -1) {
   49:         print "pandia:  URL contains a fragment; skipping\n";
   50:         return;
   51:     }
   52:     
   53:     print "pandia:  thread connecting to MySQL database...";
   54:     
   55:     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
   56:     if(not $dbh) {
   57:         print "[FAIL]\n";
   58:         goto nodb_cleanup;
   59:     }
   60:     print "[OK]\n";
   61:     
   62:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
   63:     my $tree = HTML::TreeBuilder->new();
   64:     my $tries;
   65:     
   66:     my $head;
   67:     print "pandia:  HEAD $url\n";
   68:     $head = $http->head($url);
   69: 
   70:     if(not $head->{success}) {              
   71:         print "pandia:  HEAD fail $url\n";
   72: 
   73:         my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
   74:         $sthh->execute($url);
   75:         $sthh->finish();
   76:         goto nodb_cleanup;
   77:     }
   78:     else {
   79:         print "pandia:  HEAD OK $url\n";
   80:     }
   81: 
   82:   proc_head:
   83:     my $headers = $head->{headers};
   84:     my $content_type = $headers->{'content-type'};
   85:     my $last_modified;
   86:     my $last_modified_sys;
   87: 
   88:     if ($reindex == 1) {
   89:         print "pandia:  REINDEX $url\n";
   90:         my $last_modified_t = $headers->{'last-modified'};
   91:         $last_modified_sys = str2time($last_modified_t);
   92: 
   93:         if($last_modified_sys) {
   94:             print "pandia:  GET_LAST_INDEX_DT $url\n";
   95:             my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
   96:             $sth->execute($url);
   97:             print "pandia:  GOT_LAST_INDEX_DT $url\n";
   98: 
   99:             if($sth->rows < 1) {
  100:                 print "pandia:  page not indexed\n";
  101:                 goto nodb_cleanup;
  102:             }
  103: 
  104:             my $hashref = $sth->fetchrow_hashref();
  105:             my $last_indexed = str2time($hashref->{last_indexed_dt});
  106: 
  107:             if($last_modified_sys > $last_indexed) {
  108:                 print "pandia:  $url has been modified since the last time it was indexed\n";
  109:                 my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
  110:                 $sth->execute($url);
  111:                 print "pandia:  INDEXDELETE $url\n";
  112:             }
  113:             else {
  114:                 print "pandia:  $url is still up-to-date in the index\n";
  115:                 goto cleanup;
  116:             }
  117: 
  118:         }
  119:         else {
  120:             print "pandia:  no modify info; skipping $url\n";
  121:             goto nodb_cleanup;
  122:         }
  123:     }
  124:     else {
  125:         print "pandia:  INDEX $url\n";
  126:         $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
  127:     }
  128:     
  129:     my $title = "";
  130:     my $fulltext = "";
  131:     my $fullhtml = "";
  132:     
  133:     if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
  134:         print "pandia:  content type $content_type not indexable; skipping $url\n";
  135:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
  136:         $sth->execute($url);
  137:         $sth->finish();
  138:         $dbh->disconnect();
  139:         goto nodb_cleanup;
  140:     }
  141:     
  142:     my $response = $http->get($url);
  143:     
  144:     if(not $response->{success}) {
  145:         print "pandia:  http failure; skipping $url\n";
  146:         my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
  147:         $sth->execute($url);
  148:         $sth->finish();
  149:         $dbh->disconnect();
  150:         goto nodb_cleanup;
  151:     }
  152:     
  153:     my $pagedata = $response->{content};    
  154:     if($response) {
  155:         $tree->parse($pagedata);   
  156:         $title = $tree->look_down('_tag', 'title')->as_text;
  157:         $title =~ s/[^\x00-\x7F]//g;
  158:         
  159:         print "pandia:  processing $url [$title]\n";
  160:         
  161:         $fulltext = $tree->as_text;
  162:         $fulltext =~ s/[^\x00-\x7F]//g;
  163: 
  164:         $fullhtml = $tree->as_HTML;
  165:         $fullhtml =~ s/[^\x00-\x7F]//g;
  166: 
  167:         my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
  168:         $sth->execute($url);
  169: 
  170:         if($sth->rows > 0) {
  171:             print "pandia:  we already have the full text of $url recorded\n";
  172:             $sth->finish();
  173:             goto cleanup;
  174:         }
  175:         
  176:         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
  177:         my $tries = 0;
  178:         while(1) {
  179:             print "pandia:  INSERTINDEX $url\n";
  180:             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
  181:             if($DBI::err) {
  182:                 if($tries > 5) {
  183:                     print "pandia:  giving up inserting fulltext on $url\n";
  184:                     last;
  185:                 }
  186:                 $tries = $tries + 1;
  187:                 print "pandia:  error inserting fulltext on $url; retrying\n";
  188:                 next;
  189:             }
  190:             else {
  191:                 last;
  192:             }
  193:         }
  194:         $sth->finish();        
  195:     }
  196:     
  197:     print "pandia:  $url has been processed\n";
  198:     
  199: 
  200:   cleanup:
  201:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
  202:     $tries = 0;
  203:     while(1) {
  204:         $sthuc->execute($url);
  205:         if($DBI::err) {
  206:             $tries = $tries + 1;
  207:             if($tries > 2) {
  208:                 print "pandia:  giving up updating crawl_queue for $url\n";
  209:                 last;
  210:             }
  211:             print "pandia:  DBI deadlock; retrying crawl queue update\n";           
  212:             next;
  213:         }
  214:         else {
  215:             last;
  216:         }
  217:     }
  218:     $sthuc->finish();
  219:     $dbh->disconnect();
  220: 
  221:   nodb_cleanup:
  222:     $indices_waiting = $indices_waiting - 1;
  223: }
  224: 
  225: sub blacklist_add {
  226:     my ($self, $domain) = @_;
  227: 
  228:     print "pandia:  connecting to database...";
  229:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  230:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
  231:     print "[OK]\n";
  232: 
  233:     print "pandia:  blacklisting domain $domain...";
  234:     my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
  235:     $sth->execute($domain);
  236:     print "[OK]\n";
  237:     
  238:     print "pandia:  removing blacklisted items from crawl queue...";
  239:     $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
  240:     $sth->execute();
  241:     print "[OK]\n";
  242:     
  243:     print "pandia:  removing blacklisted items from index...";
  244:     $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
  245:     $sth->execute();
  246:     print "[OK]\n";
  247: 
  248:     $sth->finish();
  249:     $dbh->disconnect();
  250: }
  251: 
  252: sub blacklist_remove {
  253:     my ($self, $domain) = @_;
  254: 
  255:     print "pandia:  connecting to database...";
  256:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  257:     die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
  258:     print "[OK]\n";
  259: 
  260:     my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
  261:     $sth->execute($domain);
  262: 
  263:     $sth->finish();
  264:     $dbh->disconnect();        
  265: }
  266: 
  267: sub index_serial {
  268:     my ($self) = @_;
  269: 
  270:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  271:     
  272:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
  273:     $sth->execute();
  274: 
  275:     while (my $hashref = $sth->fetchrow_hashref()) {
  276:         do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
  277:     }
  278: 
  279:     $sth->finish();
  280:     $dbh->disconnect();
  281: }
  282: 
  283: sub index_one {
  284:     my ($self, $url) = @_;
  285: 
  286:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  287:     
  288:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
  289:     $sth->execute($url);
  290: 
  291:     while (my $hashref = $sth->fetchrow_hashref()) {
  292:         do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
  293:     }
  294: 
  295:     $sth->finish();
  296:     $dbh->disconnect();
  297: }
  298: 
  299: sub index_domain {
  300:     my ($self, $domain) = @_;
  301: 
  302:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  303:     
  304:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
  305:     $sth->execute($domain);
  306: 
  307:     while (my $hashref = $sth->fetchrow_hashref()) {
  308:         do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
  309:     }
  310: 
  311:     $sth->finish();
  312:     $dbh->disconnect();
  313: 
  314: }
  315: 
  316: sub run_index_batch {    
  317:     my ($self) = @_;
  318: 
  319: #    open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
  320: #    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
  321: 
  322:     print "pandia:  creating $self->{index_workers} indexer threads\n";
  323: 
  324:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  325:     
  326:     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
  327:     $sth->execute($self->{index_workers});   
  328:     
  329:     $indices_waiting = $sth->rows;
  330:     
  331:     if($indices_waiting == 0) {
  332:         print "pandia:  nothing to index\n";
  333:         goto done;
  334:     }
  335: 
  336:     my $tmpi = 0;
  337:     while (my $hashref = $sth->fetchrow_hashref()) {
  338:         $tmpi = $tmpi + 1;
  339:         print "pandia:  sending $hashref->{url} to worker thread\n";
  340:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
  341:     }
  342: 
  343:     print "pandia:  $indices_waiting total pages to be processed\n";
  344: 
  345: done:        
  346:     $sth->finish();    
  347:     $dbh->disconnect();
  348: 
  349:     my $start_time = time();
  350:     while($indices_waiting > 0) {
  351:         my $end_time = time();
  352:         my $time_diff = $end_time - $start_time;
  353: 
  354:         if($time_diff > 60) {
  355:             print "pandia:  timing out\n";
  356:             last;
  357:         }
  358:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
  359:         sleep(10);
  360:     }
  361:     $self->{index_pool}->shutdown;
  362: }
  363: 
  364: sub run_reindex_batch {
  365:     my ($self) = @_;
  366: 
  367:     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
  368:     
  369:     my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
  370:     $sth->execute($self->{index_workers});   
  371: 
  372:     $indices_waiting = $sth->rows;
  373:     
  374:     if($indices_waiting == 0) {
  375:         print "pandia:  nothing to reindex\n";
  376:         goto done;
  377:     }
  378: 
  379:     my $tmpi = 0;    
  380:     while (my $hashref = $sth->fetchrow_hashref()) {
  381:         $tmpi = $tmpi + 1;
  382:         print "pandia:  sending $hashref->{url} to worker thread\n";
  383:         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
  384:     }
  385: 
  386:     print "pandia:  $indices_waiting total pages to be processed\n";
  387: 
  388:   done:        
  389:     $sth->finish();    
  390:     $dbh->disconnect();
  391: 
  392:     my $start_time = time();
  393:     while($indices_waiting > 0) {
  394:         my $end_time = time();
  395:         my $time_diff = $end_time - $start_time;
  396: 
  397:         if($time_diff > 60) {
  398:             print "pandia:  timing out\n";
  399:             last;
  400:         }
  401:         print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
  402:         sleep(10);
  403:     }
  404:     $self->{index_pool}->shutdown;
  405:         
  406: }
  407: 
  408: sub new {
  409:     my ($class, $args) = @_;
  410:     
  411:     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
  412: 
  413:     my $thost = $cfg->val($args->{profile}, 'dbhost');
  414:     my $tname = $cfg->val($args->{profile}, 'dbname');
  415:     my $tuser = $cfg->val($args->{profile}, 'dbuser');
  416:     my $tpass = $cfg->val($args->{profile}, 'dbpass');
  417:     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
  418:     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
  419: 
  420:     $indices_waiting = $tindex_workers;
  421:     
  422:     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
  423:     
  424:     my $self = bless {
  425:         profile => $args->{profile},
  426:         dbhost => $thost,
  427:         dbname => $tname,
  428:         dbuser => $tuser,
  429:         dbpass => $tpass,
  430:         dsn => $tdsn,
  431:         index_workers => $tindex_workers,
  432:         crawl_workers => $tcrawl_workers,
  433:         index_pool => Thread::Pool->new(
  434:             {
  435:                 workers => $tindex_workers,
  436:                 do => \&do_index
  437:             }
  438:             )
  439:     }, $class;
  440: 
  441:     return $self;
  442: }
  443: 
  444: 
  445: 1;
  446: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>