Diff for /pandia/Pandia.pm between versions 1.1 and 1.2

version 1.1, 2025/06/28 23:54:11 version 1.2, 2025/06/30 02:18:44
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.2  2025/06/30 02:18:44  snw
   # Updates
   #
 # Revision 1.1  2025/06/28 23:54:11  snw  # Revision 1.1  2025/06/28 23:54:11  snw
 # Add new OO module  # Add new OO module
 #  #
Line 17 Line 20
 package Pandia;  package Pandia;
   
 use strict;  use strict;
 #use warnings;  use warnings;
   
 use HTTP::Tiny;  use HTTP::Tiny;
 use HTML::TreeBuilder;  use HTML::TreeBuilder;
Line 28  use Fcntl qw(:flock); Line 31  use Fcntl qw(:flock);
 use LWP::Simple qw(get);  use LWP::Simple qw(get);
 use Config::IniFiles;  use Config::IniFiles;
 use Thread::Pool;  use Thread::Pool;
   use HTTP::Date;
   use POSIX qw(strftime);
   
 my $indices_waiting : shared;  my $indices_waiting : shared;
   
 sub index {  sub index {
     my ($url, $domain, $dsn, $dbuser, $dbpass) = @_;      my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
   
       print "pandia:  thread connecting to MySQL database...";
       
     my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});      my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
     if(not $dbh) {      if(not $dbh) {
         print "pandia:  failed to connect to MySQL database\n";          print "[FAIL]\n";
         goto nodb_cleanup;          goto nodb_cleanup;
     }      }
       print "[OK]\n";
           
     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);      my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
     my $tree = HTML::TreeBuilder->new();      my $tree = HTML::TreeBuilder->new();
       my $tries;
           
     my $head = $http->head($url);      my $head;
     if(not $head->{success}) {      print "pandia:  HEAD $url\n";
         print "pandia: http HEAD failure; skipping $url\n";      $head = $http->head($url);
         goto cleanup;      if(not $head->{success}) {              
           print "pandia:  HEAD fail $url\n";
           goto nodb_cleanup;
       }
       else {
           print "pandia:  HEAD OK $url\n";
     }      }
   
     proc_head:
     my $headers = $head->{headers};      my $headers = $head->{headers};
     my $content_type = $headers->{'content-type'};      my $content_type = $headers->{'content-type'};
       my $last_modified;
       my $last_modified_sys;
   
       if ($reindex == 1) {
           print "pandia:  REINDEX $url\n";
           my $last_modified_t = $headers->{'last-modified'};
           $last_modified_sys = str2time($last_modified_t);
   
           if($last_modified_sys) {
               print "pandia:  GET_LAST_INDEX_DT $url\n";
               my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
               $sth->execute($url);
               print "pandia:  GOT_LAST_INDEX_DT $url\n";
   
               if($sth->rows < 1) {
                   print "pandia:  page not indexed\n";
                   goto nodb_cleanup;
               }
   
               my $hashref = $sth->fetchrow_hashref();
               my $last_indexed = str2time($hashref->{last_indexed_dt});
   
               if($last_modified_sys > $last_indexed) {
                   print "pandia:  $url has been modified since the last time it was indexed\n";
                   my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
                   $sth->execute($url);
                   print "pandia:  INDEXDELETE $url\n";
               }
               else {
                   print "pandia:  $url is still up-to-date in the index\n";
                   goto cleanup;
               }
   
           }
           else {
               print "pandia:  no modify info; skipping $url\n";
               goto nodb_cleanup;
           }
       }
       else {
           print "pandia:  INDEX $url\n";
           $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
       }
       
     my $title = "";      my $title = "";
     my $fulltext = "";      my $fulltext = "";
     my $fullhtml = "";      my $fullhtml = "";
Line 80  sub index { Line 140  sub index {
         $title = $tree->look_down('_tag', 'title')->as_text;          $title = $tree->look_down('_tag', 'title')->as_text;
         $title =~ s/[^\x00-\x7F]//g;          $title =~ s/[^\x00-\x7F]//g;
                   
         #print "pandia:  processing $url [$title]\n";          print "pandia:  processing $url [$title]\n";
                   
         $fulltext = $tree->as_text;          $fulltext = $tree->as_text;
         $fulltext =~ s/[^\x00-\x7F]//g;          $fulltext =~ s/[^\x00-\x7F]//g;
Line 100  sub index { Line 160  sub index {
         $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");          $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
         my $tries = 0;          my $tries = 0;
         while(1) {          while(1) {
               print "pandia:  INSERTINDEX $url\n";
             $sth->execute($url, $domain, $title, $fulltext, $fullhtml);              $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
             if($DBI::err) {              if($DBI::err) {
                 if($tries > 5) {                  if($tries > 5) {
Line 122  sub index { Line 183  sub index {
   
   cleanup:    cleanup:
     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");      my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
     my $tries = 0;      $tries = 0;
     while(1) {      while(1) {
         $sthuc->execute($url);          $sthuc->execute($url);
         if($DBI::err) {          if($DBI::err) {
Line 142  sub index { Line 203  sub index {
     $dbh->disconnect();      $dbh->disconnect();
   
   nodb_cleanup:    nodb_cleanup:
     lock($indices_waiting);  
     $indices_waiting = $indices_waiting - 1;      $indices_waiting = $indices_waiting - 1;
 }  }
   
Line 193  sub run_index_batch { Line 253  sub run_index_batch {
     my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});      my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
           
     my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");      my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
     $sth->execute($self->{index_workers} * 4);         $sth->execute($self->{index_workers});   
           
     $indices_waiting = $sth->rows;      $indices_waiting = $sth->rows;
           
Line 206  sub run_index_batch { Line 266  sub run_index_batch {
     while (my $hashref = $sth->fetchrow_hashref()) {      while (my $hashref = $sth->fetchrow_hashref()) {
         $tmpi = $tmpi + 1;          $tmpi = $tmpi + 1;
         print "pandia:  sending $hashref->{url} to worker thread\n";          print "pandia:  sending $hashref->{url} to worker thread\n";
         $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass});          $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
     }      }
   
       print "pandia:  $indices_waiting total pages to be processed\n";
   
   done:        
       $sth->finish();    
       $dbh->disconnect();
   
       my $start_time = time();
       while($indices_waiting > 0) {
           my $end_time = time();
           my $time_diff = $end_time - $start_time;
   
           if($time_diff > 60) {
               print "pandia:  timing out\n";
               last;
           }
           print "pandia:  $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
           sleep(10);
       }
       $self->{index_pool}->shutdown;
   }
   
   sub run_reindex_batch {
       my ($self) = @_;
   
       my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
       
       my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
       $sth->execute($self->{index_workers});   
   
       $indices_waiting = $sth->rows;
       
       if($indices_waiting == 0) {
           print "pandia:  nothing to reindex\n";
           goto done;
       }
   
       my $tmpi = 0;    
       while (my $hashref = $sth->fetchrow_hashref()) {
           $tmpi = $tmpi + 1;
           print "pandia:  sending $hashref->{url} to worker thread\n";
           $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
       }
   
     print "pandia:  $indices_waiting total pages to be processed\n";      print "pandia:  $indices_waiting total pages to be processed\n";
   
 done:            done:        
     $sth->finish();          $sth->finish();    
     $dbh->disconnect();      $dbh->disconnect();
   
Line 222  done: Line 323  done:
         my $end_time = time();          my $end_time = time();
         my $time_diff = $end_time - $start_time;          my $time_diff = $end_time - $start_time;
   
         if($time_diff > $indices_waiting * 20) {          if($time_diff > 60) {
             print "pandia:  timing out\n";              print "pandia:  timing out\n";
             last;              last;
         }          }
Line 230  done: Line 331  done:
         sleep(10);          sleep(10);
     }      }
     $self->{index_pool}->shutdown;      $self->{index_pool}->shutdown;
           
 }  }
   
 1;  1;

Removed from v.1.1  
changed lines
  Added in v.1.2


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>