Diff for /pandia/indexer between versions 1.1 and 1.4

version 1.1, 2025/06/25 19:38:48 version 1.4, 2025/06/28 00:33:32
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.4  2025/06/28 00:33:32  snw
   # Update locking
   #
   # Revision 1.3  2025/06/27 16:20:30  snw
   # Add blacklist
   #
   # Revision 1.2  2025/06/27 02:14:47  snw
   # Initial operational capability
   #
 # Revision 1.1  2025/06/25 19:38:48  snw  # Revision 1.1  2025/06/25 19:38:48  snw
 # Add indexer  # Add indexer
 #  #
Line 18  use Getopt::Long; Line 27  use Getopt::Long;
 use HTTP::Tiny;  use HTTP::Tiny;
 use HTML::TreeBuilder;  use HTML::TreeBuilder;
 use URI;  use URI;
   use Lingua::Stem;
 use DBI;  use DBI;
   use Data::Dumper;
   use Try::Tiny;
   use Fcntl qw(:flock);
   
 my $dbh = "";  my $dbh = "";
 my $dsn = "";  my $dsn = "";
   
   $| = 1;
 print "pandia indexer v0.0.1\n";  print "pandia indexer v0.0.1\n";
 print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";  print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
   
   open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
   flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
   
 GetOptions("dbhost=s" => \$dbhost,  GetOptions("dbhost=s" => \$dbhost,
            "dbname=s" => \$dbname,             "dbname=s" => \$dbname,
            "dbusername=s" => \$dbusername,             "dbusername=s" => \$dbusername,
Line 35  GetOptions("dbhost=s" => \$dbhost, Line 52  GetOptions("dbhost=s" => \$dbhost,
 print "pandia:  connecting to $dbname database at $dbhost...";  print "pandia:  connecting to $dbname database at $dbhost...";
   
 $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";  $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
 $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});  $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
 die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;  die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
   
 print "[OK]\n";  print "[OK]\n";
   
   print "pandia:  loading queue...";
   
   my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
   $sth->execute() or die "pandia:  error retrieving crawl queue\n";
   
   my $qlen = $sth->rows;
   print "[OK (queue length $qlen)]\n";
   
   my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
   
   while (my $hashref = $sth->fetchrow_hashref()) {    
       my $tree = HTML::TreeBuilder->new();
       my $url = $hashref->{url};
       my $url_domain = $hashref->{url_domain};
   
       my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
       $stemmer->stem_caching({ -level => 2 });
   
       print "pandia:  retrieving $url...\n";
       try {
           my $del_queue = 0;
           my $response = $http->get($hashref->{url});
   
           if(not $response->{success}) {
               print "pandia:  http failure; skipping $url\n";
               $del_queue = 1;
           }
           
           #if(exists $response->{redirects}) {
           #    print "pandia:  redirects detected; skipping $url\n";
           #    $del_queue = 1;
           #}
   
           if($del_queue == 1) {
               my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
               $sth->execute($url);
               next;
           }
           
           my $title = "";
           
           my $pagedata = $response->{content};    
           if($response) {
               $tree->parse($pagedata);   
               $title = $tree->look_down('_tag', 'title')->as_text;
               
               print "pandia:  processing $url [$title]\n";
   
               $fulltext = $tree->as_text;
               $fulltext =~ s/[^\x00-\x7F]//g;
               
               my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
               $sth->execute($url, $title, $fulltext);
               
           }
       } catch {
           warn "pandia:  caught failure $_\n";
       };
   
       my @words = split(' ', $fulltext);    
       $stemmer->stem_in_place(@words);
   
       my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
       $sthd->execute($url);
       
       my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
       my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
       my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
       foreach my $word (@words) {
           $word =~ s/[^\x00-\x7F]//g;
           $sths->execute($word, $url);
   
           if($sths->rows > 0) {
               $sthu->execute($word, $url);
           }
           else {
               $sth->execute($word, $url, $url_domain, 1);
           }
       }
   
       my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
       $sthuc->execute($url);
   }

Removed from v.1.1  
changed lines
  Added in v.1.4


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>