Diff for /pandia/indexer between versions 1.2 and 1.5

version 1.2, 2025/06/27 02:14:47 version 1.5, 2025/06/28 05:40:11
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.5  2025/06/28 05:40:11  snw
   # Exclude non-textual MIME types
   #
   # Revision 1.4  2025/06/28 00:33:32  snw
   # Update locking
   #
   # Revision 1.3  2025/06/27 16:20:30  snw
   # Add blacklist
   #
 # Revision 1.2  2025/06/27 02:14:47  snw  # Revision 1.2  2025/06/27 02:14:47  snw
 # Initial operational capability  # Initial operational capability
 #  #
Line 25  use Lingua::Stem; Line 34  use Lingua::Stem;
 use DBI;  use DBI;
 use Data::Dumper;  use Data::Dumper;
 use Try::Tiny;  use Try::Tiny;
   use Fcntl qw(:flock);
   
 my $dbh = "";  my $dbh = "";
 my $dsn = "";  my $dsn = "";
Line 33  $| = 1; Line 43  $| = 1;
 print "pandia indexer v0.0.1\n";  print "pandia indexer v0.0.1\n";
 print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";  print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
   
   open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
   flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
   
 GetOptions("dbhost=s" => \$dbhost,  GetOptions("dbhost=s" => \$dbhost,
            "dbname=s" => \$dbname,             "dbname=s" => \$dbname,
            "dbusername=s" => \$dbusername,             "dbusername=s" => \$dbusername,
Line 68  while (my $hashref = $sth->fetchrow_hash Line 81  while (my $hashref = $sth->fetchrow_hash
     print "pandia:  retrieving $url...\n";      print "pandia:  retrieving $url...\n";
     try {      try {
         my $del_queue = 0;          my $del_queue = 0;
           my $head = $http->head($hashref->{url});
           my $headers = $head->{headers};
           my $content_type = $headers->{'content-type'};
   
           if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
               print "pandia:  content type $content_type not indexable; skipping $url\n";
               my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
               $sth->execute($url);
               next;
           }
           
         my $response = $http->get($hashref->{url});          my $response = $http->get($hashref->{url});
   
         if(not $response->{success}) {          if(not $response->{success}) {
             print "pandia:  http failure; skipping $url\n";              print "pandia:  http failure; skipping $url\n";
             $del_queue = 1;              $del_queue = 1;
         }          }
           
         #if(exists $response->{redirects}) {  
         #    print "pandia:  redirects detected; skipping $url\n";  
         #    $del_queue = 1;  
         #}  
   
         if($del_queue == 1) {          if($del_queue == 1) {
             my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");              my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
Line 92  while (my $hashref = $sth->fetchrow_hash Line 111  while (my $hashref = $sth->fetchrow_hash
         if($response) {          if($response) {
             $tree->parse($pagedata);                 $tree->parse($pagedata);   
             $title = $tree->look_down('_tag', 'title')->as_text;              $title = $tree->look_down('_tag', 'title')->as_text;
               $title =~ s/[^\x00-\x7F]//g;
                           
             print "pandia:  processing $url [$title]\n";              print "pandia:  processing $url [$title]\n";
   
Line 113  while (my $hashref = $sth->fetchrow_hash Line 133  while (my $hashref = $sth->fetchrow_hash
     $sthd->execute($url);      $sthd->execute($url);
           
     my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");      my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
     my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?");      my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
     my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");      my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
     foreach my $word (@words) {      foreach my $word (@words) {
         $word =~ s/[^\x00-\x7F]//g;          $word =~ s/[^\x00-\x7F]//g;
         $sths->execute($word);          $sths->execute($word, $url);
   
         if($sths->rows > 0) {          if($sths->rows > 0) {
             $sthu->execute($word, $url);              $sthu->execute($word, $url);

Removed from v.1.2  
changed lines
  Added in v.1.5


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>