Diff for /pandia/indexer between versions 1.4 and 1.5

version 1.4, 2025/06/28 00:33:32 version 1.5, 2025/06/28 05:40:11
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.5  2025/06/28 05:40:11  snw
   # Exclude non-textual MIME types
   #
 # Revision 1.4  2025/06/28 00:33:32  snw  # Revision 1.4  2025/06/28 00:33:32  snw
 # Update locking  # Update locking
 #  #
Line 78  while (my $hashref = $sth->fetchrow_hash Line 81  while (my $hashref = $sth->fetchrow_hash
     print "pandia:  retrieving $url...\n";      print "pandia:  retrieving $url...\n";
     try {      try {
         my $del_queue = 0;          my $del_queue = 0;
           my $head = $http->head($hashref->{url});
           my $headers = $head->{headers};
           my $content_type = $headers->{'content-type'};
   
           if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
               print "pandia:  content type $content_type not indexable; skipping $url\n";
               my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
               $sth->execute($url);
               next;
           }
           
         my $response = $http->get($hashref->{url});          my $response = $http->get($hashref->{url});
   
         if(not $response->{success}) {          if(not $response->{success}) {
             print "pandia:  http failure; skipping $url\n";              print "pandia:  http failure; skipping $url\n";
             $del_queue = 1;              $del_queue = 1;
         }          }
           
         #if(exists $response->{redirects}) {  
         #    print "pandia:  redirects detected; skipping $url\n";  
         #    $del_queue = 1;  
         #}  
   
         if($del_queue == 1) {          if($del_queue == 1) {
             my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");              my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
Line 102  while (my $hashref = $sth->fetchrow_hash Line 111  while (my $hashref = $sth->fetchrow_hash
         if($response) {          if($response) {
             $tree->parse($pagedata);                 $tree->parse($pagedata);   
             $title = $tree->look_down('_tag', 'title')->as_text;              $title = $tree->look_down('_tag', 'title')->as_text;
               $title =~ s/[^\x00-\x7F]//g;
                           
             print "pandia:  processing $url [$title]\n";              print "pandia:  processing $url [$title]\n";
   

Removed from v.1.4  
changed lines
  Added in v.1.5


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>