--- pandia/indexer 2025/06/28 00:33:32 1.4 +++ pandia/indexer 2025/06/28 05:40:11 1.5 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: indexer,v 1.4 2025/06/28 00:33:32 snw Exp $ +# $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,9 @@ # Licensed AGPL-3.0 # # $Log: indexer,v $ +# Revision 1.5 2025/06/28 05:40:11 snw +# Exclude non-textual MIME types +# # Revision 1.4 2025/06/28 00:33:32 snw # Update locking # @@ -78,17 +81,23 @@ while (my $hashref = $sth->fetchrow_hash print "pandia: retrieving $url...\n"; try { my $del_queue = 0; + my $head = $http->head($hashref->{url}); + my $headers = $head->{headers}; + my $content_type = $headers->{'content-type'}; + + if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { + print "pandia: content type $content_type not indexable; skipping $url\n"; + my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); + $sth->execute($url); + next; + } + my $response = $http->get($hashref->{url}); if(not $response->{success}) { print "pandia: http failure; skipping $url\n"; $del_queue = 1; } - - #if(exists $response->{redirects}) { - # print "pandia: redirects detected; skipping $url\n"; - # $del_queue = 1; - #} if($del_queue == 1) { my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); @@ -102,6 +111,7 @@ while (my $hashref = $sth->fetchrow_hash if($response) { $tree->parse($pagedata); $title = $tree->look_down('_tag', 'title')->as_text; + $title =~ s/[^\x00-\x7F]//g; print "pandia: processing $url [$title]\n";