--- pandia/indexer 2025/06/27 16:20:30 1.3 +++ pandia/indexer 2025/06/28 05:40:11 1.5 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: indexer,v 1.3 2025/06/27 16:20:30 snw Exp $ +# $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,12 @@ # Licensed AGPL-3.0 # # $Log: indexer,v $ +# Revision 1.5 2025/06/28 05:40:11 snw +# Exclude non-textual MIME types +# +# Revision 1.4 2025/06/28 00:33:32 snw +# Update locking +# # Revision 1.3 2025/06/27 16:20:30 snw # Add blacklist # @@ -37,7 +43,7 @@ $| = 1; print "pandia indexer v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; -open my $file, ">", "pandia_indexer.lock" or die $!; +open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; GetOptions("dbhost=s" => \$dbhost, @@ -75,17 +81,23 @@ while (my $hashref = $sth->fetchrow_hash print "pandia: retrieving $url...\n"; try { my $del_queue = 0; + my $head = $http->head($hashref->{url}); + my $headers = $head->{headers}; + my $content_type = $headers->{'content-type'}; + + if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { + print "pandia: content type $content_type not indexable; skipping $url\n"; + my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); + $sth->execute($url); + next; + } + my $response = $http->get($hashref->{url}); if(not $response->{success}) { print "pandia: http failure; skipping $url\n"; $del_queue = 1; } - - #if(exists $response->{redirects}) { - # print "pandia: redirects detected; skipping $url\n"; - # $del_queue = 1; - #} if($del_queue == 1) { my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); @@ -99,6 +111,7 @@ while (my $hashref = $sth->fetchrow_hash if($response) { $tree->parse($pagedata); $title = $tree->look_down('_tag', 'title')->as_text; + $title =~ s/[^\x00-\x7F]//g; print "pandia: processing $url [$title]\n";