--- pandia/indexer 2025/06/27 02:14:47 1.2 +++ pandia/indexer 2025/06/28 05:40:11 1.5 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: indexer,v 1.2 2025/06/27 02:14:47 snw Exp $ +# $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,15 @@ # Licensed AGPL-3.0 # # $Log: indexer,v $ +# Revision 1.5 2025/06/28 05:40:11 snw +# Exclude non-textual MIME types +# +# Revision 1.4 2025/06/28 00:33:32 snw +# Update locking +# +# Revision 1.3 2025/06/27 16:20:30 snw +# Add blacklist +# # Revision 1.2 2025/06/27 02:14:47 snw # Initial operational capability # @@ -25,6 +34,7 @@ use Lingua::Stem; use DBI; use Data::Dumper; use Try::Tiny; +use Fcntl qw(:flock); my $dbh = ""; my $dsn = ""; @@ -33,6 +43,9 @@ $| = 1; print "pandia indexer v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; +open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; +flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; + GetOptions("dbhost=s" => \$dbhost, "dbname=s" => \$dbname, "dbusername=s" => \$dbusername, @@ -68,17 +81,23 @@ while (my $hashref = $sth->fetchrow_hash print "pandia: retrieving $url...\n"; try { my $del_queue = 0; + my $head = $http->head($hashref->{url}); + my $headers = $head->{headers}; + my $content_type = $headers->{'content-type'}; + + if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { + print "pandia: content type $content_type not indexable; skipping $url\n"; + my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); + $sth->execute($url); + next; + } + my $response = $http->get($hashref->{url}); if(not $response->{success}) { print "pandia: http failure; skipping $url\n"; $del_queue = 1; } - - #if(exists $response->{redirects}) { - # print "pandia: redirects detected; skipping $url\n"; - # $del_queue = 1; - #} if($del_queue == 1) { my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); @@ -92,6 +111,7 @@ while (my $hashref = $sth->fetchrow_hash if($response) { $tree->parse($pagedata); $title = $tree->look_down('_tag', 'title')->as_text; + $title =~ s/[^\x00-\x7F]//g; print "pandia: processing $url [$title]\n"; @@ -113,11 +133,11 @@ while (my $hashref = $sth->fetchrow_hash $sthd->execute($url); my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); - my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?"); + my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?"); my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); foreach my $word (@words) { $word =~ s/[^\x00-\x7F]//g; - $sths->execute($word); + $sths->execute($word, $url); if($sths->rows > 0) { $sthu->execute($word, $url);