version 1.2, 2025/06/27 02:14:47
|
version 1.5, 2025/06/28 05:40:11
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.5 2025/06/28 05:40:11 snw |
|
# Exclude non-textual MIME types |
|
# |
|
# Revision 1.4 2025/06/28 00:33:32 snw |
|
# Update locking |
|
# |
|
# Revision 1.3 2025/06/27 16:20:30 snw |
|
# Add blacklist |
|
# |
# Revision 1.2 2025/06/27 02:14:47 snw |
# Revision 1.2 2025/06/27 02:14:47 snw |
# Initial operational capability |
# Initial operational capability |
# |
# |
Line 25 use Lingua::Stem;
|
Line 34 use Lingua::Stem;
|
use DBI; |
use DBI; |
use Data::Dumper; |
use Data::Dumper; |
use Try::Tiny; |
use Try::Tiny; |
|
use Fcntl qw(:flock); |
|
|
my $dbh = ""; |
my $dbh = ""; |
my $dsn = ""; |
my $dsn = ""; |
Line 33 $| = 1;
|
Line 43 $| = 1;
|
print "pandia indexer v0.0.1\n"; |
print "pandia indexer v0.0.1\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
|
open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; |
|
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
|
|
GetOptions("dbhost=s" => \$dbhost, |
GetOptions("dbhost=s" => \$dbhost, |
"dbname=s" => \$dbname, |
"dbname=s" => \$dbname, |
"dbusername=s" => \$dbusername, |
"dbusername=s" => \$dbusername, |
Line 68 while (my $hashref = $sth->fetchrow_hash
|
Line 81 while (my $hashref = $sth->fetchrow_hash
|
print "pandia: retrieving $url...\n"; |
print "pandia: retrieving $url...\n"; |
try { |
try { |
my $del_queue = 0; |
my $del_queue = 0; |
|
my $head = $http->head($hashref->{url}); |
|
my $headers = $head->{headers}; |
|
my $content_type = $headers->{'content-type'}; |
|
|
|
if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { |
|
print "pandia: content type $content_type not indexable; skipping $url\n"; |
|
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
|
$sth->execute($url); |
|
next; |
|
} |
|
|
my $response = $http->get($hashref->{url}); |
my $response = $http->get($hashref->{url}); |
|
|
if(not $response->{success}) { |
if(not $response->{success}) { |
print "pandia: http failure; skipping $url\n"; |
print "pandia: http failure; skipping $url\n"; |
$del_queue = 1; |
$del_queue = 1; |
} |
} |
|
|
#if(exists $response->{redirects}) { |
|
# print "pandia: redirects detected; skipping $url\n"; |
|
# $del_queue = 1; |
|
#} |
|
|
|
if($del_queue == 1) { |
if($del_queue == 1) { |
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
Line 92 while (my $hashref = $sth->fetchrow_hash
|
Line 111 while (my $hashref = $sth->fetchrow_hash
|
if($response) { |
if($response) { |
$tree->parse($pagedata); |
$tree->parse($pagedata); |
$title = $tree->look_down('_tag', 'title')->as_text; |
$title = $tree->look_down('_tag', 'title')->as_text; |
|
$title =~ s/[^\x00-\x7F]//g; |
|
|
print "pandia: processing $url [$title]\n"; |
print "pandia: processing $url [$title]\n"; |
|
|
Line 113 while (my $hashref = $sth->fetchrow_hash
|
Line 133 while (my $hashref = $sth->fetchrow_hash
|
$sthd->execute($url); |
$sthd->execute($url); |
|
|
my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); |
my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); |
my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?"); |
my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?"); |
my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); |
my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); |
foreach my $word (@words) { |
foreach my $word (@words) { |
$word =~ s/[^\x00-\x7F]//g; |
$word =~ s/[^\x00-\x7F]//g; |
$sths->execute($word); |
$sths->execute($word, $url); |
|
|
if($sths->rows > 0) { |
if($sths->rows > 0) { |
$sthu->execute($word, $url); |
$sthu->execute($word, $url); |