version 1.1, 2025/06/25 19:38:48
|
version 1.3, 2025/06/27 16:20:30
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.3 2025/06/27 16:20:30 snw |
|
# Add blacklist |
|
# |
|
# Revision 1.2 2025/06/27 02:14:47 snw |
|
# Initial operational capability |
|
# |
# Revision 1.1 2025/06/25 19:38:48 snw |
# Revision 1.1 2025/06/25 19:38:48 snw |
# Add indexer |
# Add indexer |
# |
# |
Line 18 use Getopt::Long;
|
Line 24 use Getopt::Long;
|
use HTTP::Tiny; |
use HTTP::Tiny; |
use HTML::TreeBuilder; |
use HTML::TreeBuilder; |
use URI; |
use URI; |
|
use Lingua::Stem; |
use DBI; |
use DBI; |
|
use Data::Dumper; |
|
use Try::Tiny; |
|
use Fcntl qw(:flock); |
|
|
my $dbh = ""; |
my $dbh = ""; |
my $dsn = ""; |
my $dsn = ""; |
|
|
|
$| = 1; |
print "pandia indexer v0.0.1\n"; |
print "pandia indexer v0.0.1\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
|
open my $file, ">", "pandia_indexer.lock" or die $!; |
|
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
|
|
GetOptions("dbhost=s" => \$dbhost, |
GetOptions("dbhost=s" => \$dbhost, |
"dbname=s" => \$dbname, |
"dbname=s" => \$dbname, |
"dbusername=s" => \$dbusername, |
"dbusername=s" => \$dbusername, |
Line 35 GetOptions("dbhost=s" => \$dbhost,
|
Line 49 GetOptions("dbhost=s" => \$dbhost,
|
print "pandia: connecting to $dbname database at $dbhost..."; |
print "pandia: connecting to $dbname database at $dbhost..."; |
|
|
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); |
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1}); |
die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; |
die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; |
|
|
print "[OK]\n"; |
print "[OK]\n"; |
|
|
|
print "pandia: loading queue..."; |
|
|
|
my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0"); |
|
$sth->execute() or die "pandia: error retrieving crawl queue\n"; |
|
|
|
my $qlen = $sth->rows; |
|
print "[OK (queue length $qlen)]\n"; |
|
|
|
my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5); |
|
|
|
while (my $hashref = $sth->fetchrow_hashref()) { |
|
my $tree = HTML::TreeBuilder->new(); |
|
my $url = $hashref->{url}; |
|
my $url_domain = $hashref->{url_domain}; |
|
|
|
my $stemmer = Lingua::Stem->new(-locale => 'EN-US'); |
|
$stemmer->stem_caching({ -level => 2 }); |
|
|
|
print "pandia: retrieving $url...\n"; |
|
try { |
|
my $del_queue = 0; |
|
my $response = $http->get($hashref->{url}); |
|
|
|
if(not $response->{success}) { |
|
print "pandia: http failure; skipping $url\n"; |
|
$del_queue = 1; |
|
} |
|
|
|
#if(exists $response->{redirects}) { |
|
# print "pandia: redirects detected; skipping $url\n"; |
|
# $del_queue = 1; |
|
#} |
|
|
|
if($del_queue == 1) { |
|
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
|
$sth->execute($url); |
|
next; |
|
} |
|
|
|
my $title = ""; |
|
|
|
my $pagedata = $response->{content}; |
|
if($response) { |
|
$tree->parse($pagedata); |
|
$title = $tree->look_down('_tag', 'title')->as_text; |
|
|
|
print "pandia: processing $url [$title]\n"; |
|
|
|
$fulltext = $tree->as_text; |
|
$fulltext =~ s/[^\x00-\x7F]//g; |
|
|
|
my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)"); |
|
$sth->execute($url, $title, $fulltext); |
|
|
|
} |
|
} catch { |
|
warn "pandia: caught failure $_\n"; |
|
}; |
|
|
|
my @words = split(' ', $fulltext); |
|
$stemmer->stem_in_place(@words); |
|
|
|
my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?"); |
|
$sthd->execute($url); |
|
|
|
my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); |
|
my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?"); |
|
my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); |
|
foreach my $word (@words) { |
|
$word =~ s/[^\x00-\x7F]//g; |
|
$sths->execute($word, $url); |
|
|
|
if($sths->rows > 0) { |
|
$sthu->execute($word, $url); |
|
} |
|
else { |
|
$sth->execute($word, $url, $url_domain, 1); |
|
} |
|
} |
|
|
|
my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?"); |
|
$sthuc->execute($url); |
|
} |