--- pandia/indexer 2025/06/25 19:38:48 1.1 +++ pandia/indexer 2025/06/27 02:14:47 1.2 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: indexer,v 1.1 2025/06/25 19:38:48 snw Exp $ +# $Id: indexer,v 1.2 2025/06/27 02:14:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,9 @@ # Licensed AGPL-3.0 # # $Log: indexer,v $ +# Revision 1.2 2025/06/27 02:14:47 snw +# Initial operational capability +# # Revision 1.1 2025/06/25 19:38:48 snw # Add indexer # @@ -18,11 +21,15 @@ use Getopt::Long; use HTTP::Tiny; use HTML::TreeBuilder; use URI; +use Lingua::Stem; use DBI; +use Data::Dumper; +use Try::Tiny; my $dbh = ""; my $dsn = ""; +$| = 1; print "pandia indexer v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; @@ -35,7 +42,91 @@ GetOptions("dbhost=s" => \$dbhost, print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; -$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); +$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1}); die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; print "[OK]\n"; + +print "pandia: loading queue..."; + +my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0"); +$sth->execute() or die "pandia: error retrieving crawl queue\n"; + +my $qlen = $sth->rows; +print "[OK (queue length $qlen)]\n"; + +my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5); + +while (my $hashref = $sth->fetchrow_hashref()) { + my $tree = HTML::TreeBuilder->new(); + my $url = $hashref->{url}; + my $url_domain = $hashref->{url_domain}; + + my $stemmer = Lingua::Stem->new(-locale => 'EN-US'); + $stemmer->stem_caching({ -level => 2 }); + + print "pandia: retrieving $url...\n"; + try { + my $del_queue = 0; + my $response = $http->get($hashref->{url}); + + if(not $response->{success}) { + print "pandia: http failure; skipping $url\n"; + $del_queue = 1; + } + + #if(exists $response->{redirects}) { + # print "pandia: redirects detected; skipping $url\n"; + # $del_queue = 1; + #} + + if($del_queue == 1) { + my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); + $sth->execute($url); + next; + } + + my $title = ""; + + my $pagedata = $response->{content}; + if($response) { + $tree->parse($pagedata); + $title = $tree->look_down('_tag', 'title')->as_text; + + print "pandia: processing $url [$title]\n"; + + $fulltext = $tree->as_text; + $fulltext =~ s/[^\x00-\x7F]//g; + + my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)"); + $sth->execute($url, $title, $fulltext); + + } + } catch { + warn "pandia: caught failure $_\n"; + }; + + my @words = split(' ', $fulltext); + $stemmer->stem_in_place(@words); + + my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?"); + $sthd->execute($url); + + my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); + my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?"); + my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); + foreach my $word (@words) { + $word =~ s/[^\x00-\x7F]//g; + $sths->execute($word); + + if($sths->rows > 0) { + $sthu->execute($word, $url); + } + else { + $sth->execute($word, $url, $url_domain, 1); + } + } + + my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?"); + $sthuc->execute($url); +}