#!/usr/bin/env perl # # $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis # # Licensed AGPL-3.0 # # $Log: indexer,v $ # Revision 1.5 2025/06/28 05:40:11 snw # Exclude non-textual MIME types # # Revision 1.4 2025/06/28 00:33:32 snw # Update locking # # Revision 1.3 2025/06/27 16:20:30 snw # Add blacklist # # Revision 1.2 2025/06/27 02:14:47 snw # Initial operational capability # # Revision 1.1 2025/06/25 19:38:48 snw # Add indexer # # use Getopt::Long; use HTTP::Tiny; use HTML::TreeBuilder; use URI; use Lingua::Stem; use DBI; use Data::Dumper; use Try::Tiny; use Fcntl qw(:flock); my $dbh = ""; my $dsn = ""; $| = 1; print "pandia indexer v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; GetOptions("dbhost=s" => \$dbhost, "dbname=s" => \$dbname, "dbusername=s" => \$dbusername, "dbpw=s" => \$dbpw) or die("error in command line arguments"); print "pandia: connecting to $dbname database at $dbhost..."; $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1}); die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; print "[OK]\n"; print "pandia: loading queue..."; my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0"); $sth->execute() or die "pandia: error retrieving crawl queue\n"; my $qlen = $sth->rows; print "[OK (queue length $qlen)]\n"; my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5); while (my $hashref = $sth->fetchrow_hashref()) { my $tree = HTML::TreeBuilder->new(); my $url = $hashref->{url}; my $url_domain = $hashref->{url_domain}; my $stemmer = Lingua::Stem->new(-locale => 'EN-US'); $stemmer->stem_caching({ -level => 2 }); print "pandia: retrieving $url...\n"; try { my $del_queue = 0; my $head = $http->head($hashref->{url}); my $headers = $head->{headers}; my $content_type = $headers->{'content-type'}; if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { print "pandia: content type $content_type not indexable; skipping $url\n"; my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); $sth->execute($url); next; } my $response = $http->get($hashref->{url}); if(not $response->{success}) { print "pandia: http failure; skipping $url\n"; $del_queue = 1; } if($del_queue == 1) { my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); $sth->execute($url); next; } my $title = ""; my $pagedata = $response->{content}; if($response) { $tree->parse($pagedata); $title = $tree->look_down('_tag', 'title')->as_text; $title =~ s/[^\x00-\x7F]//g; print "pandia: processing $url [$title]\n"; $fulltext = $tree->as_text; $fulltext =~ s/[^\x00-\x7F]//g; my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)"); $sth->execute($url, $title, $fulltext); } } catch { warn "pandia: caught failure $_\n"; }; my @words = split(' ', $fulltext); $stemmer->stem_in_place(@words); my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?"); $sthd->execute($url); my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)"); my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?"); my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?"); foreach my $word (@words) { $word =~ s/[^\x00-\x7F]//g; $sths->execute($word, $url); if($sths->rows > 0) { $sthu->execute($word, $url); } else { $sth->execute($word, $url, $url_domain, 1); } } my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?"); $sthuc->execute($url); }