File:  [Coherent Logic Development] / pandia / indexer
Revision 1.5: download - view: text, annotated - select for diffs
Sat Jun 28 05:40:11 2025 UTC (3 weeks, 1 day ago) by snw
Branches: MAIN
CVS tags: HEAD
Exclude non-textual MIME types

#!/usr/bin/env perl

# 
# $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $
#  Copyright (C) 2025 Coherent Logic Development LLC
#
# Author: Serena Willis <snw@coherent-logic.com>
#
# Licensed AGPL-3.0
#
# $Log: indexer,v $
# Revision 1.5  2025/06/28 05:40:11  snw
# Exclude non-textual MIME types
#
# Revision 1.4  2025/06/28 00:33:32  snw
# Update locking
#
# Revision 1.3  2025/06/27 16:20:30  snw
# Add blacklist
#
# Revision 1.2  2025/06/27 02:14:47  snw
# Initial operational capability
#
# Revision 1.1  2025/06/25 19:38:48  snw
# Add indexer
#
#

use Getopt::Long;
use HTTP::Tiny;
use HTML::TreeBuilder;
use URI;
use Lingua::Stem;
use DBI;
use Data::Dumper;
use Try::Tiny;
use Fcntl qw(:flock);

my $dbh = "";
my $dsn = "";

$| = 1;
print "pandia indexer v0.0.1\n";
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";

open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";

GetOptions("dbhost=s" => \$dbhost,
           "dbname=s" => \$dbname,
           "dbusername=s" => \$dbusername,
           "dbpw=s" => \$dbpw)
    or die("error in command line arguments");

print "pandia:  connecting to $dbname database at $dbhost...";

$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;

print "[OK]\n";

print "pandia:  loading queue...";

my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
$sth->execute() or die "pandia:  error retrieving crawl queue\n";

my $qlen = $sth->rows;
print "[OK (queue length $qlen)]\n";

my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);

while (my $hashref = $sth->fetchrow_hashref()) {    
    my $tree = HTML::TreeBuilder->new();
    my $url = $hashref->{url};
    my $url_domain = $hashref->{url_domain};

    my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
    $stemmer->stem_caching({ -level => 2 });

    print "pandia:  retrieving $url...\n";
    try {
	my $del_queue = 0;
	my $head = $http->head($hashref->{url});
	my $headers = $head->{headers};
	my $content_type = $headers->{'content-type'};

	if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
	    print "pandia:  content type $content_type not indexable; skipping $url\n";
	    my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
	    $sth->execute($url);
	    next;
	}
	
	my $response = $http->get($hashref->{url});

	if(not $response->{success}) {
	    print "pandia:  http failure; skipping $url\n";
	    $del_queue = 1;
	}

	if($del_queue == 1) {
	    my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
	    $sth->execute($url);
	    next;
	}
	
	my $title = "";
	
	my $pagedata = $response->{content};    
	if($response) {
	    $tree->parse($pagedata);   
	    $title = $tree->look_down('_tag', 'title')->as_text;
	    $title =~ s/[^\x00-\x7F]//g;
	    
	    print "pandia:  processing $url [$title]\n";

	    $fulltext = $tree->as_text;
	    $fulltext =~ s/[^\x00-\x7F]//g;
	    
	    my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
	    $sth->execute($url, $title, $fulltext);
	    
	}
    } catch {
	warn "pandia:  caught failure $_\n";
    };

    my @words = split(' ', $fulltext);    
    $stemmer->stem_in_place(@words);

    my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
    $sthd->execute($url);
    
    my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
    my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
    my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
    foreach my $word (@words) {
	$word =~ s/[^\x00-\x7F]//g;
	$sths->execute($word, $url);

	if($sths->rows > 0) {
	    $sthu->execute($word, $url);
	}
	else {
	    $sth->execute($word, $url, $url_domain, 1);
	}
    }

    my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
    $sthuc->execute($url);
}

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>