File:  [Coherent Logic Development] / pandia / indexer
Revision 1.5: download - view: text, annotated - select for diffs
Sat Jun 28 05:40:11 2025 UTC (5 weeks, 1 day ago) by snw
Branches: MAIN
CVS tags: HEAD
Exclude non-textual MIME types

    1: #!/usr/bin/env perl
    2: 
    3: # 
    4: # $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $
    5: #  Copyright (C) 2025 Coherent Logic Development LLC
    6: #
    7: # Author: Serena Willis <snw@coherent-logic.com>
    8: #
    9: # Licensed AGPL-3.0
   10: #
   11: # $Log: indexer,v $
   12: # Revision 1.5  2025/06/28 05:40:11  snw
   13: # Exclude non-textual MIME types
   14: #
   15: # Revision 1.4  2025/06/28 00:33:32  snw
   16: # Update locking
   17: #
   18: # Revision 1.3  2025/06/27 16:20:30  snw
   19: # Add blacklist
   20: #
   21: # Revision 1.2  2025/06/27 02:14:47  snw
   22: # Initial operational capability
   23: #
   24: # Revision 1.1  2025/06/25 19:38:48  snw
   25: # Add indexer
   26: #
   27: #
   28: 
   29: use Getopt::Long;
   30: use HTTP::Tiny;
   31: use HTML::TreeBuilder;
   32: use URI;
   33: use Lingua::Stem;
   34: use DBI;
   35: use Data::Dumper;
   36: use Try::Tiny;
   37: use Fcntl qw(:flock);
   38: 
   39: my $dbh = "";
   40: my $dsn = "";
   41: 
   42: $| = 1;
   43: print "pandia indexer v0.0.1\n";
   44: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
   45: 
   46: open my $file, ">", "/tmp/pandia_indexer.lock" or die $!; 
   47: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
   48: 
   49: GetOptions("dbhost=s" => \$dbhost,
   50:            "dbname=s" => \$dbname,
   51:            "dbusername=s" => \$dbusername,
   52:            "dbpw=s" => \$dbpw)
   53:     or die("error in command line arguments");
   54: 
   55: print "pandia:  connecting to $dbname database at $dbhost...";
   56: 
   57: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
   58: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
   59: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
   60: 
   61: print "[OK]\n";
   62: 
   63: print "pandia:  loading queue...";
   64: 
   65: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
   66: $sth->execute() or die "pandia:  error retrieving crawl queue\n";
   67: 
   68: my $qlen = $sth->rows;
   69: print "[OK (queue length $qlen)]\n";
   70: 
   71: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
   72: 
   73: while (my $hashref = $sth->fetchrow_hashref()) {    
   74:     my $tree = HTML::TreeBuilder->new();
   75:     my $url = $hashref->{url};
   76:     my $url_domain = $hashref->{url_domain};
   77: 
   78:     my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
   79:     $stemmer->stem_caching({ -level => 2 });
   80: 
   81:     print "pandia:  retrieving $url...\n";
   82:     try {
   83: 	my $del_queue = 0;
   84: 	my $head = $http->head($hashref->{url});
   85: 	my $headers = $head->{headers};
   86: 	my $content_type = $headers->{'content-type'};
   87: 
   88: 	if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
   89: 	    print "pandia:  content type $content_type not indexable; skipping $url\n";
   90: 	    my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
   91: 	    $sth->execute($url);
   92: 	    next;
   93: 	}
   94: 	
   95: 	my $response = $http->get($hashref->{url});
   96: 
   97: 	if(not $response->{success}) {
   98: 	    print "pandia:  http failure; skipping $url\n";
   99: 	    $del_queue = 1;
  100: 	}
  101: 
  102: 	if($del_queue == 1) {
  103: 	    my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
  104: 	    $sth->execute($url);
  105: 	    next;
  106: 	}
  107: 	
  108: 	my $title = "";
  109: 	
  110: 	my $pagedata = $response->{content};    
  111: 	if($response) {
  112: 	    $tree->parse($pagedata);   
  113: 	    $title = $tree->look_down('_tag', 'title')->as_text;
  114: 	    $title =~ s/[^\x00-\x7F]//g;
  115: 	    
  116: 	    print "pandia:  processing $url [$title]\n";
  117: 
  118: 	    $fulltext = $tree->as_text;
  119: 	    $fulltext =~ s/[^\x00-\x7F]//g;
  120: 	    
  121: 	    my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
  122: 	    $sth->execute($url, $title, $fulltext);
  123: 	    
  124: 	}
  125:     } catch {
  126: 	warn "pandia:  caught failure $_\n";
  127:     };
  128: 
  129:     my @words = split(' ', $fulltext);    
  130:     $stemmer->stem_in_place(@words);
  131: 
  132:     my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
  133:     $sthd->execute($url);
  134:     
  135:     my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
  136:     my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
  137:     my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
  138:     foreach my $word (@words) {
  139: 	$word =~ s/[^\x00-\x7F]//g;
  140: 	$sths->execute($word, $url);
  141: 
  142: 	if($sths->rows > 0) {
  143: 	    $sthu->execute($word, $url);
  144: 	}
  145: 	else {
  146: 	    $sth->execute($word, $url, $url_domain, 1);
  147: 	}
  148:     }
  149: 
  150:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
  151:     $sthuc->execute($url);
  152: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>