Annotation of pandia/indexer, revision 1.2
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.2 ! snw 4: # $Id: indexer,v 1.1 2025/06/25 19:38:48 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 ! snw 11: # $Log: indexer,v $
! 12: # Revision 1.1 2025/06/25 19:38:48 snw
! 13: # Add indexer
! 14: #
1.1 snw 15: #
16:
17: use Getopt::Long;
18: use HTTP::Tiny;
19: use HTML::TreeBuilder;
20: use URI;
1.2 ! snw 21: use Lingua::Stem;
1.1 snw 22: use DBI;
1.2 ! snw 23: use Data::Dumper;
! 24: use Try::Tiny;
1.1 snw 25:
26: my $dbh = "";
27: my $dsn = "";
28:
1.2 ! snw 29: $| = 1;
1.1 snw 30: print "pandia indexer v0.0.1\n";
31: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
32:
33: GetOptions("dbhost=s" => \$dbhost,
34: "dbname=s" => \$dbname,
35: "dbusername=s" => \$dbusername,
36: "dbpw=s" => \$dbpw)
37: or die("error in command line arguments");
38:
39: print "pandia: connecting to $dbname database at $dbhost...";
40:
41: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2 ! snw 42: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1 snw 43: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
44:
45: print "[OK]\n";
1.2 ! snw 46:
! 47: print "pandia: loading queue...";
! 48:
! 49: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
! 50: $sth->execute() or die "pandia: error retrieving crawl queue\n";
! 51:
! 52: my $qlen = $sth->rows;
! 53: print "[OK (queue length $qlen)]\n";
! 54:
! 55: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
! 56:
! 57: while (my $hashref = $sth->fetchrow_hashref()) {
! 58: my $tree = HTML::TreeBuilder->new();
! 59: my $url = $hashref->{url};
! 60: my $url_domain = $hashref->{url_domain};
! 61:
! 62: my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
! 63: $stemmer->stem_caching({ -level => 2 });
! 64:
! 65: print "pandia: retrieving $url...\n";
! 66: try {
! 67: my $del_queue = 0;
! 68: my $response = $http->get($hashref->{url});
! 69:
! 70: if(not $response->{success}) {
! 71: print "pandia: http failure; skipping $url\n";
! 72: $del_queue = 1;
! 73: }
! 74:
! 75: #if(exists $response->{redirects}) {
! 76: # print "pandia: redirects detected; skipping $url\n";
! 77: # $del_queue = 1;
! 78: #}
! 79:
! 80: if($del_queue == 1) {
! 81: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
! 82: $sth->execute($url);
! 83: next;
! 84: }
! 85:
! 86: my $title = "";
! 87:
! 88: my $pagedata = $response->{content};
! 89: if($response) {
! 90: $tree->parse($pagedata);
! 91: $title = $tree->look_down('_tag', 'title')->as_text;
! 92:
! 93: print "pandia: processing $url [$title]\n";
! 94:
! 95: $fulltext = $tree->as_text;
! 96: $fulltext =~ s/[^\x00-\x7F]//g;
! 97:
! 98: my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
! 99: $sth->execute($url, $title, $fulltext);
! 100:
! 101: }
! 102: } catch {
! 103: warn "pandia: caught failure $_\n";
! 104: };
! 105:
! 106: my @words = split(' ', $fulltext);
! 107: $stemmer->stem_in_place(@words);
! 108:
! 109: my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
! 110: $sthd->execute($url);
! 111:
! 112: my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
! 113: my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?");
! 114: my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
! 115: foreach my $word (@words) {
! 116: $word =~ s/[^\x00-\x7F]//g;
! 117: $sths->execute($word);
! 118:
! 119: if($sths->rows > 0) {
! 120: $sthu->execute($word, $url);
! 121: }
! 122: else {
! 123: $sth->execute($word, $url, $url_domain, 1);
! 124: }
! 125: }
! 126:
! 127: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
! 128: $sthuc->execute($url);
! 129: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>