Annotation of pandia/indexer, revision 1.4
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.4 ! snw 4: # $Id: indexer,v 1.3 2025/06/27 16:20:30 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: indexer,v $
1.4 ! snw 12: # Revision 1.3 2025/06/27 16:20:30 snw
! 13: # Add blacklist
! 14: #
1.3 snw 15: # Revision 1.2 2025/06/27 02:14:47 snw
16: # Initial operational capability
17: #
1.2 snw 18: # Revision 1.1 2025/06/25 19:38:48 snw
19: # Add indexer
20: #
1.1 snw 21: #
22:
23: use Getopt::Long;
24: use HTTP::Tiny;
25: use HTML::TreeBuilder;
26: use URI;
1.2 snw 27: use Lingua::Stem;
1.1 snw 28: use DBI;
1.2 snw 29: use Data::Dumper;
30: use Try::Tiny;
1.3 snw 31: use Fcntl qw(:flock);
1.1 snw 32:
33: my $dbh = "";
34: my $dsn = "";
35:
1.2 snw 36: $| = 1;
1.1 snw 37: print "pandia indexer v0.0.1\n";
38: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
39:
1.4 ! snw 40: open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
1.3 snw 41: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
42:
1.1 snw 43: GetOptions("dbhost=s" => \$dbhost,
44: "dbname=s" => \$dbname,
45: "dbusername=s" => \$dbusername,
46: "dbpw=s" => \$dbpw)
47: or die("error in command line arguments");
48:
49: print "pandia: connecting to $dbname database at $dbhost...";
50:
51: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2 snw 52: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1 snw 53: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
54:
55: print "[OK]\n";
1.2 snw 56:
57: print "pandia: loading queue...";
58:
59: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
60: $sth->execute() or die "pandia: error retrieving crawl queue\n";
61:
62: my $qlen = $sth->rows;
63: print "[OK (queue length $qlen)]\n";
64:
65: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
66:
67: while (my $hashref = $sth->fetchrow_hashref()) {
68: my $tree = HTML::TreeBuilder->new();
69: my $url = $hashref->{url};
70: my $url_domain = $hashref->{url_domain};
71:
72: my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
73: $stemmer->stem_caching({ -level => 2 });
74:
75: print "pandia: retrieving $url...\n";
76: try {
77: my $del_queue = 0;
78: my $response = $http->get($hashref->{url});
79:
80: if(not $response->{success}) {
81: print "pandia: http failure; skipping $url\n";
82: $del_queue = 1;
83: }
84:
85: #if(exists $response->{redirects}) {
86: # print "pandia: redirects detected; skipping $url\n";
87: # $del_queue = 1;
88: #}
89:
90: if($del_queue == 1) {
91: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
92: $sth->execute($url);
93: next;
94: }
95:
96: my $title = "";
97:
98: my $pagedata = $response->{content};
99: if($response) {
100: $tree->parse($pagedata);
101: $title = $tree->look_down('_tag', 'title')->as_text;
102:
103: print "pandia: processing $url [$title]\n";
104:
105: $fulltext = $tree->as_text;
106: $fulltext =~ s/[^\x00-\x7F]//g;
107:
108: my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
109: $sth->execute($url, $title, $fulltext);
110:
111: }
112: } catch {
113: warn "pandia: caught failure $_\n";
114: };
115:
116: my @words = split(' ', $fulltext);
117: $stemmer->stem_in_place(@words);
118:
119: my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
120: $sthd->execute($url);
121:
122: my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
1.3 snw 123: my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
1.2 snw 124: my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
125: foreach my $word (@words) {
126: $word =~ s/[^\x00-\x7F]//g;
1.3 snw 127: $sths->execute($word, $url);
1.2 snw 128:
129: if($sths->rows > 0) {
130: $sthu->execute($word, $url);
131: }
132: else {
133: $sth->execute($word, $url, $url_domain, 1);
134: }
135: }
136:
137: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
138: $sthuc->execute($url);
139: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>