1: #!/usr/bin/env perl
2:
3: #
4: # $Id: indexer,v 1.5 2025/06/28 05:40:11 snw Exp $
5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
11: # $Log: indexer,v $
12: # Revision 1.5 2025/06/28 05:40:11 snw
13: # Exclude non-textual MIME types
14: #
15: # Revision 1.4 2025/06/28 00:33:32 snw
16: # Update locking
17: #
18: # Revision 1.3 2025/06/27 16:20:30 snw
19: # Add blacklist
20: #
21: # Revision 1.2 2025/06/27 02:14:47 snw
22: # Initial operational capability
23: #
24: # Revision 1.1 2025/06/25 19:38:48 snw
25: # Add indexer
26: #
27: #
28:
29: use Getopt::Long;
30: use HTTP::Tiny;
31: use HTML::TreeBuilder;
32: use URI;
33: use Lingua::Stem;
34: use DBI;
35: use Data::Dumper;
36: use Try::Tiny;
37: use Fcntl qw(:flock);
38:
39: my $dbh = "";
40: my $dsn = "";
41:
42: $| = 1;
43: print "pandia indexer v0.0.1\n";
44: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
45:
46: open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
47: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
48:
49: GetOptions("dbhost=s" => \$dbhost,
50: "dbname=s" => \$dbname,
51: "dbusername=s" => \$dbusername,
52: "dbpw=s" => \$dbpw)
53: or die("error in command line arguments");
54:
55: print "pandia: connecting to $dbname database at $dbhost...";
56:
57: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
58: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
59: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
60:
61: print "[OK]\n";
62:
63: print "pandia: loading queue...";
64:
65: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
66: $sth->execute() or die "pandia: error retrieving crawl queue\n";
67:
68: my $qlen = $sth->rows;
69: print "[OK (queue length $qlen)]\n";
70:
71: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
72:
73: while (my $hashref = $sth->fetchrow_hashref()) {
74: my $tree = HTML::TreeBuilder->new();
75: my $url = $hashref->{url};
76: my $url_domain = $hashref->{url_domain};
77:
78: my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
79: $stemmer->stem_caching({ -level => 2 });
80:
81: print "pandia: retrieving $url...\n";
82: try {
83: my $del_queue = 0;
84: my $head = $http->head($hashref->{url});
85: my $headers = $head->{headers};
86: my $content_type = $headers->{'content-type'};
87:
88: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
89: print "pandia: content type $content_type not indexable; skipping $url\n";
90: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
91: $sth->execute($url);
92: next;
93: }
94:
95: my $response = $http->get($hashref->{url});
96:
97: if(not $response->{success}) {
98: print "pandia: http failure; skipping $url\n";
99: $del_queue = 1;
100: }
101:
102: if($del_queue == 1) {
103: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
104: $sth->execute($url);
105: next;
106: }
107:
108: my $title = "";
109:
110: my $pagedata = $response->{content};
111: if($response) {
112: $tree->parse($pagedata);
113: $title = $tree->look_down('_tag', 'title')->as_text;
114: $title =~ s/[^\x00-\x7F]//g;
115:
116: print "pandia: processing $url [$title]\n";
117:
118: $fulltext = $tree->as_text;
119: $fulltext =~ s/[^\x00-\x7F]//g;
120:
121: my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
122: $sth->execute($url, $title, $fulltext);
123:
124: }
125: } catch {
126: warn "pandia: caught failure $_\n";
127: };
128:
129: my @words = split(' ', $fulltext);
130: $stemmer->stem_in_place(@words);
131:
132: my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
133: $sthd->execute($url);
134:
135: my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
136: my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
137: my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
138: foreach my $word (@words) {
139: $word =~ s/[^\x00-\x7F]//g;
140: $sths->execute($word, $url);
141:
142: if($sths->rows > 0) {
143: $sthu->execute($word, $url);
144: }
145: else {
146: $sth->execute($word, $url, $url_domain, 1);
147: }
148: }
149:
150: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
151: $sthuc->execute($url);
152: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>