1: #!/usr/bin/env perl
2:
3: #
4: # $Id: Pandia.pm,v 1.4 2025/07/01 19:20:47 snw Exp $
5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
11: # $Log: Pandia.pm,v $
12: # Revision 1.4 2025/07/01 19:20:47 snw
13: # Stop crawling and indexing URLs containing page fragments
14: #
15: # Revision 1.3 2025/07/01 06:48:03 snw
16: # Updates
17: #
18: # Revision 1.2 2025/06/30 02:18:44 snw
19: # Updates
20: #
21: # Revision 1.1 2025/06/28 23:54:11 snw
22: # Add new OO module
23: #
24: #
25:
26: package Pandia;
27:
28: use strict;
29: use warnings;
30:
31: use HTTP::Tiny;
32: use HTML::TreeBuilder;
33: use URI;
34: use DBI;
35: use WWW::RobotRules;
36: use Fcntl qw(:flock);
37: use LWP::Simple qw(get);
38: use Config::IniFiles;
39: use Thread::Pool;
40: use HTTP::Date;
41: use POSIX qw(strftime);
42:
43: my $indices_waiting : shared;
44:
45: sub do_index {
46: my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
47:
48: if (index($url, '#') != -1) {
49: print "pandia: URL contains a fragment; skipping\n";
50: return;
51: }
52:
53: print "pandia: thread connecting to MySQL database...";
54:
55: my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
56: if(not $dbh) {
57: print "[FAIL]\n";
58: goto nodb_cleanup;
59: }
60: print "[OK]\n";
61:
62: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
63: my $tree = HTML::TreeBuilder->new();
64: my $tries;
65:
66: my $head;
67: print "pandia: HEAD $url\n";
68: $head = $http->head($url);
69:
70: if(not $head->{success}) {
71: print "pandia: HEAD fail $url\n";
72:
73: my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
74: $sthh->execute($url);
75: $sthh->finish();
76: goto nodb_cleanup;
77: }
78: else {
79: print "pandia: HEAD OK $url\n";
80: }
81:
82: proc_head:
83: my $headers = $head->{headers};
84: my $content_type = $headers->{'content-type'};
85: my $last_modified;
86: my $last_modified_sys;
87:
88: if ($reindex == 1) {
89: print "pandia: REINDEX $url\n";
90: my $last_modified_t = $headers->{'last-modified'};
91: $last_modified_sys = str2time($last_modified_t);
92:
93: if($last_modified_sys) {
94: print "pandia: GET_LAST_INDEX_DT $url\n";
95: my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
96: $sth->execute($url);
97: print "pandia: GOT_LAST_INDEX_DT $url\n";
98:
99: if($sth->rows < 1) {
100: print "pandia: page not indexed\n";
101: goto nodb_cleanup;
102: }
103:
104: my $hashref = $sth->fetchrow_hashref();
105: my $last_indexed = str2time($hashref->{last_indexed_dt});
106:
107: if($last_modified_sys > $last_indexed) {
108: print "pandia: $url has been modified since the last time it was indexed\n";
109: my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
110: $sth->execute($url);
111: print "pandia: INDEXDELETE $url\n";
112: }
113: else {
114: print "pandia: $url is still up-to-date in the index\n";
115: goto cleanup;
116: }
117:
118: }
119: else {
120: print "pandia: no modify info; skipping $url\n";
121: goto nodb_cleanup;
122: }
123: }
124: else {
125: print "pandia: INDEX $url\n";
126: $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
127: }
128:
129: my $title = "";
130: my $fulltext = "";
131: my $fullhtml = "";
132:
133: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
134: print "pandia: content type $content_type not indexable; skipping $url\n";
135: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
136: $sth->execute($url);
137: $sth->finish();
138: $dbh->disconnect();
139: goto nodb_cleanup;
140: }
141:
142: my $response = $http->get($url);
143:
144: if(not $response->{success}) {
145: print "pandia: http failure; skipping $url\n";
146: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
147: $sth->execute($url);
148: $sth->finish();
149: $dbh->disconnect();
150: goto nodb_cleanup;
151: }
152:
153: my $pagedata = $response->{content};
154: if($response) {
155: $tree->parse($pagedata);
156: $title = $tree->look_down('_tag', 'title')->as_text;
157: $title =~ s/[^\x00-\x7F]//g;
158:
159: print "pandia: processing $url [$title]\n";
160:
161: $fulltext = $tree->as_text;
162: $fulltext =~ s/[^\x00-\x7F]//g;
163:
164: $fullhtml = $tree->as_HTML;
165: $fullhtml =~ s/[^\x00-\x7F]//g;
166:
167: my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
168: $sth->execute($url);
169:
170: if($sth->rows > 0) {
171: print "pandia: we already have the full text of $url recorded\n";
172: $sth->finish();
173: goto cleanup;
174: }
175:
176: $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
177: my $tries = 0;
178: while(1) {
179: print "pandia: INSERTINDEX $url\n";
180: $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
181: if($DBI::err) {
182: if($tries > 5) {
183: print "pandia: giving up inserting fulltext on $url\n";
184: last;
185: }
186: $tries = $tries + 1;
187: print "pandia: error inserting fulltext on $url; retrying\n";
188: next;
189: }
190: else {
191: last;
192: }
193: }
194: $sth->finish();
195: }
196:
197: print "pandia: $url has been processed\n";
198:
199:
200: cleanup:
201: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
202: $tries = 0;
203: while(1) {
204: $sthuc->execute($url);
205: if($DBI::err) {
206: $tries = $tries + 1;
207: if($tries > 2) {
208: print "pandia: giving up updating crawl_queue for $url\n";
209: last;
210: }
211: print "pandia: DBI deadlock; retrying crawl queue update\n";
212: next;
213: }
214: else {
215: last;
216: }
217: }
218: $sthuc->finish();
219: $dbh->disconnect();
220:
221: nodb_cleanup:
222: $indices_waiting = $indices_waiting - 1;
223: }
224:
225: sub blacklist_add {
226: my ($self, $domain) = @_;
227:
228: print "pandia: connecting to database...";
229: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
230: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
231: print "[OK]\n";
232:
233: print "pandia: blacklisting domain $domain...";
234: my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
235: $sth->execute($domain);
236: print "[OK]\n";
237:
238: print "pandia: removing blacklisted items from crawl queue...";
239: $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
240: $sth->execute();
241: print "[OK]\n";
242:
243: print "pandia: removing blacklisted items from index...";
244: $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
245: $sth->execute();
246: print "[OK]\n";
247:
248: $sth->finish();
249: $dbh->disconnect();
250: }
251:
252: sub blacklist_remove {
253: my ($self, $domain) = @_;
254:
255: print "pandia: connecting to database...";
256: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
257: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
258: print "[OK]\n";
259:
260: my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
261: $sth->execute($domain);
262:
263: $sth->finish();
264: $dbh->disconnect();
265: }
266:
267: sub index_serial {
268: my ($self) = @_;
269:
270: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
271:
272: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
273: $sth->execute();
274:
275: while (my $hashref = $sth->fetchrow_hashref()) {
276: do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
277: }
278:
279: $sth->finish();
280: $dbh->disconnect();
281: }
282:
283: sub index_one {
284: my ($self, $url) = @_;
285:
286: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
287:
288: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
289: $sth->execute($url);
290:
291: while (my $hashref = $sth->fetchrow_hashref()) {
292: do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
293: }
294:
295: $sth->finish();
296: $dbh->disconnect();
297: }
298:
299: sub index_domain {
300: my ($self, $domain) = @_;
301:
302: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
303:
304: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
305: $sth->execute($domain);
306:
307: while (my $hashref = $sth->fetchrow_hashref()) {
308: do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
309: }
310:
311: $sth->finish();
312: $dbh->disconnect();
313:
314: }
315:
316: sub run_index_batch {
317: my ($self) = @_;
318:
319: # open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
320: # flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
321:
322: print "pandia: creating $self->{index_workers} indexer threads\n";
323:
324: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
325:
326: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
327: $sth->execute($self->{index_workers});
328:
329: $indices_waiting = $sth->rows;
330:
331: if($indices_waiting == 0) {
332: print "pandia: nothing to index\n";
333: goto done;
334: }
335:
336: my $tmpi = 0;
337: while (my $hashref = $sth->fetchrow_hashref()) {
338: $tmpi = $tmpi + 1;
339: print "pandia: sending $hashref->{url} to worker thread\n";
340: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
341: }
342:
343: print "pandia: $indices_waiting total pages to be processed\n";
344:
345: done:
346: $sth->finish();
347: $dbh->disconnect();
348:
349: my $start_time = time();
350: while($indices_waiting > 0) {
351: my $end_time = time();
352: my $time_diff = $end_time - $start_time;
353:
354: if($time_diff > 60) {
355: print "pandia: timing out\n";
356: last;
357: }
358: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
359: sleep(10);
360: }
361: $self->{index_pool}->shutdown;
362: }
363:
364: sub run_reindex_batch {
365: my ($self) = @_;
366:
367: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
368:
369: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
370: $sth->execute($self->{index_workers});
371:
372: $indices_waiting = $sth->rows;
373:
374: if($indices_waiting == 0) {
375: print "pandia: nothing to reindex\n";
376: goto done;
377: }
378:
379: my $tmpi = 0;
380: while (my $hashref = $sth->fetchrow_hashref()) {
381: $tmpi = $tmpi + 1;
382: print "pandia: sending $hashref->{url} to worker thread\n";
383: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
384: }
385:
386: print "pandia: $indices_waiting total pages to be processed\n";
387:
388: done:
389: $sth->finish();
390: $dbh->disconnect();
391:
392: my $start_time = time();
393: while($indices_waiting > 0) {
394: my $end_time = time();
395: my $time_diff = $end_time - $start_time;
396:
397: if($time_diff > 60) {
398: print "pandia: timing out\n";
399: last;
400: }
401: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
402: sleep(10);
403: }
404: $self->{index_pool}->shutdown;
405:
406: }
407:
408: sub new {
409: my ($class, $args) = @_;
410:
411: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
412:
413: my $thost = $cfg->val($args->{profile}, 'dbhost');
414: my $tname = $cfg->val($args->{profile}, 'dbname');
415: my $tuser = $cfg->val($args->{profile}, 'dbuser');
416: my $tpass = $cfg->val($args->{profile}, 'dbpass');
417: my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
418: my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
419:
420: $indices_waiting = $tindex_workers;
421:
422: my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
423:
424: my $self = bless {
425: profile => $args->{profile},
426: dbhost => $thost,
427: dbname => $tname,
428: dbuser => $tuser,
429: dbpass => $tpass,
430: dsn => $tdsn,
431: index_workers => $tindex_workers,
432: crawl_workers => $tcrawl_workers,
433: index_pool => Thread::Pool->new(
434: {
435: workers => $tindex_workers,
436: do => \&do_index
437: }
438: )
439: }, $class;
440:
441: return $self;
442: }
443:
444:
445: 1;
446:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>