Annotation of pandia/Pandia.pm, revision 1.4
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.4 ! snw 4: # $Id: Pandia.pm,v 1.3 2025/07/01 06:48:03 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: Pandia.pm,v $
1.4 ! snw 12: # Revision 1.3 2025/07/01 06:48:03 snw
! 13: # Updates
! 14: #
1.3 snw 15: # Revision 1.2 2025/06/30 02:18:44 snw
16: # Updates
17: #
1.2 snw 18: # Revision 1.1 2025/06/28 23:54:11 snw
19: # Add new OO module
20: #
1.1 snw 21: #
22:
23: package Pandia;
24:
25: use strict;
1.2 snw 26: use warnings;
1.1 snw 27:
28: use HTTP::Tiny;
29: use HTML::TreeBuilder;
30: use URI;
31: use DBI;
32: use WWW::RobotRules;
33: use Fcntl qw(:flock);
34: use LWP::Simple qw(get);
35: use Config::IniFiles;
36: use Thread::Pool;
1.2 snw 37: use HTTP::Date;
38: use POSIX qw(strftime);
1.1 snw 39:
40: my $indices_waiting : shared;
41:
1.3 snw 42: sub do_index {
1.2 snw 43: my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1 snw 44:
1.4 ! snw 45: if (index($url, '#') != -1) {
! 46: print "pandia: URL contains a fragment; skipping\n";
! 47: return;
! 48: }
! 49:
1.2 snw 50: print "pandia: thread connecting to MySQL database...";
51:
1.1 snw 52: my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
53: if(not $dbh) {
1.2 snw 54: print "[FAIL]\n";
1.1 snw 55: goto nodb_cleanup;
56: }
1.2 snw 57: print "[OK]\n";
1.1 snw 58:
59: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
60: my $tree = HTML::TreeBuilder->new();
1.2 snw 61: my $tries;
1.1 snw 62:
1.2 snw 63: my $head;
64: print "pandia: HEAD $url\n";
65: $head = $http->head($url);
1.3 snw 66:
1.2 snw 67: if(not $head->{success}) {
68: print "pandia: HEAD fail $url\n";
1.3 snw 69:
70: my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
71: $sthh->execute($url);
72: $sthh->finish();
1.2 snw 73: goto nodb_cleanup;
74: }
75: else {
76: print "pandia: HEAD OK $url\n";
1.1 snw 77: }
1.2 snw 78:
79: proc_head:
1.1 snw 80: my $headers = $head->{headers};
81: my $content_type = $headers->{'content-type'};
1.2 snw 82: my $last_modified;
83: my $last_modified_sys;
84:
85: if ($reindex == 1) {
86: print "pandia: REINDEX $url\n";
87: my $last_modified_t = $headers->{'last-modified'};
88: $last_modified_sys = str2time($last_modified_t);
89:
90: if($last_modified_sys) {
91: print "pandia: GET_LAST_INDEX_DT $url\n";
92: my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
93: $sth->execute($url);
94: print "pandia: GOT_LAST_INDEX_DT $url\n";
95:
96: if($sth->rows < 1) {
97: print "pandia: page not indexed\n";
98: goto nodb_cleanup;
99: }
100:
101: my $hashref = $sth->fetchrow_hashref();
102: my $last_indexed = str2time($hashref->{last_indexed_dt});
103:
104: if($last_modified_sys > $last_indexed) {
105: print "pandia: $url has been modified since the last time it was indexed\n";
106: my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
107: $sth->execute($url);
108: print "pandia: INDEXDELETE $url\n";
109: }
110: else {
111: print "pandia: $url is still up-to-date in the index\n";
112: goto cleanup;
113: }
114:
115: }
116: else {
117: print "pandia: no modify info; skipping $url\n";
118: goto nodb_cleanup;
119: }
120: }
121: else {
122: print "pandia: INDEX $url\n";
123: $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
124: }
125:
1.1 snw 126: my $title = "";
127: my $fulltext = "";
128: my $fullhtml = "";
129:
130: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
131: print "pandia: content type $content_type not indexable; skipping $url\n";
132: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
133: $sth->execute($url);
134: $sth->finish();
135: $dbh->disconnect();
136: goto nodb_cleanup;
137: }
138:
139: my $response = $http->get($url);
140:
141: if(not $response->{success}) {
142: print "pandia: http failure; skipping $url\n";
143: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
144: $sth->execute($url);
145: $sth->finish();
146: $dbh->disconnect();
147: goto nodb_cleanup;
148: }
149:
150: my $pagedata = $response->{content};
151: if($response) {
152: $tree->parse($pagedata);
153: $title = $tree->look_down('_tag', 'title')->as_text;
154: $title =~ s/[^\x00-\x7F]//g;
155:
1.2 snw 156: print "pandia: processing $url [$title]\n";
1.1 snw 157:
158: $fulltext = $tree->as_text;
159: $fulltext =~ s/[^\x00-\x7F]//g;
160:
161: $fullhtml = $tree->as_HTML;
162: $fullhtml =~ s/[^\x00-\x7F]//g;
163:
164: my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
165: $sth->execute($url);
166:
167: if($sth->rows > 0) {
168: print "pandia: we already have the full text of $url recorded\n";
169: $sth->finish();
170: goto cleanup;
171: }
172:
173: $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
174: my $tries = 0;
175: while(1) {
1.2 snw 176: print "pandia: INSERTINDEX $url\n";
1.1 snw 177: $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
178: if($DBI::err) {
179: if($tries > 5) {
180: print "pandia: giving up inserting fulltext on $url\n";
181: last;
182: }
183: $tries = $tries + 1;
184: print "pandia: error inserting fulltext on $url; retrying\n";
185: next;
186: }
187: else {
188: last;
189: }
190: }
191: $sth->finish();
192: }
193:
194: print "pandia: $url has been processed\n";
195:
196:
197: cleanup:
198: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2 snw 199: $tries = 0;
1.1 snw 200: while(1) {
201: $sthuc->execute($url);
202: if($DBI::err) {
203: $tries = $tries + 1;
204: if($tries > 2) {
205: print "pandia: giving up updating crawl_queue for $url\n";
206: last;
207: }
208: print "pandia: DBI deadlock; retrying crawl queue update\n";
209: next;
210: }
211: else {
212: last;
213: }
214: }
215: $sthuc->finish();
216: $dbh->disconnect();
217:
218: nodb_cleanup:
219: $indices_waiting = $indices_waiting - 1;
220: }
221:
1.3 snw 222: sub blacklist_add {
223: my ($self, $domain) = @_;
224:
225: print "pandia: connecting to database...";
226: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
227: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
228: print "[OK]\n";
229:
230: print "pandia: blacklisting domain $domain...";
231: my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
232: $sth->execute($domain);
233: print "[OK]\n";
234:
235: print "pandia: removing blacklisted items from crawl queue...";
236: $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
237: $sth->execute();
238: print "[OK]\n";
239:
240: print "pandia: removing blacklisted items from index...";
241: $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
242: $sth->execute();
243: print "[OK]\n";
244:
245: $sth->finish();
246: $dbh->disconnect();
247: }
248:
249: sub blacklist_remove {
250: my ($self, $domain) = @_;
251:
252: print "pandia: connecting to database...";
253: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
254: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
255: print "[OK]\n";
256:
257: my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
258: $sth->execute($domain);
259:
260: $sth->finish();
261: $dbh->disconnect();
262: }
263:
264: sub index_serial {
265: my ($self) = @_;
266:
267: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 268:
1.3 snw 269: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
270: $sth->execute();
271:
272: while (my $hashref = $sth->fetchrow_hashref()) {
273: do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
274: }
275:
276: $sth->finish();
277: $dbh->disconnect();
278: }
1.1 snw 279:
1.3 snw 280: sub index_one {
281: my ($self, $url) = @_;
1.1 snw 282:
1.3 snw 283: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 284:
1.3 snw 285: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
286: $sth->execute($url);
287:
288: while (my $hashref = $sth->fetchrow_hashref()) {
289: do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
290: }
291:
292: $sth->finish();
293: $dbh->disconnect();
294: }
295:
296: sub index_domain {
297: my ($self, $domain) = @_;
298:
299: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 300:
1.3 snw 301: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
302: $sth->execute($domain);
303:
304: while (my $hashref = $sth->fetchrow_hashref()) {
305: do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
306: }
307:
308: $sth->finish();
309: $dbh->disconnect();
1.1 snw 310:
311: }
312:
313: sub run_index_batch {
314: my ($self) = @_;
315:
316: # open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
317: # flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
318:
319: print "pandia: creating $self->{index_workers} indexer threads\n";
320:
321: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
322:
323: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2 snw 324: $sth->execute($self->{index_workers});
1.1 snw 325:
326: $indices_waiting = $sth->rows;
327:
328: if($indices_waiting == 0) {
329: print "pandia: nothing to index\n";
330: goto done;
331: }
332:
333: my $tmpi = 0;
334: while (my $hashref = $sth->fetchrow_hashref()) {
335: $tmpi = $tmpi + 1;
336: print "pandia: sending $hashref->{url} to worker thread\n";
1.2 snw 337: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1 snw 338: }
339:
1.2 snw 340: print "pandia: $indices_waiting total pages to be processed\n";
341:
342: done:
343: $sth->finish();
344: $dbh->disconnect();
1.1 snw 345:
1.2 snw 346: my $start_time = time();
347: while($indices_waiting > 0) {
348: my $end_time = time();
349: my $time_diff = $end_time - $start_time;
350:
351: if($time_diff > 60) {
352: print "pandia: timing out\n";
353: last;
354: }
355: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
356: sleep(10);
357: }
358: $self->{index_pool}->shutdown;
359: }
360:
361: sub run_reindex_batch {
362: my ($self) = @_;
363:
364: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
365:
366: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
367: $sth->execute($self->{index_workers});
368:
369: $indices_waiting = $sth->rows;
370:
371: if($indices_waiting == 0) {
372: print "pandia: nothing to reindex\n";
373: goto done;
374: }
375:
376: my $tmpi = 0;
377: while (my $hashref = $sth->fetchrow_hashref()) {
378: $tmpi = $tmpi + 1;
379: print "pandia: sending $hashref->{url} to worker thread\n";
380: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
381: }
1.1 snw 382:
383: print "pandia: $indices_waiting total pages to be processed\n";
384:
1.2 snw 385: done:
1.1 snw 386: $sth->finish();
387: $dbh->disconnect();
388:
389: my $start_time = time();
390: while($indices_waiting > 0) {
391: my $end_time = time();
392: my $time_diff = $end_time - $start_time;
393:
1.2 snw 394: if($time_diff > 60) {
1.1 snw 395: print "pandia: timing out\n";
396: last;
397: }
398: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
399: sleep(10);
400: }
401: $self->{index_pool}->shutdown;
1.2 snw 402:
1.1 snw 403: }
404:
1.3 snw 405: sub new {
406: my ($class, $args) = @_;
407:
408: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
409:
410: my $thost = $cfg->val($args->{profile}, 'dbhost');
411: my $tname = $cfg->val($args->{profile}, 'dbname');
412: my $tuser = $cfg->val($args->{profile}, 'dbuser');
413: my $tpass = $cfg->val($args->{profile}, 'dbpass');
414: my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
415: my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
416:
417: $indices_waiting = $tindex_workers;
418:
419: my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
420:
421: my $self = bless {
422: profile => $args->{profile},
423: dbhost => $thost,
424: dbname => $tname,
425: dbuser => $tuser,
426: dbpass => $tpass,
427: dsn => $tdsn,
428: index_workers => $tindex_workers,
429: crawl_workers => $tcrawl_workers,
430: index_pool => Thread::Pool->new(
431: {
432: workers => $tindex_workers,
433: do => \&do_index
434: }
435: )
436: }, $class;
437:
438: return $self;
439: }
440:
441:
1.1 snw 442: 1;
443:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>