Annotation of pandia/Pandia.pm, revision 1.3
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.3 ! snw 4: # $Id: Pandia.pm,v 1.2 2025/06/30 02:18:44 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: Pandia.pm,v $
1.3 ! snw 12: # Revision 1.2 2025/06/30 02:18:44 snw
! 13: # Updates
! 14: #
1.2 snw 15: # Revision 1.1 2025/06/28 23:54:11 snw
16: # Add new OO module
17: #
1.1 snw 18: #
19:
20: package Pandia;
21:
22: use strict;
1.2 snw 23: use warnings;
1.1 snw 24:
25: use HTTP::Tiny;
26: use HTML::TreeBuilder;
27: use URI;
28: use DBI;
29: use WWW::RobotRules;
30: use Fcntl qw(:flock);
31: use LWP::Simple qw(get);
32: use Config::IniFiles;
33: use Thread::Pool;
1.2 snw 34: use HTTP::Date;
35: use POSIX qw(strftime);
1.1 snw 36:
37: my $indices_waiting : shared;
38:
1.3 ! snw 39: sub do_index {
1.2 snw 40: my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1 snw 41:
1.2 snw 42: print "pandia: thread connecting to MySQL database...";
43:
1.1 snw 44: my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
45: if(not $dbh) {
1.2 snw 46: print "[FAIL]\n";
1.1 snw 47: goto nodb_cleanup;
48: }
1.2 snw 49: print "[OK]\n";
1.1 snw 50:
51: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
52: my $tree = HTML::TreeBuilder->new();
1.2 snw 53: my $tries;
1.1 snw 54:
1.2 snw 55: my $head;
56: print "pandia: HEAD $url\n";
57: $head = $http->head($url);
1.3 ! snw 58:
1.2 snw 59: if(not $head->{success}) {
60: print "pandia: HEAD fail $url\n";
1.3 ! snw 61:
! 62: my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
! 63: $sthh->execute($url);
! 64: $sthh->finish();
1.2 snw 65: goto nodb_cleanup;
66: }
67: else {
68: print "pandia: HEAD OK $url\n";
1.1 snw 69: }
1.2 snw 70:
71: proc_head:
1.1 snw 72: my $headers = $head->{headers};
73: my $content_type = $headers->{'content-type'};
1.2 snw 74: my $last_modified;
75: my $last_modified_sys;
76:
77: if ($reindex == 1) {
78: print "pandia: REINDEX $url\n";
79: my $last_modified_t = $headers->{'last-modified'};
80: $last_modified_sys = str2time($last_modified_t);
81:
82: if($last_modified_sys) {
83: print "pandia: GET_LAST_INDEX_DT $url\n";
84: my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
85: $sth->execute($url);
86: print "pandia: GOT_LAST_INDEX_DT $url\n";
87:
88: if($sth->rows < 1) {
89: print "pandia: page not indexed\n";
90: goto nodb_cleanup;
91: }
92:
93: my $hashref = $sth->fetchrow_hashref();
94: my $last_indexed = str2time($hashref->{last_indexed_dt});
95:
96: if($last_modified_sys > $last_indexed) {
97: print "pandia: $url has been modified since the last time it was indexed\n";
98: my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
99: $sth->execute($url);
100: print "pandia: INDEXDELETE $url\n";
101: }
102: else {
103: print "pandia: $url is still up-to-date in the index\n";
104: goto cleanup;
105: }
106:
107: }
108: else {
109: print "pandia: no modify info; skipping $url\n";
110: goto nodb_cleanup;
111: }
112: }
113: else {
114: print "pandia: INDEX $url\n";
115: $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
116: }
117:
1.1 snw 118: my $title = "";
119: my $fulltext = "";
120: my $fullhtml = "";
121:
122: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
123: print "pandia: content type $content_type not indexable; skipping $url\n";
124: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
125: $sth->execute($url);
126: $sth->finish();
127: $dbh->disconnect();
128: goto nodb_cleanup;
129: }
130:
131: my $response = $http->get($url);
132:
133: if(not $response->{success}) {
134: print "pandia: http failure; skipping $url\n";
135: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
136: $sth->execute($url);
137: $sth->finish();
138: $dbh->disconnect();
139: goto nodb_cleanup;
140: }
141:
142: my $pagedata = $response->{content};
143: if($response) {
144: $tree->parse($pagedata);
145: $title = $tree->look_down('_tag', 'title')->as_text;
146: $title =~ s/[^\x00-\x7F]//g;
147:
1.2 snw 148: print "pandia: processing $url [$title]\n";
1.1 snw 149:
150: $fulltext = $tree->as_text;
151: $fulltext =~ s/[^\x00-\x7F]//g;
152:
153: $fullhtml = $tree->as_HTML;
154: $fullhtml =~ s/[^\x00-\x7F]//g;
155:
156: my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
157: $sth->execute($url);
158:
159: if($sth->rows > 0) {
160: print "pandia: we already have the full text of $url recorded\n";
161: $sth->finish();
162: goto cleanup;
163: }
164:
165: $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
166: my $tries = 0;
167: while(1) {
1.2 snw 168: print "pandia: INSERTINDEX $url\n";
1.1 snw 169: $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
170: if($DBI::err) {
171: if($tries > 5) {
172: print "pandia: giving up inserting fulltext on $url\n";
173: last;
174: }
175: $tries = $tries + 1;
176: print "pandia: error inserting fulltext on $url; retrying\n";
177: next;
178: }
179: else {
180: last;
181: }
182: }
183: $sth->finish();
184: }
185:
186: print "pandia: $url has been processed\n";
187:
188:
189: cleanup:
190: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2 snw 191: $tries = 0;
1.1 snw 192: while(1) {
193: $sthuc->execute($url);
194: if($DBI::err) {
195: $tries = $tries + 1;
196: if($tries > 2) {
197: print "pandia: giving up updating crawl_queue for $url\n";
198: last;
199: }
200: print "pandia: DBI deadlock; retrying crawl queue update\n";
201: next;
202: }
203: else {
204: last;
205: }
206: }
207: $sthuc->finish();
208: $dbh->disconnect();
209:
210: nodb_cleanup:
211: $indices_waiting = $indices_waiting - 1;
212: }
213:
1.3 ! snw 214: sub blacklist_add {
! 215: my ($self, $domain) = @_;
! 216:
! 217: print "pandia: connecting to database...";
! 218: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
! 219: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
! 220: print "[OK]\n";
! 221:
! 222: print "pandia: blacklisting domain $domain...";
! 223: my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
! 224: $sth->execute($domain);
! 225: print "[OK]\n";
! 226:
! 227: print "pandia: removing blacklisted items from crawl queue...";
! 228: $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
! 229: $sth->execute();
! 230: print "[OK]\n";
! 231:
! 232: print "pandia: removing blacklisted items from index...";
! 233: $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
! 234: $sth->execute();
! 235: print "[OK]\n";
! 236:
! 237: $sth->finish();
! 238: $dbh->disconnect();
! 239: }
! 240:
! 241: sub blacklist_remove {
! 242: my ($self, $domain) = @_;
! 243:
! 244: print "pandia: connecting to database...";
! 245: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
! 246: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
! 247: print "[OK]\n";
! 248:
! 249: my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
! 250: $sth->execute($domain);
! 251:
! 252: $sth->finish();
! 253: $dbh->disconnect();
! 254: }
! 255:
! 256: sub index_serial {
! 257: my ($self) = @_;
! 258:
! 259: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 260:
1.3 ! snw 261: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
! 262: $sth->execute();
! 263:
! 264: while (my $hashref = $sth->fetchrow_hashref()) {
! 265: do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
! 266: }
! 267:
! 268: $sth->finish();
! 269: $dbh->disconnect();
! 270: }
1.1 snw 271:
1.3 ! snw 272: sub index_one {
! 273: my ($self, $url) = @_;
1.1 snw 274:
1.3 ! snw 275: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 276:
1.3 ! snw 277: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
! 278: $sth->execute($url);
! 279:
! 280: while (my $hashref = $sth->fetchrow_hashref()) {
! 281: do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
! 282: }
! 283:
! 284: $sth->finish();
! 285: $dbh->disconnect();
! 286: }
! 287:
! 288: sub index_domain {
! 289: my ($self, $domain) = @_;
! 290:
! 291: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
1.1 snw 292:
1.3 ! snw 293: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
! 294: $sth->execute($domain);
! 295:
! 296: while (my $hashref = $sth->fetchrow_hashref()) {
! 297: do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
! 298: }
! 299:
! 300: $sth->finish();
! 301: $dbh->disconnect();
1.1 snw 302:
303: }
304:
305: sub run_index_batch {
306: my ($self) = @_;
307:
308: # open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
309: # flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
310:
311: print "pandia: creating $self->{index_workers} indexer threads\n";
312:
313: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
314:
315: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2 snw 316: $sth->execute($self->{index_workers});
1.1 snw 317:
318: $indices_waiting = $sth->rows;
319:
320: if($indices_waiting == 0) {
321: print "pandia: nothing to index\n";
322: goto done;
323: }
324:
325: my $tmpi = 0;
326: while (my $hashref = $sth->fetchrow_hashref()) {
327: $tmpi = $tmpi + 1;
328: print "pandia: sending $hashref->{url} to worker thread\n";
1.2 snw 329: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1 snw 330: }
331:
1.2 snw 332: print "pandia: $indices_waiting total pages to be processed\n";
333:
334: done:
335: $sth->finish();
336: $dbh->disconnect();
1.1 snw 337:
1.2 snw 338: my $start_time = time();
339: while($indices_waiting > 0) {
340: my $end_time = time();
341: my $time_diff = $end_time - $start_time;
342:
343: if($time_diff > 60) {
344: print "pandia: timing out\n";
345: last;
346: }
347: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
348: sleep(10);
349: }
350: $self->{index_pool}->shutdown;
351: }
352:
353: sub run_reindex_batch {
354: my ($self) = @_;
355:
356: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
357:
358: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
359: $sth->execute($self->{index_workers});
360:
361: $indices_waiting = $sth->rows;
362:
363: if($indices_waiting == 0) {
364: print "pandia: nothing to reindex\n";
365: goto done;
366: }
367:
368: my $tmpi = 0;
369: while (my $hashref = $sth->fetchrow_hashref()) {
370: $tmpi = $tmpi + 1;
371: print "pandia: sending $hashref->{url} to worker thread\n";
372: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
373: }
1.1 snw 374:
375: print "pandia: $indices_waiting total pages to be processed\n";
376:
1.2 snw 377: done:
1.1 snw 378: $sth->finish();
379: $dbh->disconnect();
380:
381: my $start_time = time();
382: while($indices_waiting > 0) {
383: my $end_time = time();
384: my $time_diff = $end_time - $start_time;
385:
1.2 snw 386: if($time_diff > 60) {
1.1 snw 387: print "pandia: timing out\n";
388: last;
389: }
390: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
391: sleep(10);
392: }
393: $self->{index_pool}->shutdown;
1.2 snw 394:
1.1 snw 395: }
396:
1.3 ! snw 397: sub new {
! 398: my ($class, $args) = @_;
! 399:
! 400: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
! 401:
! 402: my $thost = $cfg->val($args->{profile}, 'dbhost');
! 403: my $tname = $cfg->val($args->{profile}, 'dbname');
! 404: my $tuser = $cfg->val($args->{profile}, 'dbuser');
! 405: my $tpass = $cfg->val($args->{profile}, 'dbpass');
! 406: my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
! 407: my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
! 408:
! 409: $indices_waiting = $tindex_workers;
! 410:
! 411: my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
! 412:
! 413: my $self = bless {
! 414: profile => $args->{profile},
! 415: dbhost => $thost,
! 416: dbname => $tname,
! 417: dbuser => $tuser,
! 418: dbpass => $tpass,
! 419: dsn => $tdsn,
! 420: index_workers => $tindex_workers,
! 421: crawl_workers => $tcrawl_workers,
! 422: index_pool => Thread::Pool->new(
! 423: {
! 424: workers => $tindex_workers,
! 425: do => \&do_index
! 426: }
! 427: )
! 428: }, $class;
! 429:
! 430: return $self;
! 431: }
! 432:
! 433:
1.1 snw 434: 1;
435:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>