Annotation of pandia/Pandia.pm, revision 1.2
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.2 ! snw 4: # $Id: Pandia.pm,v 1.1 2025/06/28 23:54:11 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 ! snw 11: # $Log: Pandia.pm,v $
! 12: # Revision 1.1 2025/06/28 23:54:11 snw
! 13: # Add new OO module
! 14: #
1.1 snw 15: #
16:
17: package Pandia;
18:
19: use strict;
1.2 ! snw 20: use warnings;
1.1 snw 21:
22: use HTTP::Tiny;
23: use HTML::TreeBuilder;
24: use URI;
25: use DBI;
26: use WWW::RobotRules;
27: use Fcntl qw(:flock);
28: use LWP::Simple qw(get);
29: use Config::IniFiles;
30: use Thread::Pool;
1.2 ! snw 31: use HTTP::Date;
! 32: use POSIX qw(strftime);
1.1 snw 33:
34: my $indices_waiting : shared;
35:
36: sub index {
1.2 ! snw 37: my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
1.1 snw 38:
1.2 ! snw 39: print "pandia: thread connecting to MySQL database...";
! 40:
1.1 snw 41: my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
42: if(not $dbh) {
1.2 ! snw 43: print "[FAIL]\n";
1.1 snw 44: goto nodb_cleanup;
45: }
1.2 ! snw 46: print "[OK]\n";
1.1 snw 47:
48: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
49: my $tree = HTML::TreeBuilder->new();
1.2 ! snw 50: my $tries;
1.1 snw 51:
1.2 ! snw 52: my $head;
! 53: print "pandia: HEAD $url\n";
! 54: $head = $http->head($url);
! 55: if(not $head->{success}) {
! 56: print "pandia: HEAD fail $url\n";
! 57: goto nodb_cleanup;
! 58: }
! 59: else {
! 60: print "pandia: HEAD OK $url\n";
1.1 snw 61: }
1.2 ! snw 62:
! 63: proc_head:
1.1 snw 64: my $headers = $head->{headers};
65: my $content_type = $headers->{'content-type'};
1.2 ! snw 66: my $last_modified;
! 67: my $last_modified_sys;
! 68:
! 69: if ($reindex == 1) {
! 70: print "pandia: REINDEX $url\n";
! 71: my $last_modified_t = $headers->{'last-modified'};
! 72: $last_modified_sys = str2time($last_modified_t);
! 73:
! 74: if($last_modified_sys) {
! 75: print "pandia: GET_LAST_INDEX_DT $url\n";
! 76: my $sth = $dbh->prepare("SELECT last_indexed_dt FROM url_fulltext WHERE url=?");
! 77: $sth->execute($url);
! 78: print "pandia: GOT_LAST_INDEX_DT $url\n";
! 79:
! 80: if($sth->rows < 1) {
! 81: print "pandia: page not indexed\n";
! 82: goto nodb_cleanup;
! 83: }
! 84:
! 85: my $hashref = $sth->fetchrow_hashref();
! 86: my $last_indexed = str2time($hashref->{last_indexed_dt});
! 87:
! 88: if($last_modified_sys > $last_indexed) {
! 89: print "pandia: $url has been modified since the last time it was indexed\n";
! 90: my $sth = $dbh->prepare("DELETE FROM url_fulltext WHERE url=?");
! 91: $sth->execute($url);
! 92: print "pandia: INDEXDELETE $url\n";
! 93: }
! 94: else {
! 95: print "pandia: $url is still up-to-date in the index\n";
! 96: goto cleanup;
! 97: }
! 98:
! 99: }
! 100: else {
! 101: print "pandia: no modify info; skipping $url\n";
! 102: goto nodb_cleanup;
! 103: }
! 104: }
! 105: else {
! 106: print "pandia: INDEX $url\n";
! 107: $last_modified = strftime("%Y-%m-%d %H:%M", localtime);
! 108: }
! 109:
1.1 snw 110: my $title = "";
111: my $fulltext = "";
112: my $fullhtml = "";
113:
114: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
115: print "pandia: content type $content_type not indexable; skipping $url\n";
116: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
117: $sth->execute($url);
118: $sth->finish();
119: $dbh->disconnect();
120: goto nodb_cleanup;
121: }
122:
123: my $response = $http->get($url);
124:
125: if(not $response->{success}) {
126: print "pandia: http failure; skipping $url\n";
127: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
128: $sth->execute($url);
129: $sth->finish();
130: $dbh->disconnect();
131: goto nodb_cleanup;
132: }
133:
134: my $pagedata = $response->{content};
135: if($response) {
136: $tree->parse($pagedata);
137: $title = $tree->look_down('_tag', 'title')->as_text;
138: $title =~ s/[^\x00-\x7F]//g;
139:
1.2 ! snw 140: print "pandia: processing $url [$title]\n";
1.1 snw 141:
142: $fulltext = $tree->as_text;
143: $fulltext =~ s/[^\x00-\x7F]//g;
144:
145: $fullhtml = $tree->as_HTML;
146: $fullhtml =~ s/[^\x00-\x7F]//g;
147:
148: my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
149: $sth->execute($url);
150:
151: if($sth->rows > 0) {
152: print "pandia: we already have the full text of $url recorded\n";
153: $sth->finish();
154: goto cleanup;
155: }
156:
157: $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
158: my $tries = 0;
159: while(1) {
1.2 ! snw 160: print "pandia: INSERTINDEX $url\n";
1.1 snw 161: $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
162: if($DBI::err) {
163: if($tries > 5) {
164: print "pandia: giving up inserting fulltext on $url\n";
165: last;
166: }
167: $tries = $tries + 1;
168: print "pandia: error inserting fulltext on $url; retrying\n";
169: next;
170: }
171: else {
172: last;
173: }
174: }
175: $sth->finish();
176: }
177:
178: print "pandia: $url has been processed\n";
179:
180:
181: cleanup:
182: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
1.2 ! snw 183: $tries = 0;
1.1 snw 184: while(1) {
185: $sthuc->execute($url);
186: if($DBI::err) {
187: $tries = $tries + 1;
188: if($tries > 2) {
189: print "pandia: giving up updating crawl_queue for $url\n";
190: last;
191: }
192: print "pandia: DBI deadlock; retrying crawl queue update\n";
193: next;
194: }
195: else {
196: last;
197: }
198: }
199: $sthuc->finish();
200: $dbh->disconnect();
201:
202: nodb_cleanup:
203: $indices_waiting = $indices_waiting - 1;
204: }
205:
206: sub new {
207: my ($class, $args) = @_;
208:
209: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
210:
211: my $thost = $cfg->val($args->{profile}, 'dbhost');
212: my $tname = $cfg->val($args->{profile}, 'dbname');
213: my $tuser = $cfg->val($args->{profile}, 'dbuser');
214: my $tpass = $cfg->val($args->{profile}, 'dbpass');
215: my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
216: my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
217:
218: $indices_waiting = $tindex_workers;
219:
220: my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
221:
222: my $self = bless {
223: profile => $args->{profile},
224: dbhost => $thost,
225: dbname => $tname,
226: dbuser => $tuser,
227: dbpass => $tpass,
228: dsn => $tdsn,
229: index_workers => $tindex_workers,
230: crawl_workers => $tcrawl_workers,
231: index_pool => Thread::Pool->new(
232: {
233: workers => $tindex_workers,
234: do => \&index
235: }
236: )
237: }, $class;
238:
239: return $self;
240: }
241:
242: sub run_index_batch {
243: my ($self) = @_;
244:
245: # open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
246: # flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
247:
248: print "pandia: creating $self->{index_workers} indexer threads\n";
249:
250: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
251:
252: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
1.2 ! snw 253: $sth->execute($self->{index_workers});
1.1 snw 254:
255: $indices_waiting = $sth->rows;
256:
257: if($indices_waiting == 0) {
258: print "pandia: nothing to index\n";
259: goto done;
260: }
261:
262: my $tmpi = 0;
263: while (my $hashref = $sth->fetchrow_hashref()) {
264: $tmpi = $tmpi + 1;
265: print "pandia: sending $hashref->{url} to worker thread\n";
1.2 ! snw 266: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0);
1.1 snw 267: }
268:
1.2 ! snw 269: print "pandia: $indices_waiting total pages to be processed\n";
! 270:
! 271: done:
! 272: $sth->finish();
! 273: $dbh->disconnect();
1.1 snw 274:
1.2 ! snw 275: my $start_time = time();
! 276: while($indices_waiting > 0) {
! 277: my $end_time = time();
! 278: my $time_diff = $end_time - $start_time;
! 279:
! 280: if($time_diff > 60) {
! 281: print "pandia: timing out\n";
! 282: last;
! 283: }
! 284: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
! 285: sleep(10);
! 286: }
! 287: $self->{index_pool}->shutdown;
! 288: }
! 289:
! 290: sub run_reindex_batch {
! 291: my ($self) = @_;
! 292:
! 293: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
! 294:
! 295: my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=1 ORDER BY RAND() LIMIT ?");
! 296: $sth->execute($self->{index_workers});
! 297:
! 298: $indices_waiting = $sth->rows;
! 299:
! 300: if($indices_waiting == 0) {
! 301: print "pandia: nothing to reindex\n";
! 302: goto done;
! 303: }
! 304:
! 305: my $tmpi = 0;
! 306: while (my $hashref = $sth->fetchrow_hashref()) {
! 307: $tmpi = $tmpi + 1;
! 308: print "pandia: sending $hashref->{url} to worker thread\n";
! 309: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 1);
! 310: }
1.1 snw 311:
312: print "pandia: $indices_waiting total pages to be processed\n";
313:
1.2 ! snw 314: done:
1.1 snw 315: $sth->finish();
316: $dbh->disconnect();
317:
318: my $start_time = time();
319: while($indices_waiting > 0) {
320: my $end_time = time();
321: my $time_diff = $end_time - $start_time;
322:
1.2 ! snw 323: if($time_diff > 60) {
1.1 snw 324: print "pandia: timing out\n";
325: last;
326: }
327: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
328: sleep(10);
329: }
330: $self->{index_pool}->shutdown;
1.2 ! snw 331:
1.1 snw 332: }
333:
334: 1;
335:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>