Annotation of pandia/Pandia.pm, revision 1.1
1.1 ! snw 1: #!/usr/bin/env perl
! 2:
! 3: #
! 4: # $Id$
! 5: # Copyright (C) 2025 Coherent Logic Development LLC
! 6: #
! 7: # Author: Serena Willis <snw@coherent-logic.com>
! 8: #
! 9: # Licensed AGPL-3.0
! 10: #
! 11: # $Log$
! 12: #
! 13:
! 14: package Pandia;
! 15:
! 16: use strict;
! 17: #use warnings;
! 18:
! 19: use HTTP::Tiny;
! 20: use HTML::TreeBuilder;
! 21: use URI;
! 22: use DBI;
! 23: use WWW::RobotRules;
! 24: use Fcntl qw(:flock);
! 25: use LWP::Simple qw(get);
! 26: use Config::IniFiles;
! 27: use Thread::Pool;
! 28:
! 29: my $indices_waiting : shared;
! 30:
! 31: sub index {
! 32: my ($url, $domain, $dsn, $dbuser, $dbpass) = @_;
! 33:
! 34: my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 0, PrintError => 1});
! 35: if(not $dbh) {
! 36: print "pandia: failed to connect to MySQL database\n";
! 37: goto nodb_cleanup;
! 38: }
! 39:
! 40: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 60);
! 41: my $tree = HTML::TreeBuilder->new();
! 42:
! 43: my $head = $http->head($url);
! 44: if(not $head->{success}) {
! 45: print "pandia: http HEAD failure; skipping $url\n";
! 46: goto cleanup;
! 47: }
! 48: my $headers = $head->{headers};
! 49: my $content_type = $headers->{'content-type'};
! 50: my $title = "";
! 51: my $fulltext = "";
! 52: my $fullhtml = "";
! 53:
! 54: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
! 55: print "pandia: content type $content_type not indexable; skipping $url\n";
! 56: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
! 57: $sth->execute($url);
! 58: $sth->finish();
! 59: $dbh->disconnect();
! 60: goto nodb_cleanup;
! 61: }
! 62:
! 63: my $response = $http->get($url);
! 64:
! 65: if(not $response->{success}) {
! 66: print "pandia: http failure; skipping $url\n";
! 67: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
! 68: $sth->execute($url);
! 69: $sth->finish();
! 70: $dbh->disconnect();
! 71: goto nodb_cleanup;
! 72: }
! 73:
! 74: my $pagedata = $response->{content};
! 75: if($response) {
! 76: $tree->parse($pagedata);
! 77: $title = $tree->look_down('_tag', 'title')->as_text;
! 78: $title =~ s/[^\x00-\x7F]//g;
! 79:
! 80: #print "pandia: processing $url [$title]\n";
! 81:
! 82: $fulltext = $tree->as_text;
! 83: $fulltext =~ s/[^\x00-\x7F]//g;
! 84:
! 85: $fullhtml = $tree->as_HTML;
! 86: $fullhtml =~ s/[^\x00-\x7F]//g;
! 87:
! 88: my $sth = $dbh->prepare("SELECT url FROM url_fulltext WHERE url=?");
! 89: $sth->execute($url);
! 90:
! 91: if($sth->rows > 0) {
! 92: print "pandia: we already have the full text of $url recorded\n";
! 93: $sth->finish();
! 94: goto cleanup;
! 95: }
! 96:
! 97: $sth = $dbh->prepare("INSERT INTO url_fulltext(url, url_domain, page_title, body, body_html) VALUES (?, ?, ?, ?, ?)");
! 98: my $tries = 0;
! 99: while(1) {
! 100: $sth->execute($url, $domain, $title, $fulltext, $fullhtml);
! 101: if($DBI::err) {
! 102: if($tries > 5) {
! 103: print "pandia: giving up inserting fulltext on $url\n";
! 104: last;
! 105: }
! 106: $tries = $tries + 1;
! 107: print "pandia: error inserting fulltext on $url; retrying\n";
! 108: next;
! 109: }
! 110: else {
! 111: last;
! 112: }
! 113: }
! 114: $sth->finish();
! 115: }
! 116:
! 117: print "pandia: $url has been processed\n";
! 118:
! 119:
! 120: cleanup:
! 121: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
! 122: my $tries = 0;
! 123: while(1) {
! 124: $sthuc->execute($url);
! 125: if($DBI::err) {
! 126: $tries = $tries + 1;
! 127: if($tries > 2) {
! 128: print "pandia: giving up updating crawl_queue for $url\n";
! 129: last;
! 130: }
! 131: print "pandia: DBI deadlock; retrying crawl queue update\n";
! 132: next;
! 133: }
! 134: else {
! 135: last;
! 136: }
! 137: }
! 138: $sthuc->finish();
! 139: $dbh->disconnect();
! 140:
! 141: nodb_cleanup:
! 142: lock($indices_waiting);
! 143: $indices_waiting = $indices_waiting - 1;
! 144: }
! 145:
! 146: sub new {
! 147: my ($class, $args) = @_;
! 148:
! 149: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
! 150:
! 151: my $thost = $cfg->val($args->{profile}, 'dbhost');
! 152: my $tname = $cfg->val($args->{profile}, 'dbname');
! 153: my $tuser = $cfg->val($args->{profile}, 'dbuser');
! 154: my $tpass = $cfg->val($args->{profile}, 'dbpass');
! 155: my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
! 156: my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
! 157:
! 158: $indices_waiting = $tindex_workers;
! 159:
! 160: my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
! 161:
! 162: my $self = bless {
! 163: profile => $args->{profile},
! 164: dbhost => $thost,
! 165: dbname => $tname,
! 166: dbuser => $tuser,
! 167: dbpass => $tpass,
! 168: dsn => $tdsn,
! 169: index_workers => $tindex_workers,
! 170: crawl_workers => $tcrawl_workers,
! 171: index_pool => Thread::Pool->new(
! 172: {
! 173: workers => $tindex_workers,
! 174: do => \&index
! 175: }
! 176: )
! 177: }, $class;
! 178:
! 179: return $self;
! 180: }
! 181:
! 182: sub run_index_batch {
! 183: my ($self) = @_;
! 184:
! 185: # open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
! 186: # flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
! 187:
! 188: print "pandia: creating $self->{index_workers} indexer threads\n";
! 189:
! 190: my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
! 191:
! 192: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0 LIMIT ?");
! 193: $sth->execute($self->{index_workers} * 4);
! 194:
! 195: $indices_waiting = $sth->rows;
! 196:
! 197: if($indices_waiting == 0) {
! 198: print "pandia: nothing to index\n";
! 199: goto done;
! 200: }
! 201:
! 202: my $tmpi = 0;
! 203: while (my $hashref = $sth->fetchrow_hashref()) {
! 204: $tmpi = $tmpi + 1;
! 205: print "pandia: sending $hashref->{url} to worker thread\n";
! 206: $self->{index_pool}->job($hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass});
! 207: }
! 208:
! 209:
! 210:
! 211: print "pandia: $indices_waiting total pages to be processed\n";
! 212:
! 213: done:
! 214: $sth->finish();
! 215: $dbh->disconnect();
! 216:
! 217: my $start_time = time();
! 218: while($indices_waiting > 0) {
! 219: my $end_time = time();
! 220: my $time_diff = $end_time - $start_time;
! 221:
! 222: if($time_diff > $indices_waiting * 20) {
! 223: print "pandia: timing out\n";
! 224: last;
! 225: }
! 226: print "pandia: $indices_waiting URLs still in-process [$time_diff seconds elapsed]\n";
! 227: sleep(10);
! 228: }
! 229: $self->{index_pool}->shutdown;
! 230: }
! 231:
! 232: 1;
! 233:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>