Annotation of pandia/indexer, revision 1.5
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.5 ! snw 4: # $Id: indexer,v 1.4 2025/06/28 00:33:32 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: indexer,v $
1.5 ! snw 12: # Revision 1.4 2025/06/28 00:33:32 snw
! 13: # Update locking
! 14: #
1.4 snw 15: # Revision 1.3 2025/06/27 16:20:30 snw
16: # Add blacklist
17: #
1.3 snw 18: # Revision 1.2 2025/06/27 02:14:47 snw
19: # Initial operational capability
20: #
1.2 snw 21: # Revision 1.1 2025/06/25 19:38:48 snw
22: # Add indexer
23: #
1.1 snw 24: #
25:
26: use Getopt::Long;
27: use HTTP::Tiny;
28: use HTML::TreeBuilder;
29: use URI;
1.2 snw 30: use Lingua::Stem;
1.1 snw 31: use DBI;
1.2 snw 32: use Data::Dumper;
33: use Try::Tiny;
1.3 snw 34: use Fcntl qw(:flock);
1.1 snw 35:
36: my $dbh = "";
37: my $dsn = "";
38:
1.2 snw 39: $| = 1;
1.1 snw 40: print "pandia indexer v0.0.1\n";
41: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
42:
1.4 snw 43: open my $file, ">", "/tmp/pandia_indexer.lock" or die $!;
1.3 snw 44: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
45:
1.1 snw 46: GetOptions("dbhost=s" => \$dbhost,
47: "dbname=s" => \$dbname,
48: "dbusername=s" => \$dbusername,
49: "dbpw=s" => \$dbpw)
50: or die("error in command line arguments");
51:
52: print "pandia: connecting to $dbname database at $dbhost...";
53:
54: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2 snw 55: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1 snw 56: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
57:
58: print "[OK]\n";
1.2 snw 59:
60: print "pandia: loading queue...";
61:
62: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
63: $sth->execute() or die "pandia: error retrieving crawl queue\n";
64:
65: my $qlen = $sth->rows;
66: print "[OK (queue length $qlen)]\n";
67:
68: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
69:
70: while (my $hashref = $sth->fetchrow_hashref()) {
71: my $tree = HTML::TreeBuilder->new();
72: my $url = $hashref->{url};
73: my $url_domain = $hashref->{url_domain};
74:
75: my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
76: $stemmer->stem_caching({ -level => 2 });
77:
78: print "pandia: retrieving $url...\n";
79: try {
80: my $del_queue = 0;
1.5 ! snw 81: my $head = $http->head($hashref->{url});
! 82: my $headers = $head->{headers};
! 83: my $content_type = $headers->{'content-type'};
! 84:
! 85: if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') {
! 86: print "pandia: content type $content_type not indexable; skipping $url\n";
! 87: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
! 88: $sth->execute($url);
! 89: next;
! 90: }
! 91:
1.2 snw 92: my $response = $http->get($hashref->{url});
93:
94: if(not $response->{success}) {
95: print "pandia: http failure; skipping $url\n";
96: $del_queue = 1;
97: }
98:
99: if($del_queue == 1) {
100: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
101: $sth->execute($url);
102: next;
103: }
104:
105: my $title = "";
106:
107: my $pagedata = $response->{content};
108: if($response) {
109: $tree->parse($pagedata);
110: $title = $tree->look_down('_tag', 'title')->as_text;
1.5 ! snw 111: $title =~ s/[^\x00-\x7F]//g;
1.2 snw 112:
113: print "pandia: processing $url [$title]\n";
114:
115: $fulltext = $tree->as_text;
116: $fulltext =~ s/[^\x00-\x7F]//g;
117:
118: my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
119: $sth->execute($url, $title, $fulltext);
120:
121: }
122: } catch {
123: warn "pandia: caught failure $_\n";
124: };
125:
126: my @words = split(' ', $fulltext);
127: $stemmer->stem_in_place(@words);
128:
129: my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
130: $sthd->execute($url);
131:
132: my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
1.3 snw 133: my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
1.2 snw 134: my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
135: foreach my $word (@words) {
136: $word =~ s/[^\x00-\x7F]//g;
1.3 snw 137: $sths->execute($word, $url);
1.2 snw 138:
139: if($sths->rows > 0) {
140: $sthu->execute($word, $url);
141: }
142: else {
143: $sth->execute($word, $url, $url_domain, 1);
144: }
145: }
146:
147: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
148: $sthuc->execute($url);
149: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>