Annotation of pandia/indexer, revision 1.3
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.3 ! snw 4: # $Id: indexer,v 1.2 2025/06/27 02:14:47 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: indexer,v $
1.3 ! snw 12: # Revision 1.2 2025/06/27 02:14:47 snw
! 13: # Initial operational capability
! 14: #
1.2 snw 15: # Revision 1.1 2025/06/25 19:38:48 snw
16: # Add indexer
17: #
1.1 snw 18: #
19:
20: use Getopt::Long;
21: use HTTP::Tiny;
22: use HTML::TreeBuilder;
23: use URI;
1.2 snw 24: use Lingua::Stem;
1.1 snw 25: use DBI;
1.2 snw 26: use Data::Dumper;
27: use Try::Tiny;
1.3 ! snw 28: use Fcntl qw(:flock);
1.1 snw 29:
30: my $dbh = "";
31: my $dsn = "";
32:
1.2 snw 33: $| = 1;
1.1 snw 34: print "pandia indexer v0.0.1\n";
35: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
36:
1.3 ! snw 37: open my $file, ">", "pandia_indexer.lock" or die $!;
! 38: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
! 39:
1.1 snw 40: GetOptions("dbhost=s" => \$dbhost,
41: "dbname=s" => \$dbname,
42: "dbusername=s" => \$dbusername,
43: "dbpw=s" => \$dbpw)
44: or die("error in command line arguments");
45:
46: print "pandia: connecting to $dbname database at $dbhost...";
47:
48: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2 snw 49: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1 snw 50: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
51:
52: print "[OK]\n";
1.2 snw 53:
54: print "pandia: loading queue...";
55:
56: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
57: $sth->execute() or die "pandia: error retrieving crawl queue\n";
58:
59: my $qlen = $sth->rows;
60: print "[OK (queue length $qlen)]\n";
61:
62: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
63:
64: while (my $hashref = $sth->fetchrow_hashref()) {
65: my $tree = HTML::TreeBuilder->new();
66: my $url = $hashref->{url};
67: my $url_domain = $hashref->{url_domain};
68:
69: my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
70: $stemmer->stem_caching({ -level => 2 });
71:
72: print "pandia: retrieving $url...\n";
73: try {
74: my $del_queue = 0;
75: my $response = $http->get($hashref->{url});
76:
77: if(not $response->{success}) {
78: print "pandia: http failure; skipping $url\n";
79: $del_queue = 1;
80: }
81:
82: #if(exists $response->{redirects}) {
83: # print "pandia: redirects detected; skipping $url\n";
84: # $del_queue = 1;
85: #}
86:
87: if($del_queue == 1) {
88: my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
89: $sth->execute($url);
90: next;
91: }
92:
93: my $title = "";
94:
95: my $pagedata = $response->{content};
96: if($response) {
97: $tree->parse($pagedata);
98: $title = $tree->look_down('_tag', 'title')->as_text;
99:
100: print "pandia: processing $url [$title]\n";
101:
102: $fulltext = $tree->as_text;
103: $fulltext =~ s/[^\x00-\x7F]//g;
104:
105: my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
106: $sth->execute($url, $title, $fulltext);
107:
108: }
109: } catch {
110: warn "pandia: caught failure $_\n";
111: };
112:
113: my @words = split(' ', $fulltext);
114: $stemmer->stem_in_place(@words);
115:
116: my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
117: $sthd->execute($url);
118:
119: my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
1.3 ! snw 120: my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=? AND url=?");
1.2 snw 121: my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
122: foreach my $word (@words) {
123: $word =~ s/[^\x00-\x7F]//g;
1.3 ! snw 124: $sths->execute($word, $url);
1.2 snw 125:
126: if($sths->rows > 0) {
127: $sthu->execute($word, $url);
128: }
129: else {
130: $sth->execute($word, $url, $url_domain, 1);
131: }
132: }
133:
134: my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
135: $sthuc->execute($url);
136: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>