Annotation of pandia/indexer, revision 1.2

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.2     ! snw         4: # $Id: indexer,v 1.1 2025/06/25 19:38:48 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2     ! snw        11: # $Log: indexer,v $
        !            12: # Revision 1.1  2025/06/25 19:38:48  snw
        !            13: # Add indexer
        !            14: #
1.1       snw        15: #
                     16: 
                     17: use Getopt::Long;
                     18: use HTTP::Tiny;
                     19: use HTML::TreeBuilder;
                     20: use URI;
1.2     ! snw        21: use Lingua::Stem;
1.1       snw        22: use DBI;
1.2     ! snw        23: use Data::Dumper;
        !            24: use Try::Tiny;
1.1       snw        25: 
                     26: my $dbh = "";
                     27: my $dsn = "";
                     28: 
1.2     ! snw        29: $| = 1;
1.1       snw        30: print "pandia indexer v0.0.1\n";
                     31: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
                     32: 
                     33: GetOptions("dbhost=s" => \$dbhost,
                     34:            "dbname=s" => \$dbname,
                     35:            "dbusername=s" => \$dbusername,
                     36:            "dbpw=s" => \$dbpw)
                     37:     or die("error in command line arguments");
                     38: 
                     39: print "pandia:  connecting to $dbname database at $dbhost...";
                     40: 
                     41: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
1.2     ! snw        42: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 1});
1.1       snw        43: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
                     44: 
                     45: print "[OK]\n";
1.2     ! snw        46: 
        !            47: print "pandia:  loading queue...";
        !            48: 
        !            49: my $sth = $dbh->prepare("SELECT * FROM crawl_queue WHERE analyzed=0");
        !            50: $sth->execute() or die "pandia:  error retrieving crawl queue\n";
        !            51: 
        !            52: my $qlen = $sth->rows;
        !            53: print "[OK (queue length $qlen)]\n";
        !            54: 
        !            55: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1", timeout => 5);
        !            56: 
        !            57: while (my $hashref = $sth->fetchrow_hashref()) {    
        !            58:     my $tree = HTML::TreeBuilder->new();
        !            59:     my $url = $hashref->{url};
        !            60:     my $url_domain = $hashref->{url_domain};
        !            61: 
        !            62:     my $stemmer = Lingua::Stem->new(-locale => 'EN-US');
        !            63:     $stemmer->stem_caching({ -level => 2 });
        !            64: 
        !            65:     print "pandia:  retrieving $url...\n";
        !            66:     try {
        !            67:        my $del_queue = 0;
        !            68:        my $response = $http->get($hashref->{url});
        !            69: 
        !            70:        if(not $response->{success}) {
        !            71:            print "pandia:  http failure; skipping $url\n";
        !            72:            $del_queue = 1;
        !            73:        }
        !            74:        
        !            75:        #if(exists $response->{redirects}) {
        !            76:        #    print "pandia:  redirects detected; skipping $url\n";
        !            77:        #    $del_queue = 1;
        !            78:        #}
        !            79: 
        !            80:        if($del_queue == 1) {
        !            81:            my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
        !            82:            $sth->execute($url);
        !            83:            next;
        !            84:        }
        !            85:        
        !            86:        my $title = "";
        !            87:        
        !            88:        my $pagedata = $response->{content};    
        !            89:        if($response) {
        !            90:            $tree->parse($pagedata);   
        !            91:            $title = $tree->look_down('_tag', 'title')->as_text;
        !            92:            
        !            93:            print "pandia:  processing $url [$title]\n";
        !            94: 
        !            95:            $fulltext = $tree->as_text;
        !            96:            $fulltext =~ s/[^\x00-\x7F]//g;
        !            97:            
        !            98:            my $sth = $dbh->prepare("INSERT INTO url_fulltext(url, page_title, body) VALUES (?, ?, ?)");
        !            99:            $sth->execute($url, $title, $fulltext);
        !           100:            
        !           101:        }
        !           102:     } catch {
        !           103:        warn "pandia:  caught failure $_\n";
        !           104:     };
        !           105: 
        !           106:     my @words = split(' ', $fulltext);    
        !           107:     $stemmer->stem_in_place(@words);
        !           108: 
        !           109:     my $sthd = $dbh->prepare("DELETE FROM keywords WHERE url=?");
        !           110:     $sthd->execute($url);
        !           111:     
        !           112:     my $sth = $dbh->prepare("INSERT INTO keywords (word, url, url_domain, word_count) VALUES (?, ?, ?, ?)");
        !           113:     my $sths = $dbh->prepare("SELECT word_count FROM keywords WHERE word=?");
        !           114:     my $sthu = $dbh->prepare("UPDATE keywords SET word_count=word_count + 1 WHERE word=? AND url=?");
        !           115:     foreach my $word (@words) {
        !           116:        $word =~ s/[^\x00-\x7F]//g;
        !           117:        $sths->execute($word);
        !           118: 
        !           119:        if($sths->rows > 0) {
        !           120:            $sthu->execute($word, $url);
        !           121:        }
        !           122:        else {
        !           123:            $sth->execute($word, $url, $url_domain, 1);
        !           124:        }
        !           125:     }
        !           126: 
        !           127:     my $sthuc = $dbh->prepare("UPDATE crawl_queue SET analyzed=1 WHERE url=?");
        !           128:     $sthuc->execute($url);
        !           129: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>