--- pandia/Pandia.pm 2025/06/30 02:18:44 1.2 +++ pandia/Pandia.pm 2025/07/01 06:48:03 1.3 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: Pandia.pm,v 1.2 2025/06/30 02:18:44 snw Exp $ +# $Id: Pandia.pm,v 1.3 2025/07/01 06:48:03 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,9 @@ # Licensed AGPL-3.0 # # $Log: Pandia.pm,v $ +# Revision 1.3 2025/07/01 06:48:03 snw +# Updates +# # Revision 1.2 2025/06/30 02:18:44 snw # Updates # @@ -36,7 +39,7 @@ use POSIX qw(strftime); my $indices_waiting : shared; -sub index { +sub do_index { my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_; print "pandia: thread connecting to MySQL database..."; @@ -55,8 +58,13 @@ sub index { my $head; print "pandia: HEAD $url\n"; $head = $http->head($url); + if(not $head->{success}) { print "pandia: HEAD fail $url\n"; + + my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); + $sthh->execute($url); + $sthh->finish(); goto nodb_cleanup; } else { @@ -206,40 +214,95 @@ sub index { $indices_waiting = $indices_waiting - 1; } -sub new { - my ($class, $args) = @_; +sub blacklist_add { + my ($self, $domain) = @_; + + print "pandia: connecting to database..."; + my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0}); + die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; + print "[OK]\n"; + + print "pandia: blacklisting domain $domain..."; + my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)"); + $sth->execute($domain); + print "[OK]\n"; - my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); + print "pandia: removing blacklisted items from crawl queue..."; + $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain"); + $sth->execute(); + print "[OK]\n"; + + print "pandia: removing blacklisted items from index..."; + $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain"); + $sth->execute(); + print "[OK]\n"; - my $thost = $cfg->val($args->{profile}, 'dbhost'); - my $tname = $cfg->val($args->{profile}, 'dbname'); - my $tuser = $cfg->val($args->{profile}, 'dbuser'); - my $tpass = $cfg->val($args->{profile}, 'dbpass'); - my $tindex_workers = $cfg->val($args->{profile}, 'index_workers'); - my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers'); + $sth->finish(); + $dbh->disconnect(); +} - $indices_waiting = $tindex_workers; +sub blacklist_remove { + my ($self, $domain) = @_; + + print "pandia: connecting to database..."; + my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0}); + die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; + print "[OK]\n"; + + my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?"); + $sth->execute($domain); + + $sth->finish(); + $dbh->disconnect(); +} + +sub index_serial { + my ($self) = @_; + + my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0}); - my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;"; + my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0"); + $sth->execute(); + + while (my $hashref = $sth->fetchrow_hashref()) { + do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0; + } + + $sth->finish(); + $dbh->disconnect(); +} + +sub index_one { + my ($self, $url) = @_; + + my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0}); - my $self = bless { - profile => $args->{profile}, - dbhost => $thost, - dbname => $tname, - dbuser => $tuser, - dbpass => $tpass, - dsn => $tdsn, - index_workers => $tindex_workers, - crawl_workers => $tcrawl_workers, - index_pool => Thread::Pool->new( - { - workers => $tindex_workers, - do => \&index - } - ) - }, $class; + my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1"); + $sth->execute($url); + + while (my $hashref = $sth->fetchrow_hashref()) { + do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0; + } + + $sth->finish(); + $dbh->disconnect(); +} + +sub index_domain { + my ($self, $domain) = @_; + + my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0}); + + my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?"); + $sth->execute($domain); + + while (my $hashref = $sth->fetchrow_hashref()) { + do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0; + } + + $sth->finish(); + $dbh->disconnect(); - return $self; } sub run_index_batch { @@ -334,5 +397,42 @@ sub run_reindex_batch { } +sub new { + my ($class, $args) = @_; + + my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini"); + + my $thost = $cfg->val($args->{profile}, 'dbhost'); + my $tname = $cfg->val($args->{profile}, 'dbname'); + my $tuser = $cfg->val($args->{profile}, 'dbuser'); + my $tpass = $cfg->val($args->{profile}, 'dbpass'); + my $tindex_workers = $cfg->val($args->{profile}, 'index_workers'); + my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers'); + + $indices_waiting = $tindex_workers; + + my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;"; + + my $self = bless { + profile => $args->{profile}, + dbhost => $thost, + dbname => $tname, + dbuser => $tuser, + dbpass => $tpass, + dsn => $tdsn, + index_workers => $tindex_workers, + crawl_workers => $tcrawl_workers, + index_pool => Thread::Pool->new( + { + workers => $tindex_workers, + do => \&do_index + } + ) + }, $class; + + return $self; +} + + 1;