Diff for /pandia/Pandia.pm between versions 1.2 and 1.3

version 1.2, 2025/06/30 02:18:44 version 1.3, 2025/07/01 06:48:03
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.3  2025/07/01 06:48:03  snw
   # Updates
   #
 # Revision 1.2  2025/06/30 02:18:44  snw  # Revision 1.2  2025/06/30 02:18:44  snw
 # Updates  # Updates
 #  #
Line 36  use POSIX qw(strftime); Line 39  use POSIX qw(strftime);
   
 my $indices_waiting : shared;  my $indices_waiting : shared;
   
 sub index {  sub do_index {
     my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;      my ($url, $domain, $dsn, $dbuser, $dbpass, $reindex) = @_;
   
     print "pandia:  thread connecting to MySQL database...";      print "pandia:  thread connecting to MySQL database...";
Line 55  sub index { Line 58  sub index {
     my $head;      my $head;
     print "pandia:  HEAD $url\n";      print "pandia:  HEAD $url\n";
     $head = $http->head($url);      $head = $http->head($url);
   
     if(not $head->{success}) {                    if(not $head->{success}) {              
         print "pandia:  HEAD fail $url\n";          print "pandia:  HEAD fail $url\n";
   
           my $sthh = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?");
           $sthh->execute($url);
           $sthh->finish();
         goto nodb_cleanup;          goto nodb_cleanup;
     }      }
     else {      else {
Line 206  sub index { Line 214  sub index {
     $indices_waiting = $indices_waiting - 1;      $indices_waiting = $indices_waiting - 1;
 }  }
   
 sub new {  sub blacklist_add {
     my ($class, $args) = @_;      my ($self, $domain) = @_;
   
       print "pandia:  connecting to database...";
       my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
       die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
       print "[OK]\n";
   
       print "pandia:  blacklisting domain $domain...";
       my $sth = $dbh->prepare("INSERT INTO blacklist (url_domain) VALUES (?)");
       $sth->execute($domain);
       print "[OK]\n";
           
     my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");      print "pandia:  removing blacklisted items from crawl queue...";
       $sth = $dbh->prepare("DELETE crawl_queue FROM crawl_queue JOIN blacklist ON crawl_queue.url_domain=blacklist.url_domain");
       $sth->execute();
       print "[OK]\n";
       
       print "pandia:  removing blacklisted items from index...";
       $sth = $dbh->prepare("DELETE url_fulltext FROM url_fulltext JOIN blacklist ON url_fulltext.url_domain=blacklist.url_domain");
       $sth->execute();
       print "[OK]\n";
   
     my $thost = $cfg->val($args->{profile}, 'dbhost');      $sth->finish();
     my $tname = $cfg->val($args->{profile}, 'dbname');      $dbh->disconnect();
     my $tuser = $cfg->val($args->{profile}, 'dbuser');  }
     my $tpass = $cfg->val($args->{profile}, 'dbpass');  
     my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');  
     my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');  
   
     $indices_waiting = $tindex_workers;  sub blacklist_remove {
       my ($self, $domain) = @_;
   
       print "pandia:  connecting to database...";
       my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
       die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
       print "[OK]\n";
   
       my $sth = $dbh->prepare("DELETE FROM blacklist WHERE url_domain=?");
       $sth->execute($domain);
   
       $sth->finish();
       $dbh->disconnect();        
   }
   
   sub index_serial {
       my ($self) = @_;
   
       my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
           
     my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";      my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE analyzed=0");
       $sth->execute();
   
       while (my $hashref = $sth->fetchrow_hashref()) {
           do_index $hashref->{url}, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
       }
   
       $sth->finish();
       $dbh->disconnect();
   }
   
   sub index_one {
       my ($self, $url) = @_;
   
       my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
           
     my $self = bless {      my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url=? LIMIT 1");
         profile => $args->{profile},      $sth->execute($url);
         dbhost => $thost,  
         dbname => $tname,      while (my $hashref = $sth->fetchrow_hashref()) {
         dbuser => $tuser,          do_index $url, $hashref->{url_domain}, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
         dbpass => $tpass,      }
         dsn => $tdsn,  
         index_workers => $tindex_workers,      $sth->finish();
         crawl_workers => $tcrawl_workers,      $dbh->disconnect();
         index_pool => Thread::Pool->new(  }
             {  
                 workers => $tindex_workers,  sub index_domain {
                 do => \&index      my ($self, $domain) = @_;
             }  
             )      my $dbh = DBI->connect($self->{dsn}, $self->{dbuser}, $self->{dbpass}, {RaiseError => 1, PrintError => 0});
     }, $class;      
       my $sth = $dbh->prepare("SELECT url, url_domain FROM crawl_queue WHERE url_domain=?");
       $sth->execute($domain);
   
       while (my $hashref = $sth->fetchrow_hashref()) {
           do_index $hashref->{url}, $domain, $self->{dsn}, $self->{dbuser}, $self->{dbpass}, 0;
       }
   
       $sth->finish();
       $dbh->disconnect();
   
     return $self;  
 }  }
   
 sub run_index_batch {      sub run_index_batch {    
Line 334  sub run_reindex_batch { Line 397  sub run_reindex_batch {
                   
 }  }
   
   sub new {
       my ($class, $args) = @_;
       
       my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
   
       my $thost = $cfg->val($args->{profile}, 'dbhost');
       my $tname = $cfg->val($args->{profile}, 'dbname');
       my $tuser = $cfg->val($args->{profile}, 'dbuser');
       my $tpass = $cfg->val($args->{profile}, 'dbpass');
       my $tindex_workers = $cfg->val($args->{profile}, 'index_workers');
       my $tcrawl_workers = $cfg->val($args->{profile}, 'crawl_workers');
   
       $indices_waiting = $tindex_workers;
       
       my $tdsn = "DBI:mysql:database=$tname;host=$thost;port=3306;mysql_connect_timeout=5;";
       
       my $self = bless {
           profile => $args->{profile},
           dbhost => $thost,
           dbname => $tname,
           dbuser => $tuser,
           dbpass => $tpass,
           dsn => $tdsn,
           index_workers => $tindex_workers,
           crawl_workers => $tcrawl_workers,
           index_pool => Thread::Pool->new(
               {
                   workers => $tindex_workers,
                   do => \&do_index
               }
               )
       }, $class;
   
       return $self;
   }
   
   
 1;  1;
   

Removed from v.1.2  
changed lines
  Added in v.1.3


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>