Diff for /pandia/crawler between versions 1.3 and 1.4

version 1.3, 2025/06/27 02:14:47 version 1.4, 2025/06/27 16:20:30
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.4  2025/06/27 16:20:30  snw
   # Add blacklist
   #
 # Revision 1.3  2025/06/27 02:14:47  snw  # Revision 1.3  2025/06/27 02:14:47  snw
 # Initial operational capability  # Initial operational capability
 #  #
Line 32  use HTML::TreeBuilder; Line 35  use HTML::TreeBuilder;
 use URI;  use URI;
 use DBI;  use DBI;
 use WWW::RobotRules;  use WWW::RobotRules;
 my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');  use Fcntl qw(:flock);
 use LWP::Simple qw(get);  use LWP::Simple qw(get);
   
   my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
 my $dbh = "";  my $dbh = "";
 my $dsn = "";  my $dsn = "";
 my $skips = 0;  my $skips = 0;
Line 48  my $invalid_scheme_skips = 0; Line 52  my $invalid_scheme_skips = 0;
 sub store_url {  sub store_url {
     my ($url, $parent) = @_;      my ($url, $parent) = @_;
   
     if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") {      if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
           
         my $u = URI->new($url);          my $u = URI->new($url);
         my $domain = $u->host;          my $domain = $u->host;  
         my $scheme = $u->scheme;          my $scheme = $u->scheme;
   
         my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");          my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
Line 118  sub crawl_url { Line 122  sub crawl_url {
   
     my $response = $http->get($url);      my $response = $http->get($url);
   
     if(not $response->{success}) {  
         print "pandia:  http failure; skipping $url\n";  
         next;  
     }  
           
     if(exists $response->{redirects}) {  
         print "pandia:  redirects detected; skipping $url\n";  
         next;  
     }  
   
     $tree->parse($response->{content});      $tree->parse($response->{content});
   
Line 187  if($seed ne "") { Line 182  if($seed ne "") {
     print "[OK]\n";      print "[OK]\n";
 }  }
 else {  else {
       open my $file, ">", "pandia_crawler.lock" or die $!; 
       flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
   
     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");      my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
     $sth->execute();      $sth->execute();
     my $qlen = $sth->rows;      my $qlen = $sth->rows;

Removed from v.1.3  
changed lines
  Added in v.1.4


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>