Diff for /pandia/crawler between versions 1.1 and 1.8

version 1.1, 2025/06/25 13:44:37 version 1.8, 2025/07/02 15:14:44
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.8  2025/07/02 15:14:44  snw
   # Fix bug in restricted mode
   #
   # Revision 1.7  2025/07/02 15:03:05  snw
   # Add support for restricted mode
   #
   # Revision 1.6  2025/07/01 19:20:47  snw
   # Stop crawling and indexing URLs containing page fragments
   #
   # Revision 1.5  2025/06/28 00:33:32  snw
   # Update locking
   #
   # Revision 1.4  2025/06/27 16:20:30  snw
   # Add blacklist
   #
   # Revision 1.3  2025/06/27 02:14:47  snw
   # Initial operational capability
   #
   # Revision 1.2  2025/06/25 19:38:48  snw
   # Add indexer
   #
 # Revision 1.1  2025/06/25 13:44:37  snw  # Revision 1.1  2025/06/25 13:44:37  snw
 # Renaming  # Renaming
 #  #
Line 26  use HTML::TreeBuilder; Line 47  use HTML::TreeBuilder;
 use URI;  use URI;
 use DBI;  use DBI;
 use WWW::RobotRules;  use WWW::RobotRules;
 my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');  use Fcntl qw(:flock);
 use LWP::Simple qw(get);  use LWP::Simple qw(get);
   use Config::IniFiles;
   
   my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
 my $dbh = "";  my $dbh = "";
 my $dsn = "";  my $dsn = "";
 my $skips = 0;  my $skips = 0;
 my $inserts = 0;  my $inserts = 0;
 my $seed = "";  my $seed = "";
 my $depth = 0;  my $depth = 0;
   my $blacklist_matches = 0;
   my $robots_txt_denies = 0;
   my $invalid_scheme_skips = 0;
   my $mode;
   
 sub store_url {  sub store_url {
     my ($url) = @_;      my ($url, $parent) = @_;
   
     if($url ne "" && length($url) <= 255) {      if (index($url, '#') != -1) {
         print ".";          print "F";
         my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");          return;
       }
       
       if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
           
           my $u = URI->new($url);
           my $domain = $u->host;  
           my $scheme = $u->scheme;
           my @parts = split($domain, '.');
           my $tld = $parts[-1];
           
           if ($mode eq 'restricted') {
               my $tld_ok = 0;
               foreach (@allowed_tlds) {
                   my $allowed = $_;
                   
                   if($tld eq $allowed) {
                       $tld_ok = 1;
                       last;
                   }
               }
               if($tld_ok == 0) {
                   print "T";
                   return;
               }
           }
   
           my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
           $sth->execute($domain);
           
           my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
                           
         $ins->execute($url) or $skips = $skips + 1;          if(not $ins->execute($url, $parent, $domain, $scheme)) {
         $inserts = $inserts + 1;                          $skips = $skips + 1;
         if($depth < $maxdepth) {              print "d";
             $depth = $depth + 1;  
             crawl_url($url);  
         }          }
     }               else {
               print ".";
               $inserts = $inserts + 1;        
               if($depth < $maxdepth) {
                   $depth = $depth + 1;
                   crawl_url($url);
               }
               else {
                   print "l";
               }
           }
       }
       else {
           print "x";
       }
 }  }
   
 sub crawl_url {  sub crawl_url {
Line 58  sub crawl_url { Line 127  sub crawl_url {
     my $u = URI->new($url);      my $u = URI->new($url);
   
     if ($u->scheme ne "http" && $u->scheme ne "https") {      if ($u->scheme ne "http" && $u->scheme ne "https") {
           $invalid_scheme_skips = $invalid_scheme_skips + 1;
           print "s";
         return;          return;
     }      }
   
     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");      my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
     $sth->execute($u->host);      $sth->execute($u->host);
     if($sth->rows > 0) {      if($sth->rows > 0) {
           print "b";
           $blacklist_matches = $blacklist_matches + 1;
         return;          return;
     }      }
           
Line 73  sub crawl_url { Line 146  sub crawl_url {
     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;      $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
   
     if(!$rules->allowed($url)) {      if(!$rules->allowed($url)) {
           print "r";
           $robots_txt_denies = $robots_txt_denies + 1;
         return;          return;
     }      }
           
Line 83  sub crawl_url { Line 158  sub crawl_url {
     my $tree = HTML::TreeBuilder->new();      my $tree = HTML::TreeBuilder->new();
   
     my $response = $http->get($url);      my $response = $http->get($url);
   
   
     $tree->parse($response->{content});      $tree->parse($response->{content});
   
     my @links = $tree->find_by_tag_name('a');      my @links = $tree->find_by_tag_name('a');
Line 109  sub crawl_url { Line 186  sub crawl_url {
             $final = $href;              $final = $href;
         }          }
                           
         store_url($final);          store_url($final, $url);
     }      }
   
     $depth = $depth - 1;      $depth = $depth - 1;
 }  }
   
 $| = 1;  $| = 1;
   
 print "pandia crawler v0.0.1\n";  print "pandia crawler v0.0.1\n";
 print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";  print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
   
 GetOptions("dbhost=s" => \$dbhost,  my $profile;
            "dbname=s" => \$dbname,  
            "dbusername=s" => \$dbusername,  GetOptions("profile=s" => \$profile,
            "dbpw=s" => \$dbpw,             "seed=s" => \$seed,
            "seed=s" => \$seed,             "maxdepth=n" =>\$maxdepth)
            "maxdepth=n" =>\$maxdepth)  
     or die("error in command line arguments");      or die("error in command line arguments");
   
   my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
   
   $dbhost = $cfg->val($profile, 'dbhost');
   $dbname = $cfg->val($profile, 'dbname');
   $dbusername = $cfg->val($profile, 'dbuser');
   $dbpw = $cfg->val($profile, 'dbpass');
   $tmp = $cfg->val($profile, 'allowed_tlds');
   
   if($tmp ne '*') {
       $mode = 'restricted';
       @allowed_tlds = split(',', $tmp);
       print "pandia:  crawler restricted to these TLDs:  ";
       foreach (@allowed_tlds) {
           print ".$_ ";
       }
       print "\n";
   }
   else {
       print "pandia:  crawler unrestricted\n";
       $mode = 'normal';
   }
   
 print "pandia:  connecting to $dbname database at $dbhost...";  print "pandia:  connecting to $dbname database at $dbhost...";
   
 $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";  $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
Line 136  die "pandia:  failed to connect to MySQL Line 233  die "pandia:  failed to connect to MySQL
   
 print "[OK]\n";  print "[OK]\n";
   
 if($seed ne "") {  print "pandia:  each character represents the following status for a URL:\n";
     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";  print "  .    URL added to indexer queue\n";
   print "  l    crawl exceeded max depth\n";
   print "  x    URL too long or invalid scheme\n";
   print "  d    URL was a duplicate\n";
   print "  b    crawl was blocked by robots.txt\n";
   print "  F    URL contained a fragment\n";
   print "  T    URL was from a disallowed top-level domain\n\n";
   
   if($seed ne "") {        
       print "pandia:  crawling seed $seed to a maximum depth of $maxdepth...";
     sleep 1;      sleep 1;
     crawl_url($seed);      crawl_url($seed);
     print "[OK]\n";      print "[OK]\n";
 }  }
 else {  else {
       open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 
       flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
   
     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");      my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
     $sth->execute();      $sth->execute();
     my $qlen = $sth->rows;      my $qlen = $sth->rows;
Line 161  else { Line 270  else {
 my $total = $inserts + $skips;  my $total = $inserts + $skips;
   
 print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";  print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
   print "          - $blacklist_matches blacklist matches\n";
   print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
   print "          - $robots_txt_denies URLs skipped due to robots.txt\n";
       
       

Removed from v.1.1  
changed lines
  Added in v.1.8


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>