pandia/crawler - diff

Return to crawler CVS log

Up to [Coherent Logic Development] / pandia

Diff for /pandia/crawler between versions 1.1 and 1.8

-version 1.1, 2025/06/25 13:44:37
+version 1.8, 2025/07/02 15:14:44
  Line 9
  # Licensed AGPL-3.0
  #
  # $Log$
+ # Revision 1.8  2025/07/02 15:14:44  snw
+ # Fix bug in restricted mode
+ #
+ # Revision 1.7  2025/07/02 15:03:05  snw
+ # Add support for restricted mode
+ #
+ # Revision 1.6  2025/07/01 19:20:47  snw
+ # Stop crawling and indexing URLs containing page fragments
+ #
+ # Revision 1.5  2025/06/28 00:33:32  snw
+ # Update locking
+ #
+ # Revision 1.4  2025/06/27 16:20:30  snw
+ # Add blacklist
+ #
+ # Revision 1.3  2025/06/27 02:14:47  snw
+ # Initial operational capability
+ #
+ # Revision 1.2  2025/06/25 19:38:48  snw
+ # Add indexer
+ #
  # Revision 1.1  2025/06/25 13:44:37  snw
  # Renaming
  #
- Line 26  use HTML::TreeBuilder;
+ Line 47  use HTML::TreeBuilder;
  use URI;
  use DBI;
  use WWW::RobotRules;
- my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
+ use Fcntl qw(:flock);
  use LWP::Simple qw(get);
+ use Config::IniFiles;
+ my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
  my $dbh = "";
  my $dsn = "";
  my $skips = 0;
  my $inserts = 0;
  my $seed = "";
  my $depth = 0;
+ my $blacklist_matches = 0;
+ my $robots_txt_denies = 0;
+ my $invalid_scheme_skips = 0;
+ my $mode;
  sub store_url {
-     my ($url) = @_;
+     my ($url, $parent) = @_;
-     if($url ne "" && length($url) <= 255) {
+     if (index($url, '#') != -1) {
-         print ".";
+         print "F";
-         my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
+         return;
+     }
+     if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
+         my $u = URI->new($url);
+         my $domain = $u->host;
+         my $scheme = $u->scheme;
+         my @parts = split($domain, '.');
+         my $tld = $parts[-1];
+         if ($mode eq 'restricted') {
+             my $tld_ok = 0;
+             foreach (@allowed_tlds) {
+                 my $allowed = $_;
+                 if($tld eq $allowed) {
+                     $tld_ok = 1;
+                     last;
+                 }
+             }
+             if($tld_ok == 0) {
+                 print "T";
+                 return;
+             }
+         }
+         my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
+         $sth->execute($domain);
+         my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
-         $ins->execute($url) or $skips = $skips + 1;
+         if(not $ins->execute($url, $parent, $domain, $scheme)) {
-         $inserts = $inserts + 1;
+             $skips = $skips + 1;
-         if($depth < $maxdepth) {
+             print "d";
-             $depth = $depth + 1;
-             crawl_url($url);
          }
-     }
+         else {
+             print ".";
+             $inserts = $inserts + 1;
+             if($depth < $maxdepth) {
+                 $depth = $depth + 1;
+                 crawl_url($url);
+             }
+             else {
+                 print "l";
+             }
+         }
+     }
+     else {
+         print "x";
+     }
  }
  sub crawl_url {
- Line 58  sub crawl_url {
+ Line 127  sub crawl_url {
      my $u = URI->new($url);
      if ($u->scheme ne "http" && $u->scheme ne "https") {
+         $invalid_scheme_skips = $invalid_scheme_skips + 1;
+         print "s";
          return;
      }
      my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
      $sth->execute($u->host);
      if($sth->rows > 0) {
+         print "b";
+         $blacklist_matches = $blacklist_matches + 1;
          return;
      }
- Line 73  sub crawl_url {
+ Line 146  sub crawl_url {
      $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
      if(!$rules->allowed($url)) {
+         print "r";
+         $robots_txt_denies = $robots_txt_denies + 1;
          return;
      }
- Line 83  sub crawl_url {
+ Line 158  sub crawl_url {
      my $tree = HTML::TreeBuilder->new();
      my $response = $http->get($url);
      $tree->parse($response->{content});
      my @links = $tree->find_by_tag_name('a');
- Line 109  sub crawl_url {
+ Line 186  sub crawl_url {
              $final = $href;
          }
-         store_url($final);
+         store_url($final, $url);
      }
      $depth = $depth - 1;
  }
  $| = 1;
  print "pandia crawler v0.0.1\n";
  print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
- GetOptions("dbhost=s" => \$dbhost,
+ my $profile;
-            "dbname=s" => \$dbname,
-            "dbusername=s" => \$dbusername,
+ GetOptions("profile=s" => \$profile,
-            "dbpw=s" => \$dbpw,
+            "seed=s" => \$seed,
-            "seed=s" => \$seed,
+            "maxdepth=n" =>\$maxdepth)
-            "maxdepth=n" =>\$maxdepth)
      or die("error in command line arguments");
+ my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
+ $dbhost = $cfg->val($profile, 'dbhost');
+ $dbname = $cfg->val($profile, 'dbname');
+ $dbusername = $cfg->val($profile, 'dbuser');
+ $dbpw = $cfg->val($profile, 'dbpass');
+ $tmp = $cfg->val($profile, 'allowed_tlds');
+ if($tmp ne '*') {
+     $mode = 'restricted';
+     @allowed_tlds = split(',', $tmp);
+     print "pandia:  crawler restricted to these TLDs:  ";
+     foreach (@allowed_tlds) {
+         print ".$_ ";
+     }
+     print "\n";
+ }
+ else {
+     print "pandia:  crawler unrestricted\n";
+     $mode = 'normal';
+ }
  print "pandia:  connecting to $dbname database at $dbhost...";
  $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
- Line 136  die "pandia:  failed to connect to MySQL
+ Line 233  die "pandia:  failed to connect to MySQL
  print "[OK]\n";
- if($seed ne "") {
+ print "pandia:  each character represents the following status for a URL:\n";
-     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
+ print "  .    URL added to indexer queue\n";
+ print "  l    crawl exceeded max depth\n";
+ print "  x    URL too long or invalid scheme\n";
+ print "  d    URL was a duplicate\n";
+ print "  b    crawl was blocked by robots.txt\n";
+ print "  F    URL contained a fragment\n";
+ print "  T    URL was from a disallowed top-level domain\n\n";
+ if($seed ne "") {
+     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth...";
      sleep 1;
      crawl_url($seed);
      print "[OK]\n";
  }
  else {
+     open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
+     flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
      my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
      $sth->execute();
      my $qlen = $sth->rows;
- Line 161  else {
+ Line 270  else {
  my $total = $inserts + $skips;
  print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
+ print "          - $blacklist_matches blacklist matches\n";
+ print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
+ print "          - $robots_txt_denies URLs skipped due to robots.txt\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1
changed lines
	Added in v.1.8