Diff for /pandia/Attic/crawl.pl between versions 1.1.1.1 and 1.2

version 1.1.1.1, 2025/06/23 23:17:08 version 1.2, 2025/06/25 03:10:01
Line 9 Line 9
 # Licensed AGPL-3.0  # Licensed AGPL-3.0
 #  #
 # $Log$  # $Log$
   # Revision 1.2  2025/06/25 03:10:01  snw
   # Initial working crawler
   #
 # Revision 1.1.1.1  2025/06/23 23:17:08  snw  # Revision 1.1.1.1  2025/06/23 23:17:08  snw
 # Initial commit  # Initial commit
 #  #
 #  #
   
   use Getopt::Long;
 use HTTP::Tiny;  use HTTP::Tiny;
 use HTML::TreeBuilder;  use HTML::TreeBuilder;
   use URI;
   use DBI;
   use WWW::RobotRules;
   my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
   use LWP::Simple qw(get);
   
   my $dbh = "";
   my $dsn = "";
   my $skips = 0;
   my $inserts = 0;
   my $seed = "";
   my $depth = 0;
   
   sub store_url {
       my ($url) = @_;
   
       if($url ne "" && length($url) <= 255) {
           print ".";
           my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
               
           $ins->execute($url) or $skips = $skips + 1;
           $inserts = $inserts + 1;            
           if($depth < $maxdepth) {
               $depth = $depth + 1;
               crawl_url($url);
           }
       }     
   }
   
   sub crawl_url {
       my ($url) = @_;
   
       my $u = URI->new($url);
   
       if ($u->scheme ne "http" && $u->scheme ne "https") {
           return;
       }
   
       my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
       $sth->execute($u->host);
       if($sth->rows > 0) {
           return;
       }
       
       my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
   
       my $robots_txt = get $robots_url;
       $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
   
       if(!$rules->allowed($url)) {
           return;
       }
       
       my $origurl = $url;
       ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
       
       my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
       my $tree = HTML::TreeBuilder->new();
   
       my $response = $http->get($url);
       $tree->parse($response->{content});
   
       my @links = $tree->find_by_tag_name('a');
   
       my $href = "";
       my $firstchar = "";
       my $final = "";
       
       foreach my $link (@links) {
           $href = $link->attr('href');
           $firstchar = substr($href, 0, 1);
           $final = "";
   
           if($firstchar eq '/') {
               $final = $baseurl . $href;     
           }
           elsif($href eq '##') {
               $final = $baseurl;
           }
           elsif($firstchar eq '#') {
               $final = $baseurl . '/' . $href;
           }
           else {
               $final = $href;
           }
               
           store_url($final);
       }
   
       $depth = $depth - 1;
   }
   
   $| = 1;
   
   print "pandia crawler v0.0.1\n";
   print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
   
   GetOptions("dbhost=s" => \$dbhost,
              "dbname=s" => \$dbname,
              "dbusername=s" => \$dbusername,
              "dbpw=s" => \$dbpw,
              "seed=s" => \$seed,
              "maxdepth=n" =>\$maxdepth)
       or die("error in command line arguments");
   
   print "pandia:  connecting to $dbname database at $dbhost...";
   
   $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
   $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
   die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
   
   print "[OK]\n";
   
   if($seed ne "") {
       print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
       sleep 1;
       crawl_url($seed);
       print "[OK]\n";
   }
   else {
       my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
       $sth->execute();
       my $qlen = $sth->rows;
   
       
       print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
       sleep 1;
       while (my @row = $sth->fetchrow_array()) {
           my $url = @row[0];
           crawl_url($url);
       }
       print "[OK]\n";
   }
   
   
   my $total = $inserts + $skips;
   
   print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";

Removed from v.1.1.1.1  
changed lines
  Added in v.1.2


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>