pandia/crawl.pl - diff

Return to crawl.pl CVS log

Up to [Coherent Logic Development] / pandia

Diff for /pandia/Attic/crawl.pl between versions 1.1.1.1 and 1.2

-version 1.1.1.1, 2025/06/23 23:17:08
+version 1.2, 2025/06/25 03:10:01
  Line 9
  # Licensed AGPL-3.0
  #
  # $Log$
+ # Revision 1.2  2025/06/25 03:10:01  snw
+ # Initial working crawler
+ #
  # Revision 1.1.1.1  2025/06/23 23:17:08  snw
  # Initial commit
  #
  #
+ use Getopt::Long;
  use HTTP::Tiny;
  use HTML::TreeBuilder;
+ use URI;
+ use DBI;
+ use WWW::RobotRules;
+ my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
+ use LWP::Simple qw(get);
+ my $dbh = "";
+ my $dsn = "";
+ my $skips = 0;
+ my $inserts = 0;
+ my $seed = "";
+ my $depth = 0;
+ sub store_url {
+     my ($url) = @_;
+     if($url ne "" && length($url) <= 255) {
+         print ".";
+         my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
+         $ins->execute($url) or $skips = $skips + 1;
+         $inserts = $inserts + 1;
+         if($depth < $maxdepth) {
+             $depth = $depth + 1;
+             crawl_url($url);
+         }
+     }
+ }
+ sub crawl_url {
+     my ($url) = @_;
+     my $u = URI->new($url);
+     if ($u->scheme ne "http" && $u->scheme ne "https") {
+         return;
+     }
+     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
+     $sth->execute($u->host);
+     if($sth->rows > 0) {
+         return;
+     }
+     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
+     my $robots_txt = get $robots_url;
+     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
+     if(!$rules->allowed($url)) {
+         return;
+     }
+     my $origurl = $url;
+     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
+     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
+     my $tree = HTML::TreeBuilder->new();
+     my $response = $http->get($url);
+     $tree->parse($response->{content});
+     my @links = $tree->find_by_tag_name('a');
+     my $href = "";
+     my $firstchar = "";
+     my $final = "";
+     foreach my $link (@links) {
+         $href = $link->attr('href');
+         $firstchar = substr($href, 0, 1);
+         $final = "";
+         if($firstchar eq '/') {
+             $final = $baseurl . $href;
+         }
+         elsif($href eq '##') {
+             $final = $baseurl;
+         }
+         elsif($firstchar eq '#') {
+             $final = $baseurl . '/' . $href;
+         }
+         else {
+             $final = $href;
+         }
+         store_url($final);
+     }
+     $depth = $depth - 1;
+ }
+ $| = 1;
+ print "pandia crawler v0.0.1\n";
+ print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
+ GetOptions("dbhost=s" => \$dbhost,
+            "dbname=s" => \$dbname,
+            "dbusername=s" => \$dbusername,
+            "dbpw=s" => \$dbpw,
+            "seed=s" => \$seed,
+            "maxdepth=n" =>\$maxdepth)
+     or die("error in command line arguments");
+ print "pandia:  connecting to $dbname database at $dbhost...";
+ $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
+ $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
+ die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
+ print "[OK]\n";
+ if($seed ne "") {
+     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
+     sleep 1;
+     crawl_url($seed);
+     print "[OK]\n";
+ }
+ else {
+     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
+     $sth->execute();
+     my $qlen = $sth->rows;
+     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
+     sleep 1;
+     while (my @row = $sth->fetchrow_array()) {
+         my $url = @row[0];
+         crawl_url($url);
+     }
+     print "[OK]\n";
+ }
+ my $total = $inserts + $skips;
+ print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1.1.1
changed lines
	Added in v.1.2