--- pandia/Attic/crawl.pl 2025/06/23 23:17:08 1.1 +++ pandia/Attic/crawl.pl 2025/06/25 03:10:01 1.2 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawl.pl,v 1.1 2025/06/23 23:17:08 snw Exp $ +# $Id: crawl.pl,v 1.2 2025/06/25 03:10:01 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,11 +9,152 @@ # Licensed AGPL-3.0 # # $Log: crawl.pl,v $ -# Revision 1.1 2025/06/23 23:17:08 snw -# Initial revision +# Revision 1.2 2025/06/25 03:10:01 snw +# Initial working crawler +# +# Revision 1.1.1.1 2025/06/23 23:17:08 snw +# Initial commit # # +use Getopt::Long; use HTTP::Tiny; use HTML::TreeBuilder; +use URI; +use DBI; +use WWW::RobotRules; +my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); +use LWP::Simple qw(get); + +my $dbh = ""; +my $dsn = ""; +my $skips = 0; +my $inserts = 0; +my $seed = ""; +my $depth = 0; + +sub store_url { + my ($url) = @_; + + if($url ne "" && length($url) <= 255) { + print "."; + my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); + + $ins->execute($url) or $skips = $skips + 1; + $inserts = $inserts + 1; + if($depth < $maxdepth) { + $depth = $depth + 1; + crawl_url($url); + } + } +} + +sub crawl_url { + my ($url) = @_; + + my $u = URI->new($url); + + if ($u->scheme ne "http" && $u->scheme ne "https") { + return; + } + + my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); + $sth->execute($u->host); + if($sth->rows > 0) { + return; + } + + my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt"; + + my $robots_txt = get $robots_url; + $rules->parse($robots_url, $robots_txt) if defined $robots_txt; + + if(!$rules->allowed($url)) { + return; + } + + my $origurl = $url; + ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x; + + my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1"); + my $tree = HTML::TreeBuilder->new(); + + my $response = $http->get($url); + $tree->parse($response->{content}); + + my @links = $tree->find_by_tag_name('a'); + + my $href = ""; + my $firstchar = ""; + my $final = ""; + + foreach my $link (@links) { + $href = $link->attr('href'); + $firstchar = substr($href, 0, 1); + $final = ""; + + if($firstchar eq '/') { + $final = $baseurl . $href; + } + elsif($href eq '##') { + $final = $baseurl; + } + elsif($firstchar eq '#') { + $final = $baseurl . '/' . $href; + } + else { + $final = $href; + } + + store_url($final); + } + + $depth = $depth - 1; +} + +$| = 1; + +print "pandia crawler v0.0.1\n"; +print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; + +GetOptions("dbhost=s" => \$dbhost, + "dbname=s" => \$dbname, + "dbusername=s" => \$dbusername, + "dbpw=s" => \$dbpw, + "seed=s" => \$seed, + "maxdepth=n" =>\$maxdepth) + or die("error in command line arguments"); + +print "pandia: connecting to $dbname database at $dbhost..."; + +$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; +$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); +die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; + +print "[OK]\n"; + +if($seed ne "") { + print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; + sleep 1; + crawl_url($seed); + print "[OK]\n"; +} +else { + my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); + $sth->execute(); + my $qlen = $sth->rows; + + + print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth"; + sleep 1; + while (my @row = $sth->fetchrow_array()) { + my $url = @row[0]; + crawl_url($url); + } + print "[OK]\n"; +} + + +my $total = $inserts + $skips; +print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";