version 1.1.1.1, 2025/06/23 23:17:08
|
version 1.2, 2025/06/25 03:10:01
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.2 2025/06/25 03:10:01 snw |
|
# Initial working crawler |
|
# |
# Revision 1.1.1.1 2025/06/23 23:17:08 snw |
# Revision 1.1.1.1 2025/06/23 23:17:08 snw |
# Initial commit |
# Initial commit |
# |
# |
# |
# |
|
|
|
use Getopt::Long; |
use HTTP::Tiny; |
use HTTP::Tiny; |
use HTML::TreeBuilder; |
use HTML::TreeBuilder; |
|
use URI; |
|
use DBI; |
|
use WWW::RobotRules; |
|
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
|
use LWP::Simple qw(get); |
|
|
|
my $dbh = ""; |
|
my $dsn = ""; |
|
my $skips = 0; |
|
my $inserts = 0; |
|
my $seed = ""; |
|
my $depth = 0; |
|
|
|
sub store_url { |
|
my ($url) = @_; |
|
|
|
if($url ne "" && length($url) <= 255) { |
|
print "."; |
|
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); |
|
|
|
$ins->execute($url) or $skips = $skips + 1; |
|
$inserts = $inserts + 1; |
|
if($depth < $maxdepth) { |
|
$depth = $depth + 1; |
|
crawl_url($url); |
|
} |
|
} |
|
} |
|
|
|
sub crawl_url { |
|
my ($url) = @_; |
|
|
|
my $u = URI->new($url); |
|
|
|
if ($u->scheme ne "http" && $u->scheme ne "https") { |
|
return; |
|
} |
|
|
|
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); |
|
$sth->execute($u->host); |
|
if($sth->rows > 0) { |
|
return; |
|
} |
|
|
|
my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt"; |
|
|
|
my $robots_txt = get $robots_url; |
|
$rules->parse($robots_url, $robots_txt) if defined $robots_txt; |
|
|
|
if(!$rules->allowed($url)) { |
|
return; |
|
} |
|
|
|
my $origurl = $url; |
|
($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x; |
|
|
|
my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1"); |
|
my $tree = HTML::TreeBuilder->new(); |
|
|
|
my $response = $http->get($url); |
|
$tree->parse($response->{content}); |
|
|
|
my @links = $tree->find_by_tag_name('a'); |
|
|
|
my $href = ""; |
|
my $firstchar = ""; |
|
my $final = ""; |
|
|
|
foreach my $link (@links) { |
|
$href = $link->attr('href'); |
|
$firstchar = substr($href, 0, 1); |
|
$final = ""; |
|
|
|
if($firstchar eq '/') { |
|
$final = $baseurl . $href; |
|
} |
|
elsif($href eq '##') { |
|
$final = $baseurl; |
|
} |
|
elsif($firstchar eq '#') { |
|
$final = $baseurl . '/' . $href; |
|
} |
|
else { |
|
$final = $href; |
|
} |
|
|
|
store_url($final); |
|
} |
|
|
|
$depth = $depth - 1; |
|
} |
|
|
|
$| = 1; |
|
|
|
print "pandia crawler v0.0.1\n"; |
|
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
|
GetOptions("dbhost=s" => \$dbhost, |
|
"dbname=s" => \$dbname, |
|
"dbusername=s" => \$dbusername, |
|
"dbpw=s" => \$dbpw, |
|
"seed=s" => \$seed, |
|
"maxdepth=n" =>\$maxdepth) |
|
or die("error in command line arguments"); |
|
|
|
print "pandia: connecting to $dbname database at $dbhost..."; |
|
|
|
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;"; |
|
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0}); |
|
die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh; |
|
|
|
print "[OK]\n"; |
|
|
|
if($seed ne "") { |
|
print "pandia: crawling seed $seed to a maximum depth of $maxdepth"; |
|
sleep 1; |
|
crawl_url($seed); |
|
print "[OK]\n"; |
|
} |
|
else { |
|
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
|
$sth->execute(); |
|
my $qlen = $sth->rows; |
|
|
|
|
|
print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth"; |
|
sleep 1; |
|
while (my @row = $sth->fetchrow_array()) { |
|
my $url = @row[0]; |
|
crawl_url($url); |
|
} |
|
print "[OK]\n"; |
|
} |
|
|
|
|
|
my $total = $inserts + $skips; |
|
|
|
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; |