version 1.1, 2025/06/25 13:44:37
|
version 1.2, 2025/06/25 19:38:48
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.2 2025/06/25 19:38:48 snw |
|
# Add indexer |
|
# |
# Revision 1.1 2025/06/25 13:44:37 snw |
# Revision 1.1 2025/06/25 13:44:37 snw |
# Renaming |
# Renaming |
# |
# |
Line 35 my $skips = 0;
|
Line 38 my $skips = 0;
|
my $inserts = 0; |
my $inserts = 0; |
my $seed = ""; |
my $seed = ""; |
my $depth = 0; |
my $depth = 0; |
|
my $blacklist_matches = 0; |
|
my $robots_txt_denies = 0; |
|
my $invalid_scheme_skips = 0; |
|
|
sub store_url { |
sub store_url { |
my ($url) = @_; |
my ($url, $parent) = @_; |
|
|
|
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") { |
|
|
if($url ne "" && length($url) <= 255) { |
my $u = URI->new($url); |
print "."; |
my $domain = $u->host; |
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)"); |
my $scheme = $u->scheme; |
|
|
|
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
|
$sth->execute($domain); |
|
|
|
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)"); |
|
|
$ins->execute($url) or $skips = $skips + 1; |
if(not $ins->execute($url, $parent, $domain, $scheme)) { |
$inserts = $inserts + 1; |
$skips = $skips + 1; |
if($depth < $maxdepth) { |
print "d"; |
$depth = $depth + 1; |
} |
crawl_url($url); |
else { |
|
print "."; |
|
$inserts = $inserts + 1; |
|
if($depth < $maxdepth) { |
|
$depth = $depth + 1; |
|
crawl_url($url); |
|
} |
|
else { |
|
print "l"; |
|
} |
} |
} |
} |
} |
|
else { |
|
print "x"; |
|
} |
} |
} |
|
|
sub crawl_url { |
sub crawl_url { |
Line 58 sub crawl_url {
|
Line 83 sub crawl_url {
|
my $u = URI->new($url); |
my $u = URI->new($url); |
|
|
if ($u->scheme ne "http" && $u->scheme ne "https") { |
if ($u->scheme ne "http" && $u->scheme ne "https") { |
|
$invalid_scheme_skips = $invalid_scheme_skips + 1; |
|
print "s"; |
return; |
return; |
} |
} |
|
|
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); |
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?"); |
$sth->execute($u->host); |
$sth->execute($u->host); |
if($sth->rows > 0) { |
if($sth->rows > 0) { |
|
print "b"; |
|
$blacklist_matches = $blacklist_matches + 1; |
return; |
return; |
} |
} |
|
|
Line 73 sub crawl_url {
|
Line 102 sub crawl_url {
|
$rules->parse($robots_url, $robots_txt) if defined $robots_txt; |
$rules->parse($robots_url, $robots_txt) if defined $robots_txt; |
|
|
if(!$rules->allowed($url)) { |
if(!$rules->allowed($url)) { |
|
print "r"; |
|
$robots_txt_denies = $robots_txt_denies + 1; |
return; |
return; |
} |
} |
|
|
Line 109 sub crawl_url {
|
Line 140 sub crawl_url {
|
$final = $href; |
$final = $href; |
} |
} |
|
|
store_url($final); |
store_url($final, $url); |
} |
} |
|
|
$depth = $depth - 1; |
$depth = $depth - 1; |
Line 161 else {
|
Line 192 else {
|
my $total = $inserts + $skips; |
my $total = $inserts + $skips; |
|
|
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; |
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n"; |
|
print " - $blacklist_matches blacklist matches\n"; |
|
print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n"; |
|
print " - $robots_txt_denies URLs skipped due to robots.txt\n"; |
|
|
|
|