version 1.2, 2025/06/25 19:38:48
|
version 1.4, 2025/06/27 16:20:30
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.4 2025/06/27 16:20:30 snw |
|
# Add blacklist |
|
# |
|
# Revision 1.3 2025/06/27 02:14:47 snw |
|
# Initial operational capability |
|
# |
# Revision 1.2 2025/06/25 19:38:48 snw |
# Revision 1.2 2025/06/25 19:38:48 snw |
# Add indexer |
# Add indexer |
# |
# |
Line 29 use HTML::TreeBuilder;
|
Line 35 use HTML::TreeBuilder;
|
use URI; |
use URI; |
use DBI; |
use DBI; |
use WWW::RobotRules; |
use WWW::RobotRules; |
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
use Fcntl qw(:flock); |
use LWP::Simple qw(get); |
use LWP::Simple qw(get); |
|
|
|
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); |
my $dbh = ""; |
my $dbh = ""; |
my $dsn = ""; |
my $dsn = ""; |
my $skips = 0; |
my $skips = 0; |
Line 45 my $invalid_scheme_skips = 0;
|
Line 52 my $invalid_scheme_skips = 0;
|
sub store_url { |
sub store_url { |
my ($url, $parent) = @_; |
my ($url, $parent) = @_; |
|
|
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") { |
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
|
|
my $u = URI->new($url); |
my $u = URI->new($url); |
my $domain = $u->host; |
my $domain = $u->host; |
my $scheme = $u->scheme; |
my $scheme = $u->scheme; |
|
|
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); |
Line 114 sub crawl_url {
|
Line 121 sub crawl_url {
|
my $tree = HTML::TreeBuilder->new(); |
my $tree = HTML::TreeBuilder->new(); |
|
|
my $response = $http->get($url); |
my $response = $http->get($url); |
|
|
|
|
$tree->parse($response->{content}); |
$tree->parse($response->{content}); |
|
|
my @links = $tree->find_by_tag_name('a'); |
my @links = $tree->find_by_tag_name('a'); |
Line 147 sub crawl_url {
|
Line 156 sub crawl_url {
|
} |
} |
|
|
$| = 1; |
$| = 1; |
|
|
print "pandia crawler v0.0.1\n"; |
print "pandia crawler v0.0.1\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; |
|
|
Line 174 if($seed ne "") {
|
Line 182 if($seed ne "") {
|
print "[OK]\n"; |
print "[OK]\n"; |
} |
} |
else { |
else { |
|
open my $file, ">", "pandia_crawler.lock" or die $!; |
|
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
|
|
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
$sth->execute(); |
$sth->execute(); |
my $qlen = $sth->rows; |
my $qlen = $sth->rows; |