version 1.4, 2025/06/27 16:20:30
|
version 1.6, 2025/07/01 19:20:47
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.6 2025/07/01 19:20:47 snw |
|
# Stop crawling and indexing URLs containing page fragments |
|
# |
|
# Revision 1.5 2025/06/28 00:33:32 snw |
|
# Update locking |
|
# |
# Revision 1.4 2025/06/27 16:20:30 snw |
# Revision 1.4 2025/06/27 16:20:30 snw |
# Add blacklist |
# Add blacklist |
# |
# |
Line 52 my $invalid_scheme_skips = 0;
|
Line 58 my $invalid_scheme_skips = 0;
|
sub store_url { |
sub store_url { |
my ($url, $parent) = @_; |
my ($url, $parent) = @_; |
|
|
|
if (index($url, '#') != -1) { |
|
print "pandia: URL contains a fragment; skipping\n"; |
|
return; |
|
} |
|
|
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { |
|
|
my $u = URI->new($url); |
my $u = URI->new($url); |
Line 182 if($seed ne "") {
|
Line 193 if($seed ne "") {
|
print "[OK]\n"; |
print "[OK]\n"; |
} |
} |
else { |
else { |
open my $file, ">", "pandia_crawler.lock" or die $!; |
open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; |
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; |
|
|
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |
my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); |