--- pandia/crawler 2025/06/27 16:20:30 1.4 +++ pandia/crawler 2025/07/01 19:20:47 1.6 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.4 2025/06/27 16:20:30 snw Exp $ +# $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,12 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.6 2025/07/01 19:20:47 snw +# Stop crawling and indexing URLs containing page fragments +# +# Revision 1.5 2025/06/28 00:33:32 snw +# Update locking +# # Revision 1.4 2025/06/27 16:20:30 snw # Add blacklist # @@ -52,6 +58,11 @@ my $invalid_scheme_skips = 0; sub store_url { my ($url, $parent) = @_; + if (index($url, '#') != -1) { + print "pandia: URL contains a fragment; skipping\n"; + return; + } + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { my $u = URI->new($url); @@ -182,7 +193,7 @@ if($seed ne "") { print "[OK]\n"; } else { - open my $file, ">", "pandia_crawler.lock" or die $!; + open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; my $sth = $dbh->prepare("SELECT url FROM crawl_queue");