--- pandia/crawler 2025/06/25 19:38:48 1.2 +++ pandia/crawler 2025/07/01 19:20:47 1.6 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.2 2025/06/25 19:38:48 snw Exp $ +# $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,18 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.6 2025/07/01 19:20:47 snw +# Stop crawling and indexing URLs containing page fragments +# +# Revision 1.5 2025/06/28 00:33:32 snw +# Update locking +# +# Revision 1.4 2025/06/27 16:20:30 snw +# Add blacklist +# +# Revision 1.3 2025/06/27 02:14:47 snw +# Initial operational capability +# # Revision 1.2 2025/06/25 19:38:48 snw # Add indexer # @@ -29,9 +41,10 @@ use HTML::TreeBuilder; use URI; use DBI; use WWW::RobotRules; -my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); +use Fcntl qw(:flock); use LWP::Simple qw(get); +my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1'); my $dbh = ""; my $dsn = ""; my $skips = 0; @@ -45,10 +58,15 @@ my $invalid_scheme_skips = 0; sub store_url { my ($url, $parent) = @_; - if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") { - + if (index($url, '#') != -1) { + print "pandia: URL contains a fragment; skipping\n"; + return; + } + + if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") { + my $u = URI->new($url); - my $domain = $u->host; + my $domain = $u->host; my $scheme = $u->scheme; my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)"); @@ -114,6 +132,8 @@ sub crawl_url { my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); + + $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); @@ -147,7 +167,6 @@ sub crawl_url { } $| = 1; - print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n"; @@ -174,6 +193,9 @@ if($seed ne "") { print "[OK]\n"; } else { + open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; + flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!"; + my $sth = $dbh->prepare("SELECT url FROM crawl_queue"); $sth->execute(); my $qlen = $sth->rows;