--- pandia/crawler 2025/06/25 19:38:48 1.2 +++ pandia/crawler 2025/06/27 02:14:47 1.3 @@ -1,7 +1,7 @@ #!/usr/bin/env perl # -# $Id: crawler,v 1.2 2025/06/25 19:38:48 snw Exp $ +# $Id: crawler,v 1.3 2025/06/27 02:14:47 snw Exp $ # Copyright (C) 2025 Coherent Logic Development LLC # # Author: Serena Willis @@ -9,6 +9,9 @@ # Licensed AGPL-3.0 # # $Log: crawler,v $ +# Revision 1.3 2025/06/27 02:14:47 snw +# Initial operational capability +# # Revision 1.2 2025/06/25 19:38:48 snw # Add indexer # @@ -114,6 +117,17 @@ sub crawl_url { my $tree = HTML::TreeBuilder->new(); my $response = $http->get($url); + + if(not $response->{success}) { + print "pandia: http failure; skipping $url\n"; + next; + } + + if(exists $response->{redirects}) { + print "pandia: redirects detected; skipping $url\n"; + next; + } + $tree->parse($response->{content}); my @links = $tree->find_by_tag_name('a'); @@ -147,7 +161,6 @@ sub crawl_url { } $| = 1; - print "pandia crawler v0.0.1\n"; print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";