version 1.4, 2025/06/28 00:33:32
|
version 1.5, 2025/06/28 05:40:11
|
Line 9
|
Line 9
|
# Licensed AGPL-3.0 |
# Licensed AGPL-3.0 |
# |
# |
# $Log$ |
# $Log$ |
|
# Revision 1.5 2025/06/28 05:40:11 snw |
|
# Exclude non-textual MIME types |
|
# |
# Revision 1.4 2025/06/28 00:33:32 snw |
# Revision 1.4 2025/06/28 00:33:32 snw |
# Update locking |
# Update locking |
# |
# |
Line 78 while (my $hashref = $sth->fetchrow_hash
|
Line 81 while (my $hashref = $sth->fetchrow_hash
|
print "pandia: retrieving $url...\n"; |
print "pandia: retrieving $url...\n"; |
try { |
try { |
my $del_queue = 0; |
my $del_queue = 0; |
|
my $head = $http->head($hashref->{url}); |
|
my $headers = $head->{headers}; |
|
my $content_type = $headers->{'content-type'}; |
|
|
|
if($content_type ne 'text/plain' && substr($content_type, 0, 9) ne 'text/html') { |
|
print "pandia: content type $content_type not indexable; skipping $url\n"; |
|
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
|
$sth->execute($url); |
|
next; |
|
} |
|
|
my $response = $http->get($hashref->{url}); |
my $response = $http->get($hashref->{url}); |
|
|
if(not $response->{success}) { |
if(not $response->{success}) { |
print "pandia: http failure; skipping $url\n"; |
print "pandia: http failure; skipping $url\n"; |
$del_queue = 1; |
$del_queue = 1; |
} |
} |
|
|
#if(exists $response->{redirects}) { |
|
# print "pandia: redirects detected; skipping $url\n"; |
|
# $del_queue = 1; |
|
#} |
|
|
|
if($del_queue == 1) { |
if($del_queue == 1) { |
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
my $sth = $dbh->prepare("DELETE FROM crawl_queue WHERE url=?"); |
Line 102 while (my $hashref = $sth->fetchrow_hash
|
Line 111 while (my $hashref = $sth->fetchrow_hash
|
if($response) { |
if($response) { |
$tree->parse($pagedata); |
$tree->parse($pagedata); |
$title = $tree->look_down('_tag', 'title')->as_text; |
$title = $tree->look_down('_tag', 'title')->as_text; |
|
$title =~ s/[^\x00-\x7F]//g; |
|
|
print "pandia: processing $url [$title]\n"; |
print "pandia: processing $url [$title]\n"; |
|
|