#!/usr/bin/env perl
#
# $Id: crawler,v 1.9 2025/07/05 15:27:53 snw Exp $
# Copyright (C) 2025 Coherent Logic Development LLC
#
# Author: Serena Willis <snw@coherent-logic.com>
#
# Licensed AGPL-3.0
#
# $Log: crawler,v $
# Revision 1.9 2025/07/05 15:27:53 snw
# Update
#
# Revision 1.8 2025/07/02 15:14:44 snw
# Fix bug in restricted mode
#
# Revision 1.7 2025/07/02 15:03:05 snw
# Add support for restricted mode
#
# Revision 1.6 2025/07/01 19:20:47 snw
# Stop crawling and indexing URLs containing page fragments
#
# Revision 1.5 2025/06/28 00:33:32 snw
# Update locking
#
# Revision 1.4 2025/06/27 16:20:30 snw
# Add blacklist
#
# Revision 1.3 2025/06/27 02:14:47 snw
# Initial operational capability
#
# Revision 1.2 2025/06/25 19:38:48 snw
# Add indexer
#
# Revision 1.1 2025/06/25 13:44:37 snw
# Renaming
#
# Revision 1.2 2025/06/25 03:10:01 snw
# Initial working crawler
#
# Revision 1.1.1.1 2025/06/23 23:17:08 snw
# Initial commit
#
#
use Getopt::Long;
use HTTP::Tiny;
use HTML::TreeBuilder;
use URI;
use DBI;
use WWW::RobotRules;
use Fcntl qw(:flock);
use LWP::Simple qw(get);
use Config::IniFiles;
my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
my $dbh = "";
my $dsn = "";
my $skips = 0;
my $inserts = 0;
my $seed = "";
my $depth = 0;
my $blacklist_matches = 0;
my $robots_txt_denies = 0;
my $invalid_scheme_skips = 0;
my $mode;
sub store_url {
my ($url, $parent) = @_;
if (index($url, '#') != -1) {
print "F";
return;
}
if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
my $u = URI->new($url);
my $domain = $u->host;
my $scheme = $u->scheme;
my @parts = split($domain, '.');
my $tld = $parts[-1];
if ($mode eq 'restricted') {
my $tld_ok = 0;
foreach (@allowed_tlds) {
my $allowed = $_;
if($tld eq $allowed) {
$tld_ok = 1;
last;
}
}
if($tld_ok == 0) {
print "T";
return;
}
}
my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
$sth->execute($domain);
my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
if(not $ins->execute($url, $parent, $domain, $scheme)) {
$skips = $skips + 1;
print "d";
}
else {
print ".";
$inserts = $inserts + 1;
if($depth < $maxdepth) {
$depth = $depth + 1;
crawl_url($url);
}
else {
print "l";
}
}
}
else {
print "x";
}
}
sub crawl_url {
my ($url) = @_;
my $u = URI->new($url);
if ($u->scheme ne "http" && $u->scheme ne "https") {
$invalid_scheme_skips = $invalid_scheme_skips + 1;
print "s";
return;
}
my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
$sth->execute($u->host);
if($sth->rows > 0) {
print "b";
$blacklist_matches = $blacklist_matches + 1;
return;
}
my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
my $robots_txt = get $robots_url;
$rules->parse($robots_url, $robots_txt) if defined $robots_txt;
if(!$rules->allowed($url)) {
print "r";
$robots_txt_denies = $robots_txt_denies + 1;
return;
}
my $origurl = $url;
($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
my $tree = HTML::TreeBuilder->new();
my $response = $http->get($url);
$tree->parse($response->{content});
my @links = $tree->find_by_tag_name('a');
my $href = "";
my $firstchar = "";
my $final = "";
foreach my $link (@links) {
$href = $link->attr('href');
$firstchar = substr($href, 0, 1);
$final = "";
if($firstchar eq '/') {
$final = $baseurl . $href;
}
elsif($href eq '##') {
$final = $baseurl;
}
elsif($firstchar eq '#') {
$final = $baseurl . '/' . $href;
}
else {
$final = $href;
}
store_url($final, $url);
}
$depth = $depth - 1;
}
$| = 1;
print "pandia crawler v0.0.1\n";
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
my $profile;
GetOptions("profile=s" => \$profile,
"seed=s" => \$seed,
"maxdepth=n" =>\$maxdepth)
or die("error in command line arguments");
my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
$dbhost = $cfg->val($profile, 'dbhost');
$dbname = $cfg->val($profile, 'dbname');
$dbusername = $cfg->val($profile, 'dbuser');
$dbpw = $cfg->val($profile, 'dbpass');
$tmp = $cfg->val($profile, 'allowed_tlds');
if($tmp ne '*') {
$mode = 'restricted';
@allowed_tlds = split(',', $tmp);
print "pandia: crawler restricted to these TLDs: ";
foreach (@allowed_tlds) {
print ".$_ ";
}
print "\n";
}
else {
print "pandia: crawler unrestricted\n";
$mode = 'normal';
}
print "pandia: connecting to $dbname database at $dbhost...";
$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
print "[OK]\n";
print "pandia: each character represents the following status for a URL:\n";
print " . URL added to indexer queue\n";
print " l crawl exceeded max depth\n";
print " x URL too long or invalid scheme\n";
print " d URL was a duplicate\n";
print " b crawl was blocked by robots.txt\n";
print " F URL contained a fragment\n";
print " T URL was from a disallowed top-level domain\n\n";
if($seed ne "") {
print "pandia: crawling seed $seed to a maximum depth of $maxdepth...";
sleep 1;
crawl_url($seed);
print "[OK]\n";
}
else {
open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
$sth->execute();
my $qlen = $sth->rows;
print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth...";
sleep 1;
while (my @row = $sth->fetchrow_array()) {
my $url = @row[0];
crawl_url($url);
}
print "[OK]\n";
}
my $total = $inserts + $skips;
print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
print " - $blacklist_matches blacklist matches\n";
print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
print " - $robots_txt_denies URLs skipped due to robots.txt\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>