File:  [Coherent Logic Development] / pandia / crawler
Revision 1.6: download - view: text, annotated - select for diffs
Tue Jul 1 19:20:47 2025 UTC (2 weeks, 6 days ago) by snw
Branches: MAIN
CVS tags: HEAD
Stop crawling and indexing URLs containing page fragments

#!/usr/bin/env perl

# 
# $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $
#  Copyright (C) 2025 Coherent Logic Development LLC
#
# Author: Serena Willis <snw@coherent-logic.com>
#
# Licensed AGPL-3.0
#
# $Log: crawler,v $
# Revision 1.6  2025/07/01 19:20:47  snw
# Stop crawling and indexing URLs containing page fragments
#
# Revision 1.5  2025/06/28 00:33:32  snw
# Update locking
#
# Revision 1.4  2025/06/27 16:20:30  snw
# Add blacklist
#
# Revision 1.3  2025/06/27 02:14:47  snw
# Initial operational capability
#
# Revision 1.2  2025/06/25 19:38:48  snw
# Add indexer
#
# Revision 1.1  2025/06/25 13:44:37  snw
# Renaming
#
# Revision 1.2  2025/06/25 03:10:01  snw
# Initial working crawler
#
# Revision 1.1.1.1  2025/06/23 23:17:08  snw
# Initial commit
#
#

use Getopt::Long;
use HTTP::Tiny;
use HTML::TreeBuilder;
use URI;
use DBI;
use WWW::RobotRules;
use Fcntl qw(:flock);
use LWP::Simple qw(get);

my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
my $dbh = "";
my $dsn = "";
my $skips = 0;
my $inserts = 0;
my $seed = "";
my $depth = 0;
my $blacklist_matches = 0;
my $robots_txt_denies = 0;
my $invalid_scheme_skips = 0;

sub store_url {
    my ($url, $parent) = @_;

    if (index($url, '#') != -1) {
        print "pandia:  URL contains a fragment; skipping\n";
        return;
    }
    
    if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {       
	
	my $u = URI->new($url);
	my $domain = $u->host;	
	my $scheme = $u->scheme;

	my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
	$sth->execute($domain);
	
	my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
	    
	if(not $ins->execute($url, $parent, $domain, $scheme)) {
	    $skips = $skips + 1;
	    print "d";
	}
	else {
	    print ".";
	    $inserts = $inserts + 1;	    
	    if($depth < $maxdepth) {
		$depth = $depth + 1;
		crawl_url($url);
	    }
	    else {
		print "l";
	    }
	}
    }
    else {
	print "x";
    }
}

sub crawl_url {
    my ($url) = @_;

    my $u = URI->new($url);

    if ($u->scheme ne "http" && $u->scheme ne "https") {
	$invalid_scheme_skips = $invalid_scheme_skips + 1;
	print "s";
	return;
    }

    my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
    $sth->execute($u->host);
    if($sth->rows > 0) {
	print "b";
	$blacklist_matches = $blacklist_matches + 1;
	return;
    }
    
    my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";

    my $robots_txt = get $robots_url;
    $rules->parse($robots_url, $robots_txt) if defined $robots_txt;

    if(!$rules->allowed($url)) {
	print "r";
	$robots_txt_denies = $robots_txt_denies + 1;
	return;
    }
    
    my $origurl = $url;
    ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
    
    my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
    my $tree = HTML::TreeBuilder->new();

    my $response = $http->get($url);


    $tree->parse($response->{content});

    my @links = $tree->find_by_tag_name('a');

    my $href = "";
    my $firstchar = "";
    my $final = "";
    
    foreach my $link (@links) {
        $href = $link->attr('href');
	$firstchar = substr($href, 0, 1);
	$final = "";

	if($firstchar eq '/') {
	    $final = $baseurl . $href;	   
	}
	elsif($href eq '##') {
	    $final = $baseurl;
	}
	elsif($firstchar eq '#') {
	    $final = $baseurl . '/' . $href;
	}
	else {
	    $final = $href;
	}
	    
	store_url($final, $url);
    }

    $depth = $depth - 1;
}

$| = 1;
print "pandia crawler v0.0.1\n";
print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";

GetOptions("dbhost=s" => \$dbhost,
           "dbname=s" => \$dbname,
           "dbusername=s" => \$dbusername,
           "dbpw=s" => \$dbpw,
	   "seed=s" => \$seed,
	   "maxdepth=n" =>\$maxdepth)
    or die("error in command line arguments");

print "pandia:  connecting to $dbname database at $dbhost...";

$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;

print "[OK]\n";

if($seed ne "") {
    print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
    sleep 1;
    crawl_url($seed);
    print "[OK]\n";
}
else {
    open my $file, ">", "/tmp/pandia_crawler.lock" or die $!; 
    flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";

    my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
    $sth->execute();
    my $qlen = $sth->rows;

    
    print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
    sleep 1;
    while (my @row = $sth->fetchrow_array()) {
	my $url = @row[0];
	crawl_url($url);
    }
    print "[OK]\n";
}


my $total = $inserts + $skips;

print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
print "          - $blacklist_matches blacklist matches\n";
print "          - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
print "          - $robots_txt_denies URLs skipped due to robots.txt\n";
    
    

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>