--- pandia/Attic/crawl.pl	2025/06/23 23:17:08	1.1
+++ pandia/Attic/crawl.pl	2025/06/25 03:10:01	1.2
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # 
-# $Id: crawl.pl,v 1.1 2025/06/23 23:17:08 snw Exp $
+# $Id: crawl.pl,v 1.2 2025/06/25 03:10:01 snw Exp $
 #  Copyright (C) 2025 Coherent Logic Development LLC
 #
 # Author: Serena Willis <snw@coherent-logic.com>
@@ -9,11 +9,152 @@
 # Licensed AGPL-3.0
 #
 # $Log: crawl.pl,v $
-# Revision 1.1  2025/06/23 23:17:08  snw
-# Initial revision
+# Revision 1.2  2025/06/25 03:10:01  snw
+# Initial working crawler
+#
+# Revision 1.1.1.1  2025/06/23 23:17:08  snw
+# Initial commit
 #
 #
 
+use Getopt::Long;
 use HTTP::Tiny;
 use HTML::TreeBuilder;
+use URI;
+use DBI;
+use WWW::RobotRules;
+my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
+use LWP::Simple qw(get);
+
+my $dbh = "";
+my $dsn = "";
+my $skips = 0;
+my $inserts = 0;
+my $seed = "";
+my $depth = 0;
+
+sub store_url {
+    my ($url) = @_;
+
+    if($url ne "" && length($url) <= 255) {
+	print ".";
+	my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
+	    
+	$ins->execute($url) or $skips = $skips + 1;
+	$inserts = $inserts + 1;	    
+	if($depth < $maxdepth) {
+	    $depth = $depth + 1;
+	    crawl_url($url);
+	}
+    }     
+}
+
+sub crawl_url {
+    my ($url) = @_;
+
+    my $u = URI->new($url);
+
+    if ($u->scheme ne "http" && $u->scheme ne "https") {
+	return;
+    }
+
+    my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
+    $sth->execute($u->host);
+    if($sth->rows > 0) {
+	return;
+    }
+    
+    my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
+
+    my $robots_txt = get $robots_url;
+    $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
+
+    if(!$rules->allowed($url)) {
+	return;
+    }
+    
+    my $origurl = $url;
+    ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
+    
+    my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
+    my $tree = HTML::TreeBuilder->new();
+
+    my $response = $http->get($url);
+    $tree->parse($response->{content});
+
+    my @links = $tree->find_by_tag_name('a');
+
+    my $href = "";
+    my $firstchar = "";
+    my $final = "";
+    
+    foreach my $link (@links) {
+        $href = $link->attr('href');
+	$firstchar = substr($href, 0, 1);
+	$final = "";
+
+	if($firstchar eq '/') {
+	    $final = $baseurl . $href;	   
+	}
+	elsif($href eq '##') {
+	    $final = $baseurl;
+	}
+	elsif($firstchar eq '#') {
+	    $final = $baseurl . '/' . $href;
+	}
+	else {
+	    $final = $href;
+	}
+	    
+	store_url($final);
+    }
+
+    $depth = $depth - 1;
+}
+
+$| = 1;
+
+print "pandia crawler v0.0.1\n";
+print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
+
+GetOptions("dbhost=s" => \$dbhost,
+           "dbname=s" => \$dbname,
+           "dbusername=s" => \$dbusername,
+           "dbpw=s" => \$dbpw,
+	   "seed=s" => \$seed,
+	   "maxdepth=n" =>\$maxdepth)
+    or die("error in command line arguments");
+
+print "pandia:  connecting to $dbname database at $dbhost...";
+
+$dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
+$dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
+die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
+
+print "[OK]\n";
+
+if($seed ne "") {
+    print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
+    sleep 1;
+    crawl_url($seed);
+    print "[OK]\n";
+}
+else {
+    my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
+    $sth->execute();
+    my $qlen = $sth->rows;
+
+    
+    print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
+    sleep 1;
+    while (my @row = $sth->fetchrow_array()) {
+	my $url = @row[0];
+	crawl_url($url);
+    }
+    print "[OK]\n";
+}
+
+
+my $total = $inserts + $skips;
 
+print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";