Annotation of pandia/crawler, revision 1.1
1.1 ! snw 1: #!/usr/bin/env perl
! 2:
! 3: #
! 4: # $Id: crawl.pl,v 1.2 2025/06/25 03:10:01 snw Exp $
! 5: # Copyright (C) 2025 Coherent Logic Development LLC
! 6: #
! 7: # Author: Serena Willis <snw@coherent-logic.com>
! 8: #
! 9: # Licensed AGPL-3.0
! 10: #
! 11: # $Log: crawl.pl,v $
! 12: # Revision 1.2 2025/06/25 03:10:01 snw
! 13: # Initial working crawler
! 14: #
! 15: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
! 16: # Initial commit
! 17: #
! 18: #
! 19:
! 20: use Getopt::Long;
! 21: use HTTP::Tiny;
! 22: use HTML::TreeBuilder;
! 23: use URI;
! 24: use DBI;
! 25: use WWW::RobotRules;
! 26: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
! 27: use LWP::Simple qw(get);
! 28:
! 29: my $dbh = "";
! 30: my $dsn = "";
! 31: my $skips = 0;
! 32: my $inserts = 0;
! 33: my $seed = "";
! 34: my $depth = 0;
! 35:
! 36: sub store_url {
! 37: my ($url) = @_;
! 38:
! 39: if($url ne "" && length($url) <= 255) {
! 40: print ".";
! 41: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
! 42:
! 43: $ins->execute($url) or $skips = $skips + 1;
! 44: $inserts = $inserts + 1;
! 45: if($depth < $maxdepth) {
! 46: $depth = $depth + 1;
! 47: crawl_url($url);
! 48: }
! 49: }
! 50: }
! 51:
! 52: sub crawl_url {
! 53: my ($url) = @_;
! 54:
! 55: my $u = URI->new($url);
! 56:
! 57: if ($u->scheme ne "http" && $u->scheme ne "https") {
! 58: return;
! 59: }
! 60:
! 61: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
! 62: $sth->execute($u->host);
! 63: if($sth->rows > 0) {
! 64: return;
! 65: }
! 66:
! 67: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
! 68:
! 69: my $robots_txt = get $robots_url;
! 70: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
! 71:
! 72: if(!$rules->allowed($url)) {
! 73: return;
! 74: }
! 75:
! 76: my $origurl = $url;
! 77: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
! 78:
! 79: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
! 80: my $tree = HTML::TreeBuilder->new();
! 81:
! 82: my $response = $http->get($url);
! 83: $tree->parse($response->{content});
! 84:
! 85: my @links = $tree->find_by_tag_name('a');
! 86:
! 87: my $href = "";
! 88: my $firstchar = "";
! 89: my $final = "";
! 90:
! 91: foreach my $link (@links) {
! 92: $href = $link->attr('href');
! 93: $firstchar = substr($href, 0, 1);
! 94: $final = "";
! 95:
! 96: if($firstchar eq '/') {
! 97: $final = $baseurl . $href;
! 98: }
! 99: elsif($href eq '##') {
! 100: $final = $baseurl;
! 101: }
! 102: elsif($firstchar eq '#') {
! 103: $final = $baseurl . '/' . $href;
! 104: }
! 105: else {
! 106: $final = $href;
! 107: }
! 108:
! 109: store_url($final);
! 110: }
! 111:
! 112: $depth = $depth - 1;
! 113: }
! 114:
! 115: $| = 1;
! 116:
! 117: print "pandia crawler v0.0.1\n";
! 118: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
! 119:
! 120: GetOptions("dbhost=s" => \$dbhost,
! 121: "dbname=s" => \$dbname,
! 122: "dbusername=s" => \$dbusername,
! 123: "dbpw=s" => \$dbpw,
! 124: "seed=s" => \$seed,
! 125: "maxdepth=n" =>\$maxdepth)
! 126: or die("error in command line arguments");
! 127:
! 128: print "pandia: connecting to $dbname database at $dbhost...";
! 129:
! 130: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
! 131: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
! 132: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
! 133:
! 134: print "[OK]\n";
! 135:
! 136: if($seed ne "") {
! 137: print "pandia: crawling seed $seed to a maximum depth of $maxdepth";
! 138: sleep 1;
! 139: crawl_url($seed);
! 140: print "[OK]\n";
! 141: }
! 142: else {
! 143: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
! 144: $sth->execute();
! 145: my $qlen = $sth->rows;
! 146:
! 147:
! 148: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth";
! 149: sleep 1;
! 150: while (my @row = $sth->fetchrow_array()) {
! 151: my $url = @row[0];
! 152: crawl_url($url);
! 153: }
! 154: print "[OK]\n";
! 155: }
! 156:
! 157:
! 158: my $total = $inserts + $skips;
! 159:
! 160: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>