Annotation of pandia/crawler, revision 1.7
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.7 ! snw 4: # $Id: crawler,v 1.6 2025/07/01 19:20:47 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: crawler,v $
1.7 ! snw 12: # Revision 1.6 2025/07/01 19:20:47 snw
! 13: # Stop crawling and indexing URLs containing page fragments
! 14: #
1.6 snw 15: # Revision 1.5 2025/06/28 00:33:32 snw
16: # Update locking
17: #
1.5 snw 18: # Revision 1.4 2025/06/27 16:20:30 snw
19: # Add blacklist
20: #
1.4 snw 21: # Revision 1.3 2025/06/27 02:14:47 snw
22: # Initial operational capability
23: #
1.3 snw 24: # Revision 1.2 2025/06/25 19:38:48 snw
25: # Add indexer
26: #
1.2 snw 27: # Revision 1.1 2025/06/25 13:44:37 snw
28: # Renaming
29: #
1.1 snw 30: # Revision 1.2 2025/06/25 03:10:01 snw
31: # Initial working crawler
32: #
33: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
34: # Initial commit
35: #
36: #
37:
38: use Getopt::Long;
39: use HTTP::Tiny;
40: use HTML::TreeBuilder;
41: use URI;
42: use DBI;
43: use WWW::RobotRules;
1.4 snw 44: use Fcntl qw(:flock);
1.1 snw 45: use LWP::Simple qw(get);
1.7 ! snw 46: use Config::IniFiles;
1.1 snw 47:
1.4 snw 48: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
1.1 snw 49: my $dbh = "";
50: my $dsn = "";
51: my $skips = 0;
52: my $inserts = 0;
53: my $seed = "";
54: my $depth = 0;
1.2 snw 55: my $blacklist_matches = 0;
56: my $robots_txt_denies = 0;
57: my $invalid_scheme_skips = 0;
1.7 ! snw 58: my $mode;
1.1 snw 59:
60: sub store_url {
1.2 snw 61: my ($url, $parent) = @_;
62:
1.6 snw 63: if (index($url, '#') != -1) {
1.7 ! snw 64: print "F";
1.6 snw 65: return;
66: }
67:
1.4 snw 68: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
69:
1.2 snw 70: my $u = URI->new($url);
1.4 snw 71: my $domain = $u->host;
1.2 snw 72: my $scheme = $u->scheme;
1.7 ! snw 73: my @parts = split($domain, '.');
! 74: my $tld = $parts[-1];
! 75:
! 76: if ($mode eq 'restricted') {
! 77: my $tld_ok = 0;
! 78: foreach (@allowed_tlds) {
! 79: my $allowed = $_;
! 80:
! 81: if($tld eq $allowed) {
! 82: $tld_ok = 1;
! 83: last;
! 84: }
! 85: }
! 86: if($tld_ok == 0) {
! 87: print "T";
! 88: return;
! 89: }
! 90: }
1.2 snw 91:
92: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
93: $sth->execute($domain);
94:
95: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1 snw 96:
1.2 snw 97: if(not $ins->execute($url, $parent, $domain, $scheme)) {
98: $skips = $skips + 1;
99: print "d";
100: }
101: else {
102: print ".";
103: $inserts = $inserts + 1;
104: if($depth < $maxdepth) {
105: $depth = $depth + 1;
106: crawl_url($url);
107: }
108: else {
109: print "l";
110: }
1.1 snw 111: }
1.2 snw 112: }
113: else {
114: print "x";
115: }
1.1 snw 116: }
117:
118: sub crawl_url {
119: my ($url) = @_;
120:
121: my $u = URI->new($url);
122:
123: if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2 snw 124: $invalid_scheme_skips = $invalid_scheme_skips + 1;
125: print "s";
1.1 snw 126: return;
127: }
128:
129: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
130: $sth->execute($u->host);
131: if($sth->rows > 0) {
1.2 snw 132: print "b";
133: $blacklist_matches = $blacklist_matches + 1;
1.1 snw 134: return;
135: }
136:
137: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
138:
139: my $robots_txt = get $robots_url;
140: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
141:
142: if(!$rules->allowed($url)) {
1.2 snw 143: print "r";
144: $robots_txt_denies = $robots_txt_denies + 1;
1.1 snw 145: return;
146: }
147:
148: my $origurl = $url;
149: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
150:
151: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
152: my $tree = HTML::TreeBuilder->new();
153:
154: my $response = $http->get($url);
1.3 snw 155:
156:
1.1 snw 157: $tree->parse($response->{content});
158:
159: my @links = $tree->find_by_tag_name('a');
160:
161: my $href = "";
162: my $firstchar = "";
163: my $final = "";
164:
165: foreach my $link (@links) {
166: $href = $link->attr('href');
167: $firstchar = substr($href, 0, 1);
168: $final = "";
169:
170: if($firstchar eq '/') {
171: $final = $baseurl . $href;
172: }
173: elsif($href eq '##') {
174: $final = $baseurl;
175: }
176: elsif($firstchar eq '#') {
177: $final = $baseurl . '/' . $href;
178: }
179: else {
180: $final = $href;
181: }
182:
1.2 snw 183: store_url($final, $url);
1.1 snw 184: }
185:
186: $depth = $depth - 1;
187: }
188:
189: $| = 1;
190: print "pandia crawler v0.0.1\n";
191: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
192:
1.7 ! snw 193: my $profile;
! 194:
! 195: GetOptions("profile=s" => \$profile,
! 196: "seed=s" => \$seed,
! 197: "maxdepth=n" =>\$maxdepth)
1.1 snw 198: or die("error in command line arguments");
199:
1.7 ! snw 200: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
! 201:
! 202: $dbhost = $cfg->val($profile, 'dbhost');
! 203: $dbname = $cfg->val($profile, 'dbname');
! 204: $dbusername = $cfg->val($profile, 'dbuser');
! 205: $dbpw = $cfg->val($profile, 'dbpass');
! 206: $tmp = $cfg->val($profile, 'allowed_tlds');
! 207:
! 208: if($tmp ne '*') {
! 209: $mode = 'restrictive';
! 210: @allowed_tlds = split(',', $tmp);
! 211: print "pandia: crawler restricted to these TLDs: ";
! 212: foreach (@allowed_tlds) {
! 213: print ".$_ ";
! 214: }
! 215: print "\n";
! 216: }
! 217: else {
! 218: print "pandia: crawler unrestricted\n";
! 219: $mode = 'normal';
! 220: }
! 221:
1.1 snw 222: print "pandia: connecting to $dbname database at $dbhost...";
223:
224: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
225: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
226: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
227:
228: print "[OK]\n";
229:
1.7 ! snw 230: print "pandia: each character represents the following status for a URL:\n";
! 231: print " . URL added to indexer queue\n";
! 232: print " l crawl exceeded max depth\n";
! 233: print " x URL too long or invalid scheme\n";
! 234: print " d URL was a duplicate\n";
! 235: print " b crawl was blocked by robots.txt\n";
! 236: print " F URL contained a fragment\n";
! 237: print " T URL was from a disallowed top-level domain\n\n";
! 238:
! 239: if($seed ne "") {
! 240: print "pandia: crawling seed $seed to a maximum depth of $maxdepth...";
1.1 snw 241: sleep 1;
242: crawl_url($seed);
243: print "[OK]\n";
244: }
245: else {
1.5 snw 246: open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
1.4 snw 247: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
248:
1.1 snw 249: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
250: $sth->execute();
251: my $qlen = $sth->rows;
252:
253:
254: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth";
255: sleep 1;
256: while (my @row = $sth->fetchrow_array()) {
257: my $url = @row[0];
258: crawl_url($url);
259: }
260: print "[OK]\n";
261: }
262:
263:
264: my $total = $inserts + $skips;
265:
266: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2 snw 267: print " - $blacklist_matches blacklist matches\n";
268: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
269: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
270:
271:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>