Annotation of pandia/crawler, revision 1.9
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.9 ! snw 4: # $Id: crawler,v 1.8 2025/07/02 15:14:44 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: crawler,v $
1.9 ! snw 12: # Revision 1.8 2025/07/02 15:14:44 snw
! 13: # Fix bug in restricted mode
! 14: #
1.8 snw 15: # Revision 1.7 2025/07/02 15:03:05 snw
16: # Add support for restricted mode
17: #
1.7 snw 18: # Revision 1.6 2025/07/01 19:20:47 snw
19: # Stop crawling and indexing URLs containing page fragments
20: #
1.6 snw 21: # Revision 1.5 2025/06/28 00:33:32 snw
22: # Update locking
23: #
1.5 snw 24: # Revision 1.4 2025/06/27 16:20:30 snw
25: # Add blacklist
26: #
1.4 snw 27: # Revision 1.3 2025/06/27 02:14:47 snw
28: # Initial operational capability
29: #
1.3 snw 30: # Revision 1.2 2025/06/25 19:38:48 snw
31: # Add indexer
32: #
1.2 snw 33: # Revision 1.1 2025/06/25 13:44:37 snw
34: # Renaming
35: #
1.1 snw 36: # Revision 1.2 2025/06/25 03:10:01 snw
37: # Initial working crawler
38: #
39: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
40: # Initial commit
41: #
42: #
43:
44: use Getopt::Long;
45: use HTTP::Tiny;
46: use HTML::TreeBuilder;
47: use URI;
48: use DBI;
49: use WWW::RobotRules;
1.4 snw 50: use Fcntl qw(:flock);
1.1 snw 51: use LWP::Simple qw(get);
1.7 snw 52: use Config::IniFiles;
1.1 snw 53:
1.4 snw 54: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
1.1 snw 55: my $dbh = "";
56: my $dsn = "";
57: my $skips = 0;
58: my $inserts = 0;
59: my $seed = "";
60: my $depth = 0;
1.2 snw 61: my $blacklist_matches = 0;
62: my $robots_txt_denies = 0;
63: my $invalid_scheme_skips = 0;
1.7 snw 64: my $mode;
1.1 snw 65:
66: sub store_url {
1.2 snw 67: my ($url, $parent) = @_;
68:
1.6 snw 69: if (index($url, '#') != -1) {
1.7 snw 70: print "F";
1.6 snw 71: return;
72: }
73:
1.4 snw 74: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
75:
1.2 snw 76: my $u = URI->new($url);
1.4 snw 77: my $domain = $u->host;
1.2 snw 78: my $scheme = $u->scheme;
1.7 snw 79: my @parts = split($domain, '.');
80: my $tld = $parts[-1];
81:
82: if ($mode eq 'restricted') {
83: my $tld_ok = 0;
84: foreach (@allowed_tlds) {
85: my $allowed = $_;
86:
87: if($tld eq $allowed) {
88: $tld_ok = 1;
89: last;
90: }
91: }
92: if($tld_ok == 0) {
93: print "T";
94: return;
95: }
96: }
1.2 snw 97:
98: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
99: $sth->execute($domain);
100:
101: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1 snw 102:
1.2 snw 103: if(not $ins->execute($url, $parent, $domain, $scheme)) {
104: $skips = $skips + 1;
105: print "d";
106: }
107: else {
108: print ".";
109: $inserts = $inserts + 1;
110: if($depth < $maxdepth) {
111: $depth = $depth + 1;
112: crawl_url($url);
113: }
114: else {
115: print "l";
116: }
1.1 snw 117: }
1.2 snw 118: }
119: else {
120: print "x";
121: }
1.1 snw 122: }
123:
124: sub crawl_url {
125: my ($url) = @_;
126:
127: my $u = URI->new($url);
128:
129: if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2 snw 130: $invalid_scheme_skips = $invalid_scheme_skips + 1;
131: print "s";
1.1 snw 132: return;
133: }
134:
135: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
136: $sth->execute($u->host);
137: if($sth->rows > 0) {
1.2 snw 138: print "b";
139: $blacklist_matches = $blacklist_matches + 1;
1.1 snw 140: return;
141: }
142:
143: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
144:
145: my $robots_txt = get $robots_url;
146: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
147:
148: if(!$rules->allowed($url)) {
1.2 snw 149: print "r";
150: $robots_txt_denies = $robots_txt_denies + 1;
1.1 snw 151: return;
152: }
153:
154: my $origurl = $url;
155: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
156:
157: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
158: my $tree = HTML::TreeBuilder->new();
159:
160: my $response = $http->get($url);
1.3 snw 161:
162:
1.1 snw 163: $tree->parse($response->{content});
164:
165: my @links = $tree->find_by_tag_name('a');
166:
167: my $href = "";
168: my $firstchar = "";
169: my $final = "";
170:
171: foreach my $link (@links) {
172: $href = $link->attr('href');
173: $firstchar = substr($href, 0, 1);
174: $final = "";
175:
176: if($firstchar eq '/') {
177: $final = $baseurl . $href;
178: }
179: elsif($href eq '##') {
180: $final = $baseurl;
181: }
182: elsif($firstchar eq '#') {
183: $final = $baseurl . '/' . $href;
184: }
185: else {
186: $final = $href;
187: }
188:
1.2 snw 189: store_url($final, $url);
1.1 snw 190: }
191:
192: $depth = $depth - 1;
193: }
194:
195: $| = 1;
196: print "pandia crawler v0.0.1\n";
197: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
198:
1.7 snw 199: my $profile;
200:
201: GetOptions("profile=s" => \$profile,
202: "seed=s" => \$seed,
203: "maxdepth=n" =>\$maxdepth)
1.1 snw 204: or die("error in command line arguments");
205:
1.7 snw 206: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
207:
208: $dbhost = $cfg->val($profile, 'dbhost');
209: $dbname = $cfg->val($profile, 'dbname');
210: $dbusername = $cfg->val($profile, 'dbuser');
211: $dbpw = $cfg->val($profile, 'dbpass');
212: $tmp = $cfg->val($profile, 'allowed_tlds');
213:
214: if($tmp ne '*') {
1.8 snw 215: $mode = 'restricted';
1.7 snw 216: @allowed_tlds = split(',', $tmp);
217: print "pandia: crawler restricted to these TLDs: ";
218: foreach (@allowed_tlds) {
219: print ".$_ ";
220: }
221: print "\n";
222: }
223: else {
224: print "pandia: crawler unrestricted\n";
225: $mode = 'normal';
226: }
227:
1.1 snw 228: print "pandia: connecting to $dbname database at $dbhost...";
229:
230: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
231: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
232: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
233:
234: print "[OK]\n";
235:
1.7 snw 236: print "pandia: each character represents the following status for a URL:\n";
237: print " . URL added to indexer queue\n";
238: print " l crawl exceeded max depth\n";
239: print " x URL too long or invalid scheme\n";
240: print " d URL was a duplicate\n";
241: print " b crawl was blocked by robots.txt\n";
242: print " F URL contained a fragment\n";
243: print " T URL was from a disallowed top-level domain\n\n";
244:
245: if($seed ne "") {
246: print "pandia: crawling seed $seed to a maximum depth of $maxdepth...";
1.1 snw 247: sleep 1;
248: crawl_url($seed);
249: print "[OK]\n";
250: }
251: else {
1.5 snw 252: open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
1.4 snw 253: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
254:
1.1 snw 255: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
256: $sth->execute();
257: my $qlen = $sth->rows;
258:
259:
1.9 ! snw 260: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth...";
1.1 snw 261: sleep 1;
262: while (my @row = $sth->fetchrow_array()) {
263: my $url = @row[0];
264: crawl_url($url);
265: }
266: print "[OK]\n";
267: }
268:
269:
270: my $total = $inserts + $skips;
271:
272: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2 snw 273: print " - $blacklist_matches blacklist matches\n";
274: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
275: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
276:
277:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>