1: #!/usr/bin/env perl
2:
3: #
4: # $Id: crawler,v 1.9 2025/07/05 15:27:53 snw Exp $
5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
11: # $Log: crawler,v $
12: # Revision 1.9 2025/07/05 15:27:53 snw
13: # Update
14: #
15: # Revision 1.8 2025/07/02 15:14:44 snw
16: # Fix bug in restricted mode
17: #
18: # Revision 1.7 2025/07/02 15:03:05 snw
19: # Add support for restricted mode
20: #
21: # Revision 1.6 2025/07/01 19:20:47 snw
22: # Stop crawling and indexing URLs containing page fragments
23: #
24: # Revision 1.5 2025/06/28 00:33:32 snw
25: # Update locking
26: #
27: # Revision 1.4 2025/06/27 16:20:30 snw
28: # Add blacklist
29: #
30: # Revision 1.3 2025/06/27 02:14:47 snw
31: # Initial operational capability
32: #
33: # Revision 1.2 2025/06/25 19:38:48 snw
34: # Add indexer
35: #
36: # Revision 1.1 2025/06/25 13:44:37 snw
37: # Renaming
38: #
39: # Revision 1.2 2025/06/25 03:10:01 snw
40: # Initial working crawler
41: #
42: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
43: # Initial commit
44: #
45: #
46:
47: use Getopt::Long;
48: use HTTP::Tiny;
49: use HTML::TreeBuilder;
50: use URI;
51: use DBI;
52: use WWW::RobotRules;
53: use Fcntl qw(:flock);
54: use LWP::Simple qw(get);
55: use Config::IniFiles;
56:
57: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
58: my $dbh = "";
59: my $dsn = "";
60: my $skips = 0;
61: my $inserts = 0;
62: my $seed = "";
63: my $depth = 0;
64: my $blacklist_matches = 0;
65: my $robots_txt_denies = 0;
66: my $invalid_scheme_skips = 0;
67: my $mode;
68:
69: sub store_url {
70: my ($url, $parent) = @_;
71:
72: if (index($url, '#') != -1) {
73: print "F";
74: return;
75: }
76:
77: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
78:
79: my $u = URI->new($url);
80: my $domain = $u->host;
81: my $scheme = $u->scheme;
82: my @parts = split($domain, '.');
83: my $tld = $parts[-1];
84:
85: if ($mode eq 'restricted') {
86: my $tld_ok = 0;
87: foreach (@allowed_tlds) {
88: my $allowed = $_;
89:
90: if($tld eq $allowed) {
91: $tld_ok = 1;
92: last;
93: }
94: }
95: if($tld_ok == 0) {
96: print "T";
97: return;
98: }
99: }
100:
101: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
102: $sth->execute($domain);
103:
104: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
105:
106: if(not $ins->execute($url, $parent, $domain, $scheme)) {
107: $skips = $skips + 1;
108: print "d";
109: }
110: else {
111: print ".";
112: $inserts = $inserts + 1;
113: if($depth < $maxdepth) {
114: $depth = $depth + 1;
115: crawl_url($url);
116: }
117: else {
118: print "l";
119: }
120: }
121: }
122: else {
123: print "x";
124: }
125: }
126:
127: sub crawl_url {
128: my ($url) = @_;
129:
130: my $u = URI->new($url);
131:
132: if ($u->scheme ne "http" && $u->scheme ne "https") {
133: $invalid_scheme_skips = $invalid_scheme_skips + 1;
134: print "s";
135: return;
136: }
137:
138: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
139: $sth->execute($u->host);
140: if($sth->rows > 0) {
141: print "b";
142: $blacklist_matches = $blacklist_matches + 1;
143: return;
144: }
145:
146: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
147:
148: my $robots_txt = get $robots_url;
149: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
150:
151: if(!$rules->allowed($url)) {
152: print "r";
153: $robots_txt_denies = $robots_txt_denies + 1;
154: return;
155: }
156:
157: my $origurl = $url;
158: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
159:
160: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
161: my $tree = HTML::TreeBuilder->new();
162:
163: my $response = $http->get($url);
164:
165:
166: $tree->parse($response->{content});
167:
168: my @links = $tree->find_by_tag_name('a');
169:
170: my $href = "";
171: my $firstchar = "";
172: my $final = "";
173:
174: foreach my $link (@links) {
175: $href = $link->attr('href');
176: $firstchar = substr($href, 0, 1);
177: $final = "";
178:
179: if($firstchar eq '/') {
180: $final = $baseurl . $href;
181: }
182: elsif($href eq '##') {
183: $final = $baseurl;
184: }
185: elsif($firstchar eq '#') {
186: $final = $baseurl . '/' . $href;
187: }
188: else {
189: $final = $href;
190: }
191:
192: store_url($final, $url);
193: }
194:
195: $depth = $depth - 1;
196: }
197:
198: $| = 1;
199: print "pandia crawler v0.0.1\n";
200: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
201:
202: my $profile;
203:
204: GetOptions("profile=s" => \$profile,
205: "seed=s" => \$seed,
206: "maxdepth=n" =>\$maxdepth)
207: or die("error in command line arguments");
208:
209: my $cfg = Config::IniFiles->new(-file => "/etc/pandia.ini");
210:
211: $dbhost = $cfg->val($profile, 'dbhost');
212: $dbname = $cfg->val($profile, 'dbname');
213: $dbusername = $cfg->val($profile, 'dbuser');
214: $dbpw = $cfg->val($profile, 'dbpass');
215: $tmp = $cfg->val($profile, 'allowed_tlds');
216:
217: if($tmp ne '*') {
218: $mode = 'restricted';
219: @allowed_tlds = split(',', $tmp);
220: print "pandia: crawler restricted to these TLDs: ";
221: foreach (@allowed_tlds) {
222: print ".$_ ";
223: }
224: print "\n";
225: }
226: else {
227: print "pandia: crawler unrestricted\n";
228: $mode = 'normal';
229: }
230:
231: print "pandia: connecting to $dbname database at $dbhost...";
232:
233: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
234: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
235: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
236:
237: print "[OK]\n";
238:
239: print "pandia: each character represents the following status for a URL:\n";
240: print " . URL added to indexer queue\n";
241: print " l crawl exceeded max depth\n";
242: print " x URL too long or invalid scheme\n";
243: print " d URL was a duplicate\n";
244: print " b crawl was blocked by robots.txt\n";
245: print " F URL contained a fragment\n";
246: print " T URL was from a disallowed top-level domain\n\n";
247:
248: if($seed ne "") {
249: print "pandia: crawling seed $seed to a maximum depth of $maxdepth...";
250: sleep 1;
251: crawl_url($seed);
252: print "[OK]\n";
253: }
254: else {
255: open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
256: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
257:
258: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
259: $sth->execute();
260: my $qlen = $sth->rows;
261:
262:
263: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth...";
264: sleep 1;
265: while (my @row = $sth->fetchrow_array()) {
266: my $url = @row[0];
267: crawl_url($url);
268: }
269: print "[OK]\n";
270: }
271:
272:
273: my $total = $inserts + $skips;
274:
275: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
276: print " - $blacklist_matches blacklist matches\n";
277: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
278: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
279:
280:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>