Annotation of pandia/crawler, revision 1.6
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.6 ! snw 4: # $Id: crawler,v 1.5 2025/06/28 00:33:32 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: crawler,v $
1.6 ! snw 12: # Revision 1.5 2025/06/28 00:33:32 snw
! 13: # Update locking
! 14: #
1.5 snw 15: # Revision 1.4 2025/06/27 16:20:30 snw
16: # Add blacklist
17: #
1.4 snw 18: # Revision 1.3 2025/06/27 02:14:47 snw
19: # Initial operational capability
20: #
1.3 snw 21: # Revision 1.2 2025/06/25 19:38:48 snw
22: # Add indexer
23: #
1.2 snw 24: # Revision 1.1 2025/06/25 13:44:37 snw
25: # Renaming
26: #
1.1 snw 27: # Revision 1.2 2025/06/25 03:10:01 snw
28: # Initial working crawler
29: #
30: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
31: # Initial commit
32: #
33: #
34:
35: use Getopt::Long;
36: use HTTP::Tiny;
37: use HTML::TreeBuilder;
38: use URI;
39: use DBI;
40: use WWW::RobotRules;
1.4 snw 41: use Fcntl qw(:flock);
1.1 snw 42: use LWP::Simple qw(get);
43:
1.4 snw 44: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
1.1 snw 45: my $dbh = "";
46: my $dsn = "";
47: my $skips = 0;
48: my $inserts = 0;
49: my $seed = "";
50: my $depth = 0;
1.2 snw 51: my $blacklist_matches = 0;
52: my $robots_txt_denies = 0;
53: my $invalid_scheme_skips = 0;
1.1 snw 54:
55: sub store_url {
1.2 snw 56: my ($url, $parent) = @_;
57:
1.6 ! snw 58: if (index($url, '#') != -1) {
! 59: print "pandia: URL contains a fragment; skipping\n";
! 60: return;
! 61: }
! 62:
1.4 snw 63: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto" && substr($url, 0, 4) eq "http") {
64:
1.2 snw 65: my $u = URI->new($url);
1.4 snw 66: my $domain = $u->host;
1.2 snw 67: my $scheme = $u->scheme;
68:
69: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
70: $sth->execute($domain);
71:
72: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1 snw 73:
1.2 snw 74: if(not $ins->execute($url, $parent, $domain, $scheme)) {
75: $skips = $skips + 1;
76: print "d";
77: }
78: else {
79: print ".";
80: $inserts = $inserts + 1;
81: if($depth < $maxdepth) {
82: $depth = $depth + 1;
83: crawl_url($url);
84: }
85: else {
86: print "l";
87: }
1.1 snw 88: }
1.2 snw 89: }
90: else {
91: print "x";
92: }
1.1 snw 93: }
94:
95: sub crawl_url {
96: my ($url) = @_;
97:
98: my $u = URI->new($url);
99:
100: if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2 snw 101: $invalid_scheme_skips = $invalid_scheme_skips + 1;
102: print "s";
1.1 snw 103: return;
104: }
105:
106: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
107: $sth->execute($u->host);
108: if($sth->rows > 0) {
1.2 snw 109: print "b";
110: $blacklist_matches = $blacklist_matches + 1;
1.1 snw 111: return;
112: }
113:
114: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
115:
116: my $robots_txt = get $robots_url;
117: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
118:
119: if(!$rules->allowed($url)) {
1.2 snw 120: print "r";
121: $robots_txt_denies = $robots_txt_denies + 1;
1.1 snw 122: return;
123: }
124:
125: my $origurl = $url;
126: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
127:
128: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
129: my $tree = HTML::TreeBuilder->new();
130:
131: my $response = $http->get($url);
1.3 snw 132:
133:
1.1 snw 134: $tree->parse($response->{content});
135:
136: my @links = $tree->find_by_tag_name('a');
137:
138: my $href = "";
139: my $firstchar = "";
140: my $final = "";
141:
142: foreach my $link (@links) {
143: $href = $link->attr('href');
144: $firstchar = substr($href, 0, 1);
145: $final = "";
146:
147: if($firstchar eq '/') {
148: $final = $baseurl . $href;
149: }
150: elsif($href eq '##') {
151: $final = $baseurl;
152: }
153: elsif($firstchar eq '#') {
154: $final = $baseurl . '/' . $href;
155: }
156: else {
157: $final = $href;
158: }
159:
1.2 snw 160: store_url($final, $url);
1.1 snw 161: }
162:
163: $depth = $depth - 1;
164: }
165:
166: $| = 1;
167: print "pandia crawler v0.0.1\n";
168: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
169:
170: GetOptions("dbhost=s" => \$dbhost,
171: "dbname=s" => \$dbname,
172: "dbusername=s" => \$dbusername,
173: "dbpw=s" => \$dbpw,
174: "seed=s" => \$seed,
175: "maxdepth=n" =>\$maxdepth)
176: or die("error in command line arguments");
177:
178: print "pandia: connecting to $dbname database at $dbhost...";
179:
180: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
181: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
182: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
183:
184: print "[OK]\n";
185:
186: if($seed ne "") {
187: print "pandia: crawling seed $seed to a maximum depth of $maxdepth";
188: sleep 1;
189: crawl_url($seed);
190: print "[OK]\n";
191: }
192: else {
1.5 snw 193: open my $file, ">", "/tmp/pandia_crawler.lock" or die $!;
1.4 snw 194: flock $file, LOCK_EX|LOCK_NB or die "Unable to lock file $!";
195:
1.1 snw 196: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
197: $sth->execute();
198: my $qlen = $sth->rows;
199:
200:
201: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth";
202: sleep 1;
203: while (my @row = $sth->fetchrow_array()) {
204: my $url = @row[0];
205: crawl_url($url);
206: }
207: print "[OK]\n";
208: }
209:
210:
211: my $total = $inserts + $skips;
212:
213: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2 snw 214: print " - $blacklist_matches blacklist matches\n";
215: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
216: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
217:
218:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>