Annotation of pandia/crawler, revision 1.3
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.3 ! snw 4: # $Id: crawler,v 1.2 2025/06/25 19:38:48 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 snw 11: # $Log: crawler,v $
1.3 ! snw 12: # Revision 1.2 2025/06/25 19:38:48 snw
! 13: # Add indexer
! 14: #
1.2 snw 15: # Revision 1.1 2025/06/25 13:44:37 snw
16: # Renaming
17: #
1.1 snw 18: # Revision 1.2 2025/06/25 03:10:01 snw
19: # Initial working crawler
20: #
21: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
22: # Initial commit
23: #
24: #
25:
26: use Getopt::Long;
27: use HTTP::Tiny;
28: use HTML::TreeBuilder;
29: use URI;
30: use DBI;
31: use WWW::RobotRules;
32: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
33: use LWP::Simple qw(get);
34:
35: my $dbh = "";
36: my $dsn = "";
37: my $skips = 0;
38: my $inserts = 0;
39: my $seed = "";
40: my $depth = 0;
1.2 snw 41: my $blacklist_matches = 0;
42: my $robots_txt_denies = 0;
43: my $invalid_scheme_skips = 0;
1.1 snw 44:
45: sub store_url {
1.2 snw 46: my ($url, $parent) = @_;
47:
48: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") {
1.1 snw 49:
1.2 snw 50: my $u = URI->new($url);
51: my $domain = $u->host;
52: my $scheme = $u->scheme;
53:
54: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
55: $sth->execute($domain);
56:
57: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1 snw 58:
1.2 snw 59: if(not $ins->execute($url, $parent, $domain, $scheme)) {
60: $skips = $skips + 1;
61: print "d";
62: }
63: else {
64: print ".";
65: $inserts = $inserts + 1;
66: if($depth < $maxdepth) {
67: $depth = $depth + 1;
68: crawl_url($url);
69: }
70: else {
71: print "l";
72: }
1.1 snw 73: }
1.2 snw 74: }
75: else {
76: print "x";
77: }
1.1 snw 78: }
79:
80: sub crawl_url {
81: my ($url) = @_;
82:
83: my $u = URI->new($url);
84:
85: if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2 snw 86: $invalid_scheme_skips = $invalid_scheme_skips + 1;
87: print "s";
1.1 snw 88: return;
89: }
90:
91: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
92: $sth->execute($u->host);
93: if($sth->rows > 0) {
1.2 snw 94: print "b";
95: $blacklist_matches = $blacklist_matches + 1;
1.1 snw 96: return;
97: }
98:
99: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
100:
101: my $robots_txt = get $robots_url;
102: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
103:
104: if(!$rules->allowed($url)) {
1.2 snw 105: print "r";
106: $robots_txt_denies = $robots_txt_denies + 1;
1.1 snw 107: return;
108: }
109:
110: my $origurl = $url;
111: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
112:
113: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
114: my $tree = HTML::TreeBuilder->new();
115:
116: my $response = $http->get($url);
1.3 ! snw 117:
! 118: if(not $response->{success}) {
! 119: print "pandia: http failure; skipping $url\n";
! 120: next;
! 121: }
! 122:
! 123: if(exists $response->{redirects}) {
! 124: print "pandia: redirects detected; skipping $url\n";
! 125: next;
! 126: }
! 127:
1.1 snw 128: $tree->parse($response->{content});
129:
130: my @links = $tree->find_by_tag_name('a');
131:
132: my $href = "";
133: my $firstchar = "";
134: my $final = "";
135:
136: foreach my $link (@links) {
137: $href = $link->attr('href');
138: $firstchar = substr($href, 0, 1);
139: $final = "";
140:
141: if($firstchar eq '/') {
142: $final = $baseurl . $href;
143: }
144: elsif($href eq '##') {
145: $final = $baseurl;
146: }
147: elsif($firstchar eq '#') {
148: $final = $baseurl . '/' . $href;
149: }
150: else {
151: $final = $href;
152: }
153:
1.2 snw 154: store_url($final, $url);
1.1 snw 155: }
156:
157: $depth = $depth - 1;
158: }
159:
160: $| = 1;
161: print "pandia crawler v0.0.1\n";
162: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
163:
164: GetOptions("dbhost=s" => \$dbhost,
165: "dbname=s" => \$dbname,
166: "dbusername=s" => \$dbusername,
167: "dbpw=s" => \$dbpw,
168: "seed=s" => \$seed,
169: "maxdepth=n" =>\$maxdepth)
170: or die("error in command line arguments");
171:
172: print "pandia: connecting to $dbname database at $dbhost...";
173:
174: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
175: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
176: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
177:
178: print "[OK]\n";
179:
180: if($seed ne "") {
181: print "pandia: crawling seed $seed to a maximum depth of $maxdepth";
182: sleep 1;
183: crawl_url($seed);
184: print "[OK]\n";
185: }
186: else {
187: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
188: $sth->execute();
189: my $qlen = $sth->rows;
190:
191:
192: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth";
193: sleep 1;
194: while (my @row = $sth->fetchrow_array()) {
195: my $url = @row[0];
196: crawl_url($url);
197: }
198: print "[OK]\n";
199: }
200:
201:
202: my $total = $inserts + $skips;
203:
204: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2 snw 205: print " - $blacklist_matches blacklist matches\n";
206: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
207: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
208:
209:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>