Annotation of pandia/crawler, revision 1.2
1.1 snw 1: #!/usr/bin/env perl
2:
3: #
1.2 ! snw 4: # $Id: crawler,v 1.1 2025/06/25 13:44:37 snw Exp $
1.1 snw 5: # Copyright (C) 2025 Coherent Logic Development LLC
6: #
7: # Author: Serena Willis <snw@coherent-logic.com>
8: #
9: # Licensed AGPL-3.0
10: #
1.2 ! snw 11: # $Log: crawler,v $
! 12: # Revision 1.1 2025/06/25 13:44:37 snw
! 13: # Renaming
! 14: #
1.1 snw 15: # Revision 1.2 2025/06/25 03:10:01 snw
16: # Initial working crawler
17: #
18: # Revision 1.1.1.1 2025/06/23 23:17:08 snw
19: # Initial commit
20: #
21: #
22:
23: use Getopt::Long;
24: use HTTP::Tiny;
25: use HTML::TreeBuilder;
26: use URI;
27: use DBI;
28: use WWW::RobotRules;
29: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
30: use LWP::Simple qw(get);
31:
32: my $dbh = "";
33: my $dsn = "";
34: my $skips = 0;
35: my $inserts = 0;
36: my $seed = "";
37: my $depth = 0;
1.2 ! snw 38: my $blacklist_matches = 0;
! 39: my $robots_txt_denies = 0;
! 40: my $invalid_scheme_skips = 0;
1.1 snw 41:
42: sub store_url {
1.2 ! snw 43: my ($url, $parent) = @_;
! 44:
! 45: if($url ne "" && length($url) <= 255 && substr($url, 0, 6) ne "mailto") {
1.1 snw 46:
1.2 ! snw 47: my $u = URI->new($url);
! 48: my $domain = $u->host;
! 49: my $scheme = $u->scheme;
! 50:
! 51: my $sth = $dbh->prepare("INSERT INTO url_domains (url_domain) VALUES (?)");
! 52: $sth->execute($domain);
! 53:
! 54: my $ins = $dbh->prepare("INSERT INTO crawl_queue (url, parent_url, url_domain, scheme) VALUES (?, ?, ?, ?)");
1.1 snw 55:
1.2 ! snw 56: if(not $ins->execute($url, $parent, $domain, $scheme)) {
! 57: $skips = $skips + 1;
! 58: print "d";
! 59: }
! 60: else {
! 61: print ".";
! 62: $inserts = $inserts + 1;
! 63: if($depth < $maxdepth) {
! 64: $depth = $depth + 1;
! 65: crawl_url($url);
! 66: }
! 67: else {
! 68: print "l";
! 69: }
1.1 snw 70: }
1.2 ! snw 71: }
! 72: else {
! 73: print "x";
! 74: }
1.1 snw 75: }
76:
77: sub crawl_url {
78: my ($url) = @_;
79:
80: my $u = URI->new($url);
81:
82: if ($u->scheme ne "http" && $u->scheme ne "https") {
1.2 ! snw 83: $invalid_scheme_skips = $invalid_scheme_skips + 1;
! 84: print "s";
1.1 snw 85: return;
86: }
87:
88: my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
89: $sth->execute($u->host);
90: if($sth->rows > 0) {
1.2 ! snw 91: print "b";
! 92: $blacklist_matches = $blacklist_matches + 1;
1.1 snw 93: return;
94: }
95:
96: my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
97:
98: my $robots_txt = get $robots_url;
99: $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
100:
101: if(!$rules->allowed($url)) {
1.2 ! snw 102: print "r";
! 103: $robots_txt_denies = $robots_txt_denies + 1;
1.1 snw 104: return;
105: }
106:
107: my $origurl = $url;
108: ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
109:
110: my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
111: my $tree = HTML::TreeBuilder->new();
112:
113: my $response = $http->get($url);
114: $tree->parse($response->{content});
115:
116: my @links = $tree->find_by_tag_name('a');
117:
118: my $href = "";
119: my $firstchar = "";
120: my $final = "";
121:
122: foreach my $link (@links) {
123: $href = $link->attr('href');
124: $firstchar = substr($href, 0, 1);
125: $final = "";
126:
127: if($firstchar eq '/') {
128: $final = $baseurl . $href;
129: }
130: elsif($href eq '##') {
131: $final = $baseurl;
132: }
133: elsif($firstchar eq '#') {
134: $final = $baseurl . '/' . $href;
135: }
136: else {
137: $final = $href;
138: }
139:
1.2 ! snw 140: store_url($final, $url);
1.1 snw 141: }
142:
143: $depth = $depth - 1;
144: }
145:
146: $| = 1;
147:
148: print "pandia crawler v0.0.1\n";
149: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
150:
151: GetOptions("dbhost=s" => \$dbhost,
152: "dbname=s" => \$dbname,
153: "dbusername=s" => \$dbusername,
154: "dbpw=s" => \$dbpw,
155: "seed=s" => \$seed,
156: "maxdepth=n" =>\$maxdepth)
157: or die("error in command line arguments");
158:
159: print "pandia: connecting to $dbname database at $dbhost...";
160:
161: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
162: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
163: die "pandia: failed to connect to MySQL database: DBI->errstr()" unless $dbh;
164:
165: print "[OK]\n";
166:
167: if($seed ne "") {
168: print "pandia: crawling seed $seed to a maximum depth of $maxdepth";
169: sleep 1;
170: crawl_url($seed);
171: print "[OK]\n";
172: }
173: else {
174: my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
175: $sth->execute();
176: my $qlen = $sth->rows;
177:
178:
179: print "pandia: crawling queue with length of $qlen to a maximum depth of $maxdepth";
180: sleep 1;
181: while (my @row = $sth->fetchrow_array()) {
182: my $url = @row[0];
183: crawl_url($url);
184: }
185: print "[OK]\n";
186: }
187:
188:
189: my $total = $inserts + $skips;
190:
191: print "pandia: $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
1.2 ! snw 192: print " - $blacklist_matches blacklist matches\n";
! 193: print " - $invalid_scheme_skips URLs skipped due to invalid scheme\n";
! 194: print " - $robots_txt_denies URLs skipped due to robots.txt\n";
! 195:
! 196:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>