Annotation of pandia/crawler, revision 1.1
1.1     ! snw         1: #!/usr/bin/env perl
        !             2: 
        !             3: # 
        !             4: # $Id: crawl.pl,v 1.2 2025/06/25 03:10:01 snw Exp $
        !             5: #  Copyright (C) 2025 Coherent Logic Development LLC
        !             6: #
        !             7: # Author: Serena Willis <snw@coherent-logic.com>
        !             8: #
        !             9: # Licensed AGPL-3.0
        !            10: #
        !            11: # $Log: crawl.pl,v $
        !            12: # Revision 1.2  2025/06/25 03:10:01  snw
        !            13: # Initial working crawler
        !            14: #
        !            15: # Revision 1.1.1.1  2025/06/23 23:17:08  snw
        !            16: # Initial commit
        !            17: #
        !            18: #
        !            19: 
        !            20: use Getopt::Long;
        !            21: use HTTP::Tiny;
        !            22: use HTML::TreeBuilder;
        !            23: use URI;
        !            24: use DBI;
        !            25: use WWW::RobotRules;
        !            26: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
        !            27: use LWP::Simple qw(get);
        !            28: 
        !            29: my $dbh = "";
        !            30: my $dsn = "";
        !            31: my $skips = 0;
        !            32: my $inserts = 0;
        !            33: my $seed = "";
        !            34: my $depth = 0;
        !            35: 
        !            36: sub store_url {
        !            37:     my ($url) = @_;
        !            38: 
        !            39:     if($url ne "" && length($url) <= 255) {
        !            40:        print ".";
        !            41:        my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
        !            42:            
        !            43:        $ins->execute($url) or $skips = $skips + 1;
        !            44:        $inserts = $inserts + 1;            
        !            45:        if($depth < $maxdepth) {
        !            46:            $depth = $depth + 1;
        !            47:            crawl_url($url);
        !            48:        }
        !            49:     }     
        !            50: }
        !            51: 
        !            52: sub crawl_url {
        !            53:     my ($url) = @_;
        !            54: 
        !            55:     my $u = URI->new($url);
        !            56: 
        !            57:     if ($u->scheme ne "http" && $u->scheme ne "https") {
        !            58:        return;
        !            59:     }
        !            60: 
        !            61:     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
        !            62:     $sth->execute($u->host);
        !            63:     if($sth->rows > 0) {
        !            64:        return;
        !            65:     }
        !            66:     
        !            67:     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
        !            68: 
        !            69:     my $robots_txt = get $robots_url;
        !            70:     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
        !            71: 
        !            72:     if(!$rules->allowed($url)) {
        !            73:        return;
        !            74:     }
        !            75:     
        !            76:     my $origurl = $url;
        !            77:     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
        !            78:     
        !            79:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
        !            80:     my $tree = HTML::TreeBuilder->new();
        !            81: 
        !            82:     my $response = $http->get($url);
        !            83:     $tree->parse($response->{content});
        !            84: 
        !            85:     my @links = $tree->find_by_tag_name('a');
        !            86: 
        !            87:     my $href = "";
        !            88:     my $firstchar = "";
        !            89:     my $final = "";
        !            90:     
        !            91:     foreach my $link (@links) {
        !            92:         $href = $link->attr('href');
        !            93:        $firstchar = substr($href, 0, 1);
        !            94:        $final = "";
        !            95: 
        !            96:        if($firstchar eq '/') {
        !            97:            $final = $baseurl . $href;     
        !            98:        }
        !            99:        elsif($href eq '##') {
        !           100:            $final = $baseurl;
        !           101:        }
        !           102:        elsif($firstchar eq '#') {
        !           103:            $final = $baseurl . '/' . $href;
        !           104:        }
        !           105:        else {
        !           106:            $final = $href;
        !           107:        }
        !           108:            
        !           109:        store_url($final);
        !           110:     }
        !           111: 
        !           112:     $depth = $depth - 1;
        !           113: }
        !           114: 
        !           115: $| = 1;
        !           116: 
        !           117: print "pandia crawler v0.0.1\n";
        !           118: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
        !           119: 
        !           120: GetOptions("dbhost=s" => \$dbhost,
        !           121:            "dbname=s" => \$dbname,
        !           122:            "dbusername=s" => \$dbusername,
        !           123:            "dbpw=s" => \$dbpw,
        !           124:           "seed=s" => \$seed,
        !           125:           "maxdepth=n" =>\$maxdepth)
        !           126:     or die("error in command line arguments");
        !           127: 
        !           128: print "pandia:  connecting to $dbname database at $dbhost...";
        !           129: 
        !           130: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
        !           131: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
        !           132: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
        !           133: 
        !           134: print "[OK]\n";
        !           135: 
        !           136: if($seed ne "") {
        !           137:     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
        !           138:     sleep 1;
        !           139:     crawl_url($seed);
        !           140:     print "[OK]\n";
        !           141: }
        !           142: else {
        !           143:     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
        !           144:     $sth->execute();
        !           145:     my $qlen = $sth->rows;
        !           146: 
        !           147:     
        !           148:     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
        !           149:     sleep 1;
        !           150:     while (my @row = $sth->fetchrow_array()) {
        !           151:        my $url = @row[0];
        !           152:        crawl_url($url);
        !           153:     }
        !           154:     print "[OK]\n";
        !           155: }
        !           156: 
        !           157: 
        !           158: my $total = $inserts + $skips;
        !           159: 
        !           160: print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>