Annotation of pandia/crawl.pl, revision 1.2

1.1       snw         1: #!/usr/bin/env perl
                      2: 
                      3: # 
1.2     ! snw         4: # $Id: crawl.pl,v 1.1.1.1 2025/06/23 23:17:08 snw Exp $
1.1       snw         5: #  Copyright (C) 2025 Coherent Logic Development LLC
                      6: #
                      7: # Author: Serena Willis <snw@coherent-logic.com>
                      8: #
                      9: # Licensed AGPL-3.0
                     10: #
1.2     ! snw        11: # $Log: crawl.pl,v $
        !            12: # Revision 1.1.1.1  2025/06/23 23:17:08  snw
        !            13: # Initial commit
        !            14: #
1.1       snw        15: #
                     16: 
1.2     ! snw        17: use Getopt::Long;
1.1       snw        18: use HTTP::Tiny;
                     19: use HTML::TreeBuilder;
1.2     ! snw        20: use URI;
        !            21: use DBI;
        !            22: use WWW::RobotRules;
        !            23: my $rules = WWW::RobotRules->new('pandia-crawler/0.0.1');
        !            24: use LWP::Simple qw(get);
        !            25: 
        !            26: my $dbh = "";
        !            27: my $dsn = "";
        !            28: my $skips = 0;
        !            29: my $inserts = 0;
        !            30: my $seed = "";
        !            31: my $depth = 0;
        !            32: 
        !            33: sub store_url {
        !            34:     my ($url) = @_;
        !            35: 
        !            36:     if($url ne "" && length($url) <= 255) {
        !            37:        print ".";
        !            38:        my $ins = $dbh->prepare("INSERT INTO crawl_queue (url) VALUES (?)");
        !            39:            
        !            40:        $ins->execute($url) or $skips = $skips + 1;
        !            41:        $inserts = $inserts + 1;            
        !            42:        if($depth < $maxdepth) {
        !            43:            $depth = $depth + 1;
        !            44:            crawl_url($url);
        !            45:        }
        !            46:     }     
        !            47: }
        !            48: 
        !            49: sub crawl_url {
        !            50:     my ($url) = @_;
        !            51: 
        !            52:     my $u = URI->new($url);
        !            53: 
        !            54:     if ($u->scheme ne "http" && $u->scheme ne "https") {
        !            55:        return;
        !            56:     }
        !            57: 
        !            58:     my $sth = $dbh->prepare("SELECT url_domain FROM blacklist WHERE url_domain=?");
        !            59:     $sth->execute($u->host);
        !            60:     if($sth->rows > 0) {
        !            61:        return;
        !            62:     }
        !            63:     
        !            64:     my $robots_url = $u->scheme . '://' . $u->host . "/robots.txt";
        !            65: 
        !            66:     my $robots_txt = get $robots_url;
        !            67:     $rules->parse($robots_url, $robots_txt) if defined $robots_txt;
        !            68: 
        !            69:     if(!$rules->allowed($url)) {
        !            70:        return;
        !            71:     }
        !            72:     
        !            73:     my $origurl = $url;
        !            74:     ($baseurl) = $origurl =~ m! (.+?\w) (?: /|\z) !x;
        !            75:     
        !            76:     my $http = HTTP::Tiny->new(agent => "pandia-crawler/0.0.1");
        !            77:     my $tree = HTML::TreeBuilder->new();
        !            78: 
        !            79:     my $response = $http->get($url);
        !            80:     $tree->parse($response->{content});
        !            81: 
        !            82:     my @links = $tree->find_by_tag_name('a');
        !            83: 
        !            84:     my $href = "";
        !            85:     my $firstchar = "";
        !            86:     my $final = "";
        !            87:     
        !            88:     foreach my $link (@links) {
        !            89:         $href = $link->attr('href');
        !            90:        $firstchar = substr($href, 0, 1);
        !            91:        $final = "";
        !            92: 
        !            93:        if($firstchar eq '/') {
        !            94:            $final = $baseurl . $href;     
        !            95:        }
        !            96:        elsif($href eq '##') {
        !            97:            $final = $baseurl;
        !            98:        }
        !            99:        elsif($firstchar eq '#') {
        !           100:            $final = $baseurl . '/' . $href;
        !           101:        }
        !           102:        else {
        !           103:            $final = $href;
        !           104:        }
        !           105:            
        !           106:        store_url($final);
        !           107:     }
        !           108: 
        !           109:     $depth = $depth - 1;
        !           110: }
        !           111: 
        !           112: $| = 1;
        !           113: 
        !           114: print "pandia crawler v0.0.1\n";
        !           115: print " Copyright (C) 2025 Coherent Logic Development LLC\n\n";
        !           116: 
        !           117: GetOptions("dbhost=s" => \$dbhost,
        !           118:            "dbname=s" => \$dbname,
        !           119:            "dbusername=s" => \$dbusername,
        !           120:            "dbpw=s" => \$dbpw,
        !           121:           "seed=s" => \$seed,
        !           122:           "maxdepth=n" =>\$maxdepth)
        !           123:     or die("error in command line arguments");
        !           124: 
        !           125: print "pandia:  connecting to $dbname database at $dbhost...";
        !           126: 
        !           127: $dsn = "DBI:mysql:database=$dbname;host=$dbhost;port=3306;mysql_connect_timeout=5;";
        !           128: $dbh = DBI->connect($dsn, $dbusername, $dbpw, {RaiseError => 0, PrintError => 0});
        !           129: die "pandia:  failed to connect to MySQL database: DBI->errstr()" unless $dbh;
        !           130: 
        !           131: print "[OK]\n";
        !           132: 
        !           133: if($seed ne "") {
        !           134:     print "pandia:  crawling seed $seed to a maximum depth of $maxdepth";
        !           135:     sleep 1;
        !           136:     crawl_url($seed);
        !           137:     print "[OK]\n";
        !           138: }
        !           139: else {
        !           140:     my $sth = $dbh->prepare("SELECT url FROM crawl_queue");
        !           141:     $sth->execute();
        !           142:     my $qlen = $sth->rows;
        !           143: 
        !           144:     
        !           145:     print "pandia:  crawling queue with length of $qlen to a maximum depth of $maxdepth";
        !           146:     sleep 1;
        !           147:     while (my @row = $sth->fetchrow_array()) {
        !           148:        my $url = @row[0];
        !           149:        crawl_url($url);
        !           150:     }
        !           151:     print "[OK]\n";
        !           152: }
        !           153: 
        !           154: 
        !           155: my $total = $inserts + $skips;
1.1       snw       156: 
1.2     ! snw       157: print "pandia:  $inserts URL(s) enqueued for analysis; $skips skipped [$total URL(s) seen this run]\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>