#!/usr/bin/perl use strict; package Perlcrawl; use Crawler; our @ISA = qw(Crawler); #use Data::Dumper; use LWP::RobotUA; use URI; use Data::Dumper; use warnings; use URI; use LWP; use HTML::TokeParser; use URI; use sdMimeMap; use LWP::RobotUA; use HTML::LinkExtor; use Date::Parse; use Carp; use Data::Dumper; use Readonly; use URI::Escape; use constant bot_name => "sdbot/0.1"; use constant bot_email => "support\@searchdaimon.com"; use constant timeout => 4; use constant delay => 1; use constant verbose => 0; use constant max_size => 26214400; #26214400=25 mb Readonly::Hash my %LINK_IGNORE => map { $_ => 1 } qw(td script table form head link background); our $skipDynamic = 0; our $hit_count = 0; our @starting_urls; our $download_images = 0; our $lasterror = ""; our $last_time_anything_said; our $last_time_anything_muttered; our %points_to; our %notable_url_error; # URL => error message our %seen_url_before; our @exclusionsUrlParts; our $iisspecial = 0; our @exclusionQueryParts; our @schedule; our $crawler; sub path_access { my (undef, $self, $opt) = @_; my $user = $opt->{'user'}; my $passw = $opt->{'password'}; my $url = $opt->{"resource"}; my $robot = LWP::RobotUA->new(agent=>bot_name, from=>bot_email, keep_alive=>'1'); $robot->delay(0); # "/60" to do seconds->minutes $robot->timeout(timeout); $robot->requests_redirectable([]); # comment this line to allow redirects $robot->protocols_allowed(['http','https']); # disabling all others my $req = HTTP::Request->new(HEAD => $url); print "Authenticating : $user at $url\n"; my $response = $robot->request($req); if (($response->code == 401) && ($user)) { if ($response->www_authenticate =~ m/basic/i) { $req->authorization_basic($user, $passw); } elsif (($response->www_authenticate =~ m/negotiate/i) || ($response->www_authenticate =~ m/ntlm/i)) { $url =~ m/http:\/\/([^\/]+)/i; my $server = $1 . ":80"; print "Trying to authenticate by NTLM to server $server\n"; $robot->credentials($server, '', $user, $passw); } else { print "Unknown authentication method \"" . $response->www_authenticate . "\".\n"; } #rerun the request $response = $robot->request($req); } if ($response->is_success) { return 1; } print "Not authenticated : $user at $url \n"; return 0; } sub crawl_update { my (undef, $self, $opt) = @_; $crawler = $self; my $user = $opt->{"user"}; my $passw = $opt->{"password"}; my $urls = $opt->{"url"}; my $starting_url; my @urlList = split /;/, $urls; my @exclusionsUrlPart = qw ( ); # See Sharpoint crawler on how to use this my @exclusionQueryPart = qw(); # See Sharpoint crawler on how to use this if (!exists $opt->{delay}) { $opt->{delay} = delay; } $download_images = $opt->{download_images}; print "Name : ". bot_name." mail :". bot_email."\n"; my $robot = LWP::RobotUA->new( agent=>bot_name, from=>bot_email, keep_alive=>'1' ); $robot->delay($opt->{delay}/60); $robot->timeout(timeout); $robot->max_size(max_size); $robot->requests_redirectable([]); # uncomment this line to disallow redirects $robot->protocols_allowed(['http','https']); # disabling all others say("bot_name (bot_email) starting at ", scalar(localtime), "\n"); process_starting_urls(@urlList); foreach $starting_url (@urlList) { # Login to Wordpress, using http post $robot->post($opt->{'loginurl'}, [ 'log' => $opt->{'user'}, 'pwd' => $opt->{'password'} ]); # Continue as normal schedule($starting_url); main_loop( $robot,$user, $passw); report( ) if $hit_count; } print "################################################\n"; print "hit_count: " . $hit_count . ", error: " . $lasterror . "\n"; if ($hit_count == 0) { die($lasterror); } } sub setExclusionUrlParts { @exclusionsUrlParts = @_; } sub setIISpecial { $iisspecial = 1; } sub setExclusionQueryParts { @exclusionQueryParts = @_; } sub main_loop { my ($robot,$user, $passw) = @_; while( schedule_count( ) && $crawler->continue ) { my $url = next_scheduled_url( ); if( near_url( $url ) ) { process_near_url($url,$robot,$user, $passw); } else { mutter ( "Far url $url" ); } } return; } sub say { # Print timestamps: print "[" . localtime( time() ) . "] ", @_; } sub mutter { # Add timestamps as needed: unless(time( ) == ($last_time_anything_muttered || 0)) { $last_time_anything_muttered = time( ); unshift @_, "[T$last_time_anything_muttered = " . localtime($last_time_anything_muttered) . "]\n"; } print @_ if verbose; print "\n" if verbose; $lasterror = join(" ",@_); } sub near_url { # Is the given URL "near"? #my $url = $_[0]; my $url = URI->new($_[0]); foreach my $starting_uri (@starting_urls) { my $noObjectUrl = $starting_uri->path; if (!(($url->scheme eq 'http') || ($url->scheme eq 'https') )) { next; } if( ( $url->host eq $starting_uri->host ) && ( $url->path =~ /$noObjectUrl/ ) ) { mutter(" So $url is near"); return 1; } } mutter(" So $url is far"); return 0; } sub process_starting_urls { foreach my $url (@_) { my $u = URI->new($url); $u = $u->canonical; push @starting_urls, $u; } #schedule($starting_urls[0]); #return; } sub refer { my $url = $_[0]; my $links_to_it = $points_to{$url}; # the set (hash) of all things that link to $url return( ) unless $links_to_it and keys %$links_to_it; my @urls = keys %$links_to_it; # in no special order! mutter " For $url, Referer => $urls[0]"; return "Referer" => $urls[0]; } sub note_error_response { my $response = $_[0]; return unless $response->is_error; my $code = $response->code; my $url = URI->new( $response->request->uri )->canonical; if( $code == 404 or $code == 410 or $code == 500 ) { mutter(sprintf "Noting {%s} error at %s\n",$response->status_line, $url ); $notable_url_error{$url} = $response->status_line; } else { #skriver feilmelding, og lagrer den i $! mutter(sprintf "Can't get url %s: %s\n",$url, $response->status_line ); } return; } sub consider_response { # Return 1 if it's successful, otherwise return 0 my $response = $_[0]; mutter("Consider ", $response->status_line); return 1 if $response->is_success; if($response->is_redirect) { my $to_url = $response->header('Location'); if(defined $to_url and length $to_url and $to_url !~ m/\s/ ) { my $from_url = $response->request->uri; $to_url = URI->new_abs($to_url, $from_url); mutter("Noting redirection\n from $from_url\n"," to $to_url"); note_link_to( $from_url => $to_url ); } } else { note_error_response($response); } return 0; } sub addOk { my $url; my $restricted; ($url) = @_; my $uri = URI->new($url); foreach $restricted (@exclusionsUrlParts) { $uri->path(); if ($uri->path() =~ /$restricted/) { return 0; } } return 1; } sub process_near_url { my ($url,$robot,$user, $passw) = @_; mutter("process_near_url(url='$url', robot=$robot, user='user')"); my $url_normalized = $crawler->normalize_http_url($url); my $keep_doc = addOk($url_normalized); say("Fetching " . $hit_count . ":" . schedule_count() . " \"$url\"\n"); my $req = HTTP::Request->new(GET => $url); my $response = $robot->request($req); if (($response->code == 401) && ($user)) { print "asked to authenticate by method: " . $response->www_authenticate . "\n"; if ($response->www_authenticate =~ m/basic/i) { $req->authorization_basic($user, $passw); } elsif (($response->www_authenticate =~ m/negotiate/i) || ($response->www_authenticate =~ m/ntlm/i)) { $url =~ m/http:\/\/([^\/]+)/i; my $server = $1 . ":80"; print "Trying to authenticate by NTLM to server $server\n"; $robot->credentials($server, '', $user, $passw); } else { die("Unknown authentication method \"" . $response->www_authenticate . "\"."); } #rerun the request $response = $robot->request($req); } return unless consider_response($response); if (!addOk($url_normalized)) { extract_links_from_response($response) if $response->content_type eq 'text/html'; return; } my $ct = mapMimeType($response->content_type); my $title = ""; if($response->content_type ne 'text/html') { $title = substr($url, rindex($url, "/")+1); $title = uri_unescape($title); } if (!$crawler->document_exists($url, str2time($response->header('Last-Modified') || 0), length($response->content))) { # do a basic, but exspensiv regex to see if ther is a robot noindex metatag if (($response->content_type eq 'text/html') && ($response->content =~ //i)) { print "Skiping indexint of $url_normalized because of Robots meta-tag restriction.\n"; } else { $crawler->add_document( url => $url_normalized, title => $title, content => $response->content, last_modified => str2time($response->header('Last-Modified')), type => $ct, acl_allow => "Everyone", ); #print "Length ", length($response->content); } } extract_links_from_response($response) if $response->content_type eq 'text/html' && $response->content !~ //i; $hit_count++; 1; } sub extract_links_from_response { my $response = $_[0]; my $count = 0; my $base = URI->new( $response->base )->canonical; my $page_url = URI->new( $response->request->uri ); my $page_parser = HTML::LinkExtor->new(undef, $base); $page_parser->parse($response->as_string)->eof; my @links = $page_parser->links; for my $l (@links) { next if $LINK_IGNORE{$l->[0]}; if ($l->[1] eq 'href') { note_link_to($page_url, $l->[2]); $count++; } elsif ($l->[0] eq 'img' && $l->[1] eq 'src') { note_link_to($page_url, $l->[2]) if $download_images; } elsif ($l->[0] eq 'iframe') { note_link_to($page_url, $l->[2]); $count++; } else { print "INFO: Unknown link type ignored: ", Dumper($l); } } $count--; print "Exstracted $count from $page_url\n"; } sub note_link_to { my($from_url, $to_url) = @_; $points_to{ $to_url }{ $from_url } = 1; if (($skipDynamic == 1) && ($to_url =~ /\?/)) { printf("skipping dynamic url $to_url\n"); return; } $to_url =~ s/\#.*//; # strip internal referanses if ($from_url eq $to_url) { print "Url points to self\n"; return; } if ( near_url( $to_url ) ) { mutter("Noting link\n from $from_url\n to $to_url\n"); schedule($to_url); } return; } sub next_scheduled_url { my $url = splice @schedule, rand(@schedule), 1; mutter("\nnext_scheduled_url: Pulling from schedule: ", $url || "[nil]","\n with ", scalar(@schedule)," items left in schedule.\n"); return $url; } sub schedule_count { return scalar @schedule } sub modifyQueryPart { my ($u) = @_; my $exclusiontags; my $query; my $qs = $u->query(); if (!$qs) { return; } my @newqs; my @queyTags = split /&/, $qs; if (!@exclusionQueryParts) { return; } foreach $query (@queyTags) { my @tags = split /=/, $query; my $exclude = 0; foreach $exclusiontags (@exclusionQueryParts) { if ($exclusiontags eq $tags[0]) { $exclude = 1; } } if (!$exclude) { push @newqs, $tags[0]; push @newqs, $tags[1]; } } if (@newqs) { $u->query_form(@newqs); } else { $u->query(undef); } return $u } sub schedule { # Add these URLs to the schedule foreach my $url (@_) { my $u = ref($url) ? $url : URI->new($url); $u = $u->canonical; # force canonical form next unless 'http' eq ($u->scheme || '') or 'https' eq ($u->scheme || ''); $u->host( regularize_hostname( $u->host( ) ) ); if (lc($u->as_string) =~ "default.aspx") { $u = URI->new(substr($u->as_string, 0, rindex(lc($u->as_string), "default.aspx"))); } modifyQueryPart($u); if ($iisspecial) { my $qs = $u->query(); my $lc_url = lc($u); $u = URI->new($lc_url); $u->query($qs); } if( $seen_url_before{ $u->as_string }++ ) { mutter(" Skipping the already-seen $u\n"); } else { mutter(" Scheduling $u\n"); push @schedule, $u; } } return; } sub regularize_hostname { my $host = lc $_[0]; $host =~ s/\.+/\./g; # foo..com => foo.com $host =~ s/^\.//; # .foo.com => foo.com $host =~ s/\.$//; # foo.com. => foo.com return 'localhost' if $host =~ m/^0*127\.0+\.0+\.0*1$/; return $host; } sub report { say( "Ending at ", scalar(localtime), " after ", time( ) - $^T, "s of runtime and $hit_count hits.\n", ); unless(keys %notable_url_error) { say( "No bad links seen!\n" ); return; } say( "BAD LINKS SEEN:\n" ); foreach my $url (sort keys %notable_url_error) { say( "\n$url\n Error: $notable_url_error{$url}\n" ); foreach my $linker (sort keys %{ $points_to{$url} } ) { say( " < $linker\n" ); } } return; } 1;