package Perlcrawl; use Carp; use Data::Dumper; use strict; use warnings; use Crawler; our @ISA = qw(Crawler); use JSON::XS qw(from_json); use Date::Parse; use HTTP::Request; use MIME::Base64; use LWP::UserAgent; ## # Main loop for a crawl update. # This is where a resource is crawled, and documents added. sub crawl_update { my (undef, $self, $opt) = @_; warn "Options received: ", Dumper($opt), "\n"; # Setup user agent my $userAgent = LWP::UserAgent->new(); $userAgent->timeout(30); # Autenticat with twitter my $request = new HTTP::Request( 'POST', 'https://api.twitter.com/oauth2/token' ); # Setup the correct content type for the post. $request->content_type('application/x-www-form-urlencoded;charset=UTF-8'); # Setup the your keys $request->authorization_basic( $opt->{'user'}, $opt->{'password'} ); # Setup the message you wish to send. $request->content( 'grant_type=client_credentials' ); # Now we pass the request to the user agent to get it fetched. my $response = $userAgent->request($request); if( !$response->is_success ) { die "Unable to connect to Twitter: " . $response->status_line; } my $tout = from_json($response->content); my $token = $tout->{access_token}; my $jurl = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=" . $opt->{'screen name'}; # Ask twitter for the users timeline $request = HTTP::Request->new( 'GET', $jurl ); # Add Authorization header value $request->header( 'Authorization' => "Bearer " . $token); # Now we pass the request to the user agent to get it feched. my $response = $userAgent->request($request); if( !$response->is_success ) { die "Unable to connect to Twitter: " . $response->status_line; } my $t = from_json($response->content); for my $usr (@{$t}) { my $content = $usr->{text}; my $url = "http://twitter.com/" . $usr->{user}{screen_name} . "/statuses/" . $usr->{id}; next if $self->document_exists($url, 0); my $substr = substr($content, 0, 50); my $title = "$usr->{user}{name}: $substr .."; my $created_at = str2time($usr->{created_at}); print "Adding $title\n"; $self->add_document(( content => $content, title => $title, url => $url, type => "tapp", acl_allow => "Everyone", last_modified => $created_at, )); } }; sub path_access { my ($undef, $self, $opt) = @_; # During a user search, `path access' is called against the search results # before they are shown to the user. This is to check if the user still has # access to the results. # # If this is irrelevant to you, just return 1. # You'll want to return 0 when: # * The document doesn't exist anymore # * The user has lost priviledges to read the document # * .. when you want the document to be filtered from a user search in general. return 1; } 1;