package Perlcrawl; use Carp; use Data::Dumper; use strict; use warnings; use Crawler; our @ISA = qw(Crawler); use LWP::Simple qw(get); use JSON::XS qw(from_json); use Date::Parse; ## # Main loop for a crawl update. # This is where a resource is crawled, and documents added. sub crawl_update { my (undef, $self, $opt) = @_; warn "Options received: ", Dumper($opt), "\n"; my $jurl = "http://twitter.com/statuses/user_timeline/" . $opt->{'screen name'} . ".json"; my $t = from_json(get($jurl)); for my $usr (@{$t}) { my $content = $usr->{text}; my $url = "http://twitter.com/" . "$usr->{user}{screen_name}/statuses/$usr->{id}"; next if $self->document_exists($url, 0); my $substr = substr($content, 0, 50); my $title = "$usr->{user}{name}: $substr .."; my $created_at = str2time($usr->{created_at}); print "Adding $title\n"; $self->add_document(( content => $content, title => $title, url => $url, type => "tapp", acl_allow => "Everyone", last_modified => $created_at, )); } }; sub path_access { my ($undef, $self, $opt) = @_; # During a user search, `path access' is called against the search results # before they are shown to the user. This is to check if the user still has # access to the results. # # If this is irrelevant to you, just return 1. # You'll want to return 0 when: # * The document doesn't exist anymore # * The user has lost priviledges to read the document # * .. when you want the document to be filtered from a user search in general. return 1; } 1;