package Rss2Html::Feed; =head1 Name Rss2Html::Feed - a feed =head1 Description This class represents a single feed. It currently has only one public method, update(), which updates the feed. =cut use strict; use warnings; use v5.14; use Moose; use Try::Tiny; use Cache::Memcached; use Encode qw(decode_utf8); use HTML::Entities; use HTTP::Date; use XML::Atom::Client; use XML::RAI; with 'MooseX::Log::Log4perl'; has 'id' => ( is => 'rw', ); has 'url' => ( is => 'rw', ); has 'active' => ( is => 'rw', ); has 'last_update' => ( is => 'rw', ); has 'title' => ( is => 'rw', ); has 'type' => ( is => 'rw', ); has 'allow_img' => ( is => 'rw', ); has 'expire' => ( is => 'rw', ); has 'dbh' => ( is => 'rw', ); my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']); =head1 Methods =cut =head2 update Update the feed. This is unconditional. The logic to decide which feeds must be updated is in L. =cut sub update { my ($self) = @_; my $dbh = $self->dbh; my $now = int(time()); if ($self->{type} eq 'rss') { local $@ = "unknown error"; # XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics. # Probably should do this for atom, too, so we can autodetect the feed type. my $ua = LWP::UserAgent->new; $ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)"); my $response = $ua->get($self->{url}); if ($response->code != 200) { $self->log->warn("GET $self->{url} failed: " . $response->status_line); return; } $self->log->info("$self->{url} has content type " . $response->content_type); my $rai = eval {XML::RAI->parse_string($response->content) }; if ($rai) { $rai->time_format('EPOCH'); if ($rai->channel->title ne $self->{title}) { $self->log->info("set title"); $dbh->do("update feeds set title=? where id=?", {}, $rai->channel->title, $self->{id}); } $self->log->info("checking " . scalar(@{$rai->items}) . " items in db"); for my $item ( @{$rai->items} ) { my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link); if ($item_db) { $self->log->info("item $item_db->{id} seen"); $dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id}); } else { $self->log->info("new"); my $issued = $item->issued // $now; if ($issued < $self->{last_update} || $issued > $now) { # $issued is clearly wrong. Could be any time between $self->{last_update} and $now. # Choose a random time in the interval. my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update})); $self->log->info("fixing up issued time: $issued -> $guess_issued"); $issued = $guess_issued; } my $title = $item->title; $self->log->info("item " . $item->link . " new"); if ($self->{decode_title}) { $self->log->debug("decoding title $title"); $title = decode_entities($title); $self->log->debug("decoded title $title"); } $dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)", {}, $title, $item->link, $item->content, $issued, $now, $self->{id} ); $self->invalidate_item_info; } } } else { $self->log->error("error getting $self->{url}: $@"); } } elsif ($self->{type} eq 'atom') { my $api = XML::Atom::Client->new(); my $atomfeed; try { # This doesn't work. XML::Atom::Client EXITS on error! $atomfeed = $api->getFeed($self->{url}); } catch { $self->log->error("error fetching $self->{url}: $@"); }; if ($atomfeed) { my @items = $atomfeed->entries; for my $item (@items) { my $link = $self->atom_main_link($item); my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link); if ($item_db) { $self->log->info("item $item_db->{id} seen"); $dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id}); } else { $self->log->info("new"); my $title = decode_utf8($item->title); # XXX - Workaround? my $content = $item->content; my $summary = $item->summary; my $body = $content ? $content->body : defined($summary) ? decode_utf8($summary) : undef; $self->log->info("item " . $link . " new"); my $issued = str2time($item->updated) // $now; if ($issued < $self->{last_update} || $issued > $now) { # $issued is clearly wrong. Could be any time between $self->{last_update} and $now. # Choose a random time in the interval. my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update})); $self->log->info("fixing up issued time: $issued -> $guess_issued"); $issued = $guess_issued; } $dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)", {}, $title, $link, $body, $issued, $now, $self->{id} ); $self->invalidate_item_info; } } } } else { $self->log->info("ignoring feed type $self->{type} of feed #$self->{id}: $self->{title}"); return; } $dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id}); } sub atom_main_link { my ($self, $item) = @_; # XXX should probably be a method of $item for my $link ($item->link) { # workaround for https://github.com/miyagawa/xml-atom/issues/16 my $href = $link->href // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'href'); my $rel = $link->rel // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'rel'); my $type = $link->type // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'type'); if (!$href) { $self->log->warn("href undefined, skip"); next; } $self->log->info("checking href ", $href, "\n"); if (defined $rel && $rel ne 'alternate') { $self->log->info("rel ", $rel, " unexpected, skip"); next; } if (defined $type && $type ne 'text/html') { $self->log->info("type ", $type, " unexpected, skip\n"); next; } $self->log->info("match!\n"); return $href; } } sub invalidate_item_info { my ($self) = @_; my $dbh = $self->dbh; my $users = $dbh->selectcol_arrayref("select distinct username from read"); for my $user (@$users) { $mcd->delete($self->item_info_key($user)); } } sub item_info_key { my ($self, $user) = @_; return "rss2html/item_info/" . $user . "/" . $self->id; } #--- __PACKAGE__->meta->make_immutable; 1; # vim: tw=132