rss2html/lib/Rss2Html/Feed.pm

229 lines
6.7 KiB
Perl
Raw Normal View History

package Rss2Html::Feed;
=head1 Name
Rss2Html::Feed - a feed
=head1 Description
This class represents a single feed.
It currently has only one public method, update(), which updates the
feed.
=cut
use strict;
use warnings;
use v5.14;
use Moose;
2021-05-12 23:12:00 +02:00
use Try::Tiny;
use Cache::Memcached;
use Encode qw(decode_utf8);
use HTTP::Date;
use XML::Atom::Client;
use XML::RAI;
with 'MooseX::Log::Log4perl';
has 'id' => (
is => 'rw',
);
has 'url' => (
is => 'rw',
);
2017-10-02 15:26:48 +02:00
has 'active' => (
is => 'rw',
);
has 'last_update' => (
is => 'rw',
);
has 'title' => (
is => 'rw',
);
has 'type' => (
is => 'rw',
);
has 'allow_img' => (
is => 'rw',
);
has 'expire' => (
is => 'rw',
);
has 'dbh' => (
is => 'rw',
);
my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']);
=head1 Methods
=cut
=head2 update
Update the feed. This is unconditional. The logic to decide which feeds
must be updated is in L<Rss2Html::FeedList::update_adaptive>.
=cut
sub update {
my ($self) = @_;
my $dbh = $self->dbh;
my $now = int(time());
if ($self->{type} eq 'rss') {
local $@ = "unknown error";
# XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics.
# Probably should do this for atom, too, so we can autodetect the feed type.
my $ua = LWP::UserAgent->new;
$ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)");
my $response = $ua->get($self->{url});
if ($response->code != 200) {
$self->log->warn("GET $self->{url} failed: " . $response->status_line);
return;
}
$self->log->info("$self->{url} has content type " . $response->content_type);
my $rai = eval {XML::RAI->parse_string($response->content) };
if ($rai) {
$rai->time_format('EPOCH');
if ($rai->channel->title ne $self->{title}) {
$self->log->info("set title");
$dbh->do("update feeds set title=? where id=?", {},
$rai->channel->title, $self->{id});
}
$self->log->info("checking " . scalar(@{$rai->items}) . " items in db");
for my $item ( @{$rai->items} ) {
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link);
if ($item_db) {
$self->log->info("item $item_db->{id} seen");
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
} else {
$self->log->info("new");
my $issued = $item->issued // $now;
if ($issued < $self->{last_update} || $issued > $now) {
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
# Choose a random time in the interval.
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
$self->log->info("fixing up issued time: $issued -> $guess_issued");
$issued = $guess_issued;
}
$self->log->info("item " . $item->link . " new");
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
{},
$item->title, $item->link, $item->content, $issued, $now, $self->{id}
);
$self->invalidate_item_info;
}
}
} else {
2017-10-02 15:26:48 +02:00
$self->log->error("error getting $self->{url}: $@");
}
} elsif ($self->{type} eq 'atom') {
my $api = XML::Atom::Client->new();
2021-05-12 23:12:00 +02:00
my $atomfeed;
try {
# This doesn't work. XML::Atom::Client EXITS on error!
$atomfeed = $api->getFeed($self->{url});
} catch {
$self->log->error("error fetching $self->{url}: $@");
}
if ($atomfeed) {
my @items = $atomfeed->entries;
for my $item (@items) {
my $link = $self->atom_main_link($item);
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link);
if ($item_db) {
$self->log->info("item $item_db->{id} seen");
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
} else {
$self->log->info("new");
my $title = decode_utf8($item->title); # XXX - Workaround?
my $content = $item->content;
my $summary = $item->summary;
my $body = $content ? $content->body :
defined($summary) ? decode_utf8($summary) : undef;
$self->log->info("item " . $link . " new");
my $issued = str2time($item->updated) // $now;
if ($issued < $self->{last_update} || $issued > $now) {
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
# Choose a random time in the interval.
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
$self->log->info("fixing up issued time: $issued -> $guess_issued");
$issued = $guess_issued;
}
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
{},
$title, $link, $body, $issued, $now, $self->{id}
);
$self->invalidate_item_info;
}
}
}
} else {
$self->log->info("ignoring feed type $self->{type} of feed #$self->{id}: $self->{title}");
return;
}
$dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id});
}
sub atom_main_link {
my ($self, $item) = @_; # XXX should probably be a method of $item
for my $link ($item->link) {
# workaround for https://github.com/miyagawa/xml-atom/issues/16
my $href = $link->href // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'href');
my $rel = $link->rel // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'rel');
my $type = $link->type // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'type');
if (!$href) {
$self->log->warn("href undefined, skip");
next;
}
$self->log->info("checking href ", $href, "\n");
if (defined $rel && $rel ne 'alternate') {
$self->log->info("rel ", $rel, " unexpected, skip");
next;
}
if (defined $type && $type ne 'text/html') {
$self->log->info("type ", $type, " unexpected, skip\n");
next;
}
$self->log->info("match!\n");
return $href;
}
}
sub invalidate_item_info {
my ($self) = @_;
my $dbh = $self->dbh;
2017-10-02 15:26:48 +02:00
my $users = $dbh->selectcol_arrayref("select distinct username from read");
for my $user (@$users) {
$mcd->delete($self->item_info_key($user));
}
}
sub item_info_key {
my ($self, $user) = @_;
return "rss2html/item_info/" . $user . "/" . $self->id;
}
#---
__PACKAGE__->meta->make_immutable;
1;
# vim: tw=132