2017-06-12 21:29:41 +02:00
|
|
|
package Rss2Html::Feed;
|
|
|
|
|
|
|
|
=head1 Name
|
|
|
|
|
|
|
|
Rss2Html::Feed - a feed
|
|
|
|
|
|
|
|
=head1 Description
|
|
|
|
|
|
|
|
This class represents a single feed.
|
|
|
|
|
|
|
|
It currently has only one public method, update(), which updates the
|
|
|
|
feed.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use v5.14;
|
|
|
|
use Moose;
|
2021-05-12 23:12:00 +02:00
|
|
|
use Try::Tiny;
|
2017-06-12 21:29:41 +02:00
|
|
|
|
|
|
|
use Cache::Memcached;
|
|
|
|
use Encode qw(decode_utf8);
|
|
|
|
use HTTP::Date;
|
|
|
|
use XML::Atom::Client;
|
|
|
|
use XML::RAI;
|
|
|
|
|
|
|
|
with 'MooseX::Log::Log4perl';
|
|
|
|
|
|
|
|
has 'id' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'url' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
2017-10-02 15:26:48 +02:00
|
|
|
has 'active' => (
|
2017-06-12 21:29:41 +02:00
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'last_update' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'title' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'type' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'allow_img' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
has 'expire' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
|
|
|
|
has 'dbh' => (
|
|
|
|
is => 'rw',
|
|
|
|
);
|
|
|
|
|
|
|
|
my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']);
|
|
|
|
|
|
|
|
|
|
|
|
=head1 Methods
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
=head2 update
|
|
|
|
|
|
|
|
Update the feed. This is unconditional. The logic to decide which feeds
|
|
|
|
must be updated is in L<Rss2Html::FeedList::update_adaptive>.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub update {
|
|
|
|
my ($self) = @_;
|
|
|
|
my $dbh = $self->dbh;
|
|
|
|
|
|
|
|
my $now = int(time());
|
|
|
|
|
|
|
|
if ($self->{type} eq 'rss') {
|
|
|
|
local $@ = "unknown error";
|
|
|
|
|
|
|
|
# XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics.
|
|
|
|
# Probably should do this for atom, too, so we can autodetect the feed type.
|
|
|
|
my $ua = LWP::UserAgent->new;
|
|
|
|
$ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)");
|
|
|
|
my $response = $ua->get($self->{url});
|
|
|
|
if ($response->code != 200) {
|
|
|
|
$self->log->warn("GET $self->{url} failed: " . $response->status_line);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
$self->log->info("$self->{url} has content type " . $response->content_type);
|
|
|
|
|
|
|
|
my $rai = eval {XML::RAI->parse_string($response->content) };
|
|
|
|
if ($rai) {
|
|
|
|
$rai->time_format('EPOCH');
|
|
|
|
if ($rai->channel->title ne $self->{title}) {
|
|
|
|
$self->log->info("set title");
|
|
|
|
$dbh->do("update feeds set title=? where id=?", {},
|
|
|
|
$rai->channel->title, $self->{id});
|
|
|
|
}
|
|
|
|
$self->log->info("checking " . scalar(@{$rai->items}) . " items in db");
|
|
|
|
for my $item ( @{$rai->items} ) {
|
|
|
|
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link);
|
|
|
|
if ($item_db) {
|
|
|
|
$self->log->info("item $item_db->{id} seen");
|
|
|
|
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
|
|
|
|
} else {
|
|
|
|
$self->log->info("new");
|
|
|
|
|
|
|
|
my $issued = $item->issued // $now;
|
|
|
|
if ($issued < $self->{last_update} || $issued > $now) {
|
|
|
|
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
|
|
|
|
# Choose a random time in the interval.
|
|
|
|
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
|
|
|
|
$self->log->info("fixing up issued time: $issued -> $guess_issued");
|
|
|
|
$issued = $guess_issued;
|
|
|
|
}
|
|
|
|
|
|
|
|
$self->log->info("item " . $item->link . " new");
|
|
|
|
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
|
|
|
|
{},
|
|
|
|
$item->title, $item->link, $item->content, $issued, $now, $self->{id}
|
|
|
|
);
|
|
|
|
$self->invalidate_item_info;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2017-10-02 15:26:48 +02:00
|
|
|
$self->log->error("error getting $self->{url}: $@");
|
2017-06-12 21:29:41 +02:00
|
|
|
}
|
|
|
|
} elsif ($self->{type} eq 'atom') {
|
|
|
|
my $api = XML::Atom::Client->new();
|
2021-05-12 23:12:00 +02:00
|
|
|
my $atomfeed;
|
|
|
|
try {
|
|
|
|
# This doesn't work. XML::Atom::Client EXITS on error!
|
|
|
|
$atomfeed = $api->getFeed($self->{url});
|
|
|
|
} catch {
|
|
|
|
$self->log->error("error fetching $self->{url}: $@");
|
|
|
|
}
|
2017-06-12 21:29:41 +02:00
|
|
|
if ($atomfeed) {
|
|
|
|
my @items = $atomfeed->entries;
|
|
|
|
for my $item (@items) {
|
|
|
|
my $link = $self->atom_main_link($item);
|
|
|
|
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link);
|
|
|
|
if ($item_db) {
|
|
|
|
$self->log->info("item $item_db->{id} seen");
|
|
|
|
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
|
|
|
|
} else {
|
|
|
|
$self->log->info("new");
|
|
|
|
my $title = decode_utf8($item->title); # XXX - Workaround?
|
|
|
|
my $content = $item->content;
|
|
|
|
my $summary = $item->summary;
|
|
|
|
my $body = $content ? $content->body :
|
|
|
|
defined($summary) ? decode_utf8($summary) : undef;
|
|
|
|
|
|
|
|
$self->log->info("item " . $link . " new");
|
|
|
|
|
|
|
|
my $issued = str2time($item->updated) // $now;
|
|
|
|
if ($issued < $self->{last_update} || $issued > $now) {
|
|
|
|
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
|
|
|
|
# Choose a random time in the interval.
|
|
|
|
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
|
|
|
|
$self->log->info("fixing up issued time: $issued -> $guess_issued");
|
|
|
|
$issued = $guess_issued;
|
|
|
|
}
|
|
|
|
|
|
|
|
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
|
|
|
|
{},
|
|
|
|
$title, $link, $body, $issued, $now, $self->{id}
|
|
|
|
);
|
|
|
|
$self->invalidate_item_info;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2018-02-19 10:03:36 +01:00
|
|
|
$self->log->info("ignoring feed type $self->{type} of feed #$self->{id}: $self->{title}");
|
2017-06-12 21:29:41 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
$dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id});
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub atom_main_link {
|
|
|
|
my ($self, $item) = @_; # XXX should probably be a method of $item
|
|
|
|
for my $link ($item->link) {
|
2018-06-04 10:19:37 +02:00
|
|
|
# workaround for https://github.com/miyagawa/xml-atom/issues/16
|
|
|
|
my $href = $link->href // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'href');
|
|
|
|
my $rel = $link->rel // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'rel');
|
|
|
|
my $type = $link->type // $link->elem->getAttributeNS('http://www.w3.org/2005/Atom', 'type');
|
|
|
|
|
|
|
|
if (!$href) {
|
|
|
|
$self->log->warn("href undefined, skip");
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
$self->log->info("checking href ", $href, "\n");
|
|
|
|
if (defined $rel && $rel ne 'alternate') {
|
|
|
|
$self->log->info("rel ", $rel, " unexpected, skip");
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
if (defined $type && $type ne 'text/html') {
|
|
|
|
$self->log->info("type ", $type, " unexpected, skip\n");
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
$self->log->info("match!\n");
|
|
|
|
return $href;
|
2017-06-12 21:29:41 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub invalidate_item_info {
|
|
|
|
my ($self) = @_;
|
|
|
|
|
|
|
|
my $dbh = $self->dbh;
|
2017-10-02 15:26:48 +02:00
|
|
|
my $users = $dbh->selectcol_arrayref("select distinct username from read");
|
2017-06-12 21:29:41 +02:00
|
|
|
for my $user (@$users) {
|
|
|
|
$mcd->delete($self->item_info_key($user));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub item_info_key {
|
|
|
|
my ($self, $user) = @_;
|
|
|
|
return "rss2html/item_info/" . $user . "/" . $self->id;
|
|
|
|
}
|
|
|
|
|
|
|
|
#---
|
|
|
|
__PACKAGE__->meta->make_immutable;
|
|
|
|
1;
|
|
|
|
|
|
|
|
# vim: tw=132
|