224 lines
6.4 KiB
Perl
224 lines
6.4 KiB
Perl
|
package Rss2Html::Feed;
|
||
|
|
||
|
=head1 Name
|
||
|
|
||
|
Rss2Html::Feed - a feed
|
||
|
|
||
|
=head1 Description
|
||
|
|
||
|
This class represents a single feed.
|
||
|
|
||
|
It currently has only one public method, update(), which updates the
|
||
|
feed.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use v5.14;
|
||
|
use Moose;
|
||
|
|
||
|
use Cache::Memcached;
|
||
|
use Encode qw(decode_utf8);
|
||
|
use HTTP::Date;
|
||
|
use XML::Atom::Client;
|
||
|
use XML::RAI;
|
||
|
|
||
|
with 'MooseX::Log::Log4perl';
|
||
|
|
||
|
has 'id' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'url' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'update_frequency' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'last_update' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'title' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'type' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'allow_img' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
has 'expire' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
|
||
|
has 'dbh' => (
|
||
|
is => 'rw',
|
||
|
);
|
||
|
|
||
|
my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']);
|
||
|
|
||
|
|
||
|
=head1 Methods
|
||
|
|
||
|
=cut
|
||
|
|
||
|
=head2 update
|
||
|
|
||
|
Update the feed. This is unconditional. The logic to decide which feeds
|
||
|
must be updated is in L<Rss2Html::FeedList::update_adaptive>.
|
||
|
|
||
|
=cut
|
||
|
|
||
|
sub update {
|
||
|
my ($self) = @_;
|
||
|
my $dbh = $self->dbh;
|
||
|
|
||
|
my $now = int(time());
|
||
|
|
||
|
if ($self->{type} eq 'rss') {
|
||
|
local $@ = "unknown error";
|
||
|
|
||
|
# XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics.
|
||
|
# Probably should do this for atom, too, so we can autodetect the feed type.
|
||
|
my $ua = LWP::UserAgent->new;
|
||
|
$ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)");
|
||
|
my $response = $ua->get($self->{url});
|
||
|
if ($response->code != 200) {
|
||
|
$self->log->warn("GET $self->{url} failed: " . $response->status_line);
|
||
|
return;
|
||
|
}
|
||
|
$self->log->info("$self->{url} has content type " . $response->content_type);
|
||
|
|
||
|
my $rai = eval {XML::RAI->parse_string($response->content) };
|
||
|
if ($rai) {
|
||
|
$rai->time_format('EPOCH');
|
||
|
if ($rai->channel->title ne $self->{title}) {
|
||
|
$self->log->info("set title");
|
||
|
$dbh->do("update feeds set title=? where id=?", {},
|
||
|
$rai->channel->title, $self->{id});
|
||
|
}
|
||
|
$self->log->info("checking " . scalar(@{$rai->items}) . " items in db");
|
||
|
for my $item ( @{$rai->items} ) {
|
||
|
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link);
|
||
|
if ($item_db) {
|
||
|
$self->log->info("item $item_db->{id} seen");
|
||
|
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
|
||
|
} else {
|
||
|
$self->log->info("new");
|
||
|
|
||
|
my $issued = $item->issued // $now;
|
||
|
if ($issued < $self->{last_update} || $issued > $now) {
|
||
|
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
|
||
|
# Choose a random time in the interval.
|
||
|
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
|
||
|
$self->log->info("fixing up issued time: $issued -> $guess_issued");
|
||
|
$issued = $guess_issued;
|
||
|
}
|
||
|
|
||
|
$self->log->info("item " . $item->link . " new");
|
||
|
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
|
||
|
{},
|
||
|
$item->title, $item->link, $item->content, $issued, $now, $self->{id}
|
||
|
);
|
||
|
$self->invalidate_item_info;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
print STDERR "error getting $self->{url}: $@\n";
|
||
|
}
|
||
|
} elsif ($self->{type} eq 'atom') {
|
||
|
my $api = XML::Atom::Client->new();
|
||
|
my $atomfeed = $api->getFeed($self->{url});
|
||
|
if ($atomfeed) {
|
||
|
my @items = $atomfeed->entries;
|
||
|
for my $item (@items) {
|
||
|
my $link = $self->atom_main_link($item);
|
||
|
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link);
|
||
|
if ($item_db) {
|
||
|
$self->log->info("item $item_db->{id} seen");
|
||
|
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
|
||
|
} else {
|
||
|
$self->log->info("new");
|
||
|
my $title = decode_utf8($item->title); # XXX - Workaround?
|
||
|
my $content = $item->content;
|
||
|
my $summary = $item->summary;
|
||
|
my $body = $content ? $content->body :
|
||
|
defined($summary) ? decode_utf8($summary) : undef;
|
||
|
|
||
|
$self->log->info("item " . $link . " new");
|
||
|
|
||
|
my $issued = str2time($item->updated) // $now;
|
||
|
if ($issued < $self->{last_update} || $issued > $now) {
|
||
|
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
|
||
|
# Choose a random time in the interval.
|
||
|
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
|
||
|
$self->log->info("fixing up issued time: $issued -> $guess_issued");
|
||
|
$issued = $guess_issued;
|
||
|
}
|
||
|
|
||
|
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
|
||
|
{},
|
||
|
$title, $link, $body, $issued, $now, $self->{id}
|
||
|
);
|
||
|
$self->invalidate_item_info;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
print "<p>unknown type $self->{type}</p>\n";
|
||
|
$dbh->do("update feeds set title=?, last_update=? where id=?", {},
|
||
|
'unknown', $now, $self->{id});
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
# mark all but the last 100 entries as old, so they aren't shown by default.
|
||
|
$dbh->do("update items set old=1
|
||
|
where id in (select id from items
|
||
|
where feed_id=? and (old is null or old != 1)
|
||
|
order by issued desc limit -1 offset 100)",
|
||
|
{},
|
||
|
$self->{id});
|
||
|
|
||
|
$dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id});
|
||
|
}
|
||
|
|
||
|
|
||
|
sub atom_main_link {
|
||
|
my ($self, $item) = @_; # XXX should probably be a method of $item
|
||
|
for my $link ($item->link) {
|
||
|
$self->log->info("checking href ", $link->href, "\n");
|
||
|
if (defined $link->rel && $link->rel ne 'alternate') {
|
||
|
$self->log->info("rel ", $link->rel, " unexpected, skip");
|
||
|
next;
|
||
|
}
|
||
|
if (defined $link->type && $link->type ne 'text/html') {
|
||
|
$self->log->info("type ", $link->type, " unexpected, skip\n");
|
||
|
next;
|
||
|
}
|
||
|
$self->log->info("match!\n");
|
||
|
return $link->href;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
sub invalidate_item_info {
|
||
|
my ($self) = @_;
|
||
|
|
||
|
my $dbh = $self->dbh;
|
||
|
my $users = $dbh->selectcol_arrayref("select distinct user from read");
|
||
|
for my $user (@$users) {
|
||
|
$mcd->delete($self->item_info_key($user));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub item_info_key {
|
||
|
my ($self, $user) = @_;
|
||
|
return "rss2html/item_info/" . $user . "/" . $self->id;
|
||
|
}
|
||
|
|
||
|
#---
|
||
|
__PACKAGE__->meta->make_immutable;
|
||
|
1;
|
||
|
|
||
|
# vim: tw=132
|