Copy from web site

Start a git repo independent of my web site.
This commit is contained in:
Peter J. Holzer 2017-06-12 21:29:41 +02:00
commit ff784ba351
16 changed files with 1300 additions and 0 deletions

10
.htaccess Normal file
View File

@ -0,0 +1,10 @@
AuthName hjp-work-in-progress
AuthType Basic
AuthUserFile /usr/local/www/offline/hjp/passwd
AuthGroupFile /usr/local/www/offline/hjp/group
<Limit GET>
require valid-user
</Limit>
<Limit POST>
require valid-user
</Limit>

0
Notes Normal file
View File

223
Rss2Html/Feed.pm Normal file
View File

@ -0,0 +1,223 @@
package Rss2Html::Feed;
=head1 Name
Rss2Html::Feed - a feed
=head1 Description
This class represents a single feed.
It currently has only one public method, update(), which updates the
feed.
=cut
use strict;
use warnings;
use v5.14;
use Moose;
use Cache::Memcached;
use Encode qw(decode_utf8);
use HTTP::Date;
use XML::Atom::Client;
use XML::RAI;
with 'MooseX::Log::Log4perl';
has 'id' => (
is => 'rw',
);
has 'url' => (
is => 'rw',
);
has 'update_frequency' => (
is => 'rw',
);
has 'last_update' => (
is => 'rw',
);
has 'title' => (
is => 'rw',
);
has 'type' => (
is => 'rw',
);
has 'allow_img' => (
is => 'rw',
);
has 'expire' => (
is => 'rw',
);
has 'dbh' => (
is => 'rw',
);
my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']);
=head1 Methods
=cut
=head2 update
Update the feed. This is unconditional. The logic to decide which feeds
must be updated is in L<Rss2Html::FeedList::update_adaptive>.
=cut
sub update {
my ($self) = @_;
my $dbh = $self->dbh;
my $now = int(time());
if ($self->{type} eq 'rss') {
local $@ = "unknown error";
# XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics.
# Probably should do this for atom, too, so we can autodetect the feed type.
my $ua = LWP::UserAgent->new;
$ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)");
my $response = $ua->get($self->{url});
if ($response->code != 200) {
$self->log->warn("GET $self->{url} failed: " . $response->status_line);
return;
}
$self->log->info("$self->{url} has content type " . $response->content_type);
my $rai = eval {XML::RAI->parse_string($response->content) };
if ($rai) {
$rai->time_format('EPOCH');
if ($rai->channel->title ne $self->{title}) {
$self->log->info("set title");
$dbh->do("update feeds set title=? where id=?", {},
$rai->channel->title, $self->{id});
}
$self->log->info("checking " . scalar(@{$rai->items}) . " items in db");
for my $item ( @{$rai->items} ) {
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link);
if ($item_db) {
$self->log->info("item $item_db->{id} seen");
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
} else {
$self->log->info("new");
my $issued = $item->issued // $now;
if ($issued < $self->{last_update} || $issued > $now) {
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
# Choose a random time in the interval.
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
$self->log->info("fixing up issued time: $issued -> $guess_issued");
$issued = $guess_issued;
}
$self->log->info("item " . $item->link . " new");
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
{},
$item->title, $item->link, $item->content, $issued, $now, $self->{id}
);
$self->invalidate_item_info;
}
}
} else {
print STDERR "error getting $self->{url}: $@\n";
}
} elsif ($self->{type} eq 'atom') {
my $api = XML::Atom::Client->new();
my $atomfeed = $api->getFeed($self->{url});
if ($atomfeed) {
my @items = $atomfeed->entries;
for my $item (@items) {
my $link = $self->atom_main_link($item);
my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link);
if ($item_db) {
$self->log->info("item $item_db->{id} seen");
$dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id});
} else {
$self->log->info("new");
my $title = decode_utf8($item->title); # XXX - Workaround?
my $content = $item->content;
my $summary = $item->summary;
my $body = $content ? $content->body :
defined($summary) ? decode_utf8($summary) : undef;
$self->log->info("item " . $link . " new");
my $issued = str2time($item->updated) // $now;
if ($issued < $self->{last_update} || $issued > $now) {
# $issued is clearly wrong. Could be any time between $self->{last_update} and $now.
# Choose a random time in the interval.
my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update}));
$self->log->info("fixing up issued time: $issued -> $guess_issued");
$issued = $guess_issued;
}
$dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)",
{},
$title, $link, $body, $issued, $now, $self->{id}
);
$self->invalidate_item_info;
}
}
}
} else {
print "<p>unknown type $self->{type}</p>\n";
$dbh->do("update feeds set title=?, last_update=? where id=?", {},
'unknown', $now, $self->{id});
return;
}
# mark all but the last 100 entries as old, so they aren't shown by default.
$dbh->do("update items set old=1
where id in (select id from items
where feed_id=? and (old is null or old != 1)
order by issued desc limit -1 offset 100)",
{},
$self->{id});
$dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id});
}
sub atom_main_link {
my ($self, $item) = @_; # XXX should probably be a method of $item
for my $link ($item->link) {
$self->log->info("checking href ", $link->href, "\n");
if (defined $link->rel && $link->rel ne 'alternate') {
$self->log->info("rel ", $link->rel, " unexpected, skip");
next;
}
if (defined $link->type && $link->type ne 'text/html') {
$self->log->info("type ", $link->type, " unexpected, skip\n");
next;
}
$self->log->info("match!\n");
return $link->href;
}
}
sub invalidate_item_info {
my ($self) = @_;
my $dbh = $self->dbh;
my $users = $dbh->selectcol_arrayref("select distinct user from read");
for my $user (@$users) {
$mcd->delete($self->item_info_key($user));
}
}
sub item_info_key {
my ($self, $user) = @_;
return "rss2html/item_info/" . $user . "/" . $self->id;
}
#---
__PACKAGE__->meta->make_immutable;
1;
# vim: tw=132

131
Rss2Html/FeedList.pm Normal file
View File

@ -0,0 +1,131 @@
package Rss2Html::FeedList;
use warnings;
use strict;
use Moose;
use namespace::autoclean;
use DBI;
use Rss2Html::Feed;
use POSIX qw(strftime);
with 'MooseX::Log::Log4perl';
has 'dbh' => (
is => 'ro',
writer => '_set_dbh',
);
sub BUILD {
my ($self, $args) = @_;
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "",
{
sqlite_use_immediate_transaction => 1,
});
$dbh->{sqlite_unicode} = 1;
$self->_set_dbh($dbh);
}
sub feeds {
my ($self) = @_;
my $dbh = $self->dbh;
no autovivification qw(fetch);
my $feeds_sql = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} });
my @feeds;
for (@$feeds_sql) {
push @feeds, Rss2Html::Feed->new(%$_, dbh => $dbh);
}
return \@feeds;
}
sub update_adaptive {
my ($self) = @_;
my $dbh = $self->dbh;
my $start = time();
no autovivification qw(fetch);
$dbh->begin_work;
my $feeds = $self->feeds;
my $seconds_per_week = 86400 * 7;
for my $feed (@$feeds) {
$feed->{title} //= "-";
#print "<p>feed: $feed->{id} $feed->{title}<br/>\n";
$self->log->info("feed: $feed->{id} $feed->{title}");
my $now = time();
my $run_time = $now - $start;
my $update_time = $now - $feed->{last_update};
$self->log->info(strftime("%Y-%m-%d %a %H:%M:%S", localtime($feed->{last_update})), " .. ",
strftime("%Y-%m-%d %a %H:%M:%S", localtime($now)), ": $update_time seconds");
#print "\t", "update_time: $update_time<br/>\n";
# We are guaranteed to have all items which were issued in the last
# int($feed->{expire} / 7) weeks since we expire only those not seen for
# more than $feed->{expire} days. Some feeds keep some items for a long
# time while removing others quite quickly, so statistics for older
# items may be severely skewed and are not safe to use.
my $interval = int($feed->{expire} / 7) * 7 * 86400;
# We measure those weeks backwards from the last update.
# If we measured from now we would almost always miss the update from
# exactly 4 weeks ago (one cycle we are too early, the next we are too
# late).
my $start = $feed->{last_update} - $interval;
my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued",
{ Slice => {} },
$feed->{id}, $start,
);
my $first_issued = $items->[0]{issued} // $start;
my $total_time = $now - $first_issued;
my $updates_in_interval = 0;
for my $item (@$items) {
my $t = $item->{issued};
my $dt1 = $now - $t;
my $dt2 = $dt1 % $seconds_per_week;
if ($dt2 < $update_time) {
$updates_in_interval++;
$self->log->info("found $item->{id} ", strftime("%Y-%m-%d %a %H:%M:%S", localtime($t)));
}
}
$self->log->info("updates_in_interval: $updates_in_interval");
$updates_in_interval *= $seconds_per_week / $total_time; # normalize to one week
$self->log->info("updates_in_interval: $updates_in_interval");
$updates_in_interval += $update_time / $seconds_per_week; # add one update per week
$self->log->info("updates_in_interval: $updates_in_interval");
#print "\t", "updates_in_interval: $updates_in_interval<br/>\n";
if ($updates_in_interval > 0.5) {
$self->log->info($feed->{url});
$self->log->info("need to update " . $feed->{url});
$feed->update;
}
}
$self->cleanup();
$dbh->commit;
$self->log->info("update done");
}
sub cleanup {
my ($self) = @_;
my $dbh = $self->dbh;
my $now = time();
my $feeds = $dbh->selectall_arrayref("select * from feeds", { Slice => {} });
for my $feed (@$feeds) {
my $expire_days = $feed->{expire} // 30;
my $seen_limit = $now - $expire_days * 86400;
my $rc = $dbh->do("delete from items where feed_id=? and seen < ?", {}, $feed->{id}, $seen_limit);
$self->log->info("cleanup of feed $feed->{id} (limit $seen_limit) returned $rc");
}
$dbh->do("delete from read where not exists (select id from items where id=item_id)");
}
#---
__PACKAGE__->meta->make_immutable;
1;

31
Rss2Html/Scrubber.pm Normal file
View File

@ -0,0 +1,31 @@
package Rss2Html::Scrubber;
use warnings;
use strict;
use parent 'HTML::Scrubber';
sub new {
my ($class, %options) = @_;
my $self = HTML::Scrubber->new();
$self->deny(qw(link));
$self->allow(qw(p a em strong b i ul ol li dl dt dd br));
my %rules = (
a => {
href => qr{^https?://}i,
'*' => 0,
},
);
if ($options{allow_img}) {
$rules{img} = {
src => qr{^https?://}i,
alt => 1,
'*' => 0,
},
}
$self->rules(
%rules
);
bless $self, $class;
return $self;
}
1;

3
do_cron Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
cd $(dirname $0)
./update3

66
feed.cgi Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/perl
use strict;
use warnings;
use v5.14;
use XML::RAI;
use XML::Atom::Client;
use DBI;
use CGI;
use Rss2Html::Scrubber;
use Time::HiRes qw(time);
use POSIX qw(strftime);
$| = 1;
my $q = CGI->new();
binmode STDOUT, ":encoding(UTF-8)";
my $feed_id = $q->param('id');
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
print "Content-Type: text/html; charset=utf-8\n";
print "Refresh: 600\n";
print "\n";
print "<link rel='stylesheet' type='text/css' href='rss2html.css'/>\n";
print "<h1>RSS 2 HTML</h1>\n";
print "<p>", strftime("%Y-%m-%d %H:%M:%S%z", localtime()), "</p>\n";
my $feed = $dbh->selectrow_hashref("select * from feeds where id=?", {}, $feed_id);
print "<span class='feed_info'>\n";
print "<span class='feed_id'>", $q->escapeHTML($feed->{id}), "</span>\n";
print "<span class='feed_title'>", $q->escapeHTML($feed->{title}), "</span>\n";
print "<span class='feed_type'>", $q->escapeHTML($feed->{type}), "</span>\n";
print "</span>\n";
my $now = time();
my $seconds_per_week = 86400 * 7;
my $start = $now - 4 * $seconds_per_week;
my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued",
{ Slice => {} },
$feed->{id}, $start,
);
for my $item (@$items) {
my $t = $item->{issued};
my $dt1 = $now - $t;
my $dt2 = $dt1 % $seconds_per_week;
$item->{dt2} = $dt2;
}
$items = [ sort { $a->{dt2} <=> $b->{dt2} } @$items ];
print "<table>\n";
for my $item (@$items) {
my $t = $now - $item->{dt2};
print"<tr>\n";
print"<td>", $item->{dt2}, "</td>\n";
print"<td>", strftime("%a %H:%M:%S", localtime($t)), "</td>\n";
print"<td>", "<a href='", $q->escapeHTML($item->{link}), "'>", $q->escapeHTML($item->{title}), "</a>", "</td>\n";
print"</tr>\n";
}
print "</table>\n";
# vim: expandtab

426
index.cgi Executable file
View File

@ -0,0 +1,426 @@
#!/usr/bin/perl
# Parameters:
#
# mark=id
# mark item id as read
#
# redir=id
# redirect to url of item id and mark item as read
#
# fls=(all|unread)
# "feed list show": Show all feeds or only feeds with unread items in feed list
#
# fis= comma separated list of feed ids
# show only listed feeds
# BUG: fis is only effective if sr is non-zero!
#
# sr=n
# show last n read items. Note: Currently these are the n read items with the
# highest ids, which may not be the last items which have been read. Need to
# store the read date for that.
#
# so=1
# also show old items.
#
# sl=1
# show items marked as "later".
use strict;
use warnings;
use v5.14;
use utf8;
use experimental 'smartmatch'; # for given/when
use experimental 'autoderef';
use CGI;
use Cache::Memcached;
use DBI;
use Data::Dumper;
use Encode qw(encode_utf8 decode_utf8);
use POSIX qw(strftime);
use Rss2Html::Scrubber;
use Time::HiRes qw(time);
$| = 1;
my $start = time();
my $q = CGI->new();
binmode STDOUT, ":encoding(UTF-8)";
my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']);
if ($q->param('mark')) {
mark();
} elsif ($q->param('redir')) {
redirect();
} else {
list();
}
sub redirect {
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "",
{
sqlite_use_immediate_transaction => 1,
});
$dbh->{sqlite_unicode} = 1;
my $item_id = $q->param('redir');
my $item = $dbh->selectrow_hashref("select * from items where id=?", {}, $item_id);
if ($item) {
print "Status: 302\n";
print "Location: $item->{link}\n";
print "\n";
$dbh->do("insert into read(user, item_id) values(?, ?)", {}, $q->remote_user, $item_id);
$mcd->delete(item_info_key($q->remote_user, $item->{feed_id}));
exit(0);
}
print "Status: 404\n";
print "Content-Type: text/html; charset=utf-8\n";
print "\n";
print "not found\n";
}
sub mark {
print_log("mark start");
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
my $item_id = $q->param('mark');
my $item = $dbh->selectrow_hashref("select * from items where id=?", {}, $item_id);
if ($item) {
my $q1 = CGI->new($q);
$q1->delete('mark');
print "Status: 302\n";
print "Location: ", $q1->self_url, "\n";
print "\n";
$dbh->do("insert into read(user, item_id) values(?, ?)", {}, $q->remote_user, $item_id);
$mcd->delete(item_info_key($q->remote_user, $item->{feed_id}));
print_log("mark done");
exit(0);
}
print "Status: 404\n";
print "Content-Type: text/html; charset=utf-8\n";
print "\n";
print "not found\n";
print_log("mark failed");
}
sub list {
my $feed_list_show = $q->param('fls') // "all";
my $feed_item_show = $q->param('fis') // '';
my $show_read = $q->param('sr');
my $show_old = $q->param('so');
my $show_later = $q->param('sl');
my %feed_item_show = map(($_+0 => 1), split(/,/, $feed_item_show));
print "Content-Type: text/html; charset=utf-8\n";
print "Refresh: 600\n";
print "\n";
print "<meta name='viewport' content='width=device-width, initial-scale=1' />\n";
print "<link rel='stylesheet' type='text/css' href='rss2html.css'/>\n";
print "<h1>RSS 2 HTML</h1>\n";
print "<div class='lastupdate'>", strftime("%Y-%m-%d %H:%M:%S%z", localtime()), "</div>\n";
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
print_log("list_feeds start");
list_feeds($dbh, $feed_list_show, $show_old);
print_log("list_feeds done");
my $items;
if ($show_read) {
my $fields = "feeds.id as feed_id, feeds.title as feed_title, allow_img, link, items.title as item_title, content, items.id as item_id, issued, user, lang";
my $tables = "items
join feeds on items.feed_id=feeds.id
left outer join read on (items.id=read.item_id and user=?)";
my @where;
if (%feed_item_show) {
push @where, "feeds.id in (" . join(',', keys %feed_item_show) . ")";
}
if (!$show_old) {
push @where, "(items.old is null or items.old == 0)"
}
my $where = "where " . join(" and ", @where);
my $cmd = "select $fields from $tables $where order by issued";
print_log("$cmd");
my $items1 = $dbh->selectall_arrayref($cmd, { Slice => {} }, $q->remote_user);
my $nrd = 0;
while (@$items1) {
my $item = pop @$items1;
{
no warnings 'uninitialized';
print_log("id=$item->{item_id}, user=$item->{user}");
}
if (!$item->{user}) {
print_log("id=$item->{item_id}: unread");
unshift @$items, $item;
} elsif ($nrd < $show_read) {
print_log("id=$item->{item_id}: $nrd < $show_read");
unshift @$items, $item;
$nrd++;
}
}
} else {
my @where = ("user is null");
if (!$show_old) {
push @where, "(items.old is null or items.old == 0)"
}
my $where = "where " . join(" and ", @where);
$items = $dbh->selectall_arrayref(
"select feeds.id as feed_id, feeds.title as feed_title, allow_img, link, items.title as item_title, content, items.id as item_id, issued, lang
from items
join feeds on items.feed_id=feeds.id
left outer join read on (items.id=read.item_id and user=?)
$where
order by issued",
{ Slice => {} }, $q->remote_user);
}
my $feeds = $dbh->selectall_arrayref(
"select id, title from feeds where update_frequency is not null order by id",
{ Slice => {} }
);
for my $f (@$feeds) {
$f->{selected} = 1 if $feed_item_show{$f->{id}};
}
print_itemlist($items, $feeds);
print_log("list done");
}
sub print_itemlist {
my ($items, $feeds) = @_;
print "<div class='itemlist'>\n";
print "<form action='./'>\n";
print "<select name='fis'>\n";
print "<option value=''>\n";
for my $f (@$feeds) {
print "<option value='" . $q->escapeHTML($f->{id}) . "' " . ($f->{selected} ? "selected='selected'" : "") .">" . $q->escapeHTML($f->{title}) . "</option>\n";
}
print "</select>\n";
my $fls = $q->param('fls');
print "<input type='hidden' name='fls' value='" . $q->escapeHTML($fls) . "'>\n" if defined $fls;
print "<input type='hidden' name='sr' value='10'>\n";
print "<input type='submit' value='»'>\n";
print "</form>\n";
my $n_items = 0;
my $n_html_mcd = 0;
my $n_scrub_mcd = 0;
my $n_scrub = 0;
print_log("print_itemlist: \$q=" . $q->self_url);
my $q1 = CGI->new($q);
print_log("print_itemlist: \$q1=" . $q1->self_url . " (before loop)");
print_log(scalar @$items . " before remix");
$items = remix($items);
print_log(scalar @$items . " after remix");
for my $item (@$items) {
$n_items++;
my $is_read = defined($item->{user});
my $scrubbed_content = $mcd->get(scrubbed_content_key($item->{item_id}));
if ($scrubbed_content) {
$scrubbed_content = decode_utf8($scrubbed_content);
$n_scrub_mcd++;
} else {
my $scrubber = Rss2Html::Scrubber->new(allow_img => $item->{allow_img});
$scrubbed_content = $scrubber->scrub($item->{content});
$mcd->set(scrubbed_content_key($item->{item_id}), encode_utf8($scrubbed_content), 3600);
$n_scrub++;
}
$q1->param('mark', $item->{item_id});
my $item_class = 'item' . ($is_read ? ' read' : '');
my $langattr = defined $item->{lang} ? "lang='$item->{lang}'" : "";
my $html = "";
$html .= "<div class='$item_class' $langattr>\n";
$html .= "<span class='itemno'>" . $item->{item_id} . "</span>\n";
$html .= "<span class='issued'>" . strftime('%Y-%m-%d %H:%M:%S', localtime($item->{issued})) . "</span>\n";
unless ($is_read) {
print_log("print_itemlist: \$q1=" . $q1->self_url . " (in loop)");
$html .= "<div class='op'><a href='" . $q->escapeHTML($q1->self_url) . "'>Mark read</a></div>\n";
}
$html .= "<div class='feed'>" . $q->escapeHTML($item->{feed_title}) . "</div>\n";
$html .= "<h2><a href='./?redir=" . $q->escapeHTML($item->{item_id}) . "'>" . $q->escapeHTML($item->{item_title}) . "</a></h2>\n";
$html .= "<div class='content'>" . $scrubbed_content . "</div>\n";
unless ($is_read) {
$html .= "<div class='op'><a href='" . $q->escapeHTML($q1->self_url) . "'>Mark read</a></div>\n";
}
$html .= "<div class='end'></div>\n";
$html .= "</div\n>";
print $html;
}
print "</div>\n";
print_log("itemlist: $n_items items ($n_html_mcd html cached, $n_scrub_mcd scrubbed content cached, $n_scrub scrubbed)");
}
sub remix {
my ($items) = @_;
my $queues = {};
for my $item (@$items) {
if (!defined $item->{feed_id}) {
local $Data::Dumper::Indent = 0;
print_log("item w/o feed_id: " . Dumper($item));
}
$queues->{$item->{feed_id}} //= [];
push $queues->{$item->{feed_id}}, $item;
}
my @ordered_feeds = sort { $queues->{$a}[0]{issued} <=> $queues->{$b}[0]{issued} } keys $queues;
my $new_items = [];
my $found;
do {
$found = 0;
for my $feed (@ordered_feeds) {
my $item = shift $queues->{$feed};
if ($item) {
push $new_items, $item;
$found++;
print_log("found an item in feed $feed. " . scalar(@{$queues->{$feed}}) . " items left")
}
}
} while ($found);
return $new_items;
}
sub list_feeds {
my ($dbh, $feed_list_show, $show_old) = @_;
no autovivification qw(fetch);
my $q1 = CGI->new($q);
my $other = $feed_list_show eq 'all' ? 'unread' : 'all';
$q1->param('fls', $other);
print "<div><a href='", $q->escapeHTML($q1->self_url), "'>$other</a></div>\n";
print "<div class='feedlist'>\n";
$dbh->begin_work;
my $feeds = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} });
my $seconds_per_week = 86400 * 7;
for my $feed (@$feeds) {
$feed->{title} //= "-";
print_item_info($dbh, $feed, $feed_list_show, $show_old);
}
$dbh->commit;
print "</div>\n";
}
sub print_item_info {
my ($dbh, $feed, $feed_list_show, $show_old) = @_;
my $item_info_key = item_info_key($q->remote_user, $feed->{id});
my $item_info = $mcd->get($item_info_key);
my $cmd
= "select count(i.id) as nr_items, max(i.issued) as last_issued, max(i.seen) as last_seen, count(r.item_id) as nr_read
from items i left outer join (select item_id from read where user=?) r on i.id=r.item_id
where i.feed_id=?";
unless ($show_old) {
$cmd .= " and (old is null or old = 0)";
}
unless ($item_info) {
$item_info
= $dbh->selectrow_hashref(
$cmd,
{},
$q->remote_user, $feed->{id}
);
$mcd->set($item_info_key, $item_info, 3600);
}
my $now = time();
my $unread = $item_info->{nr_items} - $item_info->{nr_read};
print_log("print_item_info: feed $feed->{id}, nr_items=$item_info->{nr_items}, read=$item_info->{nr_read}, unread=$unread");
return unless ($unread || $feed_list_show eq 'all');
my $q1 = CGI->new($q);
$q1->param('fis', $feed->{id});
$q1->param('sr', 10);
print "<span class='feed_info'>\n";
print "<span class='feed_id'>", $q->escapeHTML($feed->{id}), "</span>\n";
print "<span class='feed_title'>",
"<a href='", $q1->escapeHTML($q1->self_url), "'>",
$q->escapeHTML($feed->{title}),
"</a>",
"</span>\n";
print "<span class='feed_items'>", $item_info->{nr_items}, "</span>\n";
print "<span class='feed_unread'>", $unread, "</span>\n";
print "<span class='feed_lastissued'>", delta_t($item_info->{last_issued}, $now), "</span>\n";
print "<span class='feed_lastseen'>", delta_t($item_info->{last_seen}, $now), "</span>\n";
print "</span>\n";
}
sub print_log {
my $msg = "@_";
my $now = time();
printf STDERR "%s: %s.%06d %f: %s\n", $0, strftime("%H:%M:%S", localtime($now)), ($now - int($now)) * 1E6, $now - $start, $msg;
}
sub delta_t {
my ($t1, $t2) = @_;
return "N/A" unless defined $t1 && defined $t2;
my $dt = $t2 - $t1;
my $s = "";
if ($dt < 0) {
$s .= "-";
$dt = -$dt;
}
given ($dt) {
when ($_ >= 86400 * 10) {
$s .= sprintf("%.0fd", $_ / 86400);
}
when ($_ >= 86400) {
$s .= sprintf("%.1fd", $_ / 86400);
}
when ($_ >= 3600 * 10) {
$s .= sprintf("%.0fh", $_ / 3600);
}
when ($_ >= 3600) {
$s .= sprintf("%.1fh", $_ / 3600);
}
when ($_ >= 60 * 10) {
$s .= sprintf("%.0fm", $_ / 60);
}
when ($_ >= 60) {
$s .= sprintf("%.1fm", $_ / 60);
}
when ($_ >= 10) {
$s .= sprintf("%.0fs", $_);
}
default {
$s .= sprintf("%.1fs", $_);
}
}
}
sub item_info_key {
my ($user, $feed_id) = @_;
return "rss2html/item_info/$user/$feed_id";
}
sub scrubbed_content_key {
my ($item_id) = @_;
return "rss2html/scrubbed_content/$item_id";
}
sub item_html_key {
my ($item_id, $is_read) = @_;
return "rss2html/item_html/$item_id/$is_read";
}
# vim: tw=132 expandtab

12
log.conf Normal file
View File

@ -0,0 +1,12 @@
log4perl.logger = DEBUG, RotFileApp
log4perl.appender.FileApp = Log::Log4perl::Appender::File
log4perl.appender.FileApp.filename = test.log
log4perl.appender.FileApp.layout = PatternLayout
log4perl.appender.FileApp.layout.ConversionPattern = %d [%p] %P %c %m%n
log4perl.appender.RotFileApp = Log::Dispatch::FileRotate
log4perl.appender.RotFileApp.filename = test.log
log4perl.appender.RotFileApp.layout = PatternLayout
log4perl.appender.RotFileApp.layout.ConversionPattern = %d [%p] %P %c %m%n
log4perl.appender.RotFileApp.max = 7
log4perl.appender.RotFileApp.mode = append
log4perl.appender.RotFileApp.binmode = :encoding(UTF-8)

55
plot_intervals Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/perl
use warnings;
use strict;
use TimeSeries;
my $wanted_feed_id = shift;
my $current_feed_id = -1;
my @updates_in_interval;
my $ts = TimeSeries->new(style => 'impulses');
$ts->legend('old', 'new');
$ts->finalresolution(100);
my @files = glob("test.log*");
@files = sort { -M $b <=> -M $a } @files;
my $current;
my @updates;
for my $file (@files) {
open my $fh, '<', $file;
while (<$fh>) {
if (/ Rss2Html.FeedList feed: (\d+) .*/) {
$current_feed_id = $1;
}
next unless $current_feed_id == $wanted_feed_id;
if (/Rss2Html.FeedList [ ]
\d\d\d\d-\d\d-\d\d [ ] \w\w\w [ ] \d\d:\d\d:\d\d
[ ] \.\. [ ]
(\d\d\d\d-\d\d-\d\d) [ ] \w\w\w [ ] (\d\d:\d\d:\d\d)
: [ ]
(\d+) [ ] seconds
/x
) {
$current = { time => "$1T$2", interval => $3 };
}
if (/Rss2Html.FeedList need to update/) {
push @updates, $current;
}
if (/Rss2Html.Feed item .* new/) {
$current->{new}++;
}
}
}
for (@updates) {
$ts->add_timestring($_->{time},
$_->{new} ? (0, $_->{interval})
: ($_->{interval}, 0 )
);
}
print $ts->plot;

47
plot_stats Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/perl
use warnings;
use strict;
use TimeSeries;
my $wanted_feed_id = shift;
my $current_feed_id = -1;
my $current_time;
my @updates_in_interval;
my $update_count;
my $ts = TimeSeries->new(style => 'points');
$ts->legend('raw', 'adjusted');
$ts->finalresolution(100);
my @files = glob("/var/log/roxen/hjp/cgi.????-??-??");
@files = @files[-7..-1];
for my $file (@files) {
open my $fh, '<', $file;
while (<$fh>) {
if (/: \d+\.\d+: feed: (\d+) .*/) {
$current_feed_id = $1;
$update_count = 0;
}
next unless $current_feed_id == $wanted_feed_id;
if (/: [ ]
\d\d\d\d-\d\d-\d\d [ ] \w\w\w [ ] \d\d:\d\d:\d\d
[ ][ ] \.\. [ ][ ]
(\d\d\d\d-\d\d-\d\d) [ ] \w\w\w [ ] (\d\d:\d\d:\d\d)
/x
) {
$current_time = "$1T$2";
}
if (/updates_in_interval: ([-+\d.e]+)/) {
$updates_in_interval[$update_count++] = $1;
if ($update_count == 3) {
#print "$current_time\t$updates_in_interval[1]\t$updates_in_interval[2]\n";
$ts->add_timestring($current_time, @updates_in_interval[1, 2]);
}
}
}
}
print $ts->plot;

162
rss2html.css Normal file
View File

@ -0,0 +1,162 @@
.item {
border-style: none;
border-width: 1px;
background-color: #F6F6FF;
margin: 1em;
hyphens: auto;
-moz-hyphens: auto;
}
.item:after {
display: block;
clear: both;
content: "";
background-color: #00F;
height: 6px;
background: linear-gradient(#F6F6FF, #EEEEFF);
}
.op {
float: right;
padding: 6px;
}
.op a {
border-style: none;
border-radius: 3px;
padding: 3px;
background: linear-gradient(#88F, #008);
color: #FFF;
}
h2 a {
text-decoration: none;
}
h2 {
font-family: "Quercus", sans-serif;
font-variant: small-caps;
}
.itemno {
font-size: 0.5em;
}
.itemno:before {
content: " (";
color: #F00;
}
.itemno:after {
content: ") ";
color: #F00;
}
.feed_info {
display: table-row;
border-style: solid;
border-width: 1px;
margin: .1em;
}
.feed_id {
font-size: 0.7em;
display: table-cell;
text-align: right;
padding-right: 0.5em;
}
.feed_title {
padding-right: 0.2em;
}
.feed_items {
background-color: #CCF;
color: #888;
display: table-cell;
text-align: right;
padding-left: 0.2em;
padding-right: 0.2em;
}
.feed_unread {
background-color: #DDF;
color: #080;
display: table-cell;
text-align: right;
padding-left: 0.2em;
padding-right: 0.2em;
}
.feed_lastissued {
background-color: #CCF;
color: #000;
display: table-cell;
padding-left: 0.2em;
padding-right: 0.2em;
text-align: right;
}
.feed_lastseen {
background-color: #DDF;
color: #000;
display: table-cell;
padding-left: 0.2em;
padding-right: 0.2em;
text-align: right;
}
.feedlist {
background-color: #BBF;
float: left;
display: table;
margin-right: 0.5em;
padding: 0.2em;
}
.feedlist a {
text-decoration: none;
}
.op a {
text-decoration: none;
}
.itemlist {
background-color: #EEF;
max-width: 40em;
float: left;
font-family: "Quercus", sans-serif;
}
@media screen and (min-width: 60em) and (max-width: 80em) {
.itemlist {
max-width: 48%;
}
.feedlist {
max-width: 48%;
}
}
.item.read {
background-color: #EEE;
}
.lastupdate {
position: absolute;
top: 0em;
right: 0em;
}
h1 {
background: linear-gradient(to right, #008, #CCF);
margin: 0em;
color: #FFF;
}
body {
margin: 0em;
}
img {
max-width: 99%;
}

25
scrubber_test Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/perl
# missing: Sanitize html
use strict;
use warnings;
use Rss2Html::Scrubber;
use DBI;
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
my $scrubber = Rss2Html::Scrubber->new();
my $item
= $dbh->selectrow_hashref(
"select link, items.title as item_title, content, items.id as item_id
from items
where item_id=?
",
{ Slice => {} }, $ARGV[0]);
my $content = $scrubber->scrub($item->{content});
open my $fh1, '>:encoding(UTF-8)', "$ARGV[0].raw.html";
print $fh1 $item->{content};
open my $fh2, '>:encoding(UTF-8)', "$ARGV[0].scrubbed.html";
print $fh2 $content;

49
update2 Executable file
View File

@ -0,0 +1,49 @@
#!/usr/bin/perl
use strict;
use warnings;
use v5.14;
use DBI;
update2();
sub update2 {
no autovivification qw(fetch);
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
my $feeds = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} });
my $seconds_per_week = 86400 * 7;
for my $feed (@$feeds) {
print "feed: $feed->{id} $feed->{title}\n";
my $now = time();
my $update_time = $now - $feed->{last_update};
print "\t", "update_time: $update_time\n";
my $start = $now - 4 * $seconds_per_week;
my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued",
{ Slice => {} },
$feed->{id}, $start,
);
my $first_issued = $items->[0]{issued} // $start;
my $total_time = $now - $first_issued;
print "\t", "total_time: $total_time\n";
my $updates_in_interval = 0;
for my $item (@$items) {
my $t = $item->{issued};
my $dt1 = $now - $t;
my $dt2 = $dt1 % $seconds_per_week;
if ($dt2 < $update_time) {
$updates_in_interval++;
}
}
# normalize to one week
$updates_in_interval *= $seconds_per_week / $total_time;
$updates_in_interval += $update_time / $seconds_per_week; # add one update per week
print "\t", "updates_in_interval: $updates_in_interval\n";
}
}
# vim: tw=0

11
update3 Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/perl
use warnings;
use strict;
use Rss2Html::FeedList;
use Log::Log4perl qw(:easy);
BEGIN { Log::Log4perl->init("log.conf") };
my $feedlist = Rss2Html::FeedList->new();
$feedlist->update_adaptive;

49
update_frequency Executable file
View File

@ -0,0 +1,49 @@
#!/usr/bin/perl
use strict;
use warnings;
use v5.14;
use DBI;
use POSIX qw(strftime);
my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "");
$dbh->{sqlite_unicode} = 1;
my $feed_id = $ARGV[0];
my $issued
= $dbh->selectcol_arrayref(
"select issued from items where feed_id=? order by issued", {}, $feed_id
);
my @dt;
for (1 .. $#{$issued}) {
push @dt, $issued->[$_] - $issued->[$_-1];
}
@dt = sort { $a <=> $b } @dt;
my $c = 0;
for (@dt) {
$c++;
printf("%14s %8d %5.1f%%\n", hr($_), $_, $c * 100 / @dt);
}
sub hr {
my ($s) = $_;
my $hr = "";
if ($s > 86400) {
my $d = int ($s / 86400);
$hr .= $d . "d";
$s -= $d * 86400;
}
if ($hr || $s > 3600) {
my $h = int ($s / 3600);
$hr .= sprintf("%02dh", $h);
$s -= $h * 3600;
}
if ($hr || $s > 60) {
my $m = int ($s / 60);
$hr .= sprintf("%02dm", $m);
$s -= $m * 60;
}
$hr .= sprintf("%02ds", $s);
return $hr;
}