commit ff784ba35122a1dc89767f5006999ebf9bea8116 Author: Peter J. Holzer Date: Mon Jun 12 21:29:41 2017 +0200 Copy from web site Start a git repo independent of my web site. diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..3f1247a --- /dev/null +++ b/.htaccess @@ -0,0 +1,10 @@ +AuthName hjp-work-in-progress +AuthType Basic +AuthUserFile /usr/local/www/offline/hjp/passwd +AuthGroupFile /usr/local/www/offline/hjp/group + + require valid-user + + + require valid-user + diff --git a/Notes b/Notes new file mode 100644 index 0000000..e69de29 diff --git a/Rss2Html/Feed.pm b/Rss2Html/Feed.pm new file mode 100644 index 0000000..55fc43c --- /dev/null +++ b/Rss2Html/Feed.pm @@ -0,0 +1,223 @@ +package Rss2Html::Feed; + +=head1 Name + +Rss2Html::Feed - a feed + +=head1 Description + +This class represents a single feed. + +It currently has only one public method, update(), which updates the +feed. + +=cut + +use strict; +use warnings; +use v5.14; +use Moose; + +use Cache::Memcached; +use Encode qw(decode_utf8); +use HTTP::Date; +use XML::Atom::Client; +use XML::RAI; + +with 'MooseX::Log::Log4perl'; + +has 'id' => ( + is => 'rw', +); +has 'url' => ( + is => 'rw', +); +has 'update_frequency' => ( + is => 'rw', +); +has 'last_update' => ( + is => 'rw', +); +has 'title' => ( + is => 'rw', +); +has 'type' => ( + is => 'rw', +); +has 'allow_img' => ( + is => 'rw', +); +has 'expire' => ( + is => 'rw', +); + +has 'dbh' => ( + is => 'rw', +); + +my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']); + + +=head1 Methods + +=cut + +=head2 update + +Update the feed. This is unconditional. The logic to decide which feeds +must be updated is in L. + +=cut + +sub update { + my ($self) = @_; + my $dbh = $self->dbh; + + my $now = int(time()); + + if ($self->{type} eq 'rss') { + local $@ = "unknown error"; + + # XML::RAI has a parse_uri method, but we explicitely fetch the URI to get better diagnostics. + # Probably should do this for atom, too, so we can autodetect the feed type. + my $ua = LWP::UserAgent->new; + $ua->agent("rss2html/1.0 (+http://www.hjp.at/projekte/rss2html/)"); + my $response = $ua->get($self->{url}); + if ($response->code != 200) { + $self->log->warn("GET $self->{url} failed: " . $response->status_line); + return; + } + $self->log->info("$self->{url} has content type " . $response->content_type); + + my $rai = eval {XML::RAI->parse_string($response->content) }; + if ($rai) { + $rai->time_format('EPOCH'); + if ($rai->channel->title ne $self->{title}) { + $self->log->info("set title"); + $dbh->do("update feeds set title=? where id=?", {}, + $rai->channel->title, $self->{id}); + } + $self->log->info("checking " . scalar(@{$rai->items}) . " items in db"); + for my $item ( @{$rai->items} ) { + my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $item->link); + if ($item_db) { + $self->log->info("item $item_db->{id} seen"); + $dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id}); + } else { + $self->log->info("new"); + + my $issued = $item->issued // $now; + if ($issued < $self->{last_update} || $issued > $now) { + # $issued is clearly wrong. Could be any time between $self->{last_update} and $now. + # Choose a random time in the interval. + my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update})); + $self->log->info("fixing up issued time: $issued -> $guess_issued"); + $issued = $guess_issued; + } + + $self->log->info("item " . $item->link . " new"); + $dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)", + {}, + $item->title, $item->link, $item->content, $issued, $now, $self->{id} + ); + $self->invalidate_item_info; + } + } + } else { + print STDERR "error getting $self->{url}: $@\n"; + } + } elsif ($self->{type} eq 'atom') { + my $api = XML::Atom::Client->new(); + my $atomfeed = $api->getFeed($self->{url}); + if ($atomfeed) { + my @items = $atomfeed->entries; + for my $item (@items) { + my $link = $self->atom_main_link($item); + my $item_db = $dbh->selectrow_hashref("select * from items where link=?", {}, $link); + if ($item_db) { + $self->log->info("item $item_db->{id} seen"); + $dbh->do("update items set seen=? where id=?", {}, $now, $item_db->{id}); + } else { + $self->log->info("new"); + my $title = decode_utf8($item->title); # XXX - Workaround? + my $content = $item->content; + my $summary = $item->summary; + my $body = $content ? $content->body : + defined($summary) ? decode_utf8($summary) : undef; + + $self->log->info("item " . $link . " new"); + + my $issued = str2time($item->updated) // $now; + if ($issued < $self->{last_update} || $issued > $now) { + # $issued is clearly wrong. Could be any time between $self->{last_update} and $now. + # Choose a random time in the interval. + my $guess_issued = $self->{last_update} + int(rand($now - $self->{last_update})); + $self->log->info("fixing up issued time: $issued -> $guess_issued"); + $issued = $guess_issued; + } + + $dbh->do("insert into items(title, link, content, issued, seen, feed_id) values(?, ?, ?, ?, ?, ?)", + {}, + $title, $link, $body, $issued, $now, $self->{id} + ); + $self->invalidate_item_info; + } + } + } + } else { + print "

unknown type $self->{type}

\n"; + $dbh->do("update feeds set title=?, last_update=? where id=?", {}, + 'unknown', $now, $self->{id}); + return; + } + + # mark all but the last 100 entries as old, so they aren't shown by default. + $dbh->do("update items set old=1 + where id in (select id from items + where feed_id=? and (old is null or old != 1) + order by issued desc limit -1 offset 100)", + {}, + $self->{id}); + + $dbh->do("update feeds set last_update=? where id=?", {}, $now, $self->{id}); +} + + +sub atom_main_link { + my ($self, $item) = @_; # XXX should probably be a method of $item + for my $link ($item->link) { + $self->log->info("checking href ", $link->href, "\n"); + if (defined $link->rel && $link->rel ne 'alternate') { + $self->log->info("rel ", $link->rel, " unexpected, skip"); + next; + } + if (defined $link->type && $link->type ne 'text/html') { + $self->log->info("type ", $link->type, " unexpected, skip\n"); + next; + } + $self->log->info("match!\n"); + return $link->href; + } + +} + +sub invalidate_item_info { + my ($self) = @_; + + my $dbh = $self->dbh; + my $users = $dbh->selectcol_arrayref("select distinct user from read"); + for my $user (@$users) { + $mcd->delete($self->item_info_key($user)); + } +} + +sub item_info_key { + my ($self, $user) = @_; + return "rss2html/item_info/" . $user . "/" . $self->id; +} + +#--- +__PACKAGE__->meta->make_immutable; +1; + +# vim: tw=132 diff --git a/Rss2Html/FeedList.pm b/Rss2Html/FeedList.pm new file mode 100644 index 0000000..7e6acd4 --- /dev/null +++ b/Rss2Html/FeedList.pm @@ -0,0 +1,131 @@ +package Rss2Html::FeedList; +use warnings; +use strict; +use Moose; +use namespace::autoclean; +use DBI; +use Rss2Html::Feed; +use POSIX qw(strftime); + +with 'MooseX::Log::Log4perl'; + + +has 'dbh' => ( + is => 'ro', + writer => '_set_dbh', +); + +sub BUILD { + my ($self, $args) = @_; + my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "", + { + sqlite_use_immediate_transaction => 1, + }); + $dbh->{sqlite_unicode} = 1; + $self->_set_dbh($dbh); +} + +sub feeds { + my ($self) = @_; + my $dbh = $self->dbh; + + no autovivification qw(fetch); + + my $feeds_sql = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} }); + my @feeds; + for (@$feeds_sql) { + push @feeds, Rss2Html::Feed->new(%$_, dbh => $dbh); + } + return \@feeds; +} + +sub update_adaptive { + my ($self) = @_; + my $dbh = $self->dbh; + my $start = time(); + + no autovivification qw(fetch); + + $dbh->begin_work; + + my $feeds = $self->feeds; + my $seconds_per_week = 86400 * 7; + for my $feed (@$feeds) { + $feed->{title} //= "-"; + #print "

feed: $feed->{id} $feed->{title}
\n"; + $self->log->info("feed: $feed->{id} $feed->{title}"); + my $now = time(); + my $run_time = $now - $start; + my $update_time = $now - $feed->{last_update}; + $self->log->info(strftime("%Y-%m-%d %a %H:%M:%S", localtime($feed->{last_update})), " .. ", + strftime("%Y-%m-%d %a %H:%M:%S", localtime($now)), ": $update_time seconds"); + #print "\t", "update_time: $update_time
\n"; + + # We are guaranteed to have all items which were issued in the last + # int($feed->{expire} / 7) weeks since we expire only those not seen for + # more than $feed->{expire} days. Some feeds keep some items for a long + # time while removing others quite quickly, so statistics for older + # items may be severely skewed and are not safe to use. + my $interval = int($feed->{expire} / 7) * 7 * 86400; + + # We measure those weeks backwards from the last update. + # If we measured from now we would almost always miss the update from + # exactly 4 weeks ago (one cycle we are too early, the next we are too + # late). + my $start = $feed->{last_update} - $interval; + my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued", + { Slice => {} }, + $feed->{id}, $start, + ); + my $first_issued = $items->[0]{issued} // $start; + my $total_time = $now - $first_issued; + my $updates_in_interval = 0; + for my $item (@$items) { + my $t = $item->{issued}; + my $dt1 = $now - $t; + my $dt2 = $dt1 % $seconds_per_week; + if ($dt2 < $update_time) { + $updates_in_interval++; + $self->log->info("found $item->{id} ", strftime("%Y-%m-%d %a %H:%M:%S", localtime($t))); + } + } + $self->log->info("updates_in_interval: $updates_in_interval"); + $updates_in_interval *= $seconds_per_week / $total_time; # normalize to one week + $self->log->info("updates_in_interval: $updates_in_interval"); + $updates_in_interval += $update_time / $seconds_per_week; # add one update per week + $self->log->info("updates_in_interval: $updates_in_interval"); + #print "\t", "updates_in_interval: $updates_in_interval
\n"; + if ($updates_in_interval > 0.5) { + $self->log->info($feed->{url}); + $self->log->info("need to update " . $feed->{url}); + $feed->update; + } + } + $self->cleanup(); + + $dbh->commit; + + $self->log->info("update done"); +} + + +sub cleanup { + my ($self) = @_; + my $dbh = $self->dbh; + + my $now = time(); + my $feeds = $dbh->selectall_arrayref("select * from feeds", { Slice => {} }); + + for my $feed (@$feeds) { + my $expire_days = $feed->{expire} // 30; + my $seen_limit = $now - $expire_days * 86400; + my $rc = $dbh->do("delete from items where feed_id=? and seen < ?", {}, $feed->{id}, $seen_limit); + $self->log->info("cleanup of feed $feed->{id} (limit $seen_limit) returned $rc"); + } + $dbh->do("delete from read where not exists (select id from items where id=item_id)"); +} + + +#--- +__PACKAGE__->meta->make_immutable; +1; diff --git a/Rss2Html/Scrubber.pm b/Rss2Html/Scrubber.pm new file mode 100644 index 0000000..c2986ec --- /dev/null +++ b/Rss2Html/Scrubber.pm @@ -0,0 +1,31 @@ +package Rss2Html::Scrubber; +use warnings; +use strict; +use parent 'HTML::Scrubber'; + +sub new { + my ($class, %options) = @_; + my $self = HTML::Scrubber->new(); + $self->deny(qw(link)); + $self->allow(qw(p a em strong b i ul ol li dl dt dd br)); + my %rules = ( + a => { + href => qr{^https?://}i, + '*' => 0, + }, + ); + if ($options{allow_img}) { + $rules{img} = { + src => qr{^https?://}i, + alt => 1, + '*' => 0, + }, + } + $self->rules( + %rules + ); + bless $self, $class; + return $self; +} + +1; diff --git a/do_cron b/do_cron new file mode 100755 index 0000000..29cb812 --- /dev/null +++ b/do_cron @@ -0,0 +1,3 @@ +#!/bin/sh +cd $(dirname $0) +./update3 diff --git a/feed.cgi b/feed.cgi new file mode 100755 index 0000000..a6e674a --- /dev/null +++ b/feed.cgi @@ -0,0 +1,66 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use v5.14; + +use XML::RAI; +use XML::Atom::Client; +use DBI; +use CGI; +use Rss2Html::Scrubber; +use Time::HiRes qw(time); +use POSIX qw(strftime); + +$| = 1; + +my $q = CGI->new(); +binmode STDOUT, ":encoding(UTF-8)"; + +my $feed_id = $q->param('id'); + +my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); +$dbh->{sqlite_unicode} = 1; + +print "Content-Type: text/html; charset=utf-8\n"; +print "Refresh: 600\n"; +print "\n"; + +print "\n"; +print "

RSS 2 HTML

\n"; +print "

", strftime("%Y-%m-%d %H:%M:%S%z", localtime()), "

\n"; + +my $feed = $dbh->selectrow_hashref("select * from feeds where id=?", {}, $feed_id); + +print "\n"; +print "", $q->escapeHTML($feed->{id}), "\n"; +print "", $q->escapeHTML($feed->{title}), "\n"; +print "", $q->escapeHTML($feed->{type}), "\n"; +print "\n"; + +my $now = time(); +my $seconds_per_week = 86400 * 7; +my $start = $now - 4 * $seconds_per_week; +my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued", + { Slice => {} }, + $feed->{id}, $start, + ); +for my $item (@$items) { + my $t = $item->{issued}; + my $dt1 = $now - $t; + my $dt2 = $dt1 % $seconds_per_week; + $item->{dt2} = $dt2; +} +$items = [ sort { $a->{dt2} <=> $b->{dt2} } @$items ]; +print "\n"; +for my $item (@$items) { + my $t = $now - $item->{dt2}; + print"\n"; + print"\n"; + print"\n"; + print"\n"; + print"\n"; + +} +print "
", $item->{dt2}, "", strftime("%a %H:%M:%S", localtime($t)), "", "", $q->escapeHTML($item->{title}), "", "
\n"; +# vim: expandtab diff --git a/index.cgi b/index.cgi new file mode 100755 index 0000000..064b4bf --- /dev/null +++ b/index.cgi @@ -0,0 +1,426 @@ +#!/usr/bin/perl + +# Parameters: +# +# mark=id +# mark item id as read +# +# redir=id +# redirect to url of item id and mark item as read +# +# fls=(all|unread) +# "feed list show": Show all feeds or only feeds with unread items in feed list +# +# fis= comma separated list of feed ids +# show only listed feeds +# BUG: fis is only effective if sr is non-zero! +# +# sr=n +# show last n read items. Note: Currently these are the n read items with the +# highest ids, which may not be the last items which have been read. Need to +# store the read date for that. +# +# so=1 +# also show old items. +# +# sl=1 +# show items marked as "later". + + +use strict; +use warnings; +use v5.14; +use utf8; +use experimental 'smartmatch'; # for given/when +use experimental 'autoderef'; + +use CGI; +use Cache::Memcached; +use DBI; +use Data::Dumper; +use Encode qw(encode_utf8 decode_utf8); +use POSIX qw(strftime); +use Rss2Html::Scrubber; +use Time::HiRes qw(time); + +$| = 1; + +my $start = time(); + +my $q = CGI->new(); +binmode STDOUT, ":encoding(UTF-8)"; +my $mcd = Cache::Memcached->new(servers => ['127.0.0.1:11211']); + +if ($q->param('mark')) { + mark(); +} elsif ($q->param('redir')) { + redirect(); +} else { + list(); +} + +sub redirect { + my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", "", + { + sqlite_use_immediate_transaction => 1, + }); + $dbh->{sqlite_unicode} = 1; + my $item_id = $q->param('redir'); + my $item = $dbh->selectrow_hashref("select * from items where id=?", {}, $item_id); + if ($item) { + print "Status: 302\n"; + print "Location: $item->{link}\n"; + print "\n"; + $dbh->do("insert into read(user, item_id) values(?, ?)", {}, $q->remote_user, $item_id); + $mcd->delete(item_info_key($q->remote_user, $item->{feed_id})); + exit(0); + } + print "Status: 404\n"; + print "Content-Type: text/html; charset=utf-8\n"; + print "\n"; + print "not found\n"; + +} + +sub mark { + print_log("mark start"); + my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); + $dbh->{sqlite_unicode} = 1; + my $item_id = $q->param('mark'); + my $item = $dbh->selectrow_hashref("select * from items where id=?", {}, $item_id); + if ($item) { + my $q1 = CGI->new($q); + $q1->delete('mark'); + print "Status: 302\n"; + print "Location: ", $q1->self_url, "\n"; + print "\n"; + $dbh->do("insert into read(user, item_id) values(?, ?)", {}, $q->remote_user, $item_id); + $mcd->delete(item_info_key($q->remote_user, $item->{feed_id})); + print_log("mark done"); + exit(0); + } + print "Status: 404\n"; + print "Content-Type: text/html; charset=utf-8\n"; + print "\n"; + print "not found\n"; + print_log("mark failed"); + +} + +sub list { + + my $feed_list_show = $q->param('fls') // "all"; + my $feed_item_show = $q->param('fis') // ''; + my $show_read = $q->param('sr'); + my $show_old = $q->param('so'); + my $show_later = $q->param('sl'); + my %feed_item_show = map(($_+0 => 1), split(/,/, $feed_item_show)); + + + print "Content-Type: text/html; charset=utf-8\n"; + print "Refresh: 600\n"; + print "\n"; + + print "\n"; + print "\n"; + print "

RSS 2 HTML

\n"; + print "
", strftime("%Y-%m-%d %H:%M:%S%z", localtime()), "
\n"; + + my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); + $dbh->{sqlite_unicode} = 1; + + print_log("list_feeds start"); + list_feeds($dbh, $feed_list_show, $show_old); + print_log("list_feeds done"); + + my $items; + if ($show_read) { + my $fields = "feeds.id as feed_id, feeds.title as feed_title, allow_img, link, items.title as item_title, content, items.id as item_id, issued, user, lang"; + my $tables = "items + join feeds on items.feed_id=feeds.id + left outer join read on (items.id=read.item_id and user=?)"; + my @where; + if (%feed_item_show) { + push @where, "feeds.id in (" . join(',', keys %feed_item_show) . ")"; + } + if (!$show_old) { + push @where, "(items.old is null or items.old == 0)" + } + + my $where = "where " . join(" and ", @where); + + my $cmd = "select $fields from $tables $where order by issued"; + print_log("$cmd"); + my $items1 = $dbh->selectall_arrayref($cmd, { Slice => {} }, $q->remote_user); + my $nrd = 0; + while (@$items1) { + my $item = pop @$items1; + { + no warnings 'uninitialized'; + print_log("id=$item->{item_id}, user=$item->{user}"); + } + if (!$item->{user}) { + print_log("id=$item->{item_id}: unread"); + unshift @$items, $item; + } elsif ($nrd < $show_read) { + print_log("id=$item->{item_id}: $nrd < $show_read"); + unshift @$items, $item; + $nrd++; + } + } + + } else { + my @where = ("user is null"); + + if (!$show_old) { + push @where, "(items.old is null or items.old == 0)" + } + + my $where = "where " . join(" and ", @where); + + + $items = $dbh->selectall_arrayref( + "select feeds.id as feed_id, feeds.title as feed_title, allow_img, link, items.title as item_title, content, items.id as item_id, issued, lang + from items + join feeds on items.feed_id=feeds.id + left outer join read on (items.id=read.item_id and user=?) + $where + order by issued", + { Slice => {} }, $q->remote_user); + } + my $feeds = $dbh->selectall_arrayref( + "select id, title from feeds where update_frequency is not null order by id", + { Slice => {} } + ); + for my $f (@$feeds) { + $f->{selected} = 1 if $feed_item_show{$f->{id}}; + } + print_itemlist($items, $feeds); + print_log("list done"); +} + +sub print_itemlist { + my ($items, $feeds) = @_; + print "
\n"; + print "
\n"; + print "\n"; + my $fls = $q->param('fls'); + print "\n" if defined $fls; + print "\n"; + print "\n"; + print "
\n"; + my $n_items = 0; + my $n_html_mcd = 0; + my $n_scrub_mcd = 0; + my $n_scrub = 0; + print_log("print_itemlist: \$q=" . $q->self_url); + my $q1 = CGI->new($q); + print_log("print_itemlist: \$q1=" . $q1->self_url . " (before loop)"); + print_log(scalar @$items . " before remix"); + $items = remix($items); + print_log(scalar @$items . " after remix"); + for my $item (@$items) { + $n_items++; + my $is_read = defined($item->{user}); + my $scrubbed_content = $mcd->get(scrubbed_content_key($item->{item_id})); + if ($scrubbed_content) { + $scrubbed_content = decode_utf8($scrubbed_content); + $n_scrub_mcd++; + } else { + my $scrubber = Rss2Html::Scrubber->new(allow_img => $item->{allow_img}); + $scrubbed_content = $scrubber->scrub($item->{content}); + $mcd->set(scrubbed_content_key($item->{item_id}), encode_utf8($scrubbed_content), 3600); + $n_scrub++; + } + $q1->param('mark', $item->{item_id}); + my $item_class = 'item' . ($is_read ? ' read' : ''); + my $langattr = defined $item->{lang} ? "lang='$item->{lang}'" : ""; + my $html = ""; + $html .= "
\n"; + $html .= "" . $item->{item_id} . "\n"; + $html .= "" . strftime('%Y-%m-%d %H:%M:%S', localtime($item->{issued})) . "\n"; + unless ($is_read) { + print_log("print_itemlist: \$q1=" . $q1->self_url . " (in loop)"); + $html .= "\n"; + } + $html .= "
" . $q->escapeHTML($item->{feed_title}) . "
\n"; + $html .= "

" . $q->escapeHTML($item->{item_title}) . "

\n"; + $html .= "
" . $scrubbed_content . "
\n"; + unless ($is_read) { + $html .= "\n"; + } + $html .= "
\n"; + $html .= ""; + print $html; + + } + print "
\n"; + print_log("itemlist: $n_items items ($n_html_mcd html cached, $n_scrub_mcd scrubbed content cached, $n_scrub scrubbed)"); +} + +sub remix { + my ($items) = @_; + + my $queues = {}; + for my $item (@$items) { + if (!defined $item->{feed_id}) { + local $Data::Dumper::Indent = 0; + print_log("item w/o feed_id: " . Dumper($item)); + } + $queues->{$item->{feed_id}} //= []; + push $queues->{$item->{feed_id}}, $item; + } + my @ordered_feeds = sort { $queues->{$a}[0]{issued} <=> $queues->{$b}[0]{issued} } keys $queues; + my $new_items = []; + my $found; + do { + $found = 0; + for my $feed (@ordered_feeds) { + my $item = shift $queues->{$feed}; + if ($item) { + push $new_items, $item; + $found++; + print_log("found an item in feed $feed. " . scalar(@{$queues->{$feed}}) . " items left") + } + } + } while ($found); + + return $new_items; +} + +sub list_feeds { + my ($dbh, $feed_list_show, $show_old) = @_; + + no autovivification qw(fetch); + + my $q1 = CGI->new($q); + my $other = $feed_list_show eq 'all' ? 'unread' : 'all'; + $q1->param('fls', $other); + print "
$other
\n"; + print "
\n"; + $dbh->begin_work; + + my $feeds = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} }); + my $seconds_per_week = 86400 * 7; + for my $feed (@$feeds) { + $feed->{title} //= "-"; + print_item_info($dbh, $feed, $feed_list_show, $show_old); + + } + $dbh->commit; + + print "
\n"; +} + + +sub print_item_info { + my ($dbh, $feed, $feed_list_show, $show_old) = @_; + + my $item_info_key = item_info_key($q->remote_user, $feed->{id}); + my $item_info = $mcd->get($item_info_key); + my $cmd + = "select count(i.id) as nr_items, max(i.issued) as last_issued, max(i.seen) as last_seen, count(r.item_id) as nr_read + from items i left outer join (select item_id from read where user=?) r on i.id=r.item_id + where i.feed_id=?"; + unless ($show_old) { + $cmd .= " and (old is null or old = 0)"; + } + unless ($item_info) { + $item_info + = $dbh->selectrow_hashref( + $cmd, + {}, + $q->remote_user, $feed->{id} + ); + $mcd->set($item_info_key, $item_info, 3600); + } + my $now = time(); + my $unread = $item_info->{nr_items} - $item_info->{nr_read}; + print_log("print_item_info: feed $feed->{id}, nr_items=$item_info->{nr_items}, read=$item_info->{nr_read}, unread=$unread"); + return unless ($unread || $feed_list_show eq 'all'); + + my $q1 = CGI->new($q); + $q1->param('fis', $feed->{id}); + $q1->param('sr', 10); + print "\n"; + print "", $q->escapeHTML($feed->{id}), "\n"; + print "", + "", + $q->escapeHTML($feed->{title}), + "", + "\n"; + print "", $item_info->{nr_items}, "\n"; + print "", $unread, "\n"; + print "", delta_t($item_info->{last_issued}, $now), "\n"; + print "", delta_t($item_info->{last_seen}, $now), "\n"; + print "\n"; +} + + +sub print_log { + my $msg = "@_"; + my $now = time(); + printf STDERR "%s: %s.%06d %f: %s\n", $0, strftime("%H:%M:%S", localtime($now)), ($now - int($now)) * 1E6, $now - $start, $msg; +} + +sub delta_t { + my ($t1, $t2) = @_; + return "N/A" unless defined $t1 && defined $t2; + my $dt = $t2 - $t1; + my $s = ""; + if ($dt < 0) { + $s .= "-"; + $dt = -$dt; + } + given ($dt) { + when ($_ >= 86400 * 10) { + $s .= sprintf("%.0fd", $_ / 86400); + } + when ($_ >= 86400) { + $s .= sprintf("%.1fd", $_ / 86400); + } + when ($_ >= 3600 * 10) { + $s .= sprintf("%.0fh", $_ / 3600); + } + when ($_ >= 3600) { + $s .= sprintf("%.1fh", $_ / 3600); + } + when ($_ >= 60 * 10) { + $s .= sprintf("%.0fm", $_ / 60); + } + when ($_ >= 60) { + $s .= sprintf("%.1fm", $_ / 60); + } + when ($_ >= 10) { + $s .= sprintf("%.0fs", $_); + } + default { + $s .= sprintf("%.1fs", $_); + } + } +} + + +sub item_info_key { + my ($user, $feed_id) = @_; + return "rss2html/item_info/$user/$feed_id"; +} + + +sub scrubbed_content_key { + my ($item_id) = @_; + return "rss2html/scrubbed_content/$item_id"; +} + +sub item_html_key { + my ($item_id, $is_read) = @_; + return "rss2html/item_html/$item_id/$is_read"; +} + + +# vim: tw=132 expandtab diff --git a/log.conf b/log.conf new file mode 100644 index 0000000..31d99a3 --- /dev/null +++ b/log.conf @@ -0,0 +1,12 @@ +log4perl.logger = DEBUG, RotFileApp +log4perl.appender.FileApp = Log::Log4perl::Appender::File +log4perl.appender.FileApp.filename = test.log +log4perl.appender.FileApp.layout = PatternLayout +log4perl.appender.FileApp.layout.ConversionPattern = %d [%p] %P %c %m%n +log4perl.appender.RotFileApp = Log::Dispatch::FileRotate +log4perl.appender.RotFileApp.filename = test.log +log4perl.appender.RotFileApp.layout = PatternLayout +log4perl.appender.RotFileApp.layout.ConversionPattern = %d [%p] %P %c %m%n +log4perl.appender.RotFileApp.max = 7 +log4perl.appender.RotFileApp.mode = append +log4perl.appender.RotFileApp.binmode = :encoding(UTF-8) diff --git a/plot_intervals b/plot_intervals new file mode 100755 index 0000000..cabbe09 --- /dev/null +++ b/plot_intervals @@ -0,0 +1,55 @@ +#!/usr/bin/perl +use warnings; +use strict; +use TimeSeries; + +my $wanted_feed_id = shift; + +my $current_feed_id = -1; +my @updates_in_interval; + +my $ts = TimeSeries->new(style => 'impulses'); +$ts->legend('old', 'new'); +$ts->finalresolution(100); + +my @files = glob("test.log*"); +@files = sort { -M $b <=> -M $a } @files; + +my $current; +my @updates; +for my $file (@files) { + open my $fh, '<', $file; + while (<$fh>) { + + if (/ Rss2Html.FeedList feed: (\d+) .*/) { + $current_feed_id = $1; + } + + next unless $current_feed_id == $wanted_feed_id; + + if (/Rss2Html.FeedList [ ] + \d\d\d\d-\d\d-\d\d [ ] \w\w\w [ ] \d\d:\d\d:\d\d + [ ] \.\. [ ] + (\d\d\d\d-\d\d-\d\d) [ ] \w\w\w [ ] (\d\d:\d\d:\d\d) + : [ ] + (\d+) [ ] seconds + /x + ) { + $current = { time => "$1T$2", interval => $3 }; + } + if (/Rss2Html.FeedList need to update/) { + push @updates, $current; + } + if (/Rss2Html.Feed item .* new/) { + $current->{new}++; + } + } +} + +for (@updates) { + $ts->add_timestring($_->{time}, + $_->{new} ? (0, $_->{interval}) + : ($_->{interval}, 0 ) + ); +} +print $ts->plot; diff --git a/plot_stats b/plot_stats new file mode 100755 index 0000000..40b50f6 --- /dev/null +++ b/plot_stats @@ -0,0 +1,47 @@ +#!/usr/bin/perl +use warnings; +use strict; +use TimeSeries; + +my $wanted_feed_id = shift; + +my $current_feed_id = -1; +my $current_time; +my @updates_in_interval; +my $update_count; + +my $ts = TimeSeries->new(style => 'points'); +$ts->legend('raw', 'adjusted'); +$ts->finalresolution(100); + +my @files = glob("/var/log/roxen/hjp/cgi.????-??-??"); +@files = @files[-7..-1]; +for my $file (@files) { + open my $fh, '<', $file; + while (<$fh>) { + + if (/: \d+\.\d+: feed: (\d+) .*/) { + $current_feed_id = $1; + $update_count = 0; + } + + next unless $current_feed_id == $wanted_feed_id; + + if (/: [ ] + \d\d\d\d-\d\d-\d\d [ ] \w\w\w [ ] \d\d:\d\d:\d\d + [ ][ ] \.\. [ ][ ] + (\d\d\d\d-\d\d-\d\d) [ ] \w\w\w [ ] (\d\d:\d\d:\d\d) + /x + ) { + $current_time = "$1T$2"; + } + if (/updates_in_interval: ([-+\d.e]+)/) { + $updates_in_interval[$update_count++] = $1; + if ($update_count == 3) { + #print "$current_time\t$updates_in_interval[1]\t$updates_in_interval[2]\n"; + $ts->add_timestring($current_time, @updates_in_interval[1, 2]); + } + } + } +} +print $ts->plot; diff --git a/rss2html.css b/rss2html.css new file mode 100644 index 0000000..5c344c7 --- /dev/null +++ b/rss2html.css @@ -0,0 +1,162 @@ +.item { + border-style: none; + border-width: 1px; + background-color: #F6F6FF; + margin: 1em; + hyphens: auto; + -moz-hyphens: auto; +} + +.item:after { + display: block; + clear: both; + content: ""; + background-color: #00F; + height: 6px; + background: linear-gradient(#F6F6FF, #EEEEFF); +} + +.op { + float: right; + padding: 6px; +} + +.op a { + border-style: none; + border-radius: 3px; + padding: 3px; + background: linear-gradient(#88F, #008); + color: #FFF; +} + +h2 a { + text-decoration: none; +} + +h2 { + font-family: "Quercus", sans-serif; + font-variant: small-caps; +} + +.itemno { + font-size: 0.5em; +} + +.itemno:before { + content: " ("; + color: #F00; +} + +.itemno:after { + content: ") "; + color: #F00; +} + +.feed_info { + display: table-row; + border-style: solid; + border-width: 1px; + margin: .1em; +} + +.feed_id { + font-size: 0.7em; + display: table-cell; + text-align: right; + padding-right: 0.5em; +} + +.feed_title { + padding-right: 0.2em; +} + +.feed_items { + background-color: #CCF; + color: #888; + display: table-cell; + text-align: right; + padding-left: 0.2em; + padding-right: 0.2em; +} + +.feed_unread { + background-color: #DDF; + color: #080; + display: table-cell; + text-align: right; + padding-left: 0.2em; + padding-right: 0.2em; +} + +.feed_lastissued { + background-color: #CCF; + color: #000; + display: table-cell; + padding-left: 0.2em; + padding-right: 0.2em; + text-align: right; +} + +.feed_lastseen { + background-color: #DDF; + color: #000; + display: table-cell; + padding-left: 0.2em; + padding-right: 0.2em; + text-align: right; +} + +.feedlist { + background-color: #BBF; + float: left; + display: table; + margin-right: 0.5em; + padding: 0.2em; +} + +.feedlist a { + text-decoration: none; +} + +.op a { + text-decoration: none; +} + +.itemlist { + background-color: #EEF; + max-width: 40em; + float: left; + font-family: "Quercus", sans-serif; +} + +@media screen and (min-width: 60em) and (max-width: 80em) { + .itemlist { + max-width: 48%; + } + .feedlist { + max-width: 48%; + } +} + +.item.read { + background-color: #EEE; +} + +.lastupdate { + position: absolute; + top: 0em; + right: 0em; +} + +h1 { + background: linear-gradient(to right, #008, #CCF); + margin: 0em; + color: #FFF; +} +body { + margin: 0em; +} + +img { + max-width: 99%; +} diff --git a/scrubber_test b/scrubber_test new file mode 100755 index 0000000..8ed9e2a --- /dev/null +++ b/scrubber_test @@ -0,0 +1,25 @@ +#!/usr/bin/perl + +# missing: Sanitize html + +use strict; +use warnings; +use Rss2Html::Scrubber; + +use DBI; + +my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); +$dbh->{sqlite_unicode} = 1; +my $scrubber = Rss2Html::Scrubber->new(); +my $item + = $dbh->selectrow_hashref( + "select link, items.title as item_title, content, items.id as item_id + from items + where item_id=? + ", + { Slice => {} }, $ARGV[0]); +my $content = $scrubber->scrub($item->{content}); +open my $fh1, '>:encoding(UTF-8)', "$ARGV[0].raw.html"; +print $fh1 $item->{content}; +open my $fh2, '>:encoding(UTF-8)', "$ARGV[0].scrubbed.html"; +print $fh2 $content; diff --git a/update2 b/update2 new file mode 100755 index 0000000..7722fa4 --- /dev/null +++ b/update2 @@ -0,0 +1,49 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use v5.14; + +use DBI; + +update2(); + +sub update2 { + no autovivification qw(fetch); + + my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); + $dbh->{sqlite_unicode} = 1; + + my $feeds = $dbh->selectall_arrayref("select * from feeds where update_frequency is not null", { Slice => {} }); + my $seconds_per_week = 86400 * 7; + for my $feed (@$feeds) { + print "feed: $feed->{id} $feed->{title}\n"; + my $now = time(); + my $update_time = $now - $feed->{last_update}; + print "\t", "update_time: $update_time\n"; + my $start = $now - 4 * $seconds_per_week; + my $items = $dbh->selectall_arrayref("select * from items where feed_id=? and issued >= ? order by issued", + { Slice => {} }, + $feed->{id}, $start, + ); + my $first_issued = $items->[0]{issued} // $start; + my $total_time = $now - $first_issued; + print "\t", "total_time: $total_time\n"; + my $updates_in_interval = 0; + for my $item (@$items) { + my $t = $item->{issued}; + my $dt1 = $now - $t; + my $dt2 = $dt1 % $seconds_per_week; + if ($dt2 < $update_time) { + $updates_in_interval++; + } + } + # normalize to one week + $updates_in_interval *= $seconds_per_week / $total_time; + $updates_in_interval += $update_time / $seconds_per_week; # add one update per week + print "\t", "updates_in_interval: $updates_in_interval\n"; + + } +} + +# vim: tw=0 diff --git a/update3 b/update3 new file mode 100755 index 0000000..b53c9c1 --- /dev/null +++ b/update3 @@ -0,0 +1,11 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Rss2Html::FeedList; +use Log::Log4perl qw(:easy); + +BEGIN { Log::Log4perl->init("log.conf") }; + + +my $feedlist = Rss2Html::FeedList->new(); +$feedlist->update_adaptive; diff --git a/update_frequency b/update_frequency new file mode 100755 index 0000000..ae7a68c --- /dev/null +++ b/update_frequency @@ -0,0 +1,49 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use v5.14; + +use DBI; +use POSIX qw(strftime); + +my $dbh = DBI->connect("dbi:SQLite:dbname=rss2html.sqlite", "", ""); +$dbh->{sqlite_unicode} = 1; +my $feed_id = $ARGV[0]; +my $issued + = $dbh->selectcol_arrayref( + "select issued from items where feed_id=? order by issued", {}, $feed_id + ); + +my @dt; +for (1 .. $#{$issued}) { + push @dt, $issued->[$_] - $issued->[$_-1]; +} +@dt = sort { $a <=> $b } @dt; +my $c = 0; +for (@dt) { + $c++; + printf("%14s %8d %5.1f%%\n", hr($_), $_, $c * 100 / @dt); +} + +sub hr { + my ($s) = $_; + my $hr = ""; + if ($s > 86400) { + my $d = int ($s / 86400); + $hr .= $d . "d"; + $s -= $d * 86400; + } + if ($hr || $s > 3600) { + my $h = int ($s / 3600); + $hr .= sprintf("%02dh", $h); + $s -= $h * 3600; + } + if ($hr || $s > 60) { + my $m = int ($s / 60); + $hr .= sprintf("%02dm", $m); + $s -= $m * 60; + } + $hr .= sprintf("%02ds", $s); + return $hr; +}