2006-02-14 10:45:11 +01:00
|
|
|
#!/usr/bin/perl -w
|
|
|
|
use strict;
|
|
|
|
|
2006-07-24 10:43:57 +02:00
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
top_n - get top n (of many) time series
|
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
|
|
|
|
top_n n f v
|
|
|
|
|
|
|
|
This script expects a file in tab-separated format containing all series mixed together in
|
|
|
|
"normalized" or "vertical" format,
|
|
|
|
e.g.,
|
|
|
|
|
|
|
|
2006-07-22T20:00:00 Wien/Innere Stadt 31.5
|
|
|
|
2006-07-22T20:00:00 Wien/Hohe Warte 29.5
|
|
|
|
2006-07-22T21:00:00 Wien/Innere Stadt 30.9
|
|
|
|
2006-07-22T21:00:00 Wien/Hohe Warte 27.9
|
|
|
|
2006-07-22T22:00:00 Wien/Innere Stadt 30.1
|
|
|
|
2006-07-22T22:00:00 Wien/Hohe Warte 26.2
|
|
|
|
...
|
|
|
|
|
|
|
|
n is the number of time series to extract.
|
|
|
|
|
|
|
|
f is the number of the column with the series names, starting at 0. In the example above, this would be 1.
|
|
|
|
|
|
|
|
v is the number of the column with values to be used for sorting. In the example above, this would be 2
|
|
|
|
(there is only one column with values in this example).
|
|
|
|
|
|
|
|
All values for a given series are added, and the the n series with the highest sum are selected.
|
|
|
|
Finally all records from the selected series and an additional series "OTHER" representing the sum
|
|
|
|
of all other series are printed.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
2006-02-14 10:45:11 +01:00
|
|
|
my $n = shift;
|
|
|
|
my $f = shift;
|
|
|
|
my $v = shift;
|
|
|
|
|
|
|
|
my @data;
|
|
|
|
my %s;
|
|
|
|
while (<>) {
|
|
|
|
chomp;
|
|
|
|
my @a = split(/\t/);
|
|
|
|
push @data, [@a];
|
|
|
|
$s{$a[$f]} += $a[$v];
|
|
|
|
}
|
|
|
|
|
|
|
|
my @top_n = (sort { $s{$b} <=> $s{$a} } keys %s )[ 0 .. $n - 1 ];
|
|
|
|
my %top_n = map { $_ => 1 } @top_n;
|
|
|
|
|
|
|
|
my %index;
|
|
|
|
|
|
|
|
for my $i (0 .. $#data) {
|
|
|
|
unless ($top_n{$data[$i][$f]}) {
|
2006-07-24 10:43:57 +02:00
|
|
|
|
|
|
|
# this is a bit tricky, as records for the same point in time for
|
|
|
|
# different series can be scattered all over the place. We assume
|
|
|
|
# that all columns except $f and $v are relevant to the point in
|
|
|
|
# time (or some other distinguishing criterium), so we save the value,
|
|
|
|
# set $v and $f to fixed values and concatenate all columns.
|
|
|
|
# This will get a unique value iff all columns except $f and $v are unique.
|
|
|
|
# If we have seen this value before, we add the current value to
|
|
|
|
# the record with this value and undef the current value. Otherwise we
|
|
|
|
# record the index of the current record.
|
2006-02-14 10:45:11 +01:00
|
|
|
my $val = $data[$i][$v];
|
|
|
|
$data[$i][$v] = 0;
|
2006-02-14 10:46:14 +01:00
|
|
|
$data[$i][$f] = 'OTHER';
|
2006-02-14 10:45:11 +01:00
|
|
|
my $k = join("\t", @{$data[$i]});
|
|
|
|
if (defined($index{$k})) {
|
|
|
|
$data[$index{$k}][$v] += $val;
|
|
|
|
$data[$i][$v] = undef;
|
|
|
|
} else {
|
|
|
|
$data[$i][$v] = $val;
|
|
|
|
$index{$k} = $i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (@data) {
|
|
|
|
if (defined($_->[$v])) {
|
|
|
|
print join("\t", @$_), "\n";
|
|
|
|
}
|
|
|
|
}
|