#!/usr/bin/perl -w use strict; =head1 NAME top_n - get top n (of many) time series =head1 SYNOPSIS top_n n f v This script expects a file in tab-separated format containing all series mixed together in "normalized" or "vertical" format, e.g., 2006-07-22T20:00:00 Wien/Innere Stadt 31.5 2006-07-22T20:00:00 Wien/Hohe Warte 29.5 2006-07-22T21:00:00 Wien/Innere Stadt 30.9 2006-07-22T21:00:00 Wien/Hohe Warte 27.9 2006-07-22T22:00:00 Wien/Innere Stadt 30.1 2006-07-22T22:00:00 Wien/Hohe Warte 26.2 ... n is the number of time series to extract. f is the number of the column with the series names, starting at 0. In the example above, this would be 1. v is the number of the column with values to be used for sorting. In the example above, this would be 2 (there is only one column with values in this example). All values for a given series are added, and the the n series with the highest sum are selected. Finally all records from the selected series and an additional series "OTHER" representing the sum of all other series are printed. =cut my $n = shift; my $f = shift; my $v = shift; my @data; my %s; while (<>) { chomp; my @a = split(/\t/); push @data, [@a]; $s{$a[$f]} += $a[$v]; } my @top_n = (sort { $s{$b} <=> $s{$a} } keys %s )[ 0 .. $n - 1 ]; my %top_n = map { $_ => 1 } @top_n; my %index; for my $i (0 .. $#data) { unless ($top_n{$data[$i][$f]}) { # this is a bit tricky, as records for the same point in time for # different series can be scattered all over the place. We assume # that all columns except $f and $v are relevant to the point in # time (or some other distinguishing criterium), so we save the value, # set $v and $f to fixed values and concatenate all columns. # This will get a unique value iff all columns except $f and $v are unique. # If we have seen this value before, we add the current value to # the record with this value and undef the current value. Otherwise we # record the index of the current record. my $val = $data[$i][$v]; $data[$i][$v] = 0; $data[$i][$f] = 'OTHER'; my $k = join("\t", @{$data[$i]}); if (defined($index{$k})) { $data[$index{$k}][$v] += $val; $data[$i][$v] = undef; } else { $data[$i][$v] = $val; $index{$k} = $i; } } } for (@data) { if (defined($_->[$v])) { print join("\t", @$_), "\n"; } }