From f324d34f566cab1ae19261bee304fa882c4a0b23 Mon Sep 17 00:00:00 2001 From: "Peter J. Holzer" Date: Sun, 11 Aug 2019 21:32:13 +0200 Subject: [PATCH] Extract user agent info --- Build.PL | 10 + MANIFEST | 3 + lib/Net/Http/Useragent.pm | 1098 +++++++++++++++++++++++++++++++++++++ 3 files changed, 1111 insertions(+) create mode 100644 Build.PL create mode 100644 MANIFEST create mode 100644 lib/Net/Http/Useragent.pm diff --git a/Build.PL b/Build.PL new file mode 100644 index 0000000..1f327dd --- /dev/null +++ b/Build.PL @@ -0,0 +1,10 @@ +use Module::Build; +my $build + = Module::Build->new ( + module_name => 'Net::Http::Useragent', + scripts => [ + ], + ); + +$build->create_build_script; + diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..144b7a3 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,3 @@ +Build.PL +lib/Net/Http/Useragent.pm +MANIFEST This list of files diff --git a/lib/Net/Http/Useragent.pm b/lib/Net/Http/Useragent.pm new file mode 100644 index 0000000..751c4b2 --- /dev/null +++ b/lib/Net/Http/Useragent.pm @@ -0,0 +1,1098 @@ +package Net::Http::Useragent; +use warnings; +use strict; + +=head1 NAME + +Net::Http::Useragent - extract information from HTTP User-Agent header + +=head2 DESCRIPTION + +This class represents the contents of a HTTP User-Agent header, and can +be used to extract information (user agent, version, os, whether it's a +bot, ...) from it. + +=cut + +our $VERSION = 0.002; + +sub new { + my ($class, $useragent) = @_; + my $self = { useragent => $useragent }; + bless $self, $class; + return $self; +} + +=head2 canonical_os + +Returns a canonic OS name + +=cut + +sub canonical_os($) { + my ($os) = @_; + + $os =~ s/Windows Windows/Windows/g; + $os =~ s/.*Linux.*/Linux/g; + $os =~ s/.*Mac OS X.*/Mac OS X/g; + + return $os; +} + +=head2 useragent_munged + +Tries to find out the real user agent and returns that information as a +hash ref: + +=over 4 +=item robot + is this a known robot? + +=item useragent + (short) name of the agent, e.g. "Netscape", "MSIE", "Mozilla", + "Opera", "w3m", ... + +=item major_version + The version number with minor revisions omitted. This is not + necessarily just the number before the first dot, as different + vendors have different policies in regard to version numbers. + +=item version + +=item os + The operating system + +=cut + +my $uam = {}; +sub munged { + my ($self) = @_; + my $ua = $self->{useragent}; + + if ($ua =~ m{^(FAST-WebCrawler)/(\d+)\.(\S+)}) { + $uam->{$ua} = { + robot => 1, + useragent => 'FAST Crawler', + major_version => $2, + version => "$2.$3" + }; + return $uam->{$ua}; + } + + if ($ua =~ m{^(Googlebot)/(\d+)\.(\d+)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2.$3" + }; + return $uam->{$ua}; + } + + if ($ua =~ m{^Mozilla/5.0 \((Slurp)/(\w+); slurp\@inktomi.com; http://www.inktomi.com/slurp.html\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) 13155 22.43 3.16 + if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp); http://help.yahoo.com/help/us/ysearch/slurp\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => "", + version => "", + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html) + if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp) China; http://misc.yahoo.com.cn/help.html\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => "", + version => "", + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp) + if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp)/([.\d]+); http://help.yahoo.com/help/us/ysearch/slurp\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com) + # Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (http://www.voila.com/) + # Mozilla/4.0 (compatible; MSIE 5.0; Windows 95) VoilaBot BETA 1.2 (http://www.voila.com/) + + if ($ua =~ m{Mozilla/[45].0 \(.* Windows.*\) (VoilaBot) (BETA 1.2) \(.*voila.*\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # MSIE + + if ($ua =~ m{^Mozilla/\d+.\d+ \(compatible; (MSIE) ((\d+\.\d)\d*); ([^;\)]*).*\)$}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + major_version => $3, + version => $2, + os => canonical_os($4), + }; + return $uam->{$ua}; + } + + # Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://sp.ask.com/docs/about/tech_crawling.html) 3609 6.15 0.87 + # Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml) 2600 1.82 0.29 + # Mozilla/5.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml) 18686 21.79 2.20 + if ($ua =~ m{^Mozilla/[25].0 \(compatible; (Ask Jeeves/Teoma)(; \+http://\w+.ask.com/.*docs/about/.*html)?\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + + if ($ua =~ m{^Mozilla/4.0 \(compatible; (?i:B.l.i.t.z.B.O.T)\)}) { + $uam->{$ua} = { + robot => 1, + useragent => "Blitzbot", + }; + return $uam->{$ua}; + } + + + + if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; .*\) Gecko/(\d+) (Netscape)\d?/((\d)\.\d+)$}) { + # 1 2 3 4 5 67 + $uam->{$ua} = { + robot => 0, + useragent => $5, + os => canonical_os("$2 $3"), + major_version => $7, + version => "$6/$4", + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1 1418 2.42 0.34 + if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; rv:(\d.[.\w]*)\) Gecko/(\d+) .*(Firefox|GranParadiso|Minefield|Iceweasel)/((\d+\.\d+)[.\d]*)}) { + $uam->{$ua} = { + robot => 0, + useragent => 'Firefox', + os => canonical_os("$2 $3"), + major_version => $8, + version => "$7/$5", + }; + return $uam->{$ua}; + } + + if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; rv:((\d.\d+)[.\w]*)\) Gecko/(\d+)}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + os => canonical_os("$2 $3"), + major_version => $5, + version => "$4/$6", + }; + return $uam->{$ua}; + } + + # honest Opera + # Opera/9.52 (X11; Linux i686; U; en) + if ($ua =~ m{^(Opera)/((\d).\d+) \((?:[^;]+; )?([^;]+); U(?:; ([-a-z]*))\)}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + version => $2, + major_version => $3, + os => canonical_os($4), + lang => $5, + }; + return $uam->{$ua}; + } + + # Opera masquerading as MSIE + if ($ua =~ m{^Mozilla/4.0 \(compatible;.* MSIE \d\.\d; ([^\)]+)\) (Opera) ((\d).\d+) \[\w\w\]}) { + $uam->{$ua} = { + robot => 0, + useragent => $2, + version => $3, + major_version => $4, + os => canonical_os($1), + }; + return $uam->{$ua}; + } + if ($ua =~ m{^Mozilla/4.0 \(compatible; MSIE \d\.\d; (?:.*; )*([^;]+); ([-a-z]*)\) (Opera) ((\d).\d+)$}) { + $uam->{$ua} = { + robot => 0, + useragent => $3, + version => $4, + major_version => $5, + os => canonical_os($1), + lang => $2, + }; + return $uam->{$ua}; + } + # Opera masquerading as Mozilla + # Mozilla/5.0 (Linux 2.4.2 i386; U) Opera 6.0 [en] + if ($ua =~ m{^Mozilla/\d.\d+ \(([^;]+); [IU]\) (Opera) ((\d).\d+) \[\w\w\]}) { + $uam->{$ua} = { + robot => 0, + useragent => $2, + version => $3, + major_version => $4, + os => canonical_os($1), + }; + return $uam->{$ua}; + } + + + if ($ua =~ m{^Mozilla/\d\.\d \(compatible; (Konqueror)/([.\d]+); (\S+)\)}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + version => $2, + major_version => $2, + os => canonical_os($3), + }; + return $uam->{$ua}; + } + if ($ua =~ m{^Mozilla/\d\.\d \(compatible; (Konqueror)/(([.\d]+)(?:-rc\d+)); ([^;\)]+).*\)}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + version => $2, + major_version => $3, + os => canonical_os($4), + }; + return $uam->{$ua}; + } + + if ($ua =~ m{^(sitecheck.internetseer.com) \(For more info see: http://sitecheck.internetseer.com\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + # Mozilla/4.77 [en] (Windows NT 5.0; U)/ 37 2.26 0.15 + # Mozilla/4.79 [en] (Windows NT 5.0; U)/ 1288 4.94 0.62 + if ($ua =~ m{^Mozilla/((\d\.\d)\d*) \[\w\w\].* \(([^;]+); [IU](?:;( [^;\)]*))?\)}) { + $uam->{$ua} = { + robot => 0, + useragent => "Netscape", + version => $1, + major_version => $2, + os => canonical_os($3 . ($4 || "")), + }; + return $uam->{$ua}; + } + + # Mozilla/4.73 (Macintosh; U; PPC)/ 332 20.31 1.31 + # apparently the Mac version of Netscape omits the language tag + if ($ua =~ m{^Mozilla/((\d\.\d)\d*) \((Macintosh); [IU](?:;( [^;\)]*))?\)}) { + $uam->{$ua} = { + robot => 0, + useragent => "Netscape", + version => $1, + major_version => $2, + os => canonical_os($3 . ($4 || "")), + }; + return $uam->{$ua}; + } + + + # don't really know what that is. + # The access patterns look human, not robot-like, + # so it's probably some browser behind a UA-mangling proxy + # Mozilla/3.01 (compatible;) + if ($ua =~ m{^Mozilla/((3).01) \(compatible;\)$}) { + $uam->{$ua} = { + robot => 0, + useragent => "masquerades as Mozilla compatible", + version => $1, + major_version => $2, + }; + return $uam->{$ua}; + } + + # Altavista search bot: + # Scooter/3.2.SF0 + # Scooter/3.3 + if ($ua =~ m{^(Scooter)/((\d).*)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # http://www.almaden.ibm.com/cs/crawler [c01] + if ($ua =~ m{^http://www.(almaden.ibm.com)/cs/crawler \[(c01)\]}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + + # Mercator-2.0 + if ($ua =~ m{^(Mercator)-((\d).\d)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # Mozilla/4.0 (compatible; BorderManager 3.0) + # Bordermanager (http://www.novell.com/products/bordermanager/) + # seems to be a Novell Proxy server. + # classifing that as a user agent is a bit wrong, + if ($ua =~ m{^Mozilla/4.0 \(compatible; (BorderManager) ((3).0)\)$}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + version => $2, + major_version => $3, + os => canonical_os("Novell"), + }; + return $uam->{$ua}; + } + + # contype + # Adobe Acrobat Reader? + if ($ua =~ m{^(contype)$}) { + $uam->{$ua} = { + robot => 0, + useragent => $1, + }; + return $uam->{$ua}; + } + + # Firefly/1.0 (compatible; Mozilla 4.0; MSIE 5.5) + if ($ua =~ m{^(Firefly)/((\d).\d)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # gpv3.1/ 4462 17.13 2.16 + # No idea what that is. Provisionally classifying as bot + if ($ua =~ m{^(gpv)((\d).\d)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # ia_archiver/ 62 3.79 0.25 + if ($ua =~ m{^(ia_archiver)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + # LinkWalker/ 322 1.24 0.16 + if ($ua =~ m{^(LinkWalker)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + # Melvil3.0 http://www.uma.at/ 8291 31.82 4.02 + # No idea what that is. Provisionally classifying as bot + if ($ua =~ m{^(Melvil)((\d).\d) http://www.uma.at$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # Mozilla/4.0 (compatible; grub-client-1.0.6; Crawl your own stuff with http://grub.org)/ 307 1.18 0.15 + if ($ua =~ m{^Mozilla/4.0 \(compatible; (grub-client)-((\d)[.\d]+); Crawl your own stuff with http://grub.org\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # Mozilla/4.75 (compatible; PortalBSpider; spider@portalb.com)/ 513 1.97 0.25 + if ($ua =~ m{^Mozilla/4.75 \(compatible; PortalBSpider; spider\@portalb.com\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + # NPBot-1/2.0 (http://www.nameprotect.com/botinfo.html)/ 61 3.73 0.24 + if ($ua =~ m{^(NPBot)-((\d)/[.\d]+) \(http://www.nameprotect.com/botinfo.html\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # oBot/ 75 4.59 0.30 + if ($ua =~ m{^(oBot)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + + # TeWIS/0.3 TeWIS/0.3 libwww-perl/5.53/ 307 1.18 0.15 + if ($ua =~ m{^(TeWIS)/((\d\.\d)) TeWIS/\d\.\d libwww-perl/\d\.\d+$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # xFIND2000/0.8 RPT-HTTPClient/0.3-2/ 3761 14.43 1.82 + # No idea what that is. Provisionally classifying as bot + if ($ua =~ m{^(xFIND2000)/((\d.\d)) RPT-HTTPClient/}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # htdig/3.1.6 (webmaster@luga.at)/ 1241844 97.96 67.33 + if ($ua =~ m{^(htdig)/((\d.\d)\.\d) \(.*\@.*\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # Gigabot/1.0 + # Gigabot/2.0 (http://www.gigablast.com/spider.html) 9165 6.42 1.04 + if ($ua =~ m{^(Gigabot)/(\d.\d)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + # Gigabot/2.0; http://www.gigablast.com/spider.html/ 16888 14.41 1.03 + if ($ua =~ m{^(Gigabot)/(\d.\d); http://www.gigablast.com/spider.html$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + + # msnbot/2.0b (+http://search.msn.com/msnbot.htm) + # msnbot/1.0 (+http://search.msn.com/msnbot.htm) + if ($ua =~ m{^(msnbot)(?:-media)?/(\d.\d\w?) \(\+http://search.msn.com/msnbot.htm\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + + + # check_http/1.24.2.4 (nagios-plugins ) + if ($ua =~ m{^(check_http)/(\d+(.\d+)+) \(nagios-plugins \)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + # check_http/v2053 (nagios-plugins 1.4.13) + if ($ua =~ m{^(check_http)/v\d+ \(nagios-plugins (\d+(.\d+)+)\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $2, + }; + return $uam->{$ua}; + } + + + # Intraseek_1 + if ($ua =~ m{^Intraseek_1$}) { + $uam->{$ua} = { + robot => 1, + useragent => 'Intraseek', + version => 1, + major_version => 1, + }; + return $uam->{$ua}; + } + # WebVac (webmaster@pita.stanford.edu WebVac.org ) 2199 3.75 0.53 + if ($ua =~ m{^(WebVac) \(webmaster\@pita.stanford.edu WebVac.org \)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => "", + major_version => "", + }; + return $uam->{$ua}; + } + # ichiro/1.0 (ichiro@nttr.co.jp) 1425 2.43 0.34 + if ($ua =~ m{^(ichiro)/((\d+)(.\d+)+) \(ichiro\@nttr.co.jp\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + version => $2, + major_version => $3, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) 1180 2.01 0.28 + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Googlebot)/(\d+).(\d+); \+http://www.google.com/bot.html\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2.$3" + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_4; de-de) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1 + # Mozilla/5.0 (Macintosh; U; PPC Mac OS X; de-de) AppleWebKit/312.1 (KHTML, like Gecko) Safari/312 849 1.45 0.20 + if ($ua =~ m{^Mozilla/5.0 \(Macintosh; U; (.* Mac OS X)(?: [_0-9]+)?; [-a-z]+\) AppleWebKit/[.0-9]+ \(KHTML, like Gecko\)(?: Version/[.0-9]+)? (Safari)/((\d+)[.\w]+)}) { + $uam->{$ua} = { + robot => 0, + os => canonical_os($1), + useragent => $2, + major_version => $4, + version => $3, + }; + return $uam->{$ua}; + } + # Seekbot/1.0 (http://www.seekbot.net/bot.html) HTTPFetcher/0.3 1420 2.42 0.34 + if ($ua =~ m{^(Seekbot)/(\d+).(\d+) \(http://www.seekbot.net/bot.html\) HTTPFetcher/\d+.\d+$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2.$3" + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; heritrix/1.6.0 +http://innovationblog.com) 2624 15.88 2.41 + if ($ua =~ m{^(?:Mozilla/5.0|webcrawler) \(compatible; (heritrix)/((\d+)[-.\d]+) \+\+?http://.*\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # psbot/0.1 (+http://www.picsearch.com/bot.html) 2252 13.63 2.07 + if ($ua =~ m{^(psbot)/((\d+).\d+) \(\+http://www.picsearch.com/bot.html\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # Ocelli/1.3 (http://www.globalspec.com/Ocelli) 1482 8.97 1.36 + if ($ua =~ m{^(Ocelli)/((\d+).\d+) \(http://www.globalspec.com/Ocelli\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # Googlebot-Image/1.0 1282 7.76 1.18 + if ($ua =~ m{^(Googlebot-Image)/((\d+).\d+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # Francis/2.0 (francis@neomo.de http://www.neomo.de/pages/crawler.php) 5559 14.57 1.33 + if ($ua =~ m{^(Francis)/((\d+).\d+) \(francis\@neomo.de http://www.neomo.de/pages/crawler.php\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; Herold; +http://www.herold.at) 3637 9.53 0.87 + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Herold); \+http://www.herold.at\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; Interseek/3.1) 12415 67.90 16.66 + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Interseek)/((\d+).\d+)\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # mnogo 3313 18.12 4.45 + if ($ua =~ m{^(mnogo)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; TridentSpider/3.1)/ 10556 21.35 1.62 + if ($ua =~ m{^Mozilla/5.0 \(compatible; (TridentSpider)/((\d).\d)\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2 + }; + return $uam->{$ua}; + } + # FAST Enterprise Crawler 6 / Scirus scirus-crawler@fast.no; http://www.scirus.com/srsapp/contactus// 5955 12.05 0.91 + if ($ua =~ m{^FAST Enterprise Crawler (\d) / Scirus scirus-crawler\@fast.no; http://www.scirus.com/srsapp/contactus/$}) { + $uam->{$ua} = { + robot => 1, + useragent => 'FAST Crawler', + major_version => $1, + version => $1 + }; + return $uam->{$ua}; + } + # mnogo 47921 33.58 5.43 + if ($ua =~ m{^(mnogo)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + # MnoGoSearch/3.2.31 + if ($ua =~ m{^MnoGoSearch/((\d+\.\d+)\.\d+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => "mnogo", + version => $1, + major_version => $2, + }; + return $uam->{$ua}; + } + if ($ua =~ m{^(Aport)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + }; + return $uam->{$ua}; + } + # Mozilla/5.0 (compatible; Interseek/3.1) + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Interseek)/(\d.\d+)\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2 + }; + return $uam->{$ua}; + } + # Mozilla/4.0 (compatible; DepSpid/5.07; +http://about.depspid.net) 12838 45.87 8.04 + if ($ua =~ m{^Mozilla/4.0 \(compatible; (DepSpid)/(\d.\d+); \+http://about.depspid.net\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2 + }; + return $uam->{$ua}; + } + # libwww-perl/5.803 + if ($ua =~ m{^(libwww-perl)/(\d\.\d+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2 + }; + return $uam->{$ua}; + } + # Java/1.4.2_10 + if ($ua =~ m{^(Java)/(\d\.\d+)(\.[\d_]+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2$3" + }; + return $uam->{$ua}; + } + # Xenu Link Sleuth 1.2i + if ($ua =~ m{^(Xenu Link Sleuth) (\d\.(.+))$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2$3" + }; + return $uam->{$ua}; + } + + # Google Chrome + if ($ua =~ m{^Mozilla/5.0 \([^;]*; .; ([^;]*); [-\w]*\) AppleWebKit/\d+.\d+ \(KHTML, like Gecko\) (Chrome)/((\d+\.\d+)[.\d]*) Safari/\d+.\d+$}) { + $uam->{$ua} = { + robot => 0, + useragent => $2, + major_version => $4, + version => $3, + os => canonical_os($1), + }; + return $uam->{$ua}; + } + + # Echoping/5.2.0 + if ($ua =~ m{^(Echoping)/(\d+\.[.\d]+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Shelob (shelob@gmx.net) 1858 31.93 2.89 + if ($ua =~ m{^(Shelob) \(shelob\@gmx.net\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => '', + version => '', + }; + return $uam->{$ua}; + } + + # SiteUptime.com 448 11.31 0.70 + if ($ua =~ m{^(SiteUptime.com)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => '', + version => '', + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es65 + if ($ua =~ m{^Mozilla/5.0 \(([^;]*); .; [-\w]*\) AppleWebKit/\d+ \(KHTML, like Gecko\) (Safari)/(\d+)}) { + $uam->{$ua} = { + robot => 0, + useragent => $2, + major_version => $3, + version => $3, + os => canonical_os($1), + }; + return $uam->{$ua}; + } + + # Snoopy v1.2.3 + if ($ua =~ m{^(Snoopy) v([.\d]+)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; egothor/8.0g; +http://ego.ms.mff.cuni.cz/) + if ($ua =~ m{^Mozilla/5.0 \(compatible; (egothor)/(\d+\.\w+); \+http://ego.ms.mff.cuni.cz/\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # remi 1.5e using w-3-m-i-r (http://nep.repec.org) + # remi 1.5c using w3mir (socionet@socionet.ru) + if ($ua =~ m{^(remi) (\d+\.\d+)([a-z]*) using w-?3-?m-?i-?r }) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => "$2$3", + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; IDBot/1.0; +http://www.id-search.org/bot.html + + if ($ua =~ m{Mozilla/5.0 \(compatible; (IDBot)/(\d+.\d+); \+http://www.id-search.org/bot.html\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (Twiceler-0.9 http://www.cuill.com/twiceler/robot.html) + # Mozilla/5.0 (Twiceler-0.9 http://www.cuil.com/twiceler/robot.html) + if ($ua =~ m{^Mozilla/5.0 \((Twiceler)-(0.9) http://www.cuill?.com/twiceler/robot.html\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + # Yandex/1.01.001 (compatible; Win16; P) + if ($ua =~ m{^(Yandex)/((\d+)\.[\d.]+) \(compatible; Win16; [A-Z]\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2, + }; + return $uam->{$ua}; + } + + # Yanga WorldSearch Bot v1.1/beta (http://www.yanga.co.uk/) + if ($ua =~ m{^(Yanga) WorldSearch Bot v((1).1/beta) \(http://www.yanga.co.uk/\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2, + }; + return $uam->{$ua}; + } + + + # Jyxobot/1 + if ($ua =~ m{^(Jyxobot)/(\d+)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + + # DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) + # SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) + if ($ua =~ m{\(compatible; (Googlebot-Mobile)/((\d+).\d+); \+http://www.google.com/bot.html\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $3, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (X11; Linux i686; rv:2.0b11) Gecko/20100101 Firefox/4.0b11 + # Mozilla/5.0 (Windows NT 6.0; rv:2.0b10) Gecko/20100101 Firefox/4.0b10 + if ($ua =~ m{^Mozilla/5.0 \(((?:Windows|X11; Linux|\w+) [^;]*); rv:([0-9.b]*)\) Gecko/(\d+) Firefox/((\d+\.\d+)(?:[.b]\d+)?)$}) { + $uam->{$ua} = { + robot => 0, + useragent => 'Firefox', + os => canonical_os($1), + major_version => $5, + version => $4, + }; + return $uam->{$ua}; + } + + + # Netluchs/Nutch-1.0-dev ( ; http://www.netluchs.de/; _do_not_spam_me___humans_please_use_info_at_netluchs.de_without_the_dash) + if ($ua =~ m{^(Netluchs/Nutch)-(\d\.\S*) \( ; http://www.netluchs.de/; _do_not_spam_me___humans_please_use_info_at_netluchs.de_without_the_dash\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; DotBot/1.1; http://www.dotnetdotcom.org/, crawler@dotnetdotcom.org) 1643 17.03 2.17 + if ($ua =~ m{^Mozilla/5.0 \(compatible; (DotBot)/(\d.\d); http://www.dotnetdotcom.org/, crawler\@dotnetdotcom.org\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + + # Baiduspider+(+http://www.baidu.com/search/spider.htm) 1100 13.74 1.45 + if ($ua =~ m{^(Baiduspider)\+\(\+http://\w+.baidu.\w+/\S+\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => '', + version => '', + }; + return $uam->{$ua}; + } + + + # findlinks/1.1.6-beta1%20(+http://wortschatz.uni-leipzig.de/findlinks/) + if ($ua =~ m{^findlinks/((\d.\d.\d)-beta\d+) \(\+http://wortschatz.uni-leipzig.de/findlinks/\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => "wortschatz-findlinks", + major_version => $2, + version => $1, + }; + return $uam->{$ua}; + } + + + # Eurobot/1.1%20(http://eurobot.ayell.eu) + if ($ua =~ m{^(Eurobot)/(\d.\d) \(http://eurobot.ayell.eu\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; Charlotte/1.1; http://www.searchme.com/support/) + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Charlotte)/(\d.\d); http://www.searchme.com/support/\)$}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot) + if ($ua =~ m{^Mozilla/5.0 \(compatible; (Exabot)(?:-images)?/(\d.\d)(?: \S+)?; \+http://www.exabot.com/go/robot\)}) { + $uam->{$ua} = { + robot => 1, + useragent => $1, + major_version => $2, + version => $2, + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.11) Gecko GranParadiso/3.0.11 + # looks like AskJeeves in disguise + if ($ua =~ m{^Mozilla/5.0 \(X11; U; Linux i686; en-US; rv:1.9.0.11\) Gecko GranParadiso/3.0.11}) { + $uam->{$ua} = { + robot => 1, + useragent => "Ask Jeeves/Teoma", + major_version => "stealth", + version => "stealth", + }; + return $uam->{$ua}; + } + + # Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B367 Safari/531.21.10 + if ($ua =~ m{^Mozilla/5.0 \(iPad; U; CPU OS (\w+) like Mac OS X; [-a-z]+\) AppleWebKit/((\d+)(?:\.\d+)*) \(KHTML, like Gecko\) Version/[.0-9]+ Mobile/\w+ Safari/\2$}) { + $uam->{$ua} = { + robot => 0, + useragent => 'Safari', + os => canonical_os("iOS/iPad"), + major_version => $3, + version => $2, + }; + return $uam->{$ua}; + } + $uam->{$ua} = { + useragent => $ua + }; + return $uam->{$ua}; +}