package Net::Http::Useragent; use warnings; use strict; =head1 NAME Net::Http::Useragent - extract information from HTTP User-Agent header =head2 DESCRIPTION This class represents the contents of a HTTP User-Agent header, and can be used to extract information (user agent, version, os, whether it's a bot, ...) from it. =cut our $VERSION = 0.002; sub new { my ($class, $useragent) = @_; my $self = { useragent => $useragent }; bless $self, $class; return $self; } =head2 canonical_os Returns a canonic OS name =cut sub canonical_os($) { my ($os) = @_; $os =~ s/Windows Windows/Windows/g; $os =~ s/.*Linux.*/Linux/g; $os =~ s/.*Mac OS X.*/Mac OS X/g; return $os; } =head2 useragent_munged Tries to find out the real user agent and returns that information as a hash ref: =over 4 =item robot is this a known robot? =item useragent (short) name of the agent, e.g. "Netscape", "MSIE", "Mozilla", "Opera", "w3m", ... =item major_version The version number with minor revisions omitted. This is not necessarily just the number before the first dot, as different vendors have different policies in regard to version numbers. =item version =item os The operating system =cut my $uam = {}; sub munged { my ($self) = @_; my $ua = $self->{useragent}; if ($ua =~ m{^(FAST-WebCrawler)/(\d+)\.(\S+)}) { $uam->{$ua} = { robot => 1, useragent => 'FAST Crawler', major_version => $2, version => "$2.$3" }; return $uam->{$ua}; } if ($ua =~ m{^(Googlebot)/(\d+)\.(\d+)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2.$3" }; return $uam->{$ua}; } if ($ua =~ m{^Mozilla/5.0 \((Slurp)/(\w+); slurp\@inktomi.com; http://www.inktomi.com/slurp.html\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) 13155 22.43 3.16 if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp); http://help.yahoo.com/help/us/ysearch/slurp\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => "", version => "", }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html) if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp) China; http://misc.yahoo.com.cn/help.html\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => "", version => "", }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp) if ($ua =~ m{^Mozilla/5.0 \(compatible; Yahoo! (Slurp)/([.\d]+); http://help.yahoo.com/help/us/ysearch/slurp\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com) # Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (http://www.voila.com/) # Mozilla/4.0 (compatible; MSIE 5.0; Windows 95) VoilaBot BETA 1.2 (http://www.voila.com/) if ($ua =~ m{Mozilla/[45].0 \(.* Windows.*\) (VoilaBot) (BETA 1.2) \(.*voila.*\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # MSIE if ($ua =~ m{^Mozilla/\d+.\d+ \(compatible; (MSIE) ((\d+\.\d)\d*); ([^;\)]*).*\)$}) { $uam->{$ua} = { robot => 0, useragent => $1, major_version => $3, version => $2, os => canonical_os($4), }; return $uam->{$ua}; } # Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://sp.ask.com/docs/about/tech_crawling.html) 3609 6.15 0.87 # Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml) 2600 1.82 0.29 # Mozilla/5.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml) 18686 21.79 2.20 if ($ua =~ m{^Mozilla/[25].0 \(compatible; (Ask Jeeves/Teoma)(; \+http://\w+.ask.com/.*docs/about/.*html)?\)}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } if ($ua =~ m{^Mozilla/4.0 \(compatible; (?i:B.l.i.t.z.B.O.T)\)}) { $uam->{$ua} = { robot => 1, useragent => "Blitzbot", }; return $uam->{$ua}; } if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; .*\) Gecko/(\d+) (Netscape)\d?/((\d)\.\d+)$}) { # 1 2 3 4 5 67 $uam->{$ua} = { robot => 0, useragent => $5, os => canonical_os("$2 $3"), major_version => $7, version => "$6/$4", }; return $uam->{$ua}; } # Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1 1418 2.42 0.34 if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; rv:(\d.[.\w]*)\) Gecko/(\d+) .*(Firefox|GranParadiso|Minefield|Iceweasel)/((\d+\.\d+)[.\d]*)}) { $uam->{$ua} = { robot => 0, useragent => 'Firefox', os => canonical_os("$2 $3"), major_version => $8, version => "$7/$5", }; return $uam->{$ua}; } if ($ua =~ m{^(Mozilla)/5.0 \((\w+); U; ([^;]*); [-\w]+; rv:((\d.\d+)[.\w]*)\) Gecko/(\d+)}) { $uam->{$ua} = { robot => 0, useragent => $1, os => canonical_os("$2 $3"), major_version => $5, version => "$4/$6", }; return $uam->{$ua}; } # honest Opera # Opera/9.52 (X11; Linux i686; U; en) if ($ua =~ m{^(Opera)/((\d).\d+) \((?:[^;]+; )?([^;]+); U(?:; ([-a-z]*))\)}) { $uam->{$ua} = { robot => 0, useragent => $1, version => $2, major_version => $3, os => canonical_os($4), lang => $5, }; return $uam->{$ua}; } # Opera masquerading as MSIE if ($ua =~ m{^Mozilla/4.0 \(compatible;.* MSIE \d\.\d; ([^\)]+)\) (Opera) ((\d).\d+) \[\w\w\]}) { $uam->{$ua} = { robot => 0, useragent => $2, version => $3, major_version => $4, os => canonical_os($1), }; return $uam->{$ua}; } if ($ua =~ m{^Mozilla/4.0 \(compatible; MSIE \d\.\d; (?:.*; )*([^;]+); ([-a-z]*)\) (Opera) ((\d).\d+)$}) { $uam->{$ua} = { robot => 0, useragent => $3, version => $4, major_version => $5, os => canonical_os($1), lang => $2, }; return $uam->{$ua}; } # Opera masquerading as Mozilla # Mozilla/5.0 (Linux 2.4.2 i386; U) Opera 6.0 [en] if ($ua =~ m{^Mozilla/\d.\d+ \(([^;]+); [IU]\) (Opera) ((\d).\d+) \[\w\w\]}) { $uam->{$ua} = { robot => 0, useragent => $2, version => $3, major_version => $4, os => canonical_os($1), }; return $uam->{$ua}; } if ($ua =~ m{^Mozilla/\d\.\d \(compatible; (Konqueror)/([.\d]+); (\S+)\)}) { $uam->{$ua} = { robot => 0, useragent => $1, version => $2, major_version => $2, os => canonical_os($3), }; return $uam->{$ua}; } if ($ua =~ m{^Mozilla/\d\.\d \(compatible; (Konqueror)/(([.\d]+)(?:-rc\d+)); ([^;\)]+).*\)}) { $uam->{$ua} = { robot => 0, useragent => $1, version => $2, major_version => $3, os => canonical_os($4), }; return $uam->{$ua}; } if ($ua =~ m{^(sitecheck.internetseer.com) \(For more info see: http://sitecheck.internetseer.com\)}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # Mozilla/4.77 [en] (Windows NT 5.0; U)/ 37 2.26 0.15 # Mozilla/4.79 [en] (Windows NT 5.0; U)/ 1288 4.94 0.62 if ($ua =~ m{^Mozilla/((\d\.\d)\d*) \[\w\w\].* \(([^;]+); [IU](?:;( [^;\)]*))?\)}) { $uam->{$ua} = { robot => 0, useragent => "Netscape", version => $1, major_version => $2, os => canonical_os($3 . ($4 || "")), }; return $uam->{$ua}; } # Mozilla/4.73 (Macintosh; U; PPC)/ 332 20.31 1.31 # apparently the Mac version of Netscape omits the language tag if ($ua =~ m{^Mozilla/((\d\.\d)\d*) \((Macintosh); [IU](?:;( [^;\)]*))?\)}) { $uam->{$ua} = { robot => 0, useragent => "Netscape", version => $1, major_version => $2, os => canonical_os($3 . ($4 || "")), }; return $uam->{$ua}; } # don't really know what that is. # The access patterns look human, not robot-like, # so it's probably some browser behind a UA-mangling proxy # Mozilla/3.01 (compatible;) if ($ua =~ m{^Mozilla/((3).01) \(compatible;\)$}) { $uam->{$ua} = { robot => 0, useragent => "masquerades as Mozilla compatible", version => $1, major_version => $2, }; return $uam->{$ua}; } # Altavista search bot: # Scooter/3.2.SF0 # Scooter/3.3 if ($ua =~ m{^(Scooter)/((\d).*)}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # http://www.almaden.ibm.com/cs/crawler [c01] if ($ua =~ m{^http://www.(almaden.ibm.com)/cs/crawler \[(c01)\]}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # Mercator-2.0 if ($ua =~ m{^(Mercator)-((\d).\d)}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # Mozilla/4.0 (compatible; BorderManager 3.0) # Bordermanager (http://www.novell.com/products/bordermanager/) # seems to be a Novell Proxy server. # classifing that as a user agent is a bit wrong, if ($ua =~ m{^Mozilla/4.0 \(compatible; (BorderManager) ((3).0)\)$}) { $uam->{$ua} = { robot => 0, useragent => $1, version => $2, major_version => $3, os => canonical_os("Novell"), }; return $uam->{$ua}; } # contype # Adobe Acrobat Reader? if ($ua =~ m{^(contype)$}) { $uam->{$ua} = { robot => 0, useragent => $1, }; return $uam->{$ua}; } # Firefly/1.0 (compatible; Mozilla 4.0; MSIE 5.5) if ($ua =~ m{^(Firefly)/((\d).\d)}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # gpv3.1/ 4462 17.13 2.16 # No idea what that is. Provisionally classifying as bot if ($ua =~ m{^(gpv)((\d).\d)}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # ia_archiver/ 62 3.79 0.25 if ($ua =~ m{^(ia_archiver)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # LinkWalker/ 322 1.24 0.16 if ($ua =~ m{^(LinkWalker)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # Melvil3.0 http://www.uma.at/ 8291 31.82 4.02 # No idea what that is. Provisionally classifying as bot if ($ua =~ m{^(Melvil)((\d).\d) http://www.uma.at$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # Mozilla/4.0 (compatible; grub-client-1.0.6; Crawl your own stuff with http://grub.org)/ 307 1.18 0.15 if ($ua =~ m{^Mozilla/4.0 \(compatible; (grub-client)-((\d)[.\d]+); Crawl your own stuff with http://grub.org\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # Mozilla/4.75 (compatible; PortalBSpider; spider@portalb.com)/ 513 1.97 0.25 if ($ua =~ m{^Mozilla/4.75 \(compatible; PortalBSpider; spider\@portalb.com\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # NPBot-1/2.0 (http://www.nameprotect.com/botinfo.html)/ 61 3.73 0.24 if ($ua =~ m{^(NPBot)-((\d)/[.\d]+) \(http://www.nameprotect.com/botinfo.html\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # oBot/ 75 4.59 0.30 if ($ua =~ m{^(oBot)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # TeWIS/0.3 TeWIS/0.3 libwww-perl/5.53/ 307 1.18 0.15 if ($ua =~ m{^(TeWIS)/((\d\.\d)) TeWIS/\d\.\d libwww-perl/\d\.\d+$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # xFIND2000/0.8 RPT-HTTPClient/0.3-2/ 3761 14.43 1.82 # No idea what that is. Provisionally classifying as bot if ($ua =~ m{^(xFIND2000)/((\d.\d)) RPT-HTTPClient/}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # htdig/3.1.6 (webmaster@luga.at)/ 1241844 97.96 67.33 if ($ua =~ m{^(htdig)/((\d.\d)\.\d) \(.*\@.*\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # Gigabot/1.0 # Gigabot/2.0 (http://www.gigablast.com/spider.html) 9165 6.42 1.04 if ($ua =~ m{^(Gigabot)/(\d.\d)}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # Gigabot/2.0; http://www.gigablast.com/spider.html/ 16888 14.41 1.03 if ($ua =~ m{^(Gigabot)/(\d.\d); http://www.gigablast.com/spider.html$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # msnbot/2.0b (+http://search.msn.com/msnbot.htm) # msnbot/1.0 (+http://search.msn.com/msnbot.htm) if ($ua =~ m{^(msnbot)(?:-media)?/(\d.\d\w?) \(\+http://search.msn.com/msnbot.htm\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # check_http/1.24.2.4 (nagios-plugins ) if ($ua =~ m{^(check_http)/(\d+(.\d+)+) \(nagios-plugins \)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # check_http/v2053 (nagios-plugins 1.4.13) if ($ua =~ m{^(check_http)/v\d+ \(nagios-plugins (\d+(.\d+)+)\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $2, }; return $uam->{$ua}; } # Intraseek_1 if ($ua =~ m{^Intraseek_1$}) { $uam->{$ua} = { robot => 1, useragent => 'Intraseek', version => 1, major_version => 1, }; return $uam->{$ua}; } # WebVac (webmaster@pita.stanford.edu WebVac.org ) 2199 3.75 0.53 if ($ua =~ m{^(WebVac) \(webmaster\@pita.stanford.edu WebVac.org \)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => "", major_version => "", }; return $uam->{$ua}; } # ichiro/1.0 (ichiro@nttr.co.jp) 1425 2.43 0.34 if ($ua =~ m{^(ichiro)/((\d+)(.\d+)+) \(ichiro\@nttr.co.jp\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, version => $2, major_version => $3, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) 1180 2.01 0.28 if ($ua =~ m{^Mozilla/5.0 \(compatible; (Googlebot)/(\d+).(\d+); \+http://www.google.com/bot.html\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2.$3" }; return $uam->{$ua}; } # Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_4; de-de) AppleWebKit/525.18 (KHTML, like Gecko) Version/3.1.2 Safari/525.20.1 # Mozilla/5.0 (Macintosh; U; PPC Mac OS X; de-de) AppleWebKit/312.1 (KHTML, like Gecko) Safari/312 849 1.45 0.20 if ($ua =~ m{^Mozilla/5.0 \(Macintosh; U; (.* Mac OS X)(?: [_0-9]+)?; [-a-z]+\) AppleWebKit/[.0-9]+ \(KHTML, like Gecko\)(?: Version/[.0-9]+)? (Safari)/((\d+)[.\w]+)}) { $uam->{$ua} = { robot => 0, os => canonical_os($1), useragent => $2, major_version => $4, version => $3, }; return $uam->{$ua}; } # Seekbot/1.0 (http://www.seekbot.net/bot.html) HTTPFetcher/0.3 1420 2.42 0.34 if ($ua =~ m{^(Seekbot)/(\d+).(\d+) \(http://www.seekbot.net/bot.html\) HTTPFetcher/\d+.\d+$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2.$3" }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; heritrix/1.6.0 +http://innovationblog.com) 2624 15.88 2.41 if ($ua =~ m{^(?:Mozilla/5.0|webcrawler) \(compatible; (heritrix)/((\d+)[-.\d]+) \+\+?http://.*\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # psbot/0.1 (+http://www.picsearch.com/bot.html) 2252 13.63 2.07 if ($ua =~ m{^(psbot)/((\d+).\d+) \(\+http://www.picsearch.com/bot.html\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # Ocelli/1.3 (http://www.globalspec.com/Ocelli) 1482 8.97 1.36 if ($ua =~ m{^(Ocelli)/((\d+).\d+) \(http://www.globalspec.com/Ocelli\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # Googlebot-Image/1.0 1282 7.76 1.18 if ($ua =~ m{^(Googlebot-Image)/((\d+).\d+)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # Francis/2.0 (francis@neomo.de http://www.neomo.de/pages/crawler.php) 5559 14.57 1.33 if ($ua =~ m{^(Francis)/((\d+).\d+) \(francis\@neomo.de http://www.neomo.de/pages/crawler.php\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Herold; +http://www.herold.at) 3637 9.53 0.87 if ($ua =~ m{^Mozilla/5.0 \(compatible; (Herold); \+http://www.herold.at\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Interseek/3.1) 12415 67.90 16.66 if ($ua =~ m{^Mozilla/5.0 \(compatible; (Interseek)/((\d+).\d+)\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # mnogo 3313 18.12 4.45 if ($ua =~ m{^(mnogo)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; TridentSpider/3.1)/ 10556 21.35 1.62 if ($ua =~ m{^Mozilla/5.0 \(compatible; (TridentSpider)/((\d).\d)\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2 }; return $uam->{$ua}; } # FAST Enterprise Crawler 6 / Scirus scirus-crawler@fast.no; http://www.scirus.com/srsapp/contactus// 5955 12.05 0.91 if ($ua =~ m{^FAST Enterprise Crawler (\d) / Scirus scirus-crawler\@fast.no; http://www.scirus.com/srsapp/contactus/$}) { $uam->{$ua} = { robot => 1, useragent => 'FAST Crawler', major_version => $1, version => $1 }; return $uam->{$ua}; } # mnogo 47921 33.58 5.43 if ($ua =~ m{^(mnogo)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # MnoGoSearch/3.2.31 if ($ua =~ m{^MnoGoSearch/((\d+\.\d+)\.\d+)$}) { $uam->{$ua} = { robot => 1, useragent => "mnogo", version => $1, major_version => $2, }; return $uam->{$ua}; } if ($ua =~ m{^(Aport)$}) { $uam->{$ua} = { robot => 1, useragent => $1, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Interseek/3.1) if ($ua =~ m{^Mozilla/5.0 \(compatible; (Interseek)/(\d.\d+)\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2 }; return $uam->{$ua}; } # Mozilla/4.0 (compatible; DepSpid/5.07; +http://about.depspid.net) 12838 45.87 8.04 if ($ua =~ m{^Mozilla/4.0 \(compatible; (DepSpid)/(\d.\d+); \+http://about.depspid.net\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2 }; return $uam->{$ua}; } # libwww-perl/5.803 if ($ua =~ m{^(libwww-perl)/(\d\.\d+)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2 }; return $uam->{$ua}; } # Java/1.4.2_10 if ($ua =~ m{^(Java)/(\d\.\d+)(\.[\d_]+)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2$3" }; return $uam->{$ua}; } # Xenu Link Sleuth 1.2i if ($ua =~ m{^(Xenu Link Sleuth) (\d\.(.+))$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2$3" }; return $uam->{$ua}; } # Google Chrome if ($ua =~ m{^Mozilla/5.0 \([^;]*; .; ([^;]*); [-\w]*\) AppleWebKit/\d+.\d+ \(KHTML, like Gecko\) (Chrome)/((\d+\.\d+)[.\d]*) Safari/\d+.\d+$}) { $uam->{$ua} = { robot => 0, useragent => $2, major_version => $4, version => $3, os => canonical_os($1), }; return $uam->{$ua}; } # Echoping/5.2.0 if ($ua =~ m{^(Echoping)/(\d+\.[.\d]+)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Shelob (shelob@gmx.net) 1858 31.93 2.89 if ($ua =~ m{^(Shelob) \(shelob\@gmx.net\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => '', version => '', }; return $uam->{$ua}; } # SiteUptime.com 448 11.31 0.70 if ($ua =~ m{^(SiteUptime.com)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => '', version => '', }; return $uam->{$ua}; } # Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es65 if ($ua =~ m{^Mozilla/5.0 \(([^;]*); .; [-\w]*\) AppleWebKit/\d+ \(KHTML, like Gecko\) (Safari)/(\d+)}) { $uam->{$ua} = { robot => 0, useragent => $2, major_version => $3, version => $3, os => canonical_os($1), }; return $uam->{$ua}; } # Snoopy v1.2.3 if ($ua =~ m{^(Snoopy) v([.\d]+)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; egothor/8.0g; +http://ego.ms.mff.cuni.cz/) if ($ua =~ m{^Mozilla/5.0 \(compatible; (egothor)/(\d+\.\w+); \+http://ego.ms.mff.cuni.cz/\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # remi 1.5e using w-3-m-i-r (http://nep.repec.org) # remi 1.5c using w3mir (socionet@socionet.ru) if ($ua =~ m{^(remi) (\d+\.\d+)([a-z]*) using w-?3-?m-?i-?r }) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => "$2$3", }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; IDBot/1.0; +http://www.id-search.org/bot.html if ($ua =~ m{Mozilla/5.0 \(compatible; (IDBot)/(\d+.\d+); \+http://www.id-search.org/bot.html\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (Twiceler-0.9 http://www.cuill.com/twiceler/robot.html) # Mozilla/5.0 (Twiceler-0.9 http://www.cuil.com/twiceler/robot.html) if ($ua =~ m{^Mozilla/5.0 \((Twiceler)-(0.9) http://www.cuill?.com/twiceler/robot.html\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Yandex/1.01.001 (compatible; Win16; P) if ($ua =~ m{^(Yandex)/((\d+)\.[\d.]+) \(compatible; Win16; [A-Z]\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2, }; return $uam->{$ua}; } # Yanga WorldSearch Bot v1.1/beta (http://www.yanga.co.uk/) if ($ua =~ m{^(Yanga) WorldSearch Bot v((1).1/beta) \(http://www.yanga.co.uk/\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2, }; return $uam->{$ua}; } # Jyxobot/1 if ($ua =~ m{^(Jyxobot)/(\d+)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) # SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) if ($ua =~ m{\(compatible; (Googlebot-Mobile)/((\d+).\d+); \+http://www.google.com/bot.html\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $3, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (X11; Linux i686; rv:2.0b11) Gecko/20100101 Firefox/4.0b11 # Mozilla/5.0 (Windows NT 6.0; rv:2.0b10) Gecko/20100101 Firefox/4.0b10 if ($ua =~ m{^Mozilla/5.0 \(((?:Windows|X11; Linux|\w+) [^;]*); rv:([0-9.b]*)\) Gecko/(\d+) Firefox/((\d+\.\d+)(?:[.b]\d+)?)$}) { $uam->{$ua} = { robot => 0, useragent => 'Firefox', os => canonical_os($1), major_version => $5, version => $4, }; return $uam->{$ua}; } # Netluchs/Nutch-1.0-dev ( ; http://www.netluchs.de/; _do_not_spam_me___humans_please_use_info_at_netluchs.de_without_the_dash) if ($ua =~ m{^(Netluchs/Nutch)-(\d\.\S*) \( ; http://www.netluchs.de/; _do_not_spam_me___humans_please_use_info_at_netluchs.de_without_the_dash\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; DotBot/1.1; http://www.dotnetdotcom.org/, crawler@dotnetdotcom.org) 1643 17.03 2.17 if ($ua =~ m{^Mozilla/5.0 \(compatible; (DotBot)/(\d.\d); http://www.dotnetdotcom.org/, crawler\@dotnetdotcom.org\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Baiduspider+(+http://www.baidu.com/search/spider.htm) 1100 13.74 1.45 if ($ua =~ m{^(Baiduspider)\+\(\+http://\w+.baidu.\w+/\S+\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => '', version => '', }; return $uam->{$ua}; } # findlinks/1.1.6-beta1%20(+http://wortschatz.uni-leipzig.de/findlinks/) if ($ua =~ m{^findlinks/((\d.\d.\d)-beta\d+) \(\+http://wortschatz.uni-leipzig.de/findlinks/\)$}) { $uam->{$ua} = { robot => 1, useragent => "wortschatz-findlinks", major_version => $2, version => $1, }; return $uam->{$ua}; } # Eurobot/1.1%20(http://eurobot.ayell.eu) if ($ua =~ m{^(Eurobot)/(\d.\d) \(http://eurobot.ayell.eu\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Charlotte/1.1; http://www.searchme.com/support/) if ($ua =~ m{^Mozilla/5.0 \(compatible; (Charlotte)/(\d.\d); http://www.searchme.com/support/\)$}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot) if ($ua =~ m{^Mozilla/5.0 \(compatible; (Exabot)(?:-images)?/(\d.\d)(?: \S+)?; \+http://www.exabot.com/go/robot\)}) { $uam->{$ua} = { robot => 1, useragent => $1, major_version => $2, version => $2, }; return $uam->{$ua}; } # Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.11) Gecko GranParadiso/3.0.11 # looks like AskJeeves in disguise if ($ua =~ m{^Mozilla/5.0 \(X11; U; Linux i686; en-US; rv:1.9.0.11\) Gecko GranParadiso/3.0.11}) { $uam->{$ua} = { robot => 1, useragent => "Ask Jeeves/Teoma", major_version => "stealth", version => "stealth", }; return $uam->{$ua}; } # Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B367 Safari/531.21.10 if ($ua =~ m{^Mozilla/5.0 \(iPad; U; CPU OS (\w+) like Mac OS X; [-a-z]+\) AppleWebKit/((\d+)(?:\.\d+)*) \(KHTML, like Gecko\) Version/[.0-9]+ Mobile/\w+ Safari/\2$}) { $uam->{$ua} = { robot => 0, useragent => 'Safari', os => canonical_os("iOS/iPad"), major_version => $3, version => $2, }; return $uam->{$ua}; } $uam->{$ua} = { useragent => $ua }; return $uam->{$ua}; }