Skip to content

Commit c9badeb

Browse files
committed
[extensions] Add new crawlers: Algolia, Baidu, BLEXBot, Botify, Freespoke, Marginalia, MSNBot, OnCrawl, SeekportBot, Siteimprove, TwinAgent, YepBot, ZumBot
1 parent 9003fe5 commit c9badeb

File tree

2 files changed

+155
-8
lines changed

2 files changed

+155
-8
lines changed

src/extensions/ua-parser-extensions.js

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,11 @@ const Crawlers = Object.freeze({
6262
// OpenAI's SearchGPT - https://platform.openai.com/docs/bots
6363
// PerplexityBot - https://perplexity.ai/perplexitybot
6464
// SeznamBot - http://napoveda.seznam.cz/seznambot-intro
65-
/((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam)bot)\/([\w\.-]+)/i,
65+
// YepBot - https://yep.com/yepbot/
66+
/((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam|yep)bot)\/([\w\.-]+)/i,
67+
68+
// Algolia Crawler
69+
/(algolia crawler(?: renderscript)?)\/?([\w\.]*)/i,
6670

6771
// Applebot - http://apple.com/go/applebot
6872
/(applebot(?:-extended)?)\/?([\w\.]*)/i,
@@ -89,6 +93,9 @@ const Crawlers = Object.freeze({
8993
// Internet Archive (archive.org)
9094
/(ia_archiver|archive\.org_bot)\/?([\w\.]*)/i,
9195

96+
// OnCrawl
97+
/(oncrawl) mobile\/([\w\.]+)/i,
98+
9299
// Qwantbot - https://help.qwant.com/bot
93100
/(qwantbot)[-\w]*\/?([\w\.]*)/i,
94101

@@ -107,9 +114,10 @@ const Crawlers = Object.freeze({
107114
// Yeti (Naver)
108115
/(yeti)\/([\w\.]+)/i,
109116

110-
// aiHitBot / Algolia Crawler / Diffbot / FirecrawlAgent / HuggingFace-Bot / Linespider / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / PanguBot / Replicate-Bot / RunPod-Bot / Webzio-Extended / Screaming Frog SEO Spider / Startpage / Timpibot / Together-Bot / VelenPublicWebCrawler / xAI-Bot / YisouSpider / YouBot
117+
// aiHitBot / Algolia Crawler / BLEXBot / Diffbot / FirecrawlAgent / HuggingFace-Bot / Linespider / MSNBot / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / PanguBot / Replicate-Bot / RunPod-Bot / Webzio-Extended / Screaming Frog SEO Spider / Startpage / Timpibot / Together-Bot / VelenPublicWebCrawler / xAI-Bot / YisouSpider / YouBot / ZumBot
111118
// Cotoyogi - https://ds.rois.ac.jp/en_center8/en_crawler/
112-
/((?:aihit|diff|huggingface-|pangu|replicate-|runpod-|timpi|together-|xai-|you)bot|omgili(?:bot)?|cotoyogi|firecrawlagent|openai image downloader|(?:algolia |magpie-|velenpublicweb)crawler|startpageprivateimageproxy|webzio-extended|(?:chatglm-|line|screaming frog seo |yisou)spider)\/?([\w\.]*)/i
119+
// Freespoke - https://docs.freespoke.com/search/bot/
120+
/((?:aihit|blex|diff|huggingface-|msn|pangu|replicate-|runpod-|timpi|together-|xai-|you|zum)bot|(?:magpie-|velenpublicweb)crawler|(?:chatglm-|line|screaming frog seo |yisou)spider|cotoyogi|firecrawlagent|freespoke|omgili(?:bot)?|openai image downloader|startpageprivateimageproxy|twinagent|webzio-extended)\/?([\w\.]*)/i
113121
],
114122

115123
[NAME, VERSION, [TYPE, CRAWLER]],
@@ -119,16 +127,15 @@ const Crawlers = Object.freeze({
119127
/((?:adsbot|apis|mediapartners)-google(?:-mobile)?|google-?(?:other|cloudvertexbot|extended|safety))/i,
120128

121129
// AI2Bot - https://allenai.org/crawler
122-
// Bytespider
123130
// DataForSeoBot - https://dataforseo.com/dataforseo-bot
124-
// DeepSeekBot
125131
// Huawei AspiegelBot / PetalBot https://aspiegel.com/petalbot
126132
// ImagesiftBot - https://imagesift.com/about
127-
// Qihoo 360Spider
133+
// Siteimprove - https://help.siteimprove.com/support/solutions/articles/80000448553
128134
// TurnitinBot - https://www.turnitin.com/robot/crawlerinfo.html
129135
// v0bot - https://vercel.com/docs/bot-management
130136
// Yahoo! Slurp - http://help.yahoo.com/help/us/ysearch/slurp
131-
/\b(360spider-?(?:image|video)?|bytespider|cohere-training-data-crawler|elastic(?=\/s)|(?:ai2|aspiegel|dataforseo|deepseek|imagesift|petal|turnitin|v0)bot|teoma|yahoo! slurp)/i
137+
// Botify / Bytespider / DeepSeekBot / Qihoo 360Spider / SeekportBot
138+
/\b((?:ai2|aspiegel|dataforseo|deepseek|imagesift|petal|seekport|turnitin|v0)bot|360spider-?(?:image|video)?|baidu-ads|botify|bytespider|cohere-training-data-crawler|elastic(?=\/s)|marginalia|siteimprove(?=bot|\.com)|teoma|yahoo! slurp)/i
132139
],
133140
[NAME, [TYPE, CRAWLER]]
134141
]

test/data/ua/extension/crawler.json

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,16 @@
8989
"type" : "crawler"
9090
}
9191
},
92+
{
93+
"desc" : "Algolia Crawler Renderscript",
94+
"ua" : "Algolia Crawler Renderscript",
95+
"expect" :
96+
{
97+
"name" : "Algolia Crawler Renderscript",
98+
"version" : "undefined",
99+
"type" : "crawler"
100+
}
101+
},
92102
{
93103
"desc" : "Applebot",
94104
"ua" : "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B410 Safari/600.1.4 (Applebot/0.1;+http://www.apple.com/go/applebot)",
@@ -149,6 +159,16 @@
149159
"type" : "crawler"
150160
}
151161
},
162+
{
163+
"desc" : "Baidu ADS",
164+
"ua" : "Baidu-ADS",
165+
"expect" :
166+
{
167+
"name" : "Baidu-ADS",
168+
"version" : "undefined",
169+
"type" : "crawler"
170+
}
171+
},
152172
{
153173
"desc" : "Baiduspider",
154174
"ua" : "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
@@ -239,6 +259,26 @@
239259
"type" : "crawler"
240260
}
241261
},
262+
{
263+
"desc" : "BLEXBot",
264+
"ua" : "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)",
265+
"expect" :
266+
{
267+
"name" : "BLEXBot",
268+
"version" : "1.0",
269+
"type" : "crawler"
270+
}
271+
},
272+
{
273+
"desc" : "botify",
274+
"ua" : "Desktop: Mozilla/5.0 (compatible; botify; http://botify.com)",
275+
"expect" :
276+
{
277+
"name" : "botify",
278+
"version" : "undefined",
279+
"type" : "crawler"
280+
}
281+
},
242282
{
243283
"desc" : "Bravebot",
244284
"ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Bravebot/1.0; +https://search.brave.com/help/brave-search-crawler) Chrome/W.X.Y.Z Safari/537.36",
@@ -519,6 +559,16 @@
519559
"type" : "crawler"
520560
}
521561
},
562+
{
563+
"desc" : "Freespoke",
564+
"ua" : "Mozilla/5.0 (compatible; Freespoke/2.0; +https://docs.freespoke.com/search/bot)",
565+
"expect" :
566+
{
567+
"name" : "Freespoke",
568+
"version" : "2.0",
569+
"type" : "crawler"
570+
}
571+
},
522572
{
523573
"desc" : "Googlebot-Video",
524574
"ua" : "Googlebot-Video/1.0",
@@ -719,6 +769,16 @@
719769
"type" : "crawler"
720770
}
721771
},
772+
{
773+
"desc" : "Marginalia Search",
774+
"ua" : "search.marginalia.nu",
775+
"expect" :
776+
{
777+
"name" : "marginalia",
778+
"version" : "undefined",
779+
"type" : "crawler"
780+
}
781+
},
722782
{
723783
"desc" : "Meta-ExternalAgent",
724784
"ua" : "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
@@ -750,6 +810,16 @@
750810
"type" : "crawler"
751811
}
752812
},
813+
{
814+
"desc" : "msnbot",
815+
"ua" : "msnbot/2.0b (+http://search.msn.com/msnbot.htm)",
816+
"expect" :
817+
{
818+
"name" : "msnbot",
819+
"version" : "2.0b",
820+
"type" : "crawler"
821+
}
822+
},
753823
{
754824
"desc" : "Omgili",
755825
"ua" : "omgili/0.5 +https://omgili.com",
@@ -770,6 +840,16 @@
770840
"type" : "crawler"
771841
}
772842
},
843+
{
844+
"desc" : "OnCrawl",
845+
"ua" : "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; OnCrawl Mobile/1.0; +http://www.oncrawl.com/)",
846+
"expect" :
847+
{
848+
"name" : "OnCrawl",
849+
"version" : "1.0",
850+
"type" : "crawler"
851+
}
852+
},
773853
{
774854
"desc" : "Onespot",
775855
"ua" : "Mozilla/5.0 (compatible; Onespot-ScraperBot/1.0; +https://www.onespot.com/identifying-traffic.html)",
@@ -880,6 +960,16 @@
880960
"type" : "crawler"
881961
}
882962
},
963+
{
964+
"desc" : "SeekportBot",
965+
"ua" : "Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com)",
966+
"expect" :
967+
{
968+
"name" : "SeekportBot",
969+
"version" : "undefined",
970+
"type" : "crawler"
971+
}
972+
},
883973
{
884974
"desc" : "SemrushBot",
885975
"ua" : "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)",
@@ -931,7 +1021,27 @@
9311021
}
9321022
},
9331023
{
934-
"desc" : "Sogou",
1024+
"desc" : "Siteimprove",
1025+
"ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; SiteCheck-sitecrawl by Siteimprove.com; +https://siteimprove.com/bots) Chrome/[VERSION] Safari/537.36",
1026+
"expect" :
1027+
{
1028+
"name" : "Siteimprove",
1029+
"version" : "undefined",
1030+
"type" : "crawler"
1031+
}
1032+
},
1033+
{
1034+
"desc" : "Sogou Pic Spider",
1035+
"ua" : "Sogou Pic Spider/3.0( http://www.sogou.com/docs/help/webmasters.htm#07)",
1036+
"expect" :
1037+
{
1038+
"name" : "Sogou Pic Spider",
1039+
"version" : "3.0",
1040+
"type" : "crawler"
1041+
}
1042+
},
1043+
{
1044+
"desc" : "Sogou web spider",
9351045
"ua" : "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
9361046
"expect" :
9371047
{
@@ -990,6 +1100,16 @@
9901100
"type" : "crawler"
9911101
}
9921102
},
1103+
{
1104+
"desc" : "TwinAgent",
1105+
"ua" : "TwinAgent/1.0",
1106+
"expect" :
1107+
{
1108+
"name" : "TwinAgent",
1109+
"version" : "1.0",
1110+
"type" : "crawler"
1111+
}
1112+
},
9931113
{
9941114
"desc" : "xAI-Bot",
9951115
"ua" : "Mozilla/5.0 (compatible; xAI-Bot/1.0; +https://x.ai/)",
@@ -1050,6 +1170,16 @@
10501170
"type" : "crawler"
10511171
}
10521172
},
1173+
{
1174+
"desc" : "YepBot",
1175+
"ua" : "Mozilla/5.0 (compatible; YepBot/1.0; +http://yep.com/yepbot/)",
1176+
"expect" :
1177+
{
1178+
"name" : "YepBot",
1179+
"version" : "1.0",
1180+
"type" : "crawler"
1181+
}
1182+
},
10531183
{
10541184
"desc" : "Yeti",
10551185
"ua" : "Mozilla/5.0 (compatible; Yeti/1.1; +http://naver.me/spd)",
@@ -1089,5 +1219,15 @@
10891219
"version" : "undefined",
10901220
"type" : "crawler"
10911221
}
1222+
},
1223+
{
1224+
"desc" : "ZumBot",
1225+
"ua" : "Mozilla/5.0 (compatible; ZumBot/1.0; http://help.zum.com/inquiry)",
1226+
"expect" :
1227+
{
1228+
"name" : "ZumBot",
1229+
"version" : "1.0",
1230+
"type" : "crawler"
1231+
}
10921232
}
10931233
]

0 commit comments

Comments
 (0)