-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(plugin) bot-detection #1413
Changes from 8 commits
6548236
8dca77f
f632d12
56bdba3
8a4a777
747bdaf
48f44e7
4722ca0
852cab7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
local cache = require "kong.tools.database_cache" | ||
|
||
local _M = {} | ||
|
||
local INDEX = "bot_detection_index" | ||
|
||
function _M.set(key, value) | ||
cache.set(cache.bot_detection_key(key), value) | ||
local index_keys = cache.get(INDEX) | ||
if not index_keys then index_keys = {} end | ||
index_keys[#index_keys+1] = key | ||
cache.set(INDEX, index_keys) | ||
end | ||
|
||
function _M.get(key) | ||
return cache.get(cache.bot_detection_key(key)) | ||
end | ||
|
||
function _M.reset() | ||
local index_keys = cache.get(INDEX) | ||
for _, key in ipairs(index_keys) do | ||
cache.delete(cache.bot_detection_key(key)) | ||
end | ||
cache.delete(INDEX) | ||
end | ||
|
||
return _M |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
local BasePlugin = require "kong.plugins.base_plugin" | ||
local responses = require "kong.tools.responses" | ||
local rules = require "kong.plugins.bot-detection.rules" | ||
local stringy = require "stringy" | ||
local bot_cache = require "kong.plugins.bot-detection.cache" | ||
|
||
local ipairs = ipairs | ||
local get_headers = ngx.req.get_headers | ||
local re_match = ngx.re.match | ||
|
||
local BotDetectionHandler = BasePlugin:extend() | ||
|
||
BotDetectionHandler.PRIORITY = 2500 | ||
|
||
local function get_user_agent() | ||
local user_agent = get_headers()["user-agent"] | ||
if type(user_agent) == "table" then | ||
return nil, "Only one User-Agent header allowed" | ||
end | ||
return user_agent | ||
end | ||
|
||
function BotDetectionHandler:new() | ||
BotDetectionHandler.super.new(self, "bot-detection") | ||
end | ||
|
||
function BotDetectionHandler:access(conf) | ||
BotDetectionHandler.super.access(self) | ||
|
||
local user_agent, err = get_user_agent() | ||
if err then | ||
return responses.send_HTTP_BAD_REQUEST(err) | ||
end | ||
|
||
if user_agent then | ||
user_agent = stringy.strip(user_agent) | ||
|
||
-- Cache key, per API | ||
local cache_key = ngx.ctx.api.id..":"..user_agent | ||
|
||
-- The cache already has the user_agents that should be blocked | ||
-- So we avoid matching the regexes everytime | ||
local cached_match = bot_cache.get(cache_key) | ||
if cached_match ~= nil then | ||
if cached_match then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. directly compare against a boolean or if cached_match == true then
return
elseif cached_match == false then
return responses.send_HTTP_FORBIDDEN()
end
-- nil |
||
return | ||
else | ||
return responses.send_HTTP_FORBIDDEN() | ||
end | ||
end | ||
|
||
if conf.whitelist then | ||
for _, rule in ipairs(conf.whitelist) do | ||
if re_match(user_agent, rule) then | ||
bot_cache.set(cache_key, true) | ||
return | ||
end | ||
end | ||
end | ||
|
||
if conf.blacklist then | ||
for _, rule in ipairs(conf.blacklist) do | ||
if re_match(user_agent, rule) then | ||
bot_cache.set(cache_key, false) | ||
return responses.send_HTTP_FORBIDDEN() | ||
end | ||
end | ||
end | ||
|
||
for _, rule in ipairs(rules.bots) do | ||
if re_match(user_agent, rule) then | ||
bot_cache.set(cache_key, false) | ||
return responses.send_HTTP_FORBIDDEN() | ||
end | ||
end | ||
|
||
bot_cache.set(cache_key, true) | ||
end | ||
end | ||
|
||
return BotDetectionHandler |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
local events = require "kong.core.events" | ||
local bot_cache = require "kong.plugins.bot-detection.cache" | ||
|
||
local function invalidate(message_t) | ||
if message_t.collection == "plugins" and message_t.entity.name == "bot-detection" then | ||
bot_cache.reset() | ||
end | ||
end | ||
|
||
return { | ||
[events.TYPES.ENTITY_UPDATED] = function(message_t) | ||
invalidate(message_t) | ||
end | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
-- List taken from https://github.com/ua-parser/uap-core/blob/master/regexes.yaml | ||
|
||
return { | ||
bots = { | ||
[[(Pingdom.com_bot_version_)(\d+)\.(\d+)]], -- Pingdom | ||
[[(facebookexternalhit)/(\d+)\.(\d+)]], -- Facebook | ||
[[Google.*/\+/web/snippet]], -- Google Plus | ||
[[(Twitterbot)/(\d+)\.(\d+)]], -- Twitter | ||
[[/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots Pattern '/name-0.0' | ||
[[(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], --Bots Pattern 'name/0.0' | ||
[[(MSIE) (\d+)\.(\d+)([a-z]\d?)?;.* MSIECrawler]], --MSIECrawler | ||
[[(Google-HTTP-Java-Client|Apache-HttpClient|http%20client|Python-urllib|HttpMonitor|TLSProber|WinHTTP|JNLP)(?:[ /](\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Downloader ... | ||
[[(1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]+-Agent|AdsBot-Google(?:-[a-z]+)?|altavista|AppEngine-Google|archive.*?\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]+)*|bingbot|BingPreview|blitzbot|BlogBridge|BoardReader(?: [A-Za-z]+)*|boitho.com-dc|BotSeer|\b\w*favicon\w*\b|\bYeti(?:-[a-z]+)?|Catchpoint bot|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher)?|Feed Seeker Bot|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]+-)?Googlebot(?:-[a-zA-Z]+)?|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile)?|IconSurf|IlTrovatore(?:-Setaccio)?|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]+Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masidani_bot|Mediapartners-Google|Microsoft .*? Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media *)?|msrbot|netresearch|Netvibes|NewsGator[^/]*|^NING|Nutch[^/]*|Nymesis|ObjectsSearch|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slurp|snappy|Speedy Spider|Squrl Java|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|TwitterBot|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]+|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s)? Link Sleuth|Xerka [A-z]+Bot|yacy(?:bot)?|Yahoo[a-z]*Seeker|Yahoo! Slurp|Yandex\w+|YodaoBot(?:-[A-z]+)?|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Bots | ||
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 \-_\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*))/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name/0.0' | ||
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 _\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*)) (\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name 0.0' | ||
[[((?:[A-z0-9]+|[A-z\-]+ ?)?(?: the )?(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[A-Za-z0-9-]*(?:[^C][^Uu])[Bb]ot|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]*)(?:(?:[ /]| v)(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]] -- Bots containing spider|scrape|bot(but not CUBOT)|Crawl | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
local re_match = ngx.re.match | ||
|
||
local check_regex = function(value) | ||
if value then | ||
for _, rule in ipairs(value) do | ||
local _, err = re_match("just a string to test", rule) | ||
if err then | ||
return false, "value '"..rule.."' is not a valid regex" | ||
end | ||
end | ||
end | ||
return true | ||
end | ||
|
||
return { | ||
no_consumer = true, | ||
fields = { | ||
whitelist = { type = "array", func = check_regex }, | ||
blacklist = { type = "array", func = check_regex }, | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
local helpers = require "spec.helpers" | ||
|
||
local HELLOWORLD = "HelloWorld" -- just a test value | ||
local FACEBOOK = "facebookexternalhit/1.1" -- matches a known bot in `rules.lua` | ||
|
||
describe("Plugin: bot-detection", function() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
local client | ||
|
||
setup(function() | ||
helpers.kill_all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
helpers.prepare_prefix() | ||
|
||
local api1 = assert(helpers.dao.apis:insert { | ||
request_host = "bot.com", | ||
upstream_url = "http://mockbin.com" | ||
}) | ||
local api2 = assert(helpers.dao.apis:insert { | ||
request_host = "bot2.com", | ||
upstream_url = "http://mockbin.com" | ||
}) | ||
local api3 = assert(helpers.dao.apis:insert { | ||
request_host = "bot3.com", | ||
upstream_url = "http://mockbin.com" | ||
}) | ||
|
||
-- plugin 1 | ||
assert(helpers.dao.plugins:insert { | ||
api_id = api1.id, | ||
name = "bot-detection", | ||
config = {}, | ||
}) | ||
-- plugin 2 | ||
assert(helpers.dao.plugins:insert { | ||
api_id = api2.id, | ||
name = "bot-detection", | ||
config = { | ||
blacklist = HELLOWORLD | ||
}, | ||
}) | ||
-- plugin 3 | ||
assert(helpers.dao.plugins:insert { | ||
api_id = api3.id, | ||
name = "bot-detection", | ||
config = { | ||
whitelist = FACEBOOK | ||
}, | ||
}) | ||
|
||
assert(helpers.start_kong()) | ||
end) | ||
|
||
teardown(function() | ||
helpers.stop_kong() | ||
end) | ||
|
||
before_each(function() | ||
client = assert(helpers.proxy_client()) | ||
end) | ||
|
||
after_each(function() | ||
if client then client:close() end | ||
end) | ||
|
||
it("allows regular requests", function() | ||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { host = "bot.com" } | ||
}) | ||
assert.response(res).has.status(200) | ||
|
||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot.com", | ||
["user-agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" | ||
} | ||
}) | ||
assert.response(res).has.status(200) | ||
|
||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot.com", | ||
["user-agent"] = HELLOWORLD | ||
} | ||
}) | ||
assert.response(res).has.status(200) | ||
|
||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot.com", | ||
["user-agent"] = "curl/7.43.0" | ||
} | ||
}) | ||
assert.response(res).has.status(200) | ||
end) | ||
|
||
it("blocks bots", function() | ||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot.com", | ||
["user-agent"] = "Googlebot/2.1 (+http://www.google.com/bot.html)" | ||
}, | ||
}) | ||
assert.response(res).has.status(403) | ||
|
||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot.com", | ||
["user-agent"] = FACEBOOK, | ||
} | ||
}) | ||
assert.response(res).has.status(403) | ||
end) | ||
|
||
it("blocks blacklisted user-agents", function() | ||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot2.com", | ||
["user-agent"] = HELLOWORLD, | ||
} | ||
}) | ||
assert.response(res).has.status(403) | ||
end) | ||
|
||
it("allows whitelisted user-agents", function() | ||
local res = assert( client:send { | ||
method = "GET", | ||
path = "/request", | ||
headers = { | ||
host = "bot3.com", | ||
["user-agent"] = FACEBOOK | ||
} | ||
}) | ||
assert.response(res).has.status(200) | ||
end) | ||
|
||
end) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't use stringy but Kong's
utils.strip()
.