Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(plugin) bot-detection #1413

Merged
merged 9 commits into from
Jul 27, 2016
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion kong-0.8.3-0.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ build = {

["kong.plugins.statsd.handler"] = "kong/plugins/statsd/handler.lua",
["kong.plugins.statsd.schema"] = "kong/plugins/statsd/schema.lua",
["kong.plugins.statsd.statsd_logger"] = "kong/plugins/statsd/statsd_logger.lua"
["kong.plugins.statsd.statsd_logger"] = "kong/plugins/statsd/statsd_logger.lua",

["kong.plugins.bot-detection.handler"] = "kong/plugins/bot-detection/handler.lua",
["kong.plugins.bot-detection.schema"] = "kong/plugins/bot-detection/schema.lua",
["kong.plugins.bot-detection.rules"] = "kong/plugins/bot-detection/rules.lua",
["kong.plugins.bot-detection.cache"] = "kong/plugins/bot-detection/cache.lua",
["kong.plugins.bot-detection.hooks"] = "kong/plugins/bot-detection/hooks.lua",
}
}
2 changes: 1 addition & 1 deletion kong/constants.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ local plugins = {
"file-log", "http-log", "key-auth", "hmac-auth", "basic-auth", "ip-restriction",
"galileo", "request-transformer", "response-transformer",
"request-size-limiting", "rate-limiting", "response-ratelimiting", "syslog",
"loggly", "datadog", "runscope", "ldap-auth", "statsd"
"loggly", "datadog", "runscope", "ldap-auth", "statsd", "bot-detection"
}

local plugin_map = {}
Expand Down
27 changes: 27 additions & 0 deletions kong/plugins/bot-detection/cache.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
local cache = require "kong.tools.database_cache"

local _M = {}

local INDEX = "bot_detection_index"

function _M.set(key, value)
cache.set(cache.bot_detection_key(key), value)
local index_keys = cache.get(INDEX)
if not index_keys then index_keys = {} end
index_keys[#index_keys+1] = key
cache.set(INDEX, index_keys)
end

function _M.get(key)
return cache.get(cache.bot_detection_key(key))
end

function _M.reset()
local index_keys = cache.get(INDEX)
for _, key in ipairs(index_keys) do
cache.delete(cache.bot_detection_key(key))
end
cache.delete(INDEX)
end

return _M
81 changes: 81 additions & 0 deletions kong/plugins/bot-detection/handler.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
local BasePlugin = require "kong.plugins.base_plugin"
local responses = require "kong.tools.responses"
local rules = require "kong.plugins.bot-detection.rules"
local stringy = require "stringy"
local bot_cache = require "kong.plugins.bot-detection.cache"

local ipairs = ipairs
local get_headers = ngx.req.get_headers
local re_match = ngx.re.match

local BotDetectionHandler = BasePlugin:extend()

BotDetectionHandler.PRIORITY = 2500

local function get_user_agent()
local user_agent = get_headers()["user-agent"]
if type(user_agent) == "table" then
return nil, "Only one User-Agent header allowed"
end
return user_agent
end

function BotDetectionHandler:new()
BotDetectionHandler.super.new(self, "bot-detection")
end

function BotDetectionHandler:access(conf)
BotDetectionHandler.super.access(self)

local user_agent, err = get_user_agent()
if err then
return responses.send_HTTP_BAD_REQUEST(err)
end

if user_agent then
user_agent = stringy.strip(user_agent)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't use stringy but Kong's utils.strip().


-- Cache key, per API
local cache_key = ngx.ctx.api.id..":"..user_agent

-- The cache already has the user_agents that should be blocked
-- So we avoid matching the regexes everytime
local cached_match = bot_cache.get(cache_key)
if cached_match ~= nil then
if cached_match then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

directly compare against a boolean or nil instead.

if cached_match == true then
  return
elseif cached_match == false then
  return responses.send_HTTP_FORBIDDEN()
end

-- nil

return
else
return responses.send_HTTP_FORBIDDEN()
end
end

if conf.whitelist then
for _, rule in ipairs(conf.whitelist) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, true)
return
end
end
end

if conf.blacklist then
for _, rule in ipairs(conf.blacklist) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, false)
return responses.send_HTTP_FORBIDDEN()
end
end
end

for _, rule in ipairs(rules.bots) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, false)
return responses.send_HTTP_FORBIDDEN()
end
end

bot_cache.set(cache_key, true)
end
end

return BotDetectionHandler
14 changes: 14 additions & 0 deletions kong/plugins/bot-detection/hooks.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
local events = require "kong.core.events"
local bot_cache = require "kong.plugins.bot-detection.cache"

local function invalidate(message_t)
if message_t.collection == "plugins" and message_t.entity.name == "bot-detection" then
bot_cache.reset()
end
end

return {
[events.TYPES.ENTITY_UPDATED] = function(message_t)
invalidate(message_t)
end
}
18 changes: 18 additions & 0 deletions kong/plugins/bot-detection/rules.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- List taken from https://github.com/ua-parser/uap-core/blob/master/regexes.yaml

return {
bots = {
[[(Pingdom.com_bot_version_)(\d+)\.(\d+)]], -- Pingdom
[[(facebookexternalhit)/(\d+)\.(\d+)]], -- Facebook
[[Google.*/\+/web/snippet]], -- Google Plus
[[(Twitterbot)/(\d+)\.(\d+)]], -- Twitter
[[/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots Pattern '/name-0.0'
[[(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], --Bots Pattern 'name/0.0'
[[(MSIE) (\d+)\.(\d+)([a-z]\d?)?;.* MSIECrawler]], --MSIECrawler
[[(Google-HTTP-Java-Client|Apache-HttpClient|http%20client|Python-urllib|HttpMonitor|TLSProber|WinHTTP|JNLP)(?:[ /](\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Downloader ...
[[(1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]+-Agent|AdsBot-Google(?:-[a-z]+)?|altavista|AppEngine-Google|archive.*?\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]+)*|bingbot|BingPreview|blitzbot|BlogBridge|BoardReader(?: [A-Za-z]+)*|boitho.com-dc|BotSeer|\b\w*favicon\w*\b|\bYeti(?:-[a-z]+)?|Catchpoint bot|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher)?|Feed Seeker Bot|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]+-)?Googlebot(?:-[a-zA-Z]+)?|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile)?|IconSurf|IlTrovatore(?:-Setaccio)?|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]+Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masidani_bot|Mediapartners-Google|Microsoft .*? Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media *)?|msrbot|netresearch|Netvibes|NewsGator[^/]*|^NING|Nutch[^/]*|Nymesis|ObjectsSearch|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slurp|snappy|Speedy Spider|Squrl Java|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|TwitterBot|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]+|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s)? Link Sleuth|Xerka [A-z]+Bot|yacy(?:bot)?|Yahoo[a-z]*Seeker|Yahoo! Slurp|Yandex\w+|YodaoBot(?:-[A-z]+)?|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Bots
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 \-_\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*))/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name/0.0'
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 _\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*)) (\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name 0.0'
[[((?:[A-z0-9]+|[A-z\-]+ ?)?(?: the )?(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[A-Za-z0-9-]*(?:[^C][^Uu])[Bb]ot|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]*)(?:(?:[ /]| v)(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]] -- Bots containing spider|scrape|bot(but not CUBOT)|Crawl
}
}
21 changes: 21 additions & 0 deletions kong/plugins/bot-detection/schema.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
local re_match = ngx.re.match

local check_regex = function(value)
if value then
for _, rule in ipairs(value) do
local _, err = re_match("just a string to test", rule)
if err then
return false, "value '"..rule.."' is not a valid regex"
end
end
end
return true
end

return {
no_consumer = true,
fields = {
whitelist = { type = "array", func = check_regex },
blacklist = { type = "array", func = check_regex },
}
}
7 changes: 6 additions & 1 deletion kong/tools/database_cache.lua
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ local CACHE_KEYS = {
AUTOJOIN_RETRIES = "autojoin_retries",
TIMERS = "timers",
ALL_APIS_BY_DIC = "ALL_APIS_BY_DIC",
LDAP_CREDENTIAL = "ldap_credentials"
LDAP_CREDENTIAL = "ldap_credentials",
BOT_DETECTION = "bot_detection"
}

local _M = {}
Expand Down Expand Up @@ -115,6 +116,10 @@ function _M.ssl_data(api_id)
return CACHE_KEYS.SSL..":"..api_id
end

function _M.bot_detection_key(key)
return CACHE_KEYS.BOT_DETECTION..":"..key
end

function _M.all_apis_by_dict_key()
return CACHE_KEYS.ALL_APIS_BY_DIC
end
Expand Down
150 changes: 150 additions & 0 deletions spec/03-plugins/bot-detection/01-access_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
local helpers = require "spec.helpers"

local HELLOWORLD = "HelloWorld" -- just a test value
local FACEBOOK = "facebookexternalhit/1.1" -- matches a known bot in `rules.lua`

describe("Plugin: bot-detection", function()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plugin: bot-detection (access)


local client

setup(function()
helpers.kill_all()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kill_all is not appropriate in setup() anymore. All integration tests should use kong_stop() in their teardown() context.

helpers.prepare_prefix()

local api1 = assert(helpers.dao.apis:insert {
request_host = "bot.com",
upstream_url = "http://mockbin.com"
})
local api2 = assert(helpers.dao.apis:insert {
request_host = "bot2.com",
upstream_url = "http://mockbin.com"
})
local api3 = assert(helpers.dao.apis:insert {
request_host = "bot3.com",
upstream_url = "http://mockbin.com"
})

-- plugin 1
assert(helpers.dao.plugins:insert {
api_id = api1.id,
name = "bot-detection",
config = {},
})
-- plugin 2
assert(helpers.dao.plugins:insert {
api_id = api2.id,
name = "bot-detection",
config = {
blacklist = HELLOWORLD
},
})
-- plugin 3
assert(helpers.dao.plugins:insert {
api_id = api3.id,
name = "bot-detection",
config = {
whitelist = FACEBOOK
},
})

assert(helpers.start_kong())
end)

teardown(function()
helpers.stop_kong()
end)

before_each(function()
client = assert(helpers.proxy_client())
end)

after_each(function()
if client then client:close() end
end)

it("allows regular requests", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = { host = "bot.com" }
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = HELLOWORLD
}
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "curl/7.43.0"
}
})
assert.response(res).has.status(200)
end)

it("blocks bots", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "Googlebot/2.1 (+http://www.google.com/bot.html)"
},
})
assert.response(res).has.status(403)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = FACEBOOK,
}
})
assert.response(res).has.status(403)
end)

it("blocks blacklisted user-agents", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot2.com",
["user-agent"] = HELLOWORLD,
}
})
assert.response(res).has.status(403)
end)

it("allows whitelisted user-agents", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot3.com",
["user-agent"] = FACEBOOK
}
})
assert.response(res).has.status(200)
end)

end)
Loading