From c1a0d1eba54e6f2916d15bf97d3e8cbe3de80c7b Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Fri, 30 Apr 2021 21:30:39 +0300 Subject: [PATCH 1/2] Localization support --- src/GeoIP.jl | 1 + src/data.jl | 107 ++++++++++++++++++++++++++++++++++---------- src/geoip-module.jl | 9 ++-- 3 files changed, 89 insertions(+), 28 deletions(-) diff --git a/src/GeoIP.jl b/src/GeoIP.jl index 6820c22..3821d13 100644 --- a/src/GeoIP.jl +++ b/src/GeoIP.jl @@ -13,6 +13,7 @@ export Location, # methods geolocate, + setlocale, load include("data.jl") diff --git a/src/data.jl b/src/data.jl index 127504c..2bc5894 100644 --- a/src/data.jl +++ b/src/data.jl @@ -20,18 +20,14 @@ struct Location <: Point3D end end -# Currently it's just a tiny wrapper over DataFrames, but at some point it should -# become completely different entity. But user API shouldn't change, that is why we -# introduce this wrapper -struct DB{T1, T2} - index::Vector{IPv4Net} - locindex::Vector{Int} - blocks::Vector{T1} - locs::Vector{T2} +######################################## +# Main structures +######################################## +struct Locale{T} + index::Vector{Int} + locs::T end -Base.broadcastable(db::DB) = Ref(db) - struct BlockRow{T} v4net::T geoname_id::Int @@ -63,6 +59,24 @@ function BlockRow(csvrow) ) end +struct DB{T1, T2 <: Locale} + index::Vector{T1} + blocks::Vector{BlockRow{T1}} + locs::Vector{T2} + localeid::Int + ldict::Dict{Symbol, Int} +end + +Base.broadcastable(db::DB) = Ref(db) +function setlocale(db::DB, localename) + if localename in keys(db.ldict) + return DB(db.index, db.blocks, db.locs, db.ldict[localename], db.ldict) + else + @warn "Unable to find locale $localename" + return db + end +end + # Path to directory with data, can define GEOIP_DATADIR to override # the default (useful for testing with a smaller test set) function getdatadir(datadir) @@ -75,6 +89,27 @@ function getzipfile(zipfile) haskey(ENV, "GEOIP_ZIPFILE") ? ENV["GEOIP_ZIPFILE"] : zipfile end +getlocale(x::Pair) = x +function getlocale(x::Symbol) + if x == :en + return :en => r"Locations-en.csv$" + elseif x == :de + return :de => r"Locations-de.csv$" + elseif x == :ru + return :ru => r"Locations-ru.csv$" + elseif x == :ja + return :ja => r"Locations-ja.csv$" + elseif x == :es + return :es => r"Locations-es.csv$" + elseif x == :fr + return :fr => r"Locations-fr.csv$" + elseif x == :pt_br + return :pt_br => r"Locations-pt-BR.csv$" + elseif x == :zh_cn + return :zh_cn => r"Locations-zh_cn.csv$" + end +end + function loadgz(datadir, blockcsvgz, citycsvgz) blockfile = joinpath(datadir, blockcsvgz) locfile = joinpath(datadir, citycsvgz) @@ -96,22 +131,30 @@ function loadgz(datadir, blockcsvgz, citycsvgz) rethrow() end - return blocks, locs + return blocks, locs, Dict(:en => 1) end -function loadzip(datadir, zipfile) +function loadzip(datadir, zipfile, locales) zipfile = joinpath(datadir, zipfile) isfile(zipfile) || throw(ArgumentError("Unable to find data file in $(zipfile)")) r = ZipFile.Reader(zipfile) + ldict = Dict{Symbol, Int}() + locid = 1 local blocks - local locs + locs = [] try for f in r.files - if occursin("City-Locations-en.csv", f.name) - v = Vector{UInt8}(undef, f.uncompressedsize) - locs = read!(f, v) |> CSV.File - elseif occursin("City-Blocks-IPv4.csv", f.name) + for (l, s) in locales + if occursin(s, f.name) + v = Vector{UInt8}(undef, f.uncompressedsize) + ls = read!(f, v) |> CSV.File + push!(locs, ls) + ldict[l] = locid + locid += 1 + end + end + if occursin(r"Blocks-IPv4.csv$", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String)) end @@ -123,7 +166,7 @@ function loadzip(datadir, zipfile) close(r) end - return blocks, locs + return blocks, locs, ldict end """ @@ -133,22 +176,38 @@ Load GeoIP database from compressed CSV file or files. If `zipfile` argument is """ function load(; zipfile = "", datadir = "", + locales = [:en], + deflocale = :en, blockcsvgz = "GeoLite2-City-Blocks-IPv4.csv.gz", citycsvgz = "GeoLite2-City-Locations-en.csv.gz") datadir = getdatadir(datadir) zipfile = getzipfile(zipfile) - blocks, locs = if isempty(zipfile) + blocks, locs, ldict = if isempty(zipfile) loadgz(datadir, blockcsvgz, citycsvgz) else - loadzip(datadir, zipfile) + locales = getlocale.(locales) + loadzip(datadir, zipfile, locales) end blockdb = BlockRow.(blocks) sort!(blockdb, by = x -> x.v4net) index = map(x -> x.v4net, blockdb) - locsdb = collect(locs) - sort!(locsdb, by = x -> x.geoname_id) - locindex = map(x -> x.geoname_id, locsdb) + locsdb = map(locs) do loc + ldb = collect(loc) + sort!(ldb, by = x -> x.geoname_id) + lindex = map(x -> x.geoname_id, ldb) + Locale(lindex, ldb) + end + + localeid = if deflocale in keys(ldict) + ldict[deflocale] + else + cd = collect(d) + idx = findfirst(x -> x[2] == 1, cd) + locname = cd[idx][1] + @warn "Default locale $deflocale was not found, using locale $locname" + 1 + end - return DB(index, locindex, blockdb, locsdb) + return DB(index, blockdb, locsdb, localeid, ldict) end diff --git a/src/geoip-module.jl b/src/geoip-module.jl index cbd0681..4eb691e 100644 --- a/src/geoip-module.jl +++ b/src/geoip-module.jl @@ -28,15 +28,16 @@ function geolocate(geodata::DB, ip::IPv4) res["accuracy_radius"] = row.accuracy_radius geoname_id = row.geoname_id - idx2 = searchsortedfirst(geodata.locindex, geoname_id) - if idx2 > length(geodata.locs) || idx2 < 1 + locale = geodata.locs[geodata.localeid] + idx2 = searchsortedfirst(locale.index, geoname_id) + if idx2 > length(locale.locs) || idx2 < 1 return res end - if geodata.locindex[idx2] != geoname_id + if locale.index[idx2] != geoname_id return res end - row2 = geodata.locs[idx2] + row2 = locale.locs[idx2] for k in keys(row2) res[string(k)] = row2[k] end From 82d81b27b1f7d751699ad4b9d83361d90f8d9f0b Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Wed, 5 May 2021 18:27:23 +0300 Subject: [PATCH 2/2] Docs and tests --- README.md | 67 ++++++++++++++++++++++++++++++++ src/data.jl | 11 +++++- test/data/GeoLite2-City-CSV.zip | Bin 937 -> 1456 bytes test/test02_localization.jl | 34 ++++++++++++++++ 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 test/test02_localization.jl diff --git a/README.md b/README.md index 60bfcba..6b54165 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,73 @@ geodata["1.2.3.4"] geolocate.(geodata, [ip"1.2.3.4", ip"8.8.8.8"]) # returns vector of geo data. ``` +## Localization + +It is possible to use localized version of geo files. To load localized data, one can use `locales` argument of the `load` function. To switch between different locales is possible with the help of `setlocale` function. + +```julia +using GeoIP + +geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr]) + +geodata[ip"201.186.185.1"] +# Dict{String, Any} with 21 entries: +# "time_zone" => "America/Santiago" +# "subdivision_2_name" => missing +# "accuracy_radius" => 100 +# "geoname_id" => 3874960 +# "continent_code" => "SA" +# "postal_code" => missing +# "continent_name" => "South America" +# "locale_code" => "en" +# "subdivision_2_iso_code" => missing +# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84") +# "v4net" => IPv4Net("201.186.185.0/24") +# "subdivision_1_name" => "Los Lagos Region" +# "subdivision_1_iso_code" => "LL" +# "city_name" => "Port Montt" +# "metro_code" => missing +# "registered_country_geoname_id" => 3895114 +# "is_in_european_union" => 0 +# "is_satellite_provider" => 0 +# "is_anonymous_proxy" => 0 +# "country_name" => "Chile" +# "country_iso_code" => "CL" + +geodata_fr = setlocale(geodata, :fr) +geodata_fr[ip"201.186.185.1"] +# Dict{String, Any} with 21 entries: +# "time_zone" => "America/Santiago" +# "subdivision_2_name" => missing +# "accuracy_radius" => 100 +# "geoname_id" => 3874960 +# "continent_code" => "SA" +# "postal_code" => missing +# "continent_name" => "Amérique du Sud" +# "locale_code" => "fr" +# "subdivision_2_iso_code" => missing +# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84") +# "v4net" => IPv4Net("201.186.185.0/24") +# "subdivision_1_name" => missing +# "subdivision_1_iso_code" => "LL" +# "city_name" => "Puerto Montt" +# "metro_code" => missing +# "registered_country_geoname_id" => 3895114 +# "is_in_european_union" => 0 +# "is_satellite_provider" => 0 +# "is_anonymous_proxy" => 0 +# "country_name" => "Chili" +# "country_iso_code" => "CL" +``` + +During `load` procedure, it is possible to use either `Symbol` notation, i.e. `locales = [:en, :fr]` or one can pass `Vector` of `Pair`, where first argument is the locale name and second argument is a regular expression, which defines the name of the CSV file, which contains necessary localization. For example `locales = [:en => r"Locations-en.csv%", :fr => r"Locations-fr.csv"]`. By default, following locales are supported `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`. + +Default locale, which is used in `getlocale` response can be set with the help of `deflocale` argument of the `load` function. For example, to get `:fr` locale by default + +```julia +geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr], deflocale = :fr) +``` + # Acknowledgements This product uses, but not include, GeoLite2 data created by MaxMind, available from [http://www.maxmind.com](http://www.maxmind.com). diff --git a/src/data.jl b/src/data.jl index 2bc5894..8f89b0b 100644 --- a/src/data.jl +++ b/src/data.jl @@ -68,6 +68,11 @@ struct DB{T1, T2 <: Locale} end Base.broadcastable(db::DB) = Ref(db) +""" + setlocale(db, localename) + +Set new locale which should be used in return results. If locale is not found, then current locale is going to be used. +""" function setlocale(db::DB, localename) if localename in keys(db.ldict) return DB(db.index, db.blocks, db.locs, db.ldict[localename], db.ldict) @@ -131,7 +136,7 @@ function loadgz(datadir, blockcsvgz, citycsvgz) rethrow() end - return blocks, locs, Dict(:en => 1) + return blocks, [locs], Dict(:en => 1) end function loadzip(datadir, zipfile, locales) @@ -170,9 +175,11 @@ function loadzip(datadir, zipfile, locales) end """ - load(; datadir, zipfile, blockcsvgz, citycsvgz) + load(; datadir, zipfile, locales, deflocale, blockcsvgz, citycsvgz) Load GeoIP database from compressed CSV file or files. If `zipfile` argument is provided then `load` tries to load data from that file, otherwise it will try to load data from `blockcsvgz` and `citycsvgz`. By default `blockcsvgz` equals to `"GeoLite2-City-Blocks-IPv4.csv.gz"` and `citycsvgz` equals to `"GeoLite2-City-Locations-en.csv.gz"`. `datadir` defines where data files are located and can be either set as an argument or read from the `ENV` variable `GEOIP_DATADIR`. In the same way if `ENV` variable `GEOIP_ZIPFILE` is set, then it is used for determining `zipfile` argument. + +Argument `locales` determine locale files which should be loaded. Locales can be given as `Symbol`s or `Pair`s of locale name and filename which contains corresponding locale, e.g. `locales = [:en, :fr]` or `locales = [:en => r"-en.csv"]`. Following locales are supported in `Symbol` version `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`. To set default locale use `deflocale` argument, e.g. `deflocale = :en`. """ function load(; zipfile = "", datadir = "", diff --git a/test/data/GeoLite2-City-CSV.zip b/test/data/GeoLite2-City-CSV.zip index bfc119f46cc6531b24632959eed93756fe133dd8..43725bcf2155f4f6079de02f5b57b6eb342dac0a 100644 GIT binary patch delta 531 zcmZ3<&675-WK?AoWo8j!U|`^2STr4>^b&FaO8$Z2ArD+(to36TU2_bG}Qb z7ir#6-D2A4d86|jAD`Mep~ZrK0@c4=|9_%9fc0R@5$Rdq@73;~|NSj%jOY5(u@9Zy zx9;I9k3RMMLXeTzq|+6fuO`)J2}^$ewXWmJ_rFCOcJxjvj*sc^(0I9N>a*_^-kJZ_ zntbk6I`=YG?t4+Jj>ZoM&$xuXL~FC=IHs*^4@~zxFxlPoO!0xNUqM^`m%j(pBj!b^ z?AcrE&$f_@dreY^c3JA>XnVFj3$vE|O^7U*FyUahk-~vRa$7mxROnxrmtMFbpUtE86 hOs->;G=aq#D