diff --git a/README.md b/README.md index 60bfcba..6b54165 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,73 @@ geodata["1.2.3.4"] geolocate.(geodata, [ip"1.2.3.4", ip"8.8.8.8"]) # returns vector of geo data. ``` +## Localization + +It is possible to use localized version of geo files. To load localized data, one can use `locales` argument of the `load` function. To switch between different locales is possible with the help of `setlocale` function. + +```julia +using GeoIP + +geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr]) + +geodata[ip"201.186.185.1"] +# Dict{String, Any} with 21 entries: +# "time_zone" => "America/Santiago" +# "subdivision_2_name" => missing +# "accuracy_radius" => 100 +# "geoname_id" => 3874960 +# "continent_code" => "SA" +# "postal_code" => missing +# "continent_name" => "South America" +# "locale_code" => "en" +# "subdivision_2_iso_code" => missing +# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84") +# "v4net" => IPv4Net("201.186.185.0/24") +# "subdivision_1_name" => "Los Lagos Region" +# "subdivision_1_iso_code" => "LL" +# "city_name" => "Port Montt" +# "metro_code" => missing +# "registered_country_geoname_id" => 3895114 +# "is_in_european_union" => 0 +# "is_satellite_provider" => 0 +# "is_anonymous_proxy" => 0 +# "country_name" => "Chile" +# "country_iso_code" => "CL" + +geodata_fr = setlocale(geodata, :fr) +geodata_fr[ip"201.186.185.1"] +# Dict{String, Any} with 21 entries: +# "time_zone" => "America/Santiago" +# "subdivision_2_name" => missing +# "accuracy_radius" => 100 +# "geoname_id" => 3874960 +# "continent_code" => "SA" +# "postal_code" => missing +# "continent_name" => "Amérique du Sud" +# "locale_code" => "fr" +# "subdivision_2_iso_code" => missing +# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84") +# "v4net" => IPv4Net("201.186.185.0/24") +# "subdivision_1_name" => missing +# "subdivision_1_iso_code" => "LL" +# "city_name" => "Puerto Montt" +# "metro_code" => missing +# "registered_country_geoname_id" => 3895114 +# "is_in_european_union" => 0 +# "is_satellite_provider" => 0 +# "is_anonymous_proxy" => 0 +# "country_name" => "Chili" +# "country_iso_code" => "CL" +``` + +During `load` procedure, it is possible to use either `Symbol` notation, i.e. `locales = [:en, :fr]` or one can pass `Vector` of `Pair`, where first argument is the locale name and second argument is a regular expression, which defines the name of the CSV file, which contains necessary localization. For example `locales = [:en => r"Locations-en.csv%", :fr => r"Locations-fr.csv"]`. By default, following locales are supported `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`. + +Default locale, which is used in `getlocale` response can be set with the help of `deflocale` argument of the `load` function. For example, to get `:fr` locale by default + +```julia +geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr], deflocale = :fr) +``` + # Acknowledgements This product uses, but not include, GeoLite2 data created by MaxMind, available from [http://www.maxmind.com](http://www.maxmind.com). diff --git a/src/GeoIP.jl b/src/GeoIP.jl index 6820c22..3821d13 100644 --- a/src/GeoIP.jl +++ b/src/GeoIP.jl @@ -13,6 +13,7 @@ export Location, # methods geolocate, + setlocale, load include("data.jl") diff --git a/src/data.jl b/src/data.jl index 127504c..8f89b0b 100644 --- a/src/data.jl +++ b/src/data.jl @@ -20,18 +20,14 @@ struct Location <: Point3D end end -# Currently it's just a tiny wrapper over DataFrames, but at some point it should -# become completely different entity. But user API shouldn't change, that is why we -# introduce this wrapper -struct DB{T1, T2} - index::Vector{IPv4Net} - locindex::Vector{Int} - blocks::Vector{T1} - locs::Vector{T2} +######################################## +# Main structures +######################################## +struct Locale{T} + index::Vector{Int} + locs::T end -Base.broadcastable(db::DB) = Ref(db) - struct BlockRow{T} v4net::T geoname_id::Int @@ -63,6 +59,29 @@ function BlockRow(csvrow) ) end +struct DB{T1, T2 <: Locale} + index::Vector{T1} + blocks::Vector{BlockRow{T1}} + locs::Vector{T2} + localeid::Int + ldict::Dict{Symbol, Int} +end + +Base.broadcastable(db::DB) = Ref(db) +""" + setlocale(db, localename) + +Set new locale which should be used in return results. If locale is not found, then current locale is going to be used. +""" +function setlocale(db::DB, localename) + if localename in keys(db.ldict) + return DB(db.index, db.blocks, db.locs, db.ldict[localename], db.ldict) + else + @warn "Unable to find locale $localename" + return db + end +end + # Path to directory with data, can define GEOIP_DATADIR to override # the default (useful for testing with a smaller test set) function getdatadir(datadir) @@ -75,6 +94,27 @@ function getzipfile(zipfile) haskey(ENV, "GEOIP_ZIPFILE") ? ENV["GEOIP_ZIPFILE"] : zipfile end +getlocale(x::Pair) = x +function getlocale(x::Symbol) + if x == :en + return :en => r"Locations-en.csv$" + elseif x == :de + return :de => r"Locations-de.csv$" + elseif x == :ru + return :ru => r"Locations-ru.csv$" + elseif x == :ja + return :ja => r"Locations-ja.csv$" + elseif x == :es + return :es => r"Locations-es.csv$" + elseif x == :fr + return :fr => r"Locations-fr.csv$" + elseif x == :pt_br + return :pt_br => r"Locations-pt-BR.csv$" + elseif x == :zh_cn + return :zh_cn => r"Locations-zh_cn.csv$" + end +end + function loadgz(datadir, blockcsvgz, citycsvgz) blockfile = joinpath(datadir, blockcsvgz) locfile = joinpath(datadir, citycsvgz) @@ -96,22 +136,30 @@ function loadgz(datadir, blockcsvgz, citycsvgz) rethrow() end - return blocks, locs + return blocks, [locs], Dict(:en => 1) end -function loadzip(datadir, zipfile) +function loadzip(datadir, zipfile, locales) zipfile = joinpath(datadir, zipfile) isfile(zipfile) || throw(ArgumentError("Unable to find data file in $(zipfile)")) r = ZipFile.Reader(zipfile) + ldict = Dict{Symbol, Int}() + locid = 1 local blocks - local locs + locs = [] try for f in r.files - if occursin("City-Locations-en.csv", f.name) - v = Vector{UInt8}(undef, f.uncompressedsize) - locs = read!(f, v) |> CSV.File - elseif occursin("City-Blocks-IPv4.csv", f.name) + for (l, s) in locales + if occursin(s, f.name) + v = Vector{UInt8}(undef, f.uncompressedsize) + ls = read!(f, v) |> CSV.File + push!(locs, ls) + ldict[l] = locid + locid += 1 + end + end + if occursin(r"Blocks-IPv4.csv$", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String)) end @@ -123,32 +171,50 @@ function loadzip(datadir, zipfile) close(r) end - return blocks, locs + return blocks, locs, ldict end """ - load(; datadir, zipfile, blockcsvgz, citycsvgz) + load(; datadir, zipfile, locales, deflocale, blockcsvgz, citycsvgz) Load GeoIP database from compressed CSV file or files. If `zipfile` argument is provided then `load` tries to load data from that file, otherwise it will try to load data from `blockcsvgz` and `citycsvgz`. By default `blockcsvgz` equals to `"GeoLite2-City-Blocks-IPv4.csv.gz"` and `citycsvgz` equals to `"GeoLite2-City-Locations-en.csv.gz"`. `datadir` defines where data files are located and can be either set as an argument or read from the `ENV` variable `GEOIP_DATADIR`. In the same way if `ENV` variable `GEOIP_ZIPFILE` is set, then it is used for determining `zipfile` argument. + +Argument `locales` determine locale files which should be loaded. Locales can be given as `Symbol`s or `Pair`s of locale name and filename which contains corresponding locale, e.g. `locales = [:en, :fr]` or `locales = [:en => r"-en.csv"]`. Following locales are supported in `Symbol` version `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`. To set default locale use `deflocale` argument, e.g. `deflocale = :en`. """ function load(; zipfile = "", datadir = "", + locales = [:en], + deflocale = :en, blockcsvgz = "GeoLite2-City-Blocks-IPv4.csv.gz", citycsvgz = "GeoLite2-City-Locations-en.csv.gz") datadir = getdatadir(datadir) zipfile = getzipfile(zipfile) - blocks, locs = if isempty(zipfile) + blocks, locs, ldict = if isempty(zipfile) loadgz(datadir, blockcsvgz, citycsvgz) else - loadzip(datadir, zipfile) + locales = getlocale.(locales) + loadzip(datadir, zipfile, locales) end blockdb = BlockRow.(blocks) sort!(blockdb, by = x -> x.v4net) index = map(x -> x.v4net, blockdb) - locsdb = collect(locs) - sort!(locsdb, by = x -> x.geoname_id) - locindex = map(x -> x.geoname_id, locsdb) + locsdb = map(locs) do loc + ldb = collect(loc) + sort!(ldb, by = x -> x.geoname_id) + lindex = map(x -> x.geoname_id, ldb) + Locale(lindex, ldb) + end + + localeid = if deflocale in keys(ldict) + ldict[deflocale] + else + cd = collect(d) + idx = findfirst(x -> x[2] == 1, cd) + locname = cd[idx][1] + @warn "Default locale $deflocale was not found, using locale $locname" + 1 + end - return DB(index, locindex, blockdb, locsdb) + return DB(index, blockdb, locsdb, localeid, ldict) end diff --git a/src/geoip-module.jl b/src/geoip-module.jl index cbd0681..4eb691e 100644 --- a/src/geoip-module.jl +++ b/src/geoip-module.jl @@ -28,15 +28,16 @@ function geolocate(geodata::DB, ip::IPv4) res["accuracy_radius"] = row.accuracy_radius geoname_id = row.geoname_id - idx2 = searchsortedfirst(geodata.locindex, geoname_id) - if idx2 > length(geodata.locs) || idx2 < 1 + locale = geodata.locs[geodata.localeid] + idx2 = searchsortedfirst(locale.index, geoname_id) + if idx2 > length(locale.locs) || idx2 < 1 return res end - if geodata.locindex[idx2] != geoname_id + if locale.index[idx2] != geoname_id return res end - row2 = geodata.locs[idx2] + row2 = locale.locs[idx2] for k in keys(row2) res[string(k)] = row2[k] end diff --git a/test/data/GeoLite2-City-CSV.zip b/test/data/GeoLite2-City-CSV.zip index bfc119f..43725bc 100644 Binary files a/test/data/GeoLite2-City-CSV.zip and b/test/data/GeoLite2-City-CSV.zip differ diff --git a/test/test02_localization.jl b/test/test02_localization.jl new file mode 100644 index 0000000..438eab5 --- /dev/null +++ b/test/test02_localization.jl @@ -0,0 +1,34 @@ +module TestLocalization + +using GeoIP +using Sockets: IPv4, @ip_str +using Test + +TEST_DATADIR = joinpath(dirname(@__FILE__), "data") + +ENV["GEOIP_DATADIR"] = TEST_DATADIR +ENV["GEOIP_ZIPFILE"] = "GeoLite2-City-CSV.zip" + +@testset "Locales setup" begin + db = load(locales = [:en, :ru]) + geoip1 = db[ip"1.0.8.1"] + @test geoip1["locale_code"] == "en" + @test geoip1["continent_name"] == "Asia" + + db2 = setlocale(db, :ru) + geoip1 = db2[ip"1.0.8.1"] + @test geoip1["locale_code"] == "ru" + @test geoip1["continent_name"] == "Азия" +end + +@testset "Missed locales" begin + db = load(locales = [:en, :ru], deflocale = :ru) + geoip1 = db[ip"1.0.1.1"] + @test !haskey(geoip1, "locale_code") + + db2 = setlocale(db, :en) + geoip1 = db2[ip"1.0.1.1"] + @test geoip1["locale_code"] == "en" +end + +end # module