Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Localization support #54

Merged
merged 2 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,73 @@ geodata["1.2.3.4"]
geolocate.(geodata, [ip"1.2.3.4", ip"8.8.8.8"]) # returns vector of geo data.
```

## Localization

It is possible to use localized version of geo files. To load localized data, one can use `locales` argument of the `load` function. To switch between different locales is possible with the help of `setlocale` function.

```julia
using GeoIP

geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr])

geodata[ip"201.186.185.1"]
# Dict{String, Any} with 21 entries:
# "time_zone" => "America/Santiago"
# "subdivision_2_name" => missing
# "accuracy_radius" => 100
# "geoname_id" => 3874960
# "continent_code" => "SA"
# "postal_code" => missing
# "continent_name" => "South America"
# "locale_code" => "en"
# "subdivision_2_iso_code" => missing
# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84")
# "v4net" => IPv4Net("201.186.185.0/24")
# "subdivision_1_name" => "Los Lagos Region"
# "subdivision_1_iso_code" => "LL"
# "city_name" => "Port Montt"
# "metro_code" => missing
# "registered_country_geoname_id" => 3895114
# "is_in_european_union" => 0
# "is_satellite_provider" => 0
# "is_anonymous_proxy" => 0
# "country_name" => "Chile"
# "country_iso_code" => "CL"

geodata_fr = setlocale(geodata, :fr)
geodata_fr[ip"201.186.185.1"]
# Dict{String, Any} with 21 entries:
# "time_zone" => "America/Santiago"
# "subdivision_2_name" => missing
# "accuracy_radius" => 100
# "geoname_id" => 3874960
# "continent_code" => "SA"
# "postal_code" => missing
# "continent_name" => "Amérique du Sud"
# "locale_code" => "fr"
# "subdivision_2_iso_code" => missing
# "location" => Location(-72.9436, -41.4709, 0.0, "WGS84")
# "v4net" => IPv4Net("201.186.185.0/24")
# "subdivision_1_name" => missing
# "subdivision_1_iso_code" => "LL"
# "city_name" => "Puerto Montt"
# "metro_code" => missing
# "registered_country_geoname_id" => 3895114
# "is_in_european_union" => 0
# "is_satellite_provider" => 0
# "is_anonymous_proxy" => 0
# "country_name" => "Chili"
# "country_iso_code" => "CL"
```

During `load` procedure, it is possible to use either `Symbol` notation, i.e. `locales = [:en, :fr]` or one can pass `Vector` of `Pair`, where first argument is the locale name and second argument is a regular expression, which defines the name of the CSV file, which contains necessary localization. For example `locales = [:en => r"Locations-en.csv%", :fr => r"Locations-fr.csv"]`. By default, following locales are supported `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`.

Default locale, which is used in `getlocale` response can be set with the help of `deflocale` argument of the `load` function. For example, to get `:fr` locale by default

```julia
geodata = load(zipfile = "GeoLite2-City-CSV_20191224.zip", locales = [:en, :fr], deflocale = :fr)
```

# Acknowledgements
This product uses, but not include, GeoLite2 data created by MaxMind, available from
[http://www.maxmind.com](http://www.maxmind.com).
1 change: 1 addition & 0 deletions src/GeoIP.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export
Location,
# methods
geolocate,
setlocale,
load

include("data.jl")
Expand Down
116 changes: 91 additions & 25 deletions src/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,14 @@ struct Location <: Point3D
end
end

# Currently it's just a tiny wrapper over DataFrames, but at some point it should
# become completely different entity. But user API shouldn't change, that is why we
# introduce this wrapper
struct DB{T1, T2}
index::Vector{IPv4Net}
locindex::Vector{Int}
blocks::Vector{T1}
locs::Vector{T2}
########################################
# Main structures
########################################
struct Locale{T}
index::Vector{Int}
locs::T
end

Base.broadcastable(db::DB) = Ref(db)

struct BlockRow{T}
v4net::T
geoname_id::Int
Expand Down Expand Up @@ -63,6 +59,29 @@ function BlockRow(csvrow)
)
end

struct DB{T1, T2 <: Locale}
index::Vector{T1}
blocks::Vector{BlockRow{T1}}
locs::Vector{T2}
localeid::Int
ldict::Dict{Symbol, Int}
end

Base.broadcastable(db::DB) = Ref(db)
"""
setlocale(db, localename)

Set new locale which should be used in return results. If locale is not found, then current locale is going to be used.
"""
function setlocale(db::DB, localename)
if localename in keys(db.ldict)
return DB(db.index, db.blocks, db.locs, db.ldict[localename], db.ldict)
else
@warn "Unable to find locale $localename"
return db
end
end

# Path to directory with data, can define GEOIP_DATADIR to override
# the default (useful for testing with a smaller test set)
function getdatadir(datadir)
Expand All @@ -75,6 +94,27 @@ function getzipfile(zipfile)
haskey(ENV, "GEOIP_ZIPFILE") ? ENV["GEOIP_ZIPFILE"] : zipfile
end

getlocale(x::Pair) = x
function getlocale(x::Symbol)
if x == :en
return :en => r"Locations-en.csv$"
elseif x == :de
return :de => r"Locations-de.csv$"
elseif x == :ru
return :ru => r"Locations-ru.csv$"
elseif x == :ja
return :ja => r"Locations-ja.csv$"
elseif x == :es
return :es => r"Locations-es.csv$"
elseif x == :fr
return :fr => r"Locations-fr.csv$"
elseif x == :pt_br
return :pt_br => r"Locations-pt-BR.csv$"
elseif x == :zh_cn
return :zh_cn => r"Locations-zh_cn.csv$"
end
end

function loadgz(datadir, blockcsvgz, citycsvgz)
blockfile = joinpath(datadir, blockcsvgz)
locfile = joinpath(datadir, citycsvgz)
Expand All @@ -96,22 +136,30 @@ function loadgz(datadir, blockcsvgz, citycsvgz)
rethrow()
end

return blocks, locs
return blocks, [locs], Dict(:en => 1)
end

function loadzip(datadir, zipfile)
function loadzip(datadir, zipfile, locales)
zipfile = joinpath(datadir, zipfile)
isfile(zipfile) || throw(ArgumentError("Unable to find data file in $(zipfile)"))

r = ZipFile.Reader(zipfile)
ldict = Dict{Symbol, Int}()
locid = 1
local blocks
local locs
locs = []
try
for f in r.files
if occursin("City-Locations-en.csv", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
locs = read!(f, v) |> CSV.File
elseif occursin("City-Blocks-IPv4.csv", f.name)
for (l, s) in locales
if occursin(s, f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
ls = read!(f, v) |> CSV.File
push!(locs, ls)
ldict[l] = locid
locid += 1
end
end
if occursin(r"Blocks-IPv4.csv$", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String))
end
Expand All @@ -123,32 +171,50 @@ function loadzip(datadir, zipfile)
close(r)
end

return blocks, locs
return blocks, locs, ldict
end

"""
load(; datadir, zipfile, blockcsvgz, citycsvgz)
load(; datadir, zipfile, locales, deflocale, blockcsvgz, citycsvgz)

Load GeoIP database from compressed CSV file or files. If `zipfile` argument is provided then `load` tries to load data from that file, otherwise it will try to load data from `blockcsvgz` and `citycsvgz`. By default `blockcsvgz` equals to `"GeoLite2-City-Blocks-IPv4.csv.gz"` and `citycsvgz` equals to `"GeoLite2-City-Locations-en.csv.gz"`. `datadir` defines where data files are located and can be either set as an argument or read from the `ENV` variable `GEOIP_DATADIR`. In the same way if `ENV` variable `GEOIP_ZIPFILE` is set, then it is used for determining `zipfile` argument.

Argument `locales` determine locale files which should be loaded. Locales can be given as `Symbol`s or `Pair`s of locale name and filename which contains corresponding locale, e.g. `locales = [:en, :fr]` or `locales = [:en => r"-en.csv"]`. Following locales are supported in `Symbol` version `:en, :de, :ru, :ja, :es, :fr, :pt_br, :zh_cn`. To set default locale use `deflocale` argument, e.g. `deflocale = :en`.
"""
function load(; zipfile = "",
datadir = "",
locales = [:en],
deflocale = :en,
blockcsvgz = "GeoLite2-City-Blocks-IPv4.csv.gz",
citycsvgz = "GeoLite2-City-Locations-en.csv.gz")
datadir = getdatadir(datadir)
zipfile = getzipfile(zipfile)
blocks, locs = if isempty(zipfile)
blocks, locs, ldict = if isempty(zipfile)
loadgz(datadir, blockcsvgz, citycsvgz)
else
loadzip(datadir, zipfile)
locales = getlocale.(locales)
loadzip(datadir, zipfile, locales)
end

blockdb = BlockRow.(blocks)
sort!(blockdb, by = x -> x.v4net)
index = map(x -> x.v4net, blockdb)
locsdb = collect(locs)
sort!(locsdb, by = x -> x.geoname_id)
locindex = map(x -> x.geoname_id, locsdb)
locsdb = map(locs) do loc
ldb = collect(loc)
sort!(ldb, by = x -> x.geoname_id)
lindex = map(x -> x.geoname_id, ldb)
Locale(lindex, ldb)
end

localeid = if deflocale in keys(ldict)
ldict[deflocale]
else
cd = collect(d)
idx = findfirst(x -> x[2] == 1, cd)
locname = cd[idx][1]
@warn "Default locale $deflocale was not found, using locale $locname"
1
end

return DB(index, locindex, blockdb, locsdb)
return DB(index, blockdb, locsdb, localeid, ldict)
end
9 changes: 5 additions & 4 deletions src/geoip-module.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,16 @@ function geolocate(geodata::DB, ip::IPv4)
res["accuracy_radius"] = row.accuracy_radius

geoname_id = row.geoname_id
idx2 = searchsortedfirst(geodata.locindex, geoname_id)
if idx2 > length(geodata.locs) || idx2 < 1
locale = geodata.locs[geodata.localeid]
idx2 = searchsortedfirst(locale.index, geoname_id)
if idx2 > length(locale.locs) || idx2 < 1
return res
end
if geodata.locindex[idx2] != geoname_id
if locale.index[idx2] != geoname_id
return res
end

row2 = geodata.locs[idx2]
row2 = locale.locs[idx2]
for k in keys(row2)
res[string(k)] = row2[k]
end
Expand Down
Binary file modified test/data/GeoLite2-City-CSV.zip
Binary file not shown.
34 changes: 34 additions & 0 deletions test/test02_localization.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module TestLocalization

using GeoIP
using Sockets: IPv4, @ip_str
using Test

TEST_DATADIR = joinpath(dirname(@__FILE__), "data")

ENV["GEOIP_DATADIR"] = TEST_DATADIR
ENV["GEOIP_ZIPFILE"] = "GeoLite2-City-CSV.zip"

@testset "Locales setup" begin
db = load(locales = [:en, :ru])
geoip1 = db[ip"1.0.8.1"]
@test geoip1["locale_code"] == "en"
@test geoip1["continent_name"] == "Asia"

db2 = setlocale(db, :ru)
geoip1 = db2[ip"1.0.8.1"]
@test geoip1["locale_code"] == "ru"
@test geoip1["continent_name"] == "Азия"
end

@testset "Missed locales" begin
db = load(locales = [:en, :ru], deflocale = :ru)
geoip1 = db[ip"1.0.1.1"]
@test !haskey(geoip1, "locale_code")

db2 = setlocale(db, :en)
geoip1 = db2[ip"1.0.1.1"]
@test geoip1["locale_code"] == "en"
end

end # module