From e0df0eccf08b6b1ee9893ebb0f18ccfec4e323ee Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Thu, 29 Apr 2021 14:23:51 +0300 Subject: [PATCH 1/4] Updated geolocate speed --- Project.toml | 2 +- src/data.jl | 13 +++++++++++-- src/geoip-module.jl | 22 +++++++--------------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index 14cdc38..5ae6764 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "GeoIP" uuid = "abcde121-99a1-4d4f-92c5-2a3c6888d26a" authors = ["Andrey Oskin", "Seth Bromberger", "contributors: https://github.com/JuliaWeb/GeoIP.jl/graphs/contributors"] -version = "0.5.1" +version = "0.5.2" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/src/data.jl b/src/data.jl index a7c057c..c05509f 100644 --- a/src/data.jl +++ b/src/data.jl @@ -2,7 +2,8 @@ # become completely different entity. But user API shouldn't change, that is why we # introduce this wrapper struct DB - db::DataFrame + index::Vector{IPv4Net} + db::Vector{Dict{String, Any}} end Base.broadcastable(db::DB) = Ref(db) @@ -89,6 +90,7 @@ function load(; zipfile = "", loadzip(datadir, zipfile) end + # TODO: All of this is slow and should be improved. No DataFrame is needed actually # Clean up unneeded columns and map others to appropriate data structures select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider])) @@ -101,5 +103,12 @@ function load(; zipfile = "", alldata = leftjoin(blocks, locs, on = :geoname_id) - return DB(sort!(alldata, :v4net)) + df = sort!(alldata, :v4net) + db = Vector{Dict{String, Any}}(undef, size(df, 1)) + for i in axes(df, 1) + row = df[i, :] + db[i] = Dict(collect(zip(names(row), row))) + end + + return DB(df.v4net, db) end diff --git a/src/geoip-module.jl b/src/geoip-module.jl index 4460c09..53baea0 100644 --- a/src/geoip-module.jl +++ b/src/geoip-module.jl @@ -32,24 +32,16 @@ function geolocate(geodata::DB, ip::IPv4) ipnet = IPv4Net(ip, 32) db = geodata.db - # only iterate over rows that actually make sense - this filter is - # less expensive than iteration with in(). - found = 0 - for i in axes(db, 1) # iterate over rows - if db[i, :v4net] > ipnet - found = i - 1 - break - end - end + idx = searchsortedfirst(geodata.index, ipnet) - 1 # TODO: sentinel value should be returned - retdict = Dict{String, Any}() - if (found > 0) && ip in db[found, :v4net] - # Placeholder, should be removed - row = db[found, :] - return Dict(collect(zip(names(row), row))) + res = if idx > 0 && ip in geodata.index[idx] + db[idx] + else + Dict{String, Any}() end - return retdict + + return res end geolocate(geodata::DB, ipstr::AbstractString) = geolocate(geodata, IPv4(ipstr)) From 5bda557d00d8a8c0c289234635cd1f3aadbf140e Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Thu, 29 Apr 2021 22:19:13 +0300 Subject: [PATCH 2/4] Improved load time --- src/data.jl | 108 +++++++++++++++++++++++++++++++++++--------- src/geoip-module.jl | 58 ++++++++++++------------ 2 files changed, 115 insertions(+), 51 deletions(-) diff --git a/src/data.jl b/src/data.jl index c05509f..878cdc6 100644 --- a/src/data.jl +++ b/src/data.jl @@ -1,13 +1,68 @@ +######################################## +# Location structure +######################################## +# It would be great to replace this with a real GIS package. +abstract type Point end +abstract type Point3D <: Point end + +struct Location <: Point3D + x::Float64 + y::Float64 + z::Float64 + datum::String + + function Location(x, y, z = 0, datum = "WGS84") + if x === missing || y === missing + return new(0, 0, 0, datum) + else + return new(x, y, z, datum) + end + end +end + # Currently it's just a tiny wrapper over DataFrames, but at some point it should # become completely different entity. But user API shouldn't change, that is why we # introduce this wrapper -struct DB +struct DB{T1, T2} index::Vector{IPv4Net} - db::Vector{Dict{String, Any}} + locindex::Vector{Int} + blocks::Vector{T1} + locs::Vector{T2} end Base.broadcastable(db::DB) = Ref(db) +struct BlockRow{T} + net::T + geoname_id::Int + location::Location + registered_country_geoname_id::Int + is_anonymous_proxy::Int + is_satellite_provider::Int + postal_code::String + accuracy_radius::Int +end + +function BlockRow(csvrow) + net = IPNets.IPv4Net(csvrow.network) + geoname_id = ismissing(csvrow.geoname_id) ? -1 : csvrow.geoname_id + location = Location(csvrow.longitude, csvrow.latitude) + registered_country_geoname_id = ismissing(csvrow.registered_country_geoname_id) ? -1 : csvrow.registered_country_geoname_id + accuracy_radius = ismissing(csvrow.accuracy_radius) ? -1 : csvrow.accuracy_radius + postal_code = ismissing(csvrow.postal_code) ? "" : csvrow.postal_code + + BlockRow( + net, + geoname_id, + location, + registered_country_geoname_id, + csvrow.is_anonymous_proxy, + csvrow.is_satellite_provider, + postal_code, + accuracy_radius + ) +end + # Path to directory with data, can define GEOIP_DATADIR to override # the default (useful for testing with a smaller test set) function getdatadir(datadir) @@ -31,12 +86,10 @@ function loadgz(datadir, blockcsvgz, citycsvgz) local locs try blocks = GZip.open(blockfile, "r") do stream - CSV.File(read(stream)) |> DataFrame - # CSV.File(stream, types=[String, Int, Int, String, Int, Int, String, Float64, Float64, Int]) |> DataFrame + CSV.File(read(stream)) end locs = GZip.open(locfile, "r") do stream - # CSV.File(stream, types=[Int, String, String, String, String, String, String, String, String, String, String, Int, String, Int]) |> DataFrame - CSV.File(read(stream)) |> DataFrame + CSV.File(read(stream)) end catch @error "Geolocation data cannot be read. Data directory may be corrupt..." @@ -57,10 +110,10 @@ function loadzip(datadir, zipfile) for f in r.files if occursin("GeoLite2-City-Locations-en.csv", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) - locs = read!(f, v) |> CSV.File |> DataFrame + locs = read!(f, v) |> CSV.File elseif occursin("GeoLite2-City-Blocks-IPv4.csv", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) - blocks = read!(f, v) |> CSV.File |> DataFrame + blocks = read!(f, v) |> CSV.File end end catch @@ -92,23 +145,34 @@ function load(; zipfile = "", # TODO: All of this is slow and should be improved. No DataFrame is needed actually # Clean up unneeded columns and map others to appropriate data structures - select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider])) + blockdb = BlockRow.(blocks) + sort!(blockdb, by = x -> x.net) + index = map(x -> x.net, blockdb) + locsdb = collect(locs) + sort!(locsdb, by = x -> x.geoname_id) + locindex = map(x -> x.geoname_id, locsdb) - blocks[!, :v4net] = map(x -> IPNets.IPv4Net(x), blocks[!, :network]) - select!(blocks, Not(:network)) + return DB(index, locindex, blockdb, locsdb) - blocks[!, :location] = map(Location, blocks[!, :longitude], blocks[!, :latitude]) - select!(blocks, Not([:longitude, :latitude])) - blocks.geoname_id = map(x -> ismissing(x) ? -1 : Int(x), blocks.geoname_id) + # return blocks[1:100] - alldata = leftjoin(blocks, locs, on = :geoname_id) + # select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider])) - df = sort!(alldata, :v4net) - db = Vector{Dict{String, Any}}(undef, size(df, 1)) - for i in axes(df, 1) - row = df[i, :] - db[i] = Dict(collect(zip(names(row), row))) - end + # blocks[!, :v4net] = map(x -> IPNets.IPv4Net(x), blocks[!, :network]) + # select!(blocks, Not(:network)) + + # blocks[!, :location] = map(Location, blocks[!, :longitude], blocks[!, :latitude]) + # select!(blocks, Not([:longitude, :latitude])) + # blocks.geoname_id = map(x -> ismissing(x) ? -1 : Int(x), blocks.geoname_id) + + # alldata = leftjoin(blocks, locs, on = :geoname_id) + + # df = sort!(alldata, :v4net) + # db = Vector{Dict{String, Any}}(undef, size(df, 1)) + # for i in axes(df, 1) + # row = df[i, :] + # db[i] = Dict(collect(zip(names(row), row))) + # end - return DB(df.v4net, db) + # return DB(df.v4net, db) end diff --git a/src/geoip-module.jl b/src/geoip-module.jl index 53baea0..6957feb 100644 --- a/src/geoip-module.jl +++ b/src/geoip-module.jl @@ -1,25 +1,3 @@ -######################################## -# Location structure -######################################## -# It would be great to replace this with a real GIS package. -abstract type Point end -abstract type Point3D <: Point end - -struct Location <: Point3D - x::Float64 - y::Float64 - z::Float64 - datum::String - - function Location(x, y, z = 0, datum = "WGS84") - if x === missing || y === missing - return missing - else - return new(x, y, z, datum) - end - end -end - ######################################## # Geolocation functions ######################################## @@ -30,15 +8,37 @@ Returns geolocation and other information determined in `geodata` by `ip`. """ function geolocate(geodata::DB, ip::IPv4) ipnet = IPv4Net(ip, 32) - db = geodata.db + # TODO: Dict should be changed to a more suitable structure + res = Dict{String, Any}() + idx = searchsortedfirst(geodata.index, ipnet) - 1 - - # TODO: sentinel value should be returned - res = if idx > 0 && ip in geodata.index[idx] - db[idx] - else - Dict{String, Any}() + if idx == 0 || !(ip in geodata.index[idx]) + return res + end + row = geodata.blocks[idx] + + res["net"] = row.net + res["geoname_id"] = row.geoname_id + res["location"] = row.location + res["registered_country_geoname_id"] = row.registered_country_geoname_id + res["is_anonymous_proxy"] = row.is_anonymous_proxy + res["is_satellite_provider"] = row.is_satellite_provider + res["postal_code"] = row.postal_code + res["accuracy_radius"] = row.accuracy_radius + + geoname_id = row.geoname_id + idx2 = searchsortedfirst(geodata.locindex, geoname_id) + if idx2 > length(geodata.locs) || idx2 < 1 + return res + end + if geodata.locindex[idx2] != geoname_id + return res + end + + row2 = geodata.locs[idx2] + for k in keys(row2) + res[string(k)] = row2[k] end return res From 0a0375b12aa6b4f27924996c223503cb3c0efc1b Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Fri, 30 Apr 2021 12:47:50 +0300 Subject: [PATCH 3/4] remove breaking code --- src/data.jl | 50 ++++++++++++--------------------------------- src/geoip-module.jl | 2 +- 2 files changed, 14 insertions(+), 38 deletions(-) diff --git a/src/data.jl b/src/data.jl index 878cdc6..d1f2b37 100644 --- a/src/data.jl +++ b/src/data.jl @@ -13,7 +13,7 @@ struct Location <: Point3D function Location(x, y, z = 0, datum = "WGS84") if x === missing || y === missing - return new(0, 0, 0, datum) + return missing else return new(x, y, z, datum) end @@ -33,23 +33,23 @@ end Base.broadcastable(db::DB) = Ref(db) struct BlockRow{T} - net::T + v4net::T geoname_id::Int - location::Location - registered_country_geoname_id::Int + location::Union{Location, Missing} + registered_country_geoname_id::Union{Int, Missing} is_anonymous_proxy::Int is_satellite_provider::Int - postal_code::String - accuracy_radius::Int + postal_code::Union{String, Missing} + accuracy_radius::Union{Int, Missing} end function BlockRow(csvrow) net = IPNets.IPv4Net(csvrow.network) geoname_id = ismissing(csvrow.geoname_id) ? -1 : csvrow.geoname_id location = Location(csvrow.longitude, csvrow.latitude) - registered_country_geoname_id = ismissing(csvrow.registered_country_geoname_id) ? -1 : csvrow.registered_country_geoname_id - accuracy_radius = ismissing(csvrow.accuracy_radius) ? -1 : csvrow.accuracy_radius - postal_code = ismissing(csvrow.postal_code) ? "" : csvrow.postal_code + registered_country_geoname_id = csvrow.registered_country_geoname_id + accuracy_radius = get(csvrow, :accuracy_radius, missing) + postal_code = csvrow.postal_code BlockRow( net, @@ -108,10 +108,10 @@ function loadzip(datadir, zipfile) local locs try for f in r.files - if occursin("GeoLite2-City-Locations-en.csv", f.name) + if occursin("City-Locations-en.csv", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) locs = read!(f, v) |> CSV.File - elseif occursin("GeoLite2-City-Blocks-IPv4.csv", f.name) + elseif occursin("City-Blocks-IPv4.csv", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) blocks = read!(f, v) |> CSV.File end @@ -143,36 +143,12 @@ function load(; zipfile = "", loadzip(datadir, zipfile) end - # TODO: All of this is slow and should be improved. No DataFrame is needed actually - # Clean up unneeded columns and map others to appropriate data structures blockdb = BlockRow.(blocks) - sort!(blockdb, by = x -> x.net) - index = map(x -> x.net, blockdb) + sort!(blockdb, by = x -> x.v4net) + index = map(x -> x.v4net, blockdb) locsdb = collect(locs) sort!(locsdb, by = x -> x.geoname_id) locindex = map(x -> x.geoname_id, locsdb) return DB(index, locindex, blockdb, locsdb) - - # return blocks[1:100] - - # select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider])) - - # blocks[!, :v4net] = map(x -> IPNets.IPv4Net(x), blocks[!, :network]) - # select!(blocks, Not(:network)) - - # blocks[!, :location] = map(Location, blocks[!, :longitude], blocks[!, :latitude]) - # select!(blocks, Not([:longitude, :latitude])) - # blocks.geoname_id = map(x -> ismissing(x) ? -1 : Int(x), blocks.geoname_id) - - # alldata = leftjoin(blocks, locs, on = :geoname_id) - - # df = sort!(alldata, :v4net) - # db = Vector{Dict{String, Any}}(undef, size(df, 1)) - # for i in axes(df, 1) - # row = df[i, :] - # db[i] = Dict(collect(zip(names(row), row))) - # end - - # return DB(df.v4net, db) end diff --git a/src/geoip-module.jl b/src/geoip-module.jl index 6957feb..cbd0681 100644 --- a/src/geoip-module.jl +++ b/src/geoip-module.jl @@ -18,7 +18,7 @@ function geolocate(geodata::DB, ip::IPv4) end row = geodata.blocks[idx] - res["net"] = row.net + res["v4net"] = row.v4net res["geoname_id"] = row.geoname_id res["location"] = row.location res["registered_country_geoname_id"] = row.registered_country_geoname_id From 32ebeaa6af1983706f63e4fee408ddce96506ef4 Mon Sep 17 00:00:00 2001 From: Andrey Oskin Date: Fri, 30 Apr 2021 13:08:32 +0300 Subject: [PATCH 4/4] fixed CSV load error --- src/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data.jl b/src/data.jl index d1f2b37..127504c 100644 --- a/src/data.jl +++ b/src/data.jl @@ -86,7 +86,7 @@ function loadgz(datadir, blockcsvgz, citycsvgz) local locs try blocks = GZip.open(blockfile, "r") do stream - CSV.File(read(stream)) + CSV.File(read(stream); types = Dict(:postal_code => String)) end locs = GZip.open(locfile, "r") do stream CSV.File(read(stream)) @@ -113,7 +113,7 @@ function loadzip(datadir, zipfile) locs = read!(f, v) |> CSV.File elseif occursin("City-Blocks-IPv4.csv", f.name) v = Vector{UInt8}(undef, f.uncompressedsize) - blocks = read!(f, v) |> CSV.File + blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String)) end end catch