Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated geolocate speed #53

Merged
merged 4 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "GeoIP"
uuid = "abcde121-99a1-4d4f-92c5-2a3c6888d26a"
authors = ["Andrey Oskin", "Seth Bromberger", "contributors: https://github.com/JuliaWeb/GeoIP.jl/graphs/contributors"]
version = "0.5.1"
version = "0.5.2"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
93 changes: 71 additions & 22 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,68 @@
########################################
# Location structure
########################################
# It would be great to replace this with a real GIS package.
abstract type Point end
abstract type Point3D <: Point end

struct Location <: Point3D
x::Float64
y::Float64
z::Float64
datum::String

function Location(x, y, z = 0, datum = "WGS84")
if x === missing || y === missing
return missing
else
return new(x, y, z, datum)
end
end
end

# Currently it's just a tiny wrapper over DataFrames, but at some point it should
# become completely different entity. But user API shouldn't change, that is why we
# introduce this wrapper
struct DB
db::DataFrame
struct DB{T1, T2}
index::Vector{IPv4Net}
locindex::Vector{Int}
blocks::Vector{T1}
locs::Vector{T2}
end

Base.broadcastable(db::DB) = Ref(db)

struct BlockRow{T}
v4net::T
geoname_id::Int
location::Union{Location, Missing}
registered_country_geoname_id::Union{Int, Missing}
is_anonymous_proxy::Int
is_satellite_provider::Int
postal_code::Union{String, Missing}
accuracy_radius::Union{Int, Missing}
end

function BlockRow(csvrow)
net = IPNets.IPv4Net(csvrow.network)
geoname_id = ismissing(csvrow.geoname_id) ? -1 : csvrow.geoname_id
location = Location(csvrow.longitude, csvrow.latitude)
registered_country_geoname_id = csvrow.registered_country_geoname_id
accuracy_radius = get(csvrow, :accuracy_radius, missing)
postal_code = csvrow.postal_code

BlockRow(
net,
geoname_id,
location,
registered_country_geoname_id,
csvrow.is_anonymous_proxy,
csvrow.is_satellite_provider,
postal_code,
accuracy_radius
)
end

# Path to directory with data, can define GEOIP_DATADIR to override
# the default (useful for testing with a smaller test set)
function getdatadir(datadir)
Expand All @@ -30,12 +86,10 @@ function loadgz(datadir, blockcsvgz, citycsvgz)
local locs
try
blocks = GZip.open(blockfile, "r") do stream
CSV.File(read(stream)) |> DataFrame
# CSV.File(stream, types=[String, Int, Int, String, Int, Int, String, Float64, Float64, Int]) |> DataFrame
CSV.File(read(stream); types = Dict(:postal_code => String))
end
locs = GZip.open(locfile, "r") do stream
# CSV.File(stream, types=[Int, String, String, String, String, String, String, String, String, String, String, Int, String, Int]) |> DataFrame
CSV.File(read(stream)) |> DataFrame
CSV.File(read(stream))
end
catch
@error "Geolocation data cannot be read. Data directory may be corrupt..."
Expand All @@ -54,12 +108,12 @@ function loadzip(datadir, zipfile)
local locs
try
for f in r.files
if occursin("GeoLite2-City-Locations-en.csv", f.name)
if occursin("City-Locations-en.csv", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
locs = read!(f, v) |> CSV.File |> DataFrame
elseif occursin("GeoLite2-City-Blocks-IPv4.csv", f.name)
locs = read!(f, v) |> CSV.File
elseif occursin("City-Blocks-IPv4.csv", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
blocks = read!(f, v) |> CSV.File |> DataFrame
blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String))
end
end
catch
Expand Down Expand Up @@ -89,17 +143,12 @@ function load(; zipfile = "",
loadzip(datadir, zipfile)
end

# Clean up unneeded columns and map others to appropriate data structures
select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider]))

blocks[!, :v4net] = map(x -> IPNets.IPv4Net(x), blocks[!, :network])
select!(blocks, Not(:network))

blocks[!, :location] = map(Location, blocks[!, :longitude], blocks[!, :latitude])
select!(blocks, Not([:longitude, :latitude]))
blocks.geoname_id = map(x -> ismissing(x) ? -1 : Int(x), blocks.geoname_id)

alldata = leftjoin(blocks, locs, on = :geoname_id)
blockdb = BlockRow.(blocks)
sort!(blockdb, by = x -> x.v4net)
index = map(x -> x.v4net, blockdb)
locsdb = collect(locs)
sort!(locsdb, by = x -> x.geoname_id)
locindex = map(x -> x.geoname_id, locsdb)

return DB(sort!(alldata, :v4net))
return DB(index, locindex, blockdb, locsdb)
end
68 changes: 30 additions & 38 deletions src/geoip-module.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,3 @@
########################################
# Location structure
########################################
# It would be great to replace this with a real GIS package.
abstract type Point end
abstract type Point3D <: Point end

struct Location <: Point3D
x::Float64
y::Float64
z::Float64
datum::String

function Location(x, y, z = 0, datum = "WGS84")
if x === missing || y === missing
return missing
else
return new(x, y, z, datum)
end
end
end

########################################
# Geolocation functions
########################################
Expand All @@ -30,26 +8,40 @@ Returns geolocation and other information determined in `geodata` by `ip`.
"""
function geolocate(geodata::DB, ip::IPv4)
ipnet = IPv4Net(ip, 32)
db = geodata.db

# only iterate over rows that actually make sense - this filter is
# less expensive than iteration with in().
found = 0
for i in axes(db, 1) # iterate over rows
if db[i, :v4net] > ipnet
found = i - 1
break
end
# TODO: Dict should be changed to a more suitable structure
res = Dict{String, Any}()

idx = searchsortedfirst(geodata.index, ipnet) - 1
if idx == 0 || !(ip in geodata.index[idx])
return res
end
row = geodata.blocks[idx]

res["v4net"] = row.v4net
res["geoname_id"] = row.geoname_id
res["location"] = row.location
res["registered_country_geoname_id"] = row.registered_country_geoname_id
res["is_anonymous_proxy"] = row.is_anonymous_proxy
res["is_satellite_provider"] = row.is_satellite_provider
res["postal_code"] = row.postal_code
res["accuracy_radius"] = row.accuracy_radius

# TODO: sentinel value should be returned
retdict = Dict{String, Any}()
if (found > 0) && ip in db[found, :v4net]
# Placeholder, should be removed
row = db[found, :]
return Dict(collect(zip(names(row), row)))
geoname_id = row.geoname_id
idx2 = searchsortedfirst(geodata.locindex, geoname_id)
if idx2 > length(geodata.locs) || idx2 < 1
return res
end
if geodata.locindex[idx2] != geoname_id
return res
end
return retdict

row2 = geodata.locs[idx2]
for k in keys(row2)
res[string(k)] = row2[k]
end

return res
end

geolocate(geodata::DB, ipstr::AbstractString) = geolocate(geodata, IPv4(ipstr))
Expand Down