Skip to content

Commit

Permalink
Merge pull request #53 from Arkoniak/geolocate_improve
Browse files Browse the repository at this point in the history
Updated geolocate speed
  • Loading branch information
Arkoniak authored Apr 30, 2021
2 parents 66cc4a7 + 32ebeaa commit a000677
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 61 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "GeoIP"
uuid = "abcde121-99a1-4d4f-92c5-2a3c6888d26a"
authors = ["Andrey Oskin", "Seth Bromberger", "contributors: https://github.com/JuliaWeb/GeoIP.jl/graphs/contributors"]
version = "0.5.1"
version = "0.5.2"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
93 changes: 71 additions & 22 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,68 @@
########################################
# Location structure
########################################
# It would be great to replace this with a real GIS package.
abstract type Point end
abstract type Point3D <: Point end

struct Location <: Point3D
x::Float64
y::Float64
z::Float64
datum::String

function Location(x, y, z = 0, datum = "WGS84")
if x === missing || y === missing
return missing
else
return new(x, y, z, datum)
end
end
end

# Currently it's just a tiny wrapper over DataFrames, but at some point it should
# become completely different entity. But user API shouldn't change, that is why we
# introduce this wrapper
struct DB
db::DataFrame
struct DB{T1, T2}
index::Vector{IPv4Net}
locindex::Vector{Int}
blocks::Vector{T1}
locs::Vector{T2}
end

Base.broadcastable(db::DB) = Ref(db)

struct BlockRow{T}
v4net::T
geoname_id::Int
location::Union{Location, Missing}
registered_country_geoname_id::Union{Int, Missing}
is_anonymous_proxy::Int
is_satellite_provider::Int
postal_code::Union{String, Missing}
accuracy_radius::Union{Int, Missing}
end

function BlockRow(csvrow)
net = IPNets.IPv4Net(csvrow.network)
geoname_id = ismissing(csvrow.geoname_id) ? -1 : csvrow.geoname_id
location = Location(csvrow.longitude, csvrow.latitude)
registered_country_geoname_id = csvrow.registered_country_geoname_id
accuracy_radius = get(csvrow, :accuracy_radius, missing)
postal_code = csvrow.postal_code

BlockRow(
net,
geoname_id,
location,
registered_country_geoname_id,
csvrow.is_anonymous_proxy,
csvrow.is_satellite_provider,
postal_code,
accuracy_radius
)
end

# Path to directory with data, can define GEOIP_DATADIR to override
# the default (useful for testing with a smaller test set)
function getdatadir(datadir)
Expand All @@ -30,12 +86,10 @@ function loadgz(datadir, blockcsvgz, citycsvgz)
local locs
try
blocks = GZip.open(blockfile, "r") do stream
CSV.File(read(stream)) |> DataFrame
# CSV.File(stream, types=[String, Int, Int, String, Int, Int, String, Float64, Float64, Int]) |> DataFrame
CSV.File(read(stream); types = Dict(:postal_code => String))
end
locs = GZip.open(locfile, "r") do stream
# CSV.File(stream, types=[Int, String, String, String, String, String, String, String, String, String, String, Int, String, Int]) |> DataFrame
CSV.File(read(stream)) |> DataFrame
CSV.File(read(stream))
end
catch
@error "Geolocation data cannot be read. Data directory may be corrupt..."
Expand All @@ -54,12 +108,12 @@ function loadzip(datadir, zipfile)
local locs
try
for f in r.files
if occursin("GeoLite2-City-Locations-en.csv", f.name)
if occursin("City-Locations-en.csv", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
locs = read!(f, v) |> CSV.File |> DataFrame
elseif occursin("GeoLite2-City-Blocks-IPv4.csv", f.name)
locs = read!(f, v) |> CSV.File
elseif occursin("City-Blocks-IPv4.csv", f.name)
v = Vector{UInt8}(undef, f.uncompressedsize)
blocks = read!(f, v) |> CSV.File |> DataFrame
blocks = read!(f, v) |> x -> CSV.File(x; types = Dict(:postal_code => String))
end
end
catch
Expand Down Expand Up @@ -89,17 +143,12 @@ function load(; zipfile = "",
loadzip(datadir, zipfile)
end

# Clean up unneeded columns and map others to appropriate data structures
select!(blocks, Not([:represented_country_geoname_id, :is_anonymous_proxy, :is_satellite_provider]))

blocks[!, :v4net] = map(x -> IPNets.IPv4Net(x), blocks[!, :network])
select!(blocks, Not(:network))

blocks[!, :location] = map(Location, blocks[!, :longitude], blocks[!, :latitude])
select!(blocks, Not([:longitude, :latitude]))
blocks.geoname_id = map(x -> ismissing(x) ? -1 : Int(x), blocks.geoname_id)

alldata = leftjoin(blocks, locs, on = :geoname_id)
blockdb = BlockRow.(blocks)
sort!(blockdb, by = x -> x.v4net)
index = map(x -> x.v4net, blockdb)
locsdb = collect(locs)
sort!(locsdb, by = x -> x.geoname_id)
locindex = map(x -> x.geoname_id, locsdb)

return DB(sort!(alldata, :v4net))
return DB(index, locindex, blockdb, locsdb)
end
68 changes: 30 additions & 38 deletions src/geoip-module.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,3 @@
########################################
# Location structure
########################################
# It would be great to replace this with a real GIS package.
abstract type Point end
abstract type Point3D <: Point end

struct Location <: Point3D
x::Float64
y::Float64
z::Float64
datum::String

function Location(x, y, z = 0, datum = "WGS84")
if x === missing || y === missing
return missing
else
return new(x, y, z, datum)
end
end
end

########################################
# Geolocation functions
########################################
Expand All @@ -30,26 +8,40 @@ Returns geolocation and other information determined in `geodata` by `ip`.
"""
function geolocate(geodata::DB, ip::IPv4)
ipnet = IPv4Net(ip, 32)
db = geodata.db

# only iterate over rows that actually make sense - this filter is
# less expensive than iteration with in().
found = 0
for i in axes(db, 1) # iterate over rows
if db[i, :v4net] > ipnet
found = i - 1
break
end
# TODO: Dict should be changed to a more suitable structure
res = Dict{String, Any}()

idx = searchsortedfirst(geodata.index, ipnet) - 1
if idx == 0 || !(ip in geodata.index[idx])
return res
end
row = geodata.blocks[idx]

res["v4net"] = row.v4net
res["geoname_id"] = row.geoname_id
res["location"] = row.location
res["registered_country_geoname_id"] = row.registered_country_geoname_id
res["is_anonymous_proxy"] = row.is_anonymous_proxy
res["is_satellite_provider"] = row.is_satellite_provider
res["postal_code"] = row.postal_code
res["accuracy_radius"] = row.accuracy_radius

# TODO: sentinel value should be returned
retdict = Dict{String, Any}()
if (found > 0) && ip in db[found, :v4net]
# Placeholder, should be removed
row = db[found, :]
return Dict(collect(zip(names(row), row)))
geoname_id = row.geoname_id
idx2 = searchsortedfirst(geodata.locindex, geoname_id)
if idx2 > length(geodata.locs) || idx2 < 1
return res
end
if geodata.locindex[idx2] != geoname_id
return res
end
return retdict

row2 = geodata.locs[idx2]
for k in keys(row2)
res[string(k)] = row2[k]
end

return res
end

geolocate(geodata::DB, ipstr::AbstractString) = geolocate(geodata, IPv4(ipstr))
Expand Down

2 comments on commit a000677

@Arkoniak
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/35711

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.5.2 -m "<description of version>" a0006775c17fbf95c1387e7947291370bb16594d
git push origin v0.5.2

Please sign in to comment.