Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For small files, don't recompile anything #284

Merged
merged 2 commits into from
Sep 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/CSV.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ function Base.showerror(io::IO, e::Error)
showerror(io, e.error)
end

# could just store NT internally and have Row take it as type parameter
# could separate kwargs out into individual fields and remove type parameter
struct File{NT, transpose, I, P, KW}
struct File{transpose, columnaccess, I, P, KW}
names::Vector{Symbol}
types::Vector{Type}
name::String
io::I
parsinglayers::P
Expand All @@ -39,11 +39,13 @@ struct File{NT, transpose, I, P, KW}
strict::Bool
end

function Base.show(io::IO, f::File{NT, transpose}) where {NT, transpose}
function Base.show(io::IO, f::File{transpose}) where {transpose}
println(io, "CSV.File(\"$(f.name)\", rows=$(transpose ? missing : length(f.positions))):")
show(io, Tables.schema(f))
end

const EMPTY_TYPEMAP = Dict{Type, Type}()

"""
CSV.File(source::Union{String, IO}; kwargs...) => CSV.File

Expand Down Expand Up @@ -126,7 +128,7 @@ function File(source::Union{String, IO};
falsestrings::Union{Vector{String}, Nothing}=nothing,
# type options
types=nothing,
typemap::Dict=Dict{Type, Type}(),
typemap::Dict=EMPTY_TYPEMAP,
allowmissing::Symbol=:all,
categorical::Bool=false,
strict::Bool=false,
Expand Down Expand Up @@ -156,13 +158,15 @@ function File(source::Union{String, IO};
rows, names, positions = datalayout_transpose(header, parsinglayers, io, datarow, footerskip, normalizenames)
originalpositions = copy(positions)
ref = Ref{Int}(min(something(limit, typemax(Int)), rows))
columnaccess = false
else
names, datapos = datalayout(header, parsinglayers, io, datarow, normalizenames)
eof(io) && return File{NamedTuple{names, Tuple{(Missing for _ in names)...}}, false, typeof(io), typeof(parsinglayers), typeof(kwargs)}(getname(source), io, parsinglayers, Int64[], Int64[], Ref{Int}(0), Ref{Int}(0), Ref(Parsers.SUCCESS), kwargs, CategoricalPool{String, UInt32, CatStr}[], strict)
eof(io) && return File{false, true, typeof(io), typeof(parsinglayers), typeof(kwargs)}(names, Type[Missing for _ in names], getname(source), io, parsinglayers, Int64[], Int64[], Ref{Int}(0), Ref{Int}(0), Ref(Parsers.SUCCESS), kwargs, CategoricalPool{String, UInt32, CatStr}[], strict)
positions = rowpositions(io, quotechar % UInt8, escapechar % UInt8, limit, parsinglayers, comment === nothing ? nothing : Parsers.Trie(comment))
originalpositions = Int64[]
footerskip > 0 && resize!(positions, length(positions) - footerskip)
ref = Ref{Int}(0)
columnaccess = length(positions) <= 1024
debug && @show positions
end

Expand All @@ -179,7 +183,7 @@ function File(source::Union{String, IO};
end

!transpose && seek(io, positions[1])
return File{NamedTuple{names, Tuple{types...}}, transpose, typeof(io), typeof(parsinglayers), typeof(kwargs)}(getname(source), io, parsinglayers, positions, originalpositions, Ref(1), ref, Ref(Parsers.SUCCESS), kwargs, pools, strict)
return File{transpose, columnaccess, typeof(io), typeof(parsinglayers), typeof(kwargs)}(names, types, getname(source), io, parsinglayers, positions, originalpositions, Ref(1), ref, Ref(Parsers.SUCCESS), kwargs, pools, strict)
end

include("filedetection.jl")
Expand Down Expand Up @@ -230,7 +234,7 @@ include("validate.jl")
function __init__()
# read a dummy file to "precompile" a bunch of methods; until the `precompile` function
# can propertly store machine code, this is our best option
CSV.File(joinpath(@__DIR__, "../test/testfiles/test_utf8.csv"), allowmissing=:auto) |> columntable
CSV.File(joinpath(@__DIR__, "../test/testfiles/test_utf8.csv"), allowmissing=:auto) |> DataFrame
return
end

Expand Down
16 changes: 8 additions & 8 deletions src/filedetection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function makeunique(names)
end
push!(nms, nm)
end
return Tuple(nms)
return nms
end

function skiptoheader!(parsinglayers, io, row, header)
Expand Down Expand Up @@ -106,7 +106,7 @@ function datalayout_transpose(header, parsinglayers, io, datarow, footerskip, no
seek(io, datapos)
end
rows = rows - footerskip # rows now equals the actual number of rows in the dataset
return rows, makeunique(Tuple(normalizenames ? normalizename(x) : Symbol(x) for x in columnnames)), columnpositions
return rows, makeunique(map(x->normalizenames ? normalizename(x) : Symbol(x), columnnames)), columnpositions
end

function datalayout(header::Integer, parsinglayers, io, datarow, normalizenames)
Expand All @@ -117,10 +117,10 @@ function datalayout(header::Integer, parsinglayers, io, datarow, normalizenames)
datapos = position(io)
row_vals = readsplitline(parsinglayers, io)
seek(io, datapos)
columnnames = Tuple(Symbol("Column$i") for i = eachindex(row_vals))
columnnames = [Symbol("Column$i") for i = eachindex(row_vals)]
else
skipto!(parsinglayers, io, 1, header)
columnnames = makeunique(Tuple(ismissing(x) ? Symbol("Column$i") : (normalizenames ? normalizename(x) : Symbol(x)) for (i, x) in enumerate(readsplitline(parsinglayers, io))))
columnnames = makeunique([ismissing(x) ? Symbol("Column$i") : (normalizenames ? normalizename(x) : Symbol(x)) for (i, x) in enumerate(readsplitline(parsinglayers, io))])
datarow != header+1 && skipto!(parsinglayers, io, header+1, datarow)
datapos = position(io)
end
Expand All @@ -137,22 +137,22 @@ function datalayout(header::AbstractRange, parsinglayers, io, datarow, normalize
end
datarow != last(header)+1 && skipto!(parsinglayers, io, last(header)+1, datarow)
datapos = position(io)
return makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in columnnames)), datapos
return makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in columnnames]), datapos
end

function datalayout(header::Vector, parsinglayers, io, datarow, normalizenames)
skipto!(parsinglayers, io, 1, datarow)
datapos = position(io)
if eof(io)
columnnames = makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in header))
columnnames = makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in header])
else
row_vals = readsplitline(parsinglayers, io)
seek(io, datapos)
if isempty(header)
columnnames = Tuple(Symbol("Column$i") for i in eachindex(row_vals))
columnnames = [Symbol("Column$i") for i in eachindex(row_vals)]
else
length(header) == length(row_vals) || throw(ArgumentError("The length of provided header ($(length(header))) doesn't match the number of columns at row $datarow ($(length(row_vals)))"))
columnnames = makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in header))
columnnames = makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in header])
end
end
return columnnames, datapos
Expand Down
98 changes: 84 additions & 14 deletions src/tables.jl
Original file line number Diff line number Diff line change
@@ -1,28 +1,90 @@
Tables.istable(::Type{<:File}) = true
Tables.schema(f::File{NamedTuple{names, types}}) where {names, types} = Tables.Schema(names, types)
Tables.schema(f::File) = Tables.Schema(f.names, f.types)

struct Row{F}
Tables.columnaccess(::Type{File{t, c, I, P, KW}}) where {t, c, I, P, KW} = c

struct Columns
names
columns
end
Base.propertynames(c::Columns) = getfield(c, 1)
function Base.getproperty(c::Columns, nm::Symbol)
getfield(c, 2)[findfirst(x->x==nm, getfield(c, 1))]
end

function Tables.columns(@nospecialize(f::File{transpose, true})) where {transpose}
len = length(f)
types = f.types
columns = Any[Tables.allocatecolumn(T, len) for T in types]
for (i, row) in enumerate(f)
for col = 1:length(types)
columns[col][i] = getproperty(row, types[col], col, :_)
end
end
return Columns(f.names, columns)
end

# row interfaces
abstract type Row end

struct UntypedRow{F} <: Row
file::F
row::Int
end
Base.propertynames(r::UntypedRow) = getfield(r, 1).names

struct RowIterator{names, types, F}
file::F
end
Tables.schema(r::RowIterator) = Tables.Schema(r.file.names, r.file.types)

struct TypedRow{F} <: Row
rowiterator::F
row::Int
end
Base.propertynames(r::TypedRow{F}) where {F <: RowIterator{names}} where {names} = names

function Base.show(io::IO, r::Row)
println(io, "CSV.Row($(getfield(r, 2))) of:")
show(io, getfield(r, 1))
end

Base.propertynames(row::Row{F}) where {F <: File{NamedTuple{names, T}}} where {names, T} = names
getranspose(r::UntypedRow{F}) where {F <: File{transpose}} where {transpose} = transpose
getranspose(r::RowIterator{N, T, F}) where {N, T, F <: File{transpose}} where {transpose} = transpose
getranspose(r::TypedRow) = getranspose(getfield(r, 1))

Base.eltype(f::F) where {F <: File} = Row{F}
Base.length(f::File{NT, transpose}) where {NT, transpose} = transpose ? f.lastparsedcol[] : length(f.positions)
Base.size(f::File{NamedTuple{names, types}}) where {names, types} = (length(f), length(names))
Base.size(f::File{NamedTuple{}}) = (0, 0)
# File iterates UntypedRows
Base.eltype(f::F) where {F <: File} = UntypedRow{F}
Base.length(f::File{transpose}) where {transpose} = transpose ? f.lastparsedcol[] : length(f.positions)
Base.size(f::File) = (length(f), length(f.names))

@inline function Base.iterate(f::File{transpose}, st=1) where {transpose}
st > length(f) && return nothing
# println("row=$st")
if transpose
st === 1 && (f.positions .= f.originalpositions)
else
@inbounds Parsers.fastseek!(f.io, f.positions[st])
f.currentrow[] = st
f.lastparsedcol[] = 0
f.lastparsedcode[] = Parsers.SUCCESS
end
return UntypedRow(f, st), st + 1
end

# Tables.rows(f) -> RowIterator
Tables.rowaccess(::Type{<:File}) = true
Tables.rows(f::File) = f
Tables.rows(f::F) where {F <: File} = RowIterator{Tuple(f.names), Tuple{f.types...}, F}(f)

@inline function Base.iterate(f::File{NT, transpose}, st=1) where {NT, transpose}
Base.eltype(f::R) where {R <: RowIterator} = TypedRow{R}
Base.length(f::RowIterator) = getranspose(f) ? f.file.lastparsedcol[] : length(f.file.positions)
Base.size(f::RowIterator) = (length(f.file), length(f.file.names))

# RowIterator iterates TypedRows
@inline function Base.iterate(r::RowIterator, st=1)
f = r.file
st > length(f) && return nothing
transpose = getranspose(r)
# println("row=$st")
if transpose
st === 1 && (f.positions .= f.originalpositions)
Expand All @@ -32,7 +94,7 @@ Tables.rows(f::File) = f
f.lastparsedcol[] = 0
f.lastparsedcode[] = Parsers.SUCCESS
end
return Row(f, st), st + 1
return TypedRow(r, st), st + 1
end

parsingtype(::Type{Missing}) = Missing
Expand Down Expand Up @@ -85,12 +147,21 @@ end
return true
end

@inline Base.getproperty(csvrow::Row{F}, name::Symbol) where {F <: File{NamedTuple{names, types}}} where {names, types} =
@inline function Base.getproperty(csvrow::UntypedRow, name::Symbol)
f = getfield(csvrow, 1)
i = findfirst(x->x===name, f.names)
return getproperty(csvrow, f.types[i], i, name)
end
@inline Base.getproperty(csvrow::TypedRow{F}, name::Symbol) where {F <: RowIterator{names, types}} where {names, types} =
getproperty(csvrow, Tables.columntype(names, types, name), Tables.columnindex(names, name), name)

function Base.getproperty(csvrow::Row{F}, ::Type{T}, col::Int, name::Symbol) where {T, F <: File{NT, transpose}} where {NT, transpose}
getfile(f::File) = f
getfile(r::RowIterator) = r.file

function Base.getproperty(csvrow::Row, ::Type{T}, col::Int, name::Symbol) where {T}
col === 0 && return missing
f = getfield(csvrow, 1)
transpose = getranspose(csvrow)
f = getfile(getfield(csvrow, 1))
row = getfield(csvrow, 2)
if transpose
@inbounds Parsers.fastseek!(f.io, f.positions[col])
Expand Down Expand Up @@ -128,4 +199,3 @@ function Base.getproperty(csvrow::Row{F}, ::Type{T}, col::Int, name::Symbol) whe
end
return r
end

10 changes: 5 additions & 5 deletions src/typedetection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ promote_type2(::Type{String}, ::Type{String}) = String

# providedtypes: Dict{String, Type}, Dict{Int, Type}, Vector{Type}
initialtype(allowmissing) = (allowmissing === :auto || allowmissing === :none) ? Union{} : Missing
initialtypes(T, ::Nothing, names) = Any[T for _ = 1:length(names)]
initialtypes(T, t::Dict{String, V}, names) where {V} = Any[get(t, string(nm), T) for nm in names]
initialtypes(T, t::Dict{Symbol, V}, names) where {V} = Any[get(t, nm, T) for nm in names]
initialtypes(T, t::Dict{Int, V}, names) where {V} = Any[get(t, i, T) for i = 1:length(names)]
initialtypes(T, t::Vector, names) = length(t) == length(names) ? collect(Any, t) : throw(ArgumentError("length of user provided types ($(length(t))) does not match length of header (?$(length(names)))"))
initialtypes(T, ::Nothing, names) = Type[T for _ = 1:length(names)]
initialtypes(T, t::Dict{String, V}, names) where {V} = Type[get(t, string(nm), T) for nm in names]
initialtypes(T, t::Dict{Symbol, V}, names) where {V} = Type[get(t, nm, T) for nm in names]
initialtypes(T, t::Dict{Int, V}, names) where {V} = Type[get(t, i, T) for i = 1:length(names)]
initialtypes(T, t::Vector, names) = length(t) == length(names) ? collect(Type, t) : throw(ArgumentError("length of user provided types ($(length(t))) does not match length of header (?$(length(names)))"))

struct Fib{N}
len::Int
Expand Down
2 changes: 1 addition & 1 deletion test/testfiles/testfiles.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
getschema(f::CSV.File{NT}) where NT = NT
getschema(f::CSV.File) = NamedTuple{Tuple(f.names), Tuple{f.types...}}

function testfile(file, kwargs, sz, sch, testfunc)
f = CSV.File(file isa IO ? file : joinpath(dir, file); kwargs...)
Expand Down