JuliaData · quinnj · Sep 20, 2018 · Sep 19, 2018 · Sep 20, 2018
diff --git a/src/CSV.jl b/src/CSV.jl
@@ -23,9 +23,9 @@ function Base.showerror(io::IO, e::Error)
     showerror(io, e.error)
 end
 
-# could just store NT internally and have Row take it as type parameter
-# could separate kwargs out into individual fields and remove type parameter
-struct File{NT, transpose, I, P, KW}
+struct File{transpose, columnaccess, I, P, KW}
+    names::Vector{Symbol}
+    types::Vector{Type}
     name::String
     io::I
     parsinglayers::P
@@ -39,11 +39,13 @@ struct File{NT, transpose, I, P, KW}
     strict::Bool
 end
 
-function Base.show(io::IO, f::File{NT, transpose}) where {NT, transpose}
+function Base.show(io::IO, f::File{transpose}) where {transpose}
     println(io, "CSV.File(\"$(f.name)\", rows=$(transpose ? missing : length(f.positions))):")
     show(io, Tables.schema(f))
 end
 
+const EMPTY_TYPEMAP = Dict{Type, Type}()
+
 """
     CSV.File(source::Union{String, IO}; kwargs...) => CSV.File
 
@@ -126,7 +128,7 @@ function File(source::Union{String, IO};
     falsestrings::Union{Vector{String}, Nothing}=nothing,
     # type options
     types=nothing,
-    typemap::Dict=Dict{Type, Type}(),
+    typemap::Dict=EMPTY_TYPEMAP,
     allowmissing::Symbol=:all,
     categorical::Bool=false,
     strict::Bool=false,
@@ -156,13 +158,15 @@ function File(source::Union{String, IO};
         rows, names, positions = datalayout_transpose(header, parsinglayers, io, datarow, footerskip, normalizenames)
         originalpositions = copy(positions)
         ref = Ref{Int}(min(something(limit, typemax(Int)), rows))
+        columnaccess = false
     else
         names, datapos = datalayout(header, parsinglayers, io, datarow, normalizenames)
-        eof(io) && return File{NamedTuple{names, Tuple{(Missing for _ in names)...}}, false, typeof(io), typeof(parsinglayers), typeof(kwargs)}(getname(source), io, parsinglayers, Int64[], Int64[], Ref{Int}(0), Ref{Int}(0), Ref(Parsers.SUCCESS), kwargs, CategoricalPool{String, UInt32, CatStr}[], strict)
+        eof(io) && return File{false, true, typeof(io), typeof(parsinglayers), typeof(kwargs)}(names, Type[Missing for _ in names], getname(source), io, parsinglayers, Int64[], Int64[], Ref{Int}(0), Ref{Int}(0), Ref(Parsers.SUCCESS), kwargs, CategoricalPool{String, UInt32, CatStr}[], strict)
         positions = rowpositions(io, quotechar % UInt8, escapechar % UInt8, limit, parsinglayers, comment === nothing ? nothing : Parsers.Trie(comment))
         originalpositions = Int64[]
         footerskip > 0 && resize!(positions, length(positions) - footerskip)
         ref = Ref{Int}(0)
+        columnaccess = length(positions) <= 1024
         debug && @show positions
     end
 
@@ -179,7 +183,7 @@ function File(source::Union{String, IO};
     end
 
     !transpose && seek(io, positions[1])
-    return File{NamedTuple{names, Tuple{types...}}, transpose, typeof(io), typeof(parsinglayers), typeof(kwargs)}(getname(source), io, parsinglayers, positions, originalpositions, Ref(1), ref, Ref(Parsers.SUCCESS), kwargs, pools, strict)
+    return File{transpose, columnaccess, typeof(io), typeof(parsinglayers), typeof(kwargs)}(names, types, getname(source), io, parsinglayers, positions, originalpositions, Ref(1), ref, Ref(Parsers.SUCCESS), kwargs, pools, strict)
 end
 
 include("filedetection.jl")
@@ -230,7 +234,7 @@ include("validate.jl")
 function __init__()
     # read a dummy file to "precompile" a bunch of methods; until the `precompile` function
     # can propertly store machine code, this is our best option
-    CSV.File(joinpath(@__DIR__, "../test/testfiles/test_utf8.csv"), allowmissing=:auto) |> columntable
+    CSV.File(joinpath(@__DIR__, "../test/testfiles/test_utf8.csv"), allowmissing=:auto) |> DataFrame
     return
 end
 

diff --git a/src/filedetection.jl b/src/filedetection.jl
@@ -28,7 +28,7 @@ function makeunique(names)
         end
         push!(nms, nm)
     end
-    return Tuple(nms)
+    return nms
 end
 
 function skiptoheader!(parsinglayers, io, row, header)
@@ -106,7 +106,7 @@ function datalayout_transpose(header, parsinglayers, io, datarow, footerskip, no
         seek(io, datapos)
     end
     rows = rows - footerskip # rows now equals the actual number of rows in the dataset
-    return rows, makeunique(Tuple(normalizenames ? normalizename(x) : Symbol(x) for x in columnnames)), columnpositions
+    return rows, makeunique(map(x->normalizenames ? normalizename(x) : Symbol(x), columnnames)), columnpositions
 end
 
 function datalayout(header::Integer, parsinglayers, io, datarow, normalizenames)
@@ -117,10 +117,10 @@ function datalayout(header::Integer, parsinglayers, io, datarow, normalizenames)
         datapos = position(io)
         row_vals = readsplitline(parsinglayers, io)
         seek(io, datapos)
-        columnnames = Tuple(Symbol("Column$i") for i = eachindex(row_vals))
+        columnnames = [Symbol("Column$i") for i = eachindex(row_vals)]
     else
         skipto!(parsinglayers, io, 1, header)
-        columnnames = makeunique(Tuple(ismissing(x) ? Symbol("Column$i") : (normalizenames ? normalizename(x) : Symbol(x)) for (i, x) in enumerate(readsplitline(parsinglayers, io))))
+        columnnames = makeunique([ismissing(x) ? Symbol("Column$i") : (normalizenames ? normalizename(x) : Symbol(x)) for (i, x) in enumerate(readsplitline(parsinglayers, io))])
         datarow != header+1 && skipto!(parsinglayers, io, header+1, datarow)
         datapos = position(io)
     end
@@ -137,22 +137,22 @@ function datalayout(header::AbstractRange, parsinglayers, io, datarow, normalize
     end
     datarow != last(header)+1 && skipto!(parsinglayers, io, last(header)+1, datarow)
     datapos = position(io)
-    return makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in columnnames)), datapos
+    return makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in columnnames]), datapos
 end
 
 function datalayout(header::Vector, parsinglayers, io, datarow, normalizenames)
     skipto!(parsinglayers, io, 1, datarow)
     datapos = position(io)
     if eof(io)
-        columnnames = makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in header))
+        columnnames = makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in header])
     else
         row_vals = readsplitline(parsinglayers, io)
         seek(io, datapos)
         if isempty(header)
-            columnnames = Tuple(Symbol("Column$i") for i in eachindex(row_vals))
+            columnnames = [Symbol("Column$i") for i in eachindex(row_vals)]
         else
             length(header) == length(row_vals) || throw(ArgumentError("The length of provided header ($(length(header))) doesn't match the number of columns at row $datarow ($(length(row_vals)))"))
-            columnnames = makeunique(Tuple(normalizenames ? normalizename(nm) : Symbol(nm) for nm in header))
+            columnnames = makeunique([normalizenames ? normalizename(nm) : Symbol(nm) for nm in header])
         end
     end
     return columnnames, datapos

diff --git a/src/tables.jl b/src/tables.jl
@@ -1,28 +1,90 @@
 Tables.istable(::Type{<:File}) = true
-Tables.schema(f::File{NamedTuple{names, types}}) where {names, types} = Tables.Schema(names, types)
+Tables.schema(f::File) = Tables.Schema(f.names, f.types)
 
-struct Row{F}
+Tables.columnaccess(::Type{File{t, c, I, P, KW}}) where {t, c, I, P, KW} = c
+
+struct Columns
+    names
+    columns
+end
+Base.propertynames(c::Columns) = getfield(c, 1)
+function Base.getproperty(c::Columns, nm::Symbol)
+    getfield(c, 2)[findfirst(x->x==nm, getfield(c, 1))]
+end
+
+function Tables.columns(@nospecialize(f::File{transpose, true})) where {transpose}
+    len = length(f)
+    types = f.types
+    columns = Any[Tables.allocatecolumn(T, len) for T in types]
+    for (i, row) in enumerate(f)
+        for col = 1:length(types)
+            columns[col][i] = getproperty(row, types[col], col, :_)
+        end
+    end
+    return Columns(f.names, columns)
+end
+
+# row interfaces
+abstract type Row end
+
+struct UntypedRow{F} <: Row
     file::F
     row::Int
 end
+Base.propertynames(r::UntypedRow) = getfield(r, 1).names
+
+struct RowIterator{names, types, F}
+    file::F
+end
+Tables.schema(r::RowIterator) = Tables.Schema(r.file.names, r.file.types)
+
+struct TypedRow{F} <: Row
+    rowiterator::F
+    row::Int
+end
+Base.propertynames(r::TypedRow{F}) where {F <: RowIterator{names}} where {names} = names
 
 function Base.show(io::IO, r::Row)
     println(io, "CSV.Row($(getfield(r, 2))) of:")
     show(io, getfield(r, 1))
 end
 
-Base.propertynames(row::Row{F}) where {F <: File{NamedTuple{names, T}}} where {names, T} = names
+getranspose(r::UntypedRow{F}) where {F <: File{transpose}} where {transpose} = transpose
+getranspose(r::RowIterator{N, T, F}) where {N, T, F <: File{transpose}} where {transpose} = transpose
+getranspose(r::TypedRow) = getranspose(getfield(r, 1))
 
-Base.eltype(f::F) where {F <: File} = Row{F}
-Base.length(f::File{NT, transpose}) where {NT, transpose} = transpose ? f.lastparsedcol[] : length(f.positions)
-Base.size(f::File{NamedTuple{names, types}}) where {names, types} = (length(f), length(names))
-Base.size(f::File{NamedTuple{}}) = (0, 0)
+# File iterates UntypedRows
+Base.eltype(f::F) where {F <: File} = UntypedRow{F}
+Base.length(f::File{transpose}) where {transpose} = transpose ? f.lastparsedcol[] : length(f.positions)
+Base.size(f::File) = (length(f), length(f.names))
 
+@inline function Base.iterate(f::File{transpose}, st=1) where {transpose}
+    st > length(f) && return nothing
+    # println("row=$st")
+    if transpose
+        st === 1 && (f.positions .= f.originalpositions)
+    else
+        @inbounds Parsers.fastseek!(f.io, f.positions[st])
+        f.currentrow[] = st
+        f.lastparsedcol[] = 0
+        f.lastparsedcode[] = Parsers.SUCCESS
+    end
+    return UntypedRow(f, st), st + 1
+end
+
+# Tables.rows(f) -> RowIterator
 Tables.rowaccess(::Type{<:File}) = true
-Tables.rows(f::File) = f
+Tables.rows(f::F) where {F <: File} = RowIterator{Tuple(f.names), Tuple{f.types...}, F}(f)
 
-@inline function Base.iterate(f::File{NT, transpose}, st=1) where {NT, transpose}
+Base.eltype(f::R) where {R <: RowIterator} = TypedRow{R}
+Base.length(f::RowIterator) = getranspose(f) ? f.file.lastparsedcol[] : length(f.file.positions)
+Base.size(f::RowIterator) = (length(f.file), length(f.file.names))
+
+# RowIterator iterates TypedRows
+@inline function Base.iterate(r::RowIterator, st=1)
+    f = r.file
     st > length(f) && return nothing
+    transpose = getranspose(r)
     # println("row=$st")
     if transpose
         st === 1 && (f.positions .= f.originalpositions)
@@ -32,7 +94,7 @@ Tables.rows(f::File) = f
         f.lastparsedcol[] = 0
         f.lastparsedcode[] = Parsers.SUCCESS
     end
-    return Row(f, st), st + 1
+    return TypedRow(r, st), st + 1
 end
 
 parsingtype(::Type{Missing}) = Missing
@@ -85,12 +147,21 @@ end
     return true
 end
 
-@inline Base.getproperty(csvrow::Row{F}, name::Symbol) where {F <: File{NamedTuple{names, types}}} where {names, types} =
+@inline function Base.getproperty(csvrow::UntypedRow, name::Symbol)
+    f = getfield(csvrow, 1)
+    i = findfirst(x->x===name, f.names)
+    return getproperty(csvrow, f.types[i], i, name)
+end
+@inline Base.getproperty(csvrow::TypedRow{F}, name::Symbol) where {F <: RowIterator{names, types}} where {names, types} =
     getproperty(csvrow, Tables.columntype(names, types, name), Tables.columnindex(names, name), name)
 
-function Base.getproperty(csvrow::Row{F}, ::Type{T}, col::Int, name::Symbol) where {T, F <: File{NT, transpose}} where {NT, transpose}
+getfile(f::File) = f
+getfile(r::RowIterator) = r.file
+
+function Base.getproperty(csvrow::Row, ::Type{T}, col::Int, name::Symbol) where {T}
     col === 0 && return missing
-    f = getfield(csvrow, 1)
+    transpose = getranspose(csvrow)
+    f = getfile(getfield(csvrow, 1))
     row = getfield(csvrow, 2)
     if transpose
         @inbounds Parsers.fastseek!(f.io, f.positions[col])
@@ -128,4 +199,3 @@ function Base.getproperty(csvrow::Row{F}, ::Type{T}, col::Int, name::Symbol) whe
     end
     return r
 end
-
diff --git a/src/typedetection.jl b/src/typedetection.jl
@@ -34,11 +34,11 @@ promote_type2(::Type{String}, ::Type{String}) = String
 
 # providedtypes: Dict{String, Type}, Dict{Int, Type}, Vector{Type}
 initialtype(allowmissing) = (allowmissing === :auto || allowmissing === :none) ? Union{} : Missing
-initialtypes(T, ::Nothing, names) = Any[T for _ = 1:length(names)]
-initialtypes(T, t::Dict{String, V}, names) where {V} = Any[get(t, string(nm), T) for nm in names]
-initialtypes(T, t::Dict{Symbol, V}, names) where {V} = Any[get(t, nm, T) for nm in names]
-initialtypes(T, t::Dict{Int, V}, names) where {V} = Any[get(t, i, T) for i = 1:length(names)]
-initialtypes(T, t::Vector, names) = length(t) == length(names) ? collect(Any, t) : throw(ArgumentError("length of user provided types ($(length(t))) does not match length of header (?$(length(names)))"))
+initialtypes(T, ::Nothing, names) = Type[T for _ = 1:length(names)]
+initialtypes(T, t::Dict{String, V}, names) where {V} = Type[get(t, string(nm), T) for nm in names]
+initialtypes(T, t::Dict{Symbol, V}, names) where {V} = Type[get(t, nm, T) for nm in names]
+initialtypes(T, t::Dict{Int, V}, names) where {V} = Type[get(t, i, T) for i = 1:length(names)]
+initialtypes(T, t::Vector, names) = length(t) == length(names) ? collect(Type, t) : throw(ArgumentError("length of user provided types ($(length(t))) does not match length of header (?$(length(names)))"))
 
 struct Fib{N}
     len::Int

diff --git a/test/testfiles/testfiles.jl b/test/testfiles/testfiles.jl
@@ -1,4 +1,4 @@
-getschema(f::CSV.File{NT}) where NT = NT
+getschema(f::CSV.File) = NamedTuple{Tuple(f.names), Tuple{f.types...}}
 
 function testfile(file, kwargs, sz, sch, testfunc)
     f = CSV.File(file isa IO ? file : joinpath(dir, file); kwargs...)