From c386238f756d9465288c72b882bb68fcdf62ea3a Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 25 Feb 2021 16:26:30 -0700 Subject: [PATCH] Scrub CSV.File types for Tables.Schema (#811) Fixes #808. The issue here is that we use an internal `PooledString` type while parsing to signal a column is currently being pooled. Once we're done parsing, however, there's no value in keeping `PooledString` around, and indeed, `Tables.rowtable` even gets confused because it's expecting a `PooledString` object but we always return `String` objects when indexing string columns, pooled or not. The fix here is to scrub these PooledString type columns to correct the `Tables.Schema` on `CSV.File`. Let's see if CI points out any problems with this approach. --- src/file.jl | 2 ++ src/utils.jl | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/file.jl b/src/file.jl index 881b175f..ca3f553e 100644 --- a/src/file.jl +++ b/src/file.jl @@ -331,6 +331,8 @@ function File(h::Header; if types[i] === Union{} types[i] = Missing columns[i] = MissingVector(finalrows) + elseif schematype(types[i]) !== types[i] + types[i] = schematype(types[i]) end end end diff --git a/src/utils.jl b/src/utils.jl index bae33e16..486db44a 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -7,6 +7,10 @@ with the output array type being a `PooledArray`. """ struct PooledString <: AbstractString end +schematype(::Type{T}) where {T} = T +schematype(::Type{PooledString}) = String +schematype(::Type{Union{Missing, PooledString}}) = Union{Missing, String} + # PointerString is an internal-only type for efficiently tracking string data + length # all strings indexed from a column/row will always be a full String # specifically, it allows avoiding materializing full Strings for pooled string columns while parsing