From c386238f756d9465288c72b882bb68fcdf62ea3a Mon Sep 17 00:00:00 2001
From: Jacob Quinn <quinn.jacobd@gmail.com>
Date: Thu, 25 Feb 2021 16:26:30 -0700
Subject: [PATCH] Scrub CSV.File types for Tables.Schema (#811)

Fixes #808. The issue here is that we use an internal `PooledString`
type while parsing to signal a column is currently being pooled. Once
we're done parsing, however, there's no value in keeping `PooledString`
around, and indeed, `Tables.rowtable` even gets confused because it's
expecting a `PooledString` object but we always return `String` objects
when indexing string columns, pooled or not. The fix here is to scrub
these PooledString type columns to correct the `Tables.Schema` on
`CSV.File`. Let's see if CI points out any problems with this approach.
---
 src/file.jl  | 2 ++
 src/utils.jl | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/src/file.jl b/src/file.jl
index 881b175f..ca3f553e 100644
--- a/src/file.jl
+++ b/src/file.jl
@@ -331,6 +331,8 @@ function File(h::Header;
             if types[i] === Union{}
                 types[i] = Missing
                 columns[i] = MissingVector(finalrows)
+            elseif schematype(types[i]) !== types[i]
+                types[i] = schematype(types[i])
             end
         end
     end
diff --git a/src/utils.jl b/src/utils.jl
index bae33e16..486db44a 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -7,6 +7,10 @@ with the output array type being a `PooledArray`.
 """
 struct PooledString <: AbstractString end
 
+schematype(::Type{T}) where {T} = T
+schematype(::Type{PooledString}) = String
+schematype(::Type{Union{Missing, PooledString}}) = Union{Missing, String}
+
 # PointerString is an internal-only type for efficiently tracking string data + length
 # all strings indexed from a column/row will always be a full String
 # specifically, it allows avoiding materializing full Strings for pooled string columns while parsing