Improve performance of reading files with duplicate column names

I need to load a file with 30k columns, 10k of these have the same name. Currently, this is practically impossible because makeunique(), which produces unique column names, has cubic complexity. This changes the algorithm to use Set and Dict to quickly look up the existence of columns and to cache the last numeric suffix used to uniquify column names. Care has been taken to ensure that columns are named the same way as before. To that extent, additional tests were added in the previous commit.
JuliaData · Dec 21, 2021 · d226d42 · d226d42
1 parent 2fd493b
commit d226d42
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/src/utils.jl b/src/utils.jl
@@ -349,17 +349,21 @@ function makeunique(names)
     set = Set(names)
     length(set) == length(names) && return Symbol[Symbol(x) for x in names]
     nms = Symbol[]
+    nmsset = Set{eltype(names)}()
+    suffixes = Dict{eltype(names), UInt}()
     for nm in names
-        if nm in nms
-            k = 1
+        if nm in nmsset
+            k = get(suffixes, nm, 1)
             newnm = Symbol("$(nm)_$k")
-            while newnm in set || newnm in nms
+            while newnm in set || newnm in nmsset
                 k += 1
                 newnm = Symbol("$(nm)_$k")
             end
+            suffixes[nm] = k + 1
             nm = newnm
         end
         push!(nms, nm)
+        push!(nmsset, nm)
     end
     @assert length(names) == length(nms)
     return nms