Introduce delimiter guessing (#100)

* add CSV.guessedDelimiter(string:) helper * add delimiter-guessing initializer * add CSV.Delimiter enum * use CSV.Delimiter for initializers * re-group initializers * expand README to mention delimiters and talk a bit about the API
swiftcsv · May 13, 2022 · 7461683 · 7461683
1 parent 38fa397
commit 7461683
Show file tree

Hide file tree

Showing 10 changed files with 332 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -20,86 +20,72 @@ CSV content can be loaded using the `CSV` class:
 import SwiftCSV
 
 do {
-    // As a string
+    // As a string, guessing the delimiter
     let csv: CSV = try CSV(string: "id,name,age\n1,Alice,18")
 
-    // With a custom delimiter character
+    // Specifying a custom delimiter
     let tsv: CSV = try CSV(string: "id\tname\tage\n1\tAlice\t18", delimiter: "\t")
 
-    // From a file (with errors)
+    // From a file (propagating error during file loading)
     let csvFile: CSV = try CSV(url: URL(fileURLWithPath: "path/to/users.csv"))
 
-    // From a file inside the app bundle, with a custom delimiter, errors, and custom encoding
+    // From a file inside the app bundle, with a custom delimiter, errors, and custom encoding.
+    // Note the result is an optional.
     let resource: CSV? = try CSV(
         name: "users",
         extension: "tsv",
         bundle: .main,
         delimiter: "\t",
         encoding: .utf8)
 } catch parseError as CSVParseError {
-    // Catch errors from parsing invalid formed CSV
+    // Catch errors from parsing invalid CSV
 } catch {
     // Catch errors from trying to load files
 }
 ```
 
-### API
+### File Loading
 
-If you don't care about accessing named columns, you can set the `loadColumns` argument to `false` and the columns Dictionary will not be populated. This can increase performance in critical cases for lots of data.
+The `CSV` class comes with initializers that are suited for loading files from URLs.
 
 ```swift
-class CSV {
-    /// Load CSV data from a string.
-    ///
-    /// - parameter string: CSV contents to parse.
-    /// - parameter delimiter: Character used to separate  row and header fields (default is ',')
-    /// - parameter loadColumns: Whether to populate the `columns` dictionary (default is `true`)
-    /// - throws: `CSVParseError` when parsing `string` fails.
-    public init(string: String,
-                delimiter: Character = comma,
-                loadColumns: Bool = true) throws
-
-    /// Load a CSV file as a named resource from `bundle`.
+extension CSV {
+    /// Load a CSV file from `url`.
     ///
-    /// - parameter name: Name of the file resource inside `bundle`.
-    /// - parameter ext: File extension of the resource; use `nil` to load the first file matching the name (default is `nil`)
-    /// - parameter bundle: `Bundle` to use for resource lookup (default is `.main`)
-    /// - parameter delimiter: Character used to separate row and header fields (default is ',')
-    /// - parameter encoding: encoding used to read file (default is `.utf8`)
+    /// - parameter url: URL of the file (will be passed to `String(contentsOfURL:encoding:)` to load)
+    /// - parameter delimiter: Character used to separate cells from one another in rows.
+    /// - parameter encoding: Character encoding to read file (default is `.utf8`)
     /// - parameter loadColumns: Whether to populate the columns dictionary (default is `true`)
-    /// - throws: `CSVParseError` when parsing the contents of the resource fails, or file loading errors.
-    /// - returns: `nil` if the resource could not be found
-    public convenience init?(
-        name: String,
-        extension ext: String? = nil,
-        bundle: Bundle = .main,
-        delimiter: Character = comma,
-        encoding: String.Encoding = .utf8,
-        loadColumns: Bool = true) throws
+    /// - throws: `CSVParseError` when parsing the contents of `url` fails, or file loading errors.
+    public convenience init(url: URL,
+                            delimiter: Delimiter,
+                            encoding: String.Encoding = .utf8,
+                            loadColumns: Bool = true) throws
 
-    /// Load a CSV file from `url`.
+    /// Load a CSV file from `url` and guess its delimiter from `CSV.recognizedDelimiters`, falling back to `.comma`.
     ///
     /// - parameter url: URL of the file (will be passed to `String(contentsOfURL:encoding:)` to load)
-    /// - parameter delimiter: Character used to separate row and header fields (default is ',')
     /// - parameter encoding: Character encoding to read file (default is `.utf8`)
     /// - parameter loadColumns: Whether to populate the columns dictionary (default is `true`)
     /// - throws: `CSVParseError` when parsing the contents of `url` fails, or file loading errors.
-    public convenience init(
-        url: URL,
-        delimiter: Character = comma,
-        encoding: String.Encoding = .utf8,
-        loadColumns: Bool = true)
-}
-
-public enum CSVParseError: Error {
-    case generic(message: String)
-    case quotation(message: String)
+    public convenience init(url: URL,
+                            encoding: String.Encoding = .utf8,
+                            loadColumns: Bool = true)
 }
 ```
 
+### Delimiters
+
+Delimiters are strongly typed. The recognized `CSV.Delimiter` cases are: `.comma`, `.semicolon`, and `.tab`.
+
+You can use convenience initializers that guess the delimiter from the recognized list for you. These initializers are available for loading CSV from URLs and strings.
+
+You can also use any other single-character delimiter when loading CSV data. A character literal like `"x"` will produce `CSV.Delimiter.character("x")`, so you don't have to type the whole `.character(_)` case name. There are initializers for each variant that accept explicit delimiter settings.
+
 ### Reading Data
 
 ```swift
+// Recognized the comma delimiter automatically:
 let csv = CSV(string: "id,name,age\n1,Alice,18\n2,Bob,19")
 csv.header         //=> ["id", "name", "age"]
 csv.namedRows      //=> [["id": "1", "name": "Alice", "age": "18"], ["id": "2", "name": "Bob", "age": "19"]]
@@ -119,6 +105,21 @@ csv.enumerateAsDict { dict in
 }
 ```
 
+### Skip Named Column Access for Large Data Sets
+
+By default, the variants of `CSV.init` will populate its `namedColumns` and `enumeratedColumns` to provide access to the CSV data on a column-by-column basis. Think of this like a cross section:
+
+```swift
+let csv = CSV(string: "id,name,age\n1,Alice,18\n2,Bob,19")
+csv.namedRows[0]["name"]  //=> "Alice"
+csv.namedColumns["name"]  //=> ["Alice", "Bob"]
+```
+
+If you only want to access your data row-by-row, and not by-column, then you can set the `loadColumns` argument in any initializer to `false`. This will prevent the columnar data from being populated.
+
+Skipping this step can increase performance for lots of data.
+
+
 ## Installation
 
 ### CocoaPods

diff --git a/SwiftCSV.xcodeproj/project.pbxproj b/SwiftCSV.xcodeproj/project.pbxproj
@@ -30,6 +30,16 @@
 		508975E11DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508975E01DBF3E51006F3DBE /* EnumeratedViewTests.swift */; };
 		508975E21DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508975E01DBF3E51006F3DBE /* EnumeratedViewTests.swift */; };
 		508975E31DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508975E01DBF3E51006F3DBE /* EnumeratedViewTests.swift */; };
+		508CA0FB2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */; };
+		508CA0FD2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FC2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift */; };
+		508CA0FE2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FC2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift */; };
+		508CA0FF2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FC2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift */; };
+		508CA1002771F32C0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */; };
+		508CA1022771F32D0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */; };
+		508CA1032771F32E0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */; };
+		508CA1052772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA1042772039E0084C8E8 /* CSVDelimiterTests.swift */; };
+		508CA1062772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA1042772039E0084C8E8 /* CSVDelimiterTests.swift */; };
+		508CA1072772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 508CA1042772039E0084C8E8 /* CSVDelimiterTests.swift */; };
 		5FB74B9B1CCB9274009DDBF1 /* SwiftCSV.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5FB74B911CCB9274009DDBF1 /* SwiftCSV.framework */; };
 		5FB74BB71CCB929D009DDBF1 /* SwiftCSV.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5FB74BAD1CCB929D009DDBF1 /* SwiftCSV.framework */; };
 		5FB74BD11CCB92E5009DDBF1 /* CSV.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DAAEE9B1C74C7EC00A933DB /* CSV.swift */; };
@@ -118,6 +128,9 @@
 		508975D61DBF34CF006F3DBE /* ParsingState.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ParsingState.swift; sourceTree = "<group>"; };
 		508975DB1DBF3B70006F3DBE /* EnumeratedView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = EnumeratedView.swift; sourceTree = "<group>"; };
 		508975E01DBF3E51006F3DBE /* EnumeratedViewTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = EnumeratedViewTests.swift; sourceTree = "<group>"; };
+		508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "CSV+DelimiterGuessing.swift"; sourceTree = "<group>"; };
+		508CA0FC2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "CSV+DelimiterGuessingTests.swift"; sourceTree = "<group>"; };
+		508CA1042772039E0084C8E8 /* CSVDelimiterTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CSVDelimiterTests.swift; sourceTree = "<group>"; };
 		50F241A4274BB8DB00520A69 /* CHANGELOG.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = CHANGELOG.md; sourceTree = "<group>"; };
 		5FB74B911CCB9274009DDBF1 /* SwiftCSV.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = SwiftCSV.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		5FB74B9A1CCB9274009DDBF1 /* SwiftCSVTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = SwiftCSVTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -223,6 +236,7 @@
 			isa = PBXGroup;
 			children = (
 				3DAAEE9B1C74C7EC00A933DB /* CSV.swift */,
+				508CA0FA2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift */,
 				508975D11DBB897A006F3DBE /* NamedView.swift */,
 				508975DB1DBF3B70006F3DBE /* EnumeratedView.swift */,
 				3D444BCC1C7D88290001C60C /* String+Lines.swift */,
@@ -238,6 +252,8 @@
 			children = (
 				BE06B67E1CB72680009578CC /* Res */,
 				3D1E59C61945FFAD001CF760 /* CSVTests.swift */,
+				508CA0FC2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift */,
+				508CA1042772039E0084C8E8 /* CSVDelimiterTests.swift */,
 				508975E01DBF3E51006F3DBE /* EnumeratedViewTests.swift */,
 				BE6C86061CB5CE44009A351D /* QuotedTests.swift */,
 				3D3749E2194D6DF7008F262A /* TSVTests.swift */,
@@ -555,6 +571,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				508975D21DBB897A006F3DBE /* NamedView.swift in Sources */,
+				508CA0FB2771F2E70084C8E8 /* CSV+DelimiterGuessing.swift in Sources */,
 				BEE5461E1CBBB15900C0666F /* Description.swift in Sources */,
 				3DAAEE9C1C74C7EC00A933DB /* CSV.swift in Sources */,
 				BE9B02D81CBE57B8009FE424 /* Parser.swift in Sources */,
@@ -570,7 +587,9 @@
 			files = (
 				E46085941CCB1F5C00385286 /* PerformanceTest.swift in Sources */,
 				3D1E59C71945FFAD001CF760 /* CSVTests.swift in Sources */,
+				508CA0FD2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */,
 				F5C19F502283243C00920B06 /* ResourceHelper.swift in Sources */,
+				508CA1052772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */,
 				508975E11DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */,
 				5015AD8A274BA20A0050F975 /* ParserTests.swift in Sources */,
 				3D3749E3194D6DF7008F262A /* TSVTests.swift in Sources */,
@@ -584,6 +603,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				508975D31DBB897A006F3DBE /* NamedView.swift in Sources */,
+				508CA1002771F32C0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */,
 				5FB74BD11CCB92E5009DDBF1 /* CSV.swift in Sources */,
 				5FB74BD21CCB92E5009DDBF1 /* String+Lines.swift in Sources */,
 				508975DD1DBF3B70006F3DBE /* EnumeratedView.swift in Sources */,
@@ -599,7 +619,9 @@
 			files = (
 				5FB74BE01CCB9312009DDBF1 /* CSVTests.swift in Sources */,
 				5FB74BE11CCB9312009DDBF1 /* QuotedTests.swift in Sources */,
+				508CA0FE2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */,
 				F5C19F512283C0C100920B06 /* ResourceHelper.swift in Sources */,
+				508CA1062772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */,
 				508975E21DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */,
 				5015AD8B274BA20A0050F975 /* ParserTests.swift in Sources */,
 				5FB74BE21CCB9312009DDBF1 /* TSVTests.swift in Sources */,
@@ -613,6 +635,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				508975D41DBB897A006F3DBE /* NamedView.swift in Sources */,
+				508CA1022771F32D0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */,
 				5FB74BD61CCB92EB009DDBF1 /* CSV.swift in Sources */,
 				5FB74BD71CCB92EB009DDBF1 /* String+Lines.swift in Sources */,
 				508975DE1DBF3B70006F3DBE /* EnumeratedView.swift in Sources */,
@@ -628,7 +651,9 @@
 			files = (
 				5FB74BE51CCB931F009DDBF1 /* CSVTests.swift in Sources */,
 				5FB74BE61CCB931F009DDBF1 /* QuotedTests.swift in Sources */,
+				508CA0FF2771F3260084C8E8 /* CSV+DelimiterGuessingTests.swift in Sources */,
 				F5C19F522283C0C300920B06 /* ResourceHelper.swift in Sources */,
+				508CA1072772039E0084C8E8 /* CSVDelimiterTests.swift in Sources */,
 				508975E31DBF3E51006F3DBE /* EnumeratedViewTests.swift in Sources */,
 				5015AD8C274BA20A0050F975 /* ParserTests.swift in Sources */,
 				5FB74BE71CCB931F009DDBF1 /* TSVTests.swift in Sources */,
@@ -642,6 +667,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				508975D51DBB897A006F3DBE /* NamedView.swift in Sources */,
+				508CA1032771F32E0084C8E8 /* CSV+DelimiterGuessing.swift in Sources */,
 				5FB74BDB1CCB92F1009DDBF1 /* CSV.swift in Sources */,
 				5FB74BDC1CCB92F1009DDBF1 /* String+Lines.swift in Sources */,
 				508975DF1DBF3B70006F3DBE /* EnumeratedView.swift in Sources */,

diff --git a/SwiftCSV/CSV+DelimiterGuessing.swift b/SwiftCSV/CSV+DelimiterGuessing.swift
@@ -0,0 +1,51 @@
+//
+//  CSV+DelimiterGuessing.swift
+//  SwiftCSV
+//
+//  Created by Christian Tietze on 21.12.21.
+//  Copyright © 2021 SwiftCSV. All rights reserved.
+//
+
+import Foundation
+
+extension CSV {
+    public static let recognizedDelimiters: [Delimiter] = [.comma, .tab, .semicolon]
+
+    /// - Returns: Delimiter between cells based on the first line in the CSV. Falls back to `.comma`.
+    public static func guessedDelimiter(string: String) -> Delimiter {
+        let recognizedDelimiterCharacters = recognizedDelimiters.map(\.rawValue)
+
+        // Trim newline and spaces, but keep tabs (as delimiters)
+        var trimmedCharacters = CharacterSet.whitespacesAndNewlines
+        trimmedCharacters.remove("\t")
+        let line = string.trimmingCharacters(in: trimmedCharacters).firstLine
+
+        var index = line.startIndex
+        while index < line.endIndex {
+            let character = line[index]
+            switch character {
+            case "\"":
+                // When encountering an open quote, skip to the closing counterpart.
+                // If none is found, skip to end of line.
+
+                // 1) Advance one character to skip the quote
+                index = line.index(after: index)
+
+                // 2) Look for the closing quote and move current position after it
+                if index < line.endIndex,
+                   let closingQuoteInddex = line[index...].firstIndex(of: character) {
+                    index = line.index(after: closingQuoteInddex)
+                } else {
+                    index = line.endIndex
+                }
+            case _ where recognizedDelimiterCharacters.contains(character):
+                return Delimiter(rawValue: character)
+            default:
+                index = line.index(after: index)
+            }
+        }
+
+        // Fallback value
+        return .comma
+    }
+}