Merge pull request #81 from realratchet/master

Performance improvements for text reader
root-11 · Aug 30, 2023 · 3c79d3f · 3c79d3f
2 parents 90a181b + 39d3a70
commit 3c79d3f
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 130 deletions.
diff --git a/tablite/base.py b/tablite/base.py
@@ -97,8 +97,6 @@ def __del__(self):
 
         if refcount > 0:
             return
-
-        print(f"{os.getpid()} deleted page '{self.path}")
 
         self.path.unlink(True)
 

diff --git a/tablite/datatypes.py b/tablite/datatypes.py
@@ -498,22 +498,13 @@ def guess(*values):
     def infer(cls, v, dtype):
         if v in DataTypes.nones:
             return None
-        if dtype is int:
-            return DataTypes._infer_int(v)
-        elif dtype is str:
-            return DataTypes._infer_str(v)
-        elif dtype is float:
-            return DataTypes._infer_float(v)
-        elif dtype is bool:
-            return DataTypes._infer_bool(v)
-        elif dtype is date:
-            return DataTypes._infer_date(v)
-        elif dtype is datetime:
-            return DataTypes._infer_datetime(v)
-        elif dtype is time:
-            return DataTypes._infer_time(v)
-        else:
+
+
+
+        if dtype not in matched_types:
             raise TypeError(f"The datatype {str(dtype)} is not supported.")
+
+        return matched_types[dtype](v)
 
     @classmethod
     def _infer_bool(cls, value):
@@ -829,3 +820,13 @@ def multitype_set(arr):
     L = list(set(L))
     L = [v for _, v in L]
     return np.array(L, dtype=object)
+
+matched_types = {
+    int: DataTypes._infer_int,
+    str: DataTypes._infer_str,
+    float: DataTypes._infer_float,
+    bool: DataTypes._infer_bool,
+    date: DataTypes._infer_date,
+    datetime: DataTypes._infer_datetime,
+    time: DataTypes._infer_time,
+}
diff --git a/tablite/file_reader_utils.py b/tablite/file_reader_utils.py
@@ -3,6 +3,8 @@
 import openpyxl
 from pathlib import Path
 from tablite.datatypes import DataTypes
+import csv
+from io import StringIO
 
 ENCODING_GUESS_BYTES = 10000
 
@@ -88,6 +90,7 @@ def __init__(
                 self.c = self._call_4
             except TypeError:
                 self.c = self._call_4_slow
+        # self.c = self._call_3
 
     def __call__(self, s):
         return self.c(s)
@@ -100,24 +103,37 @@ def _call_2(self, s):
 
     def _call_3(self, s):  # looks for qoutes.
         words = []
-        qoute = False
-        ix = 0
-        while ix < len(s):
-            c = s[ix]
-            if c == self.qoute:
-                qoute = not qoute
-            if qoute:
-                ix += 1
-                continue
-            if c == self.delimiter:
-                word, s = s[:ix], s[ix + self._delimiter_length :]
-                word = word.lstrip(self.qoute).rstrip(self.qoute)
-                words.append(word)
-                ix = -1
-            ix += 1
-        if s:
-            s = s.lstrip(self.qoute).rstrip(self.qoute)
-            words.append(s)
+        # qoute = False
+        # ix = 0
+        # while ix < len(s):
+        #     c = s[ix]
+        #     if c == self.qoute:
+        #         qoute = not qoute
+        #     if qoute:
+        #         ix += 1
+        #         continue
+        #     if c == self.delimiter:
+        #         word, s = s[:ix], s[ix + self._delimiter_length :]
+        #         word = word.lstrip(self.qoute).rstrip(self.qoute)
+        #         words.append(word)
+        #         ix = -1
+        #     ix += 1
+        # if s:
+        #     s = s.lstrip(self.qoute).rstrip(self.qoute)
+        #     words.append(s)
+
+        class MyDialect(csv.Dialect):
+            delimiter = self.delimiter
+            quotechar = self.qoute
+            escapechar = '\\'
+            doublequote = True
+            quoting = csv.QUOTE_MINIMAL
+            skipinitialspace = False
+            lineterminator = "\n"
+
+        dia = MyDialect
+        parsed_words = list(csv.reader(StringIO(s), dialect=dia))[0]
+        words.extend(parsed_words)
         return words
 
     def _call_4(self, s):  # looks for qoutes, openings and closures.
-Original file line number
+Diff line change
@@ Expand Up / @@ -97,8 +97,6 @@ def __del__(self): @@
             if refcount > 0:
                 return
-            print(f"{os.getpid()} deleted page '{self.path}")
             self.path.unlink(True)
@@ Expand Down @@