Skip to content

Commit

Permalink
Merge pull request #81 from realratchet/master
Browse files Browse the repository at this point in the history
Performance improvements for text reader
  • Loading branch information
realratchet authored Aug 30, 2023
2 parents 90a181b + 39d3a70 commit 3c79d3f
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 130 deletions.
2 changes: 0 additions & 2 deletions tablite/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,6 @@ def __del__(self):

if refcount > 0:
return

print(f"{os.getpid()} deleted page '{self.path}")

self.path.unlink(True)

Expand Down
31 changes: 16 additions & 15 deletions tablite/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,22 +498,13 @@ def guess(*values):
def infer(cls, v, dtype):
if v in DataTypes.nones:
return None
if dtype is int:
return DataTypes._infer_int(v)
elif dtype is str:
return DataTypes._infer_str(v)
elif dtype is float:
return DataTypes._infer_float(v)
elif dtype is bool:
return DataTypes._infer_bool(v)
elif dtype is date:
return DataTypes._infer_date(v)
elif dtype is datetime:
return DataTypes._infer_datetime(v)
elif dtype is time:
return DataTypes._infer_time(v)
else:



if dtype not in matched_types:
raise TypeError(f"The datatype {str(dtype)} is not supported.")

return matched_types[dtype](v)

@classmethod
def _infer_bool(cls, value):
Expand Down Expand Up @@ -829,3 +820,13 @@ def multitype_set(arr):
L = list(set(L))
L = [v for _, v in L]
return np.array(L, dtype=object)

matched_types = {
int: DataTypes._infer_int,
str: DataTypes._infer_str,
float: DataTypes._infer_float,
bool: DataTypes._infer_bool,
date: DataTypes._infer_date,
datetime: DataTypes._infer_datetime,
time: DataTypes._infer_time,
}
52 changes: 34 additions & 18 deletions tablite/file_reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import openpyxl
from pathlib import Path
from tablite.datatypes import DataTypes
import csv
from io import StringIO

ENCODING_GUESS_BYTES = 10000

Expand Down Expand Up @@ -88,6 +90,7 @@ def __init__(
self.c = self._call_4
except TypeError:
self.c = self._call_4_slow
# self.c = self._call_3

def __call__(self, s):
return self.c(s)
Expand All @@ -100,24 +103,37 @@ def _call_2(self, s):

def _call_3(self, s): # looks for qoutes.
words = []
qoute = False
ix = 0
while ix < len(s):
c = s[ix]
if c == self.qoute:
qoute = not qoute
if qoute:
ix += 1
continue
if c == self.delimiter:
word, s = s[:ix], s[ix + self._delimiter_length :]
word = word.lstrip(self.qoute).rstrip(self.qoute)
words.append(word)
ix = -1
ix += 1
if s:
s = s.lstrip(self.qoute).rstrip(self.qoute)
words.append(s)
# qoute = False
# ix = 0
# while ix < len(s):
# c = s[ix]
# if c == self.qoute:
# qoute = not qoute
# if qoute:
# ix += 1
# continue
# if c == self.delimiter:
# word, s = s[:ix], s[ix + self._delimiter_length :]
# word = word.lstrip(self.qoute).rstrip(self.qoute)
# words.append(word)
# ix = -1
# ix += 1
# if s:
# s = s.lstrip(self.qoute).rstrip(self.qoute)
# words.append(s)

class MyDialect(csv.Dialect):
delimiter = self.delimiter
quotechar = self.qoute
escapechar = '\\'
doublequote = True
quoting = csv.QUOTE_MINIMAL
skipinitialspace = False
lineterminator = "\n"

dia = MyDialect
parsed_words = list(csv.reader(StringIO(s), dialect=dia))[0]
words.extend(parsed_words)
return words

def _call_4(self, s): # looks for qoutes, openings and closures.
Expand Down
Loading

0 comments on commit 3c79d3f

Please sign in to comment.