Refactor out libwriters, fix references to Timestamp, Timedelta (pand…

…as-dev#19413)
harisbal · Feb 28, 2018 · 7232932 · 7232932
1 parent c5da136
commit 7232932
Show file tree

Hide file tree

Showing 29 changed files with 262 additions and 263 deletions.
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -21,14 +21,7 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
                       PyBytes_Check,
                       PyUnicode_Check,
                       PyTuple_New,
-                      PyObject_RichCompareBool,
-                      PyBytes_GET_SIZE,
-                      PyUnicode_GET_SIZE)
-
-try:
-    from cpython cimport PyString_GET_SIZE
-except ImportError:
-    from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
+                      PyObject_RichCompareBool)
 
 cimport cpython
 
@@ -38,7 +31,7 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
                                PyDateTime_IMPORT)
 PyDateTime_IMPORT
 
-from tslib import NaT, Timestamp, Timedelta, array_to_datetime
+from tslib import NaT, array_to_datetime
 from missing cimport checknull
 
 
@@ -127,28 +120,6 @@ def item_from_zerodim(object val):
     return util.unbox_if_zerodim(val)
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_unique(ndarray[object] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        list uniques = []
-        dict table = {}
-        object val, stub = 0
-
-    for i from 0 <= i < n:
-        val = values[i]
-        if val not in table:
-            table[val] = stub
-            uniques.append(val)
-    try:
-        uniques.sort()
-    except Exception:
-        pass
-
-    return uniques
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def fast_unique_multiple(list arrays):
@@ -368,30 +339,6 @@ def has_infs_f8(ndarray[float64_t] arr):
     return False
 
 
-def convert_timestamps(ndarray values):
-    cdef:
-        object val, f, result
-        dict cache = {}
-        Py_ssize_t i, n = len(values)
-        ndarray[object] out
-
-    # for HDFStore, a bit temporary but...
-
-    from datetime import datetime
-    f = datetime.fromtimestamp
-
-    out = np.empty(n, dtype='O')
-
-    for i in range(n):
-        val = util.get_value_1d(values, i)
-        if val in cache:
-            out[i] = cache[val]
-        else:
-            cache[val] = out[i] = f(val)
-
-    return out
-
-
 def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
     cdef:
         Py_ssize_t i, n = len(indices)
@@ -731,145 +678,6 @@ def clean_index_list(list obj):
     return np.asarray(obj), 0
 
 
-ctypedef fused pandas_string:
-    str
-    unicode
-    bytes
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
-    """ return the maximum size of elements in a 1-dim string array """
-    cdef:
-        Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
-        pandas_string v
-
-    for i in range(length):
-        v = arr[i]
-        if PyString_Check(v):
-            l = PyString_GET_SIZE(v)
-        elif PyBytes_Check(v):
-            l = PyBytes_GET_SIZE(v)
-        elif PyUnicode_Check(v):
-            l = PyUnicode_GET_SIZE(v)
-
-        if l > m:
-            m = l
-
-    return m
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def string_array_replace_from_nan_rep(
-        ndarray[object, ndim=1] arr, object nan_rep,
-        object replace=None):
-    """
-    Replace the values in the array with 'replacement' if
-    they are 'nan_rep'. Return the same array.
-    """
-
-    cdef int length = arr.shape[0], i = 0
-    if replace is None:
-        replace = np.nan
-
-    for i from 0 <= i < length:
-        if arr[i] == nan_rep:
-            arr[i] = replace
-
-    return arr
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def convert_json_to_lines(object arr):
-    """
-    replace comma separated json with line feeds, paying special attention
-    to quotes & brackets
-    """
-    cdef:
-        Py_ssize_t i = 0, num_open_brackets_seen = 0, length
-        bint in_quotes = 0, is_escaping = 0
-        ndarray[uint8_t] narr
-        unsigned char v, comma, left_bracket, right_brack, newline
-
-    newline = ord('\n')
-    comma = ord(',')
-    left_bracket = ord('{')
-    right_bracket = ord('}')
-    quote = ord('"')
-    backslash = ord('\\')
-
-    narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
-    length = narr.shape[0]
-    for i in range(length):
-        v = narr[i]
-        if v == quote and i > 0 and not is_escaping:
-            in_quotes = ~in_quotes
-        if v == backslash or is_escaping:
-            is_escaping = ~is_escaping
-        if v == comma:  # commas that should be \n
-            if num_open_brackets_seen == 0 and not in_quotes:
-                narr[i] = newline
-        elif v == left_bracket:
-            if not in_quotes:
-                num_open_brackets_seen += 1
-        elif v == right_bracket:
-            if not in_quotes:
-                num_open_brackets_seen -= 1
-
-    return narr.tostring().decode('utf-8')
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def write_csv_rows(list data, ndarray data_index,
-                   int nlevels, ndarray cols, object writer):
-
-    cdef int N, j, i, ncols
-    cdef list rows
-    cdef object val
-
-    # In crude testing, N>100 yields little marginal improvement
-    N=100
-
-    # pre-allocate rows
-    ncols = len(cols)
-    rows = [[None] * (nlevels + ncols) for x in range(N)]
-
-    j = -1
-    if nlevels == 1:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            row[0] = data_index[j]
-            for i in range(ncols):
-                row[1 + i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-    elif nlevels > 1:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            row[:nlevels] = list(data_index[j])
-            for i in range(ncols):
-                row[nlevels + i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-    else:
-        for j in range(len(data_index)):
-            row = rows[j % N]
-            for i in range(ncols):
-                row[i] = data[i][j]
-
-            if j >= N - 1 and j % N == N - 1:
-                writer.writerows(rows)
-
-    if j >= 0 and (j < N - 1 or (j % N) != N - 1):
-        writer.writerows(rows[:((j + 1) % N)])
-
-
 # ------------------------------------------------------------------------------
 # Groupby-related functions
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -2225,3 +2225,37 @@ def _maybe_encode(values):
     if values is None:
         return []
     return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
+
+
+def sanitize_objects(ndarray[object] values, set na_values,
+                     convert_empty=True):
+    """
+    Convert specified values, including the given set na_values and empty
+    strings if convert_empty is True, to np.nan.
+
+    Parameters
+    ----------
+    values : ndarray[object]
+    na_values : set
+    convert_empty : bool (default True)
+    """
+    cdef:
+        Py_ssize_t i, n
+        object val, onan
+        Py_ssize_t na_count = 0
+        dict memo = {}
+
+    n = len(values)
+    onan = np.nan
+
+    for i from 0 <= i < n:
+        val = values[i]
+        if (convert_empty and val == '') or (val in na_values):
+            values[i] = onan
+            na_count += 1
+        elif val in memo:
+            values[i] = memo[val]
+        else:
+            memo[val] = val
+
+    return na_count
diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
@@ -6,7 +6,7 @@ from tslibs.nattype import NaT
 from tslibs.conversion cimport convert_to_tsobject
 from tslibs.timedeltas cimport convert_to_timedelta64
 from tslibs.timezones cimport get_timezone, tz_compare
-from datetime import datetime, timedelta
+
 iNaT = util.get_nat()
 
 cdef bint PY2 = sys.version_info[0] == 2
@@ -1405,30 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     return objects
 
 
-def sanitize_objects(ndarray[object] values, set na_values,
-                     convert_empty=True):
-    cdef:
-        Py_ssize_t i, n
-        object val, onan
-        Py_ssize_t na_count = 0
-        dict memo = {}
-
-    n = len(values)
-    onan = np.nan
-
-    for i from 0 <= i < n:
-        val = values[i]
-        if (convert_empty and val == '') or (val in na_values):
-            values[i] = onan
-            na_count += 1
-        elif val in memo:
-            values[i] = memo[val]
-        else:
-            memo[val] = val
-
-    return na_count
-
-
 def maybe_convert_bool(ndarray[object] arr,
                        true_values=None, false_values=None):
     cdef: