Skip to content

Commit

Permalink
Refactor out libwriters, fix references to Timestamp, Timedelta (pand…
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and harisbal committed Feb 28, 2018
1 parent c5da136 commit 7232932
Show file tree
Hide file tree
Showing 29 changed files with 262 additions and 263 deletions.
196 changes: 2 additions & 194 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,7 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
PyBytes_Check,
PyUnicode_Check,
PyTuple_New,
PyObject_RichCompareBool,
PyBytes_GET_SIZE,
PyUnicode_GET_SIZE)

try:
from cpython cimport PyString_GET_SIZE
except ImportError:
from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
PyObject_RichCompareBool)

cimport cpython

Expand All @@ -38,7 +31,7 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
PyDateTime_IMPORT)
PyDateTime_IMPORT

from tslib import NaT, Timestamp, Timedelta, array_to_datetime
from tslib import NaT, array_to_datetime
from missing cimport checknull


Expand Down Expand Up @@ -127,28 +120,6 @@ def item_from_zerodim(object val):
return util.unbox_if_zerodim(val)


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique(ndarray[object] values):
cdef:
Py_ssize_t i, n = len(values)
list uniques = []
dict table = {}
object val, stub = 0

for i from 0 <= i < n:
val = values[i]
if val not in table:
table[val] = stub
uniques.append(val)
try:
uniques.sort()
except Exception:
pass

return uniques


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple(list arrays):
Expand Down Expand Up @@ -368,30 +339,6 @@ def has_infs_f8(ndarray[float64_t] arr):
return False


def convert_timestamps(ndarray values):
cdef:
object val, f, result
dict cache = {}
Py_ssize_t i, n = len(values)
ndarray[object] out

# for HDFStore, a bit temporary but...

from datetime import datetime
f = datetime.fromtimestamp

out = np.empty(n, dtype='O')

for i in range(n):
val = util.get_value_1d(values, i)
if val in cache:
out[i] = cache[val]
else:
cache[val] = out[i] = f(val)

return out


def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
Expand Down Expand Up @@ -731,145 +678,6 @@ def clean_index_list(list obj):
return np.asarray(obj), 0


ctypedef fused pandas_string:
str
unicode
bytes


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
""" return the maximum size of elements in a 1-dim string array """
cdef:
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
pandas_string v

for i in range(length):
v = arr[i]
if PyString_Check(v):
l = PyString_GET_SIZE(v)
elif PyBytes_Check(v):
l = PyBytes_GET_SIZE(v)
elif PyUnicode_Check(v):
l = PyUnicode_GET_SIZE(v)

if l > m:
m = l

return m


@cython.boundscheck(False)
@cython.wraparound(False)
def string_array_replace_from_nan_rep(
ndarray[object, ndim=1] arr, object nan_rep,
object replace=None):
"""
Replace the values in the array with 'replacement' if
they are 'nan_rep'. Return the same array.
"""

cdef int length = arr.shape[0], i = 0
if replace is None:
replace = np.nan

for i from 0 <= i < length:
if arr[i] == nan_rep:
arr[i] = replace

return arr


@cython.boundscheck(False)
@cython.wraparound(False)
def convert_json_to_lines(object arr):
"""
replace comma separated json with line feeds, paying special attention
to quotes & brackets
"""
cdef:
Py_ssize_t i = 0, num_open_brackets_seen = 0, length
bint in_quotes = 0, is_escaping = 0
ndarray[uint8_t] narr
unsigned char v, comma, left_bracket, right_brack, newline

newline = ord('\n')
comma = ord(',')
left_bracket = ord('{')
right_bracket = ord('}')
quote = ord('"')
backslash = ord('\\')

narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
length = narr.shape[0]
for i in range(length):
v = narr[i]
if v == quote and i > 0 and not is_escaping:
in_quotes = ~in_quotes
if v == backslash or is_escaping:
is_escaping = ~is_escaping
if v == comma: # commas that should be \n
if num_open_brackets_seen == 0 and not in_quotes:
narr[i] = newline
elif v == left_bracket:
if not in_quotes:
num_open_brackets_seen += 1
elif v == right_bracket:
if not in_quotes:
num_open_brackets_seen -= 1

return narr.tostring().decode('utf-8')


@cython.boundscheck(False)
@cython.wraparound(False)
def write_csv_rows(list data, ndarray data_index,
int nlevels, ndarray cols, object writer):

cdef int N, j, i, ncols
cdef list rows
cdef object val

# In crude testing, N>100 yields little marginal improvement
N=100

# pre-allocate rows
ncols = len(cols)
rows = [[None] * (nlevels + ncols) for x in range(N)]

j = -1
if nlevels == 1:
for j in range(len(data_index)):
row = rows[j % N]
row[0] = data_index[j]
for i in range(ncols):
row[1 + i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)
elif nlevels > 1:
for j in range(len(data_index)):
row = rows[j % N]
row[:nlevels] = list(data_index[j])
for i in range(ncols):
row[nlevels + i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)
else:
for j in range(len(data_index)):
row = rows[j % N]
for i in range(ncols):
row[i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)

if j >= 0 and (j < N - 1 or (j % N) != N - 1):
writer.writerows(rows[:((j + 1) % N)])


# ------------------------------------------------------------------------------
# Groupby-related functions

Expand Down
34 changes: 34 additions & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2225,3 +2225,37 @@ def _maybe_encode(values):
if values is None:
return []
return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]


def sanitize_objects(ndarray[object] values, set na_values,
convert_empty=True):
"""
Convert specified values, including the given set na_values and empty
strings if convert_empty is True, to np.nan.
Parameters
----------
values : ndarray[object]
na_values : set
convert_empty : bool (default True)
"""
cdef:
Py_ssize_t i, n
object val, onan
Py_ssize_t na_count = 0
dict memo = {}

n = len(values)
onan = np.nan

for i from 0 <= i < n:
val = values[i]
if (convert_empty and val == '') or (val in na_values):
values[i] = onan
na_count += 1
elif val in memo:
values[i] = memo[val]
else:
memo[val] = val

return na_count
26 changes: 1 addition & 25 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from tslibs.nattype import NaT
from tslibs.conversion cimport convert_to_tsobject
from tslibs.timedeltas cimport convert_to_timedelta64
from tslibs.timezones cimport get_timezone, tz_compare
from datetime import datetime, timedelta

iNaT = util.get_nat()

cdef bint PY2 = sys.version_info[0] == 2
Expand Down Expand Up @@ -1405,30 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
return objects


def sanitize_objects(ndarray[object] values, set na_values,
convert_empty=True):
cdef:
Py_ssize_t i, n
object val, onan
Py_ssize_t na_count = 0
dict memo = {}

n = len(values)
onan = np.nan

for i from 0 <= i < n:
val = values[i]
if (convert_empty and val == '') or (val in na_values):
values[i] = onan
na_count += 1
elif val in memo:
values[i] = memo[val]
else:
memo[val] = val

return na_count


def maybe_convert_bool(ndarray[object] arr,
true_values=None, false_values=None):
cdef:
Expand Down
Loading

0 comments on commit 7232932

Please sign in to comment.