Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor out libwriters, fix references to Timestamp, Timedelta #19413

Merged
merged 11 commits into from
Feb 1, 2018
196 changes: 2 additions & 194 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,7 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
PyBytes_Check,
PyUnicode_Check,
PyTuple_New,
PyObject_RichCompareBool,
PyBytes_GET_SIZE,
PyUnicode_GET_SIZE)

try:
from cpython cimport PyString_GET_SIZE
except ImportError:
from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
PyObject_RichCompareBool)

cimport cpython

Expand All @@ -38,7 +31,7 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
PyDateTime_IMPORT)
PyDateTime_IMPORT

from tslib import NaT, Timestamp, Timedelta, array_to_datetime
from tslib import NaT, array_to_datetime
from missing cimport checknull


Expand Down Expand Up @@ -127,28 +120,6 @@ def item_from_zerodim(object val):
return util.unbox_if_zerodim(val)


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique(ndarray[object] values):
cdef:
Py_ssize_t i, n = len(values)
list uniques = []
dict table = {}
object val, stub = 0

for i from 0 <= i < n:
val = values[i]
if val not in table:
table[val] = stub
uniques.append(val)
try:
uniques.sort()
except Exception:
pass

return uniques


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple(list arrays):
Expand Down Expand Up @@ -368,30 +339,6 @@ def has_infs_f8(ndarray[float64_t] arr):
return False


def convert_timestamps(ndarray values):
cdef:
object val, f, result
dict cache = {}
Py_ssize_t i, n = len(values)
ndarray[object] out

# for HDFStore, a bit temporary but...

from datetime import datetime
f = datetime.fromtimestamp

out = np.empty(n, dtype='O')

for i in range(n):
val = util.get_value_1d(values, i)
if val in cache:
out[i] = cache[val]
else:
cache[val] = out[i] = f(val)

return out


def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
Expand Down Expand Up @@ -731,145 +678,6 @@ def clean_index_list(list obj):
return np.asarray(obj), 0


ctypedef fused pandas_string:
str
unicode
bytes


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
""" return the maximum size of elements in a 1-dim string array """
cdef:
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
pandas_string v

for i in range(length):
v = arr[i]
if PyString_Check(v):
l = PyString_GET_SIZE(v)
elif PyBytes_Check(v):
l = PyBytes_GET_SIZE(v)
elif PyUnicode_Check(v):
l = PyUnicode_GET_SIZE(v)

if l > m:
m = l

return m


@cython.boundscheck(False)
@cython.wraparound(False)
def string_array_replace_from_nan_rep(
ndarray[object, ndim=1] arr, object nan_rep,
object replace=None):
"""
Replace the values in the array with 'replacement' if
they are 'nan_rep'. Return the same array.
"""

cdef int length = arr.shape[0], i = 0
if replace is None:
replace = np.nan

for i from 0 <= i < length:
if arr[i] == nan_rep:
arr[i] = replace

return arr


@cython.boundscheck(False)
@cython.wraparound(False)
def convert_json_to_lines(object arr):
"""
replace comma separated json with line feeds, paying special attention
to quotes & brackets
"""
cdef:
Py_ssize_t i = 0, num_open_brackets_seen = 0, length
bint in_quotes = 0, is_escaping = 0
ndarray[uint8_t] narr
unsigned char v, comma, left_bracket, right_brack, newline

newline = ord('\n')
comma = ord(',')
left_bracket = ord('{')
right_bracket = ord('}')
quote = ord('"')
backslash = ord('\\')

narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
length = narr.shape[0]
for i in range(length):
v = narr[i]
if v == quote and i > 0 and not is_escaping:
in_quotes = ~in_quotes
if v == backslash or is_escaping:
is_escaping = ~is_escaping
if v == comma: # commas that should be \n
if num_open_brackets_seen == 0 and not in_quotes:
narr[i] = newline
elif v == left_bracket:
if not in_quotes:
num_open_brackets_seen += 1
elif v == right_bracket:
if not in_quotes:
num_open_brackets_seen -= 1

return narr.tostring().decode('utf-8')


@cython.boundscheck(False)
@cython.wraparound(False)
def write_csv_rows(list data, ndarray data_index,
int nlevels, ndarray cols, object writer):

cdef int N, j, i, ncols
cdef list rows
cdef object val

# In crude testing, N>100 yields little marginal improvement
N=100

# pre-allocate rows
ncols = len(cols)
rows = [[None] * (nlevels + ncols) for x in range(N)]

j = -1
if nlevels == 1:
for j in range(len(data_index)):
row = rows[j % N]
row[0] = data_index[j]
for i in range(ncols):
row[1 + i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)
elif nlevels > 1:
for j in range(len(data_index)):
row = rows[j % N]
row[:nlevels] = list(data_index[j])
for i in range(ncols):
row[nlevels + i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)
else:
for j in range(len(data_index)):
row = rows[j % N]
for i in range(ncols):
row[i] = data[i][j]

if j >= N - 1 and j % N == N - 1:
writer.writerows(rows)

if j >= 0 and (j < N - 1 or (j % N) != N - 1):
writer.writerows(rows[:((j + 1) % N)])


# ------------------------------------------------------------------------------
# Groupby-related functions

Expand Down
34 changes: 34 additions & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2225,3 +2225,37 @@ def _maybe_encode(values):
if values is None:
return []
return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]


def sanitize_objects(ndarray[object] values, set na_values,
convert_empty=True):
"""
Convert specified values, including the given set na_values and empty
strings if convert_empty is True, to np.nan.

Parameters
----------
values : ndarray[object]
na_values : set
convert_empty : bool (default True)
"""
cdef:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u add a doc string

Py_ssize_t i, n
object val, onan
Py_ssize_t na_count = 0
dict memo = {}

n = len(values)
onan = np.nan

for i from 0 <= i < n:
val = values[i]
if (convert_empty and val == '') or (val in na_values):
values[i] = onan
na_count += 1
elif val in memo:
values[i] = memo[val]
else:
memo[val] = val

return na_count
26 changes: 1 addition & 25 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from tslibs.nattype import NaT
from tslibs.conversion cimport convert_to_tsobject
from tslibs.timedeltas cimport convert_to_timedelta64
from tslibs.timezones cimport get_timezone, tz_compare
from datetime import datetime, timedelta

iNaT = util.get_nat()

cdef bint PY2 = sys.version_info[0] == 2
Expand Down Expand Up @@ -1405,30 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
return objects


def sanitize_objects(ndarray[object] values, set na_values,
convert_empty=True):
cdef:
Py_ssize_t i, n
object val, onan
Py_ssize_t na_count = 0
dict memo = {}

n = len(values)
onan = np.nan

for i from 0 <= i < n:
val = values[i]
if (convert_empty and val == '') or (val in na_values):
values[i] = onan
na_count += 1
elif val in memo:
values[i] = memo[val]
else:
memo[val] = val

return na_count


def maybe_convert_bool(ndarray[object] arr,
true_values=None, false_values=None):
cdef:
Expand Down
Loading