diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e3a9102fec395..5c6254c6a1ec7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ - +from cpython.unicode cimport PyUnicode_AsUTF8 {{py: @@ -98,7 +98,6 @@ from pandas._libs.khash cimport ( # VectorData # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA @@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = get_c_string(key) + v = PyUnicode_AsUTF8(key) k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): @@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable): raise MemoryError() for i in range(n): val = values[i] - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) vecs[i] = v with nogil: @@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. try: - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + v = PyUnicode_AsUTF8(repr(val)) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index aa01a05d0d932..61095b3f034fd 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -18,6 +18,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from libc.stdint cimport INT64_MAX import_datetime() @@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, npy_unit_to_attrname, ) -from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "pandas/datetime/pd_datetime.h": @@ -341,13 +341,13 @@ cdef int string_to_dts( const char* format_buf FormatRequirement format_requirement - buf = get_c_string_buf_and_size(val, &length) + buf = PyUnicode_AsUTF8AndSize(val, &length) if format is None: format_buf = b"" format_length = 0 format_requirement = INFER_FORMAT else: - format_buf = get_c_string_buf_and_size(format, &format_length) + format_buf = PyUnicode_AsUTF8AndSize(format, &format_length) format_requirement = exact return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 384df1cac95eb..85ef3fd93ff09 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -19,6 +19,7 @@ from cpython.datetime cimport ( from datetime import timezone from cpython.object cimport PyObject_Str +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -74,10 +75,7 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport ( - get_c_string_buf_and_size, - is_array, -) +from pandas._libs.tslibs.util cimport is_array cdef extern from "pandas/portable.h": @@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date( int day = 1, month = 1, year bint can_swap = 0 - buf = get_c_string_buf_and_size(date_string, &length) + buf = PyUnicode_AsUTF8AndSize(date_string, &length) if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]): # parsing MM?DD?YYYY and DD?MM?YYYY dates month = _parse_2digit(buf) @@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string): Py_ssize_t length int hour = -1, minute = -1 - buf = get_c_string_buf_and_size(parse_string, &length) + buf = PyUnicode_AsUTF8AndSize(parse_string, &length) if length >= 4: if buf[1] == b":": # h:MM format @@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string): char first int error = 0 - buf = get_c_string_buf_and_size(py_string, &length) + buf = PyUnicode_AsUTF8AndSize(py_string, &length) if length >= 1: first = buf[0] if first == b"0": @@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default, pass if 4 <= date_len <= 7: - buf = get_c_string_buf_and_size(date_string, &date_len) + buf = PyUnicode_AsUTF8AndSize(date_string, &date_len) try: i = date_string.index("Q", 1, 6) if i == 1: diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index a5822e57d3fa6..f144275e0ee6a 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,6 +1,5 @@ from cpython.object cimport PyTypeObject -from cpython.unicode cimport PyUnicode_AsUTF8AndSize cdef extern from "Python.h": @@ -155,36 +154,6 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline const char* get_c_string_buf_and_size(str py_string, - Py_ssize_t *length) except NULL: - """ - Extract internal char* buffer of unicode or bytes object `py_string` with - getting length of this internal buffer saved in `length`. - - Notes - ----- - Python object owns memory, thus returned char* must not be freed. - `length` can be NULL if getting buffer length is not needed. - - Parameters - ---------- - py_string : str - length : Py_ssize_t* - - Returns - ------- - buf : const char* - """ - # Note PyUnicode_AsUTF8AndSize() can - # potentially allocate memory inside in unlikely case of when underlying - # unicode object was stored as non-utf8 and utf8 wasn't requested before. - return PyUnicode_AsUTF8AndSize(py_string, length) - - -cdef inline const char* get_c_string(str py_string) except NULL: - return get_c_string_buf_and_size(py_string, NULL) - - cdef inline bytes string_encode_locale(str py_string): """As opposed to PyUnicode_Encode, use current system locale to encode.""" return PyUnicode_EncodeLocale(py_string, NULL)