Skip to content

Commit

Permalink
REF: Use PyUnicode_AsUTF8AndSize instead of get_c_string_buf_and_size (
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Apr 12, 2024
1 parent b4493b6 commit 4fe49b1
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 55 deletions.
25 changes: 12 additions & 13 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

from cpython.unicode cimport PyUnicode_AsUTF8

{{py:

Expand Down Expand Up @@ -98,7 +98,6 @@ from pandas._libs.khash cimport (
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA


Expand Down Expand Up @@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable):
cdef:
khiter_t k
const char *v
v = get_c_string(val)
v = PyUnicode_AsUTF8(val)

k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
Expand All @@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable):
int ret = 0
const char *v

v = get_c_string(key)
v = PyUnicode_AsUTF8(key)

k = kh_put_str(self.table, v, &ret)
if kh_exist_str(self.table, k):
Expand All @@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable):
raise MemoryError()
for i in range(n):
val = values[i]
v = get_c_string(val)
v = PyUnicode_AsUTF8(val)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
v = PyUnicode_AsUTF8(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
v = PyUnicode_AsUTF8(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable):
else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
except UnicodeEncodeError:
v = get_c_string(<str>repr(val))
v = PyUnicode_AsUTF8(<str>repr(val))
vecs[i] = v

# compute
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ from cpython.object cimport (
Py_LT,
Py_NE,
)
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from libc.stdint cimport INT64_MAX

import_datetime()
Expand All @@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport (
npy_unit_to_abbrev,
npy_unit_to_attrname,
)
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size


cdef extern from "pandas/datetime/pd_datetime.h":
Expand Down Expand Up @@ -341,13 +341,13 @@ cdef int string_to_dts(
const char* format_buf
FormatRequirement format_requirement

buf = get_c_string_buf_and_size(val, &length)
buf = PyUnicode_AsUTF8AndSize(val, &length)
if format is None:
format_buf = b""
format_length = 0
format_requirement = INFER_FORMAT
else:
format_buf = get_c_string_buf_and_size(format, &format_length)
format_buf = PyUnicode_AsUTF8AndSize(format, &format_length)
format_requirement = <FormatRequirement>exact
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset,
Expand Down
14 changes: 6 additions & 8 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ from cpython.datetime cimport (
from datetime import timezone

from cpython.object cimport PyObject_Str
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cython cimport Py_ssize_t
from libc.string cimport strchr

Expand Down Expand Up @@ -74,10 +75,7 @@ import_pandas_datetime()

from pandas._libs.tslibs.strptime import array_strptime

from pandas._libs.tslibs.util cimport (
get_c_string_buf_and_size,
is_array,
)
from pandas._libs.tslibs.util cimport is_array


cdef extern from "pandas/portable.h":
Expand Down Expand Up @@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date(
int day = 1, month = 1, year
bint can_swap = 0

buf = get_c_string_buf_and_size(date_string, &length)
buf = PyUnicode_AsUTF8AndSize(date_string, &length)
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
# parsing MM?DD?YYYY and DD?MM?YYYY dates
month = _parse_2digit(buf)
Expand Down Expand Up @@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string):
Py_ssize_t length
int hour = -1, minute = -1

buf = get_c_string_buf_and_size(parse_string, &length)
buf = PyUnicode_AsUTF8AndSize(parse_string, &length)
if length >= 4:
if buf[1] == b":":
# h:MM format
Expand Down Expand Up @@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string):
char first
int error = 0

buf = get_c_string_buf_and_size(py_string, &length)
buf = PyUnicode_AsUTF8AndSize(py_string, &length)
if length >= 1:
first = buf[0]
if first == b"0":
Expand Down Expand Up @@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default,
pass

if 4 <= date_len <= 7:
buf = get_c_string_buf_and_size(date_string, &date_len)
buf = PyUnicode_AsUTF8AndSize(date_string, &date_len)
try:
i = date_string.index("Q", 1, 6)
if i == 1:
Expand Down
31 changes: 0 additions & 31 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

from cpython.object cimport PyTypeObject
from cpython.unicode cimport PyUnicode_AsUTF8AndSize


cdef extern from "Python.h":
Expand Down Expand Up @@ -155,36 +154,6 @@ cdef inline bint is_nan(object val):
return is_complex_object(val) and val != val


cdef inline const char* get_c_string_buf_and_size(str py_string,
Py_ssize_t *length) except NULL:
"""
Extract internal char* buffer of unicode or bytes object `py_string` with
getting length of this internal buffer saved in `length`.
Notes
-----
Python object owns memory, thus returned char* must not be freed.
`length` can be NULL if getting buffer length is not needed.
Parameters
----------
py_string : str
length : Py_ssize_t*
Returns
-------
buf : const char*
"""
# Note PyUnicode_AsUTF8AndSize() can
# potentially allocate memory inside in unlikely case of when underlying
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
return PyUnicode_AsUTF8AndSize(py_string, length)


cdef inline const char* get_c_string(str py_string) except NULL:
return get_c_string_buf_and_size(py_string, NULL)


cdef inline bytes string_encode_locale(str py_string):
"""As opposed to PyUnicode_Encode, use current system locale to encode."""
return PyUnicode_EncodeLocale(py_string, NULL)
Expand Down

0 comments on commit 4fe49b1

Please sign in to comment.