Skip to content

Commit

Permalink
BUG: byteorder option in to_stata is not honoured when writing strL d…
Browse files Browse the repository at this point in the history
…ata (pandas-dev#58970)

* BUG: byteorder option in to_stata is not honoured when writing strL data

* Check whether requested byteorder matches the current system once, and store the result
  • Loading branch information
cmjcharlton authored Jun 10, 2024
1 parent b290bf0 commit 629ffeb
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 5 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ I/O
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
Expand Down
17 changes: 14 additions & 3 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3037,6 +3037,8 @@ def __init__(
if byteorder is None:
byteorder = sys.byteorder
self._byteorder = _set_endianness(byteorder)
# Flag whether chosen byteorder matches the system on which we're running
self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder)

gso_v_type = "I" # uint32
gso_o_type = "Q" # uint64
Expand All @@ -3049,13 +3051,20 @@ def __init__(
o_size = 6
else: # version == 119
o_size = 5
self._o_offet = 2 ** (8 * (8 - o_size))
if self._native_byteorder:
self._o_offet = 2 ** (8 * (8 - o_size))
else:
self._o_offet = 2 ** (8 * o_size)
self._gso_o_type = gso_o_type
self._gso_v_type = gso_v_type

def _convert_key(self, key: tuple[int, int]) -> int:
v, o = key
return v + self._o_offet * o
if self._native_byteorder:
return v + self._o_offet * o
else:
# v, o will be swapped when applying byteorder
return o + self._o_offet * v

def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
"""
Expand Down Expand Up @@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame:
]

if convert_cols:
ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
ssw = StataStrLWriter(
data, convert_cols, version=self._dta_version, byteorder=self._byteorder
)
tab, new_data = ssw.generate_table()
data = new_data
self._strl_blob = ssw.generate_blob(tab)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
formatted = df.loc[0, column + "_fmt"]
assert unformatted == formatted

def test_writer_117(self, temp_file):
@pytest.mark.parametrize("byteorder", ["little", "big"])
def test_writer_117(self, byteorder, temp_file):
original = DataFrame(
data=[
[
Expand Down Expand Up @@ -1736,6 +1737,7 @@ def test_writer_117(self, temp_file):
original.to_stata(
path,
convert_dates={"datetime": "tc"},
byteorder=byteorder,
convert_strl=["forced_strl"],
version=117,
)
Expand Down Expand Up @@ -1940,7 +1942,8 @@ def test_stata_119(self, datapath):
assert reader._nvar == 32999

@pytest.mark.parametrize("version", [118, 119, None])
def test_utf8_writer(self, version, temp_file):
@pytest.mark.parametrize("byteorder", ["little", "big"])
def test_utf8_writer(self, version, byteorder, temp_file):
cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
data = DataFrame(
[
Expand Down Expand Up @@ -1968,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file):
convert_strl=["strls"],
variable_labels=variable_labels,
write_index=False,
byteorder=byteorder,
version=version,
value_labels=value_labels,
)
Expand Down

0 comments on commit 629ffeb

Please sign in to comment.