From 3a0fbfd4899b1c9abec4f024d3cccc8970882691 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Fri, 1 Feb 2019 12:56:05 -0800 Subject: [PATCH] PERF: use new to_records() argument in to_stata() (#25045) --- doc/source/whatsnew/v0.25.0.rst | 12 ++---------- pandas/io/stata.py | 20 +++++--------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 55b9957763ff6..09626be713c4f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -23,14 +23,6 @@ Other Enhancements - - -.. _whatsnew_0250.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - - - .. _whatsnew_0250.api_breaking: Backwards incompatible API changes @@ -69,8 +61,8 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- -- +- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) +- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1b0660171ecac..0bd084f4e5df7 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2385,32 +2385,22 @@ def _prepare_data(self): data = self._convert_strls(data) # 3. Convert bad string data to '' and pad to correct length - dtypes = [] - data_cols = [] - has_strings = False + dtypes = {} native_byteorder = self._byteorder == _set_endianness(sys.byteorder) for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - has_strings = True data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,)) stype = 'S{type}'.format(type=typ) - dtypes.append(('c' + str(i), stype)) - string = data[col].str.encode(self._encoding) - data_cols.append(string.values.astype(stype)) + dtypes[col] = stype + data[col] = data[col].str.encode(self._encoding).astype(stype) else: - values = data[col].values dtype = data[col].dtype if not native_byteorder: dtype = dtype.newbyteorder(self._byteorder) - dtypes.append(('c' + str(i), dtype)) - data_cols.append(values) - dtypes = np.dtype(dtypes) + dtypes[col] = dtype - if has_strings or not native_byteorder: - self.data = np.fromiter(zip(*data_cols), dtype=dtypes) - else: - self.data = data.to_records(index=False) + self.data = data.to_records(index=False, column_dtypes=dtypes) def _write_data(self): data = self.data