From 54ab5be78fac125ae6efcd650d8d3f418e60754d Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 5 Sep 2016 15:37:30 -0400 Subject: [PATCH] BUG: compat with Stata ver 111 closes #11526 closes #14159 --- doc/source/whatsnew/v0.19.0.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 3 --- pandas/io/stata.py | 6 +++--- pandas/io/tests/data/stata7_111.dta | Bin 0 -> 1024 bytes pandas/io/tests/test_stata.py | 16 ++++++++++++++++ 5 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 pandas/io/tests/data/stata7_111.dta diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index ec37b6d6f5f13..f3a6736ff9920 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1327,6 +1327,7 @@ Other API Changes - More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) - ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) +- ``pd.read_stata()`` can now handle some format 111 files, which are produced by SAS when generating Stata dta files (:issue:`11526`) .. _whatsnew_0190.deprecations: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4aee6f72b1d53..0354a8046e873 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -33,8 +33,6 @@ Other enhancements - - .. _whatsnew_0200.api_breaking: Backwards incompatible API changes @@ -81,4 +79,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 25f13048a73fd..985ea9c051505 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -34,8 +34,8 @@ from pandas.tslib import NaT, Timestamp _version_error = ("Version of given Stata file is not 104, 105, 108, " - "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), " - "117 (Stata 13), or 118 (Stata 14)") + "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " + "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)") _statafile_processing_params1 = """\ convert_dates : boolean, defaults to True @@ -1183,7 +1183,7 @@ def _get_seek_variable_labels(self): def _read_old_header(self, first_char): self.format_version = struct.unpack('b', first_char)[0] - if self.format_version not in [104, 105, 108, 113, 114, 115]: + if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error) self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[ 0] == 0x1 and '>' or '<' diff --git a/pandas/io/tests/data/stata7_111.dta b/pandas/io/tests/data/stata7_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..e87fa3a72ff8e5b1486c8350bc19d6917429f8e7 GIT binary patch literal 1024 zcmd02Vq{=p;9{T)`2YVuQzaprDhMekC#0Z?kOG9fs-cmdK{|wwlWkPY08@j|Im#L) zA@IQ-;t?3l;E)LQ8A1di@Sr{kO$Mey#330?hQk3W;E;kQ^PnCgAmES+m4Q(L4lo&* o2#kiA#^C@>s|+wxM4);;*k`~LLur`X7#uRuWF(+^I2^JV08VNl4*&oF literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 7752fff5247c0..1849b32a4a7c8 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -82,6 +82,8 @@ def setUp(self): self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') self.dta23 = os.path.join(self.dirpath, 'stata15.dta') + self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') + def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) @@ -1219,6 +1221,20 @@ def test_repeated_column_labels(self): read_stata(self.dta23, convert_categoricals=True) tm.assertTrue('wolof' in cm.exception) + def test_stata_111(self): + # 111 is an old version but still used by current versions of + # SAS when exporting to Stata format. We do not know of any + # on-line documentation for this version. + df = read_stata(self.dta24_111) + original = pd.DataFrame({'y': [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0], + 'x': [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6], + 'w': [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3], + 'z': ['a', 'b', 'c', 'd', 'e', '', 'g', 'h', + 'i', 'j']}) + original = original[['y', 'x', 'w', 'z']] + tm.assert_frame_equal(original, df) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)