From 6a95c81dc17e027168345a574aa60921ec3c2e69 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 3 Apr 2017 08:23:38 -0400 Subject: [PATCH] update docs as per review --- doc/source/io.rst | 8 +++++++- doc/source/options.rst | 4 +++- pandas/core/config_init.py | 2 +- pandas/core/frame.py | 6 +++--- pandas/io/parquet.py | 27 ++++++++++++++------------- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 40ff2d9657b06f..1a8aa90429eede 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4529,7 +4529,13 @@ Several caveats. - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message on an attempt at serialization. -See the documentation for `pyarrow ` +See the documentation for `pyarrow `__ + +.. note:: + + These engines are very similar and should read/write nearly identical parquet format files. + These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). + TODO: differing options to write non-standard columns & null treatment .. ipython:: python diff --git a/doc/source/options.rst b/doc/source/options.rst index 1b219f640cc87b..bdb7ce526923ba 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -416,6 +416,8 @@ io.hdf.default_format None default format writing format, 'table' io.hdf.dropna_table True drop ALL nan rows when appending to a table +io.parquet.engine pyarrow The engine to use as a default for + parquet reading and writing. mode.chained_assignment warn Raise an exception, warn, or no action if trying to use chained assignment, The default is warn @@ -538,4 +540,4 @@ Only ``'display.max_rows'`` are serialized and published. .. ipython:: python :suppress: - pd.reset_option('display.html.table_schema') \ No newline at end of file + pd.reset_option('display.html.table_schema') diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 6822041aab1d40..6d07288c4660b9 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -471,7 +471,7 @@ def _register_xlsx(engine, other): parquet_engine_doc = """ : string The default parquet reader/writer engine. Available options: - None, 'pyarrow', 'fastparquet' + 'pyarrow', 'fastparquet', the default is 'pyarrow' """ with cf.config_prefix('io.parquet'): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 834f680db6437a..eee14eaf0904af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1531,9 +1531,9 @@ def to_parquet(self, fname, engine=None, compression=None, ---------- fname : str string file path - engine : parquet engine - supported are {'pyarrow', 'fastparquet'} - if None, will use the option: io.parquet.engine + engine : str, optional + The parquet engine, one of {'pyarrow', 'fastparquet'} + if None, will use the option: `io.parquet.engine` compression : str, optional compression method, includes {'gzip', 'snappy', 'brotli'} kwargs passed to the engine diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e6e601c8d06301..95b186dd759f17 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -81,16 +81,16 @@ def read(self, path): def to_parquet(df, path, engine=None, compression=None, **kwargs): """ - Write a DataFrame to the pyarrow + Write a DataFrame to the parquet format. Parameters ---------- df : DataFrame path : string File path - engine : parquet engine - supported are {'pyarrow', 'fastparquet'} - if None, will use the option: io.parquet.engine + engine : str, optional + The parquet engine, one of {'pyarrow', 'fastparquet'} + if None, will use the option: `io.parquet.engine` compression : str, optional compression method, includes {'gzip', 'snappy', 'brotli'} kwargs are passed to the engine @@ -110,15 +110,16 @@ def to_parquet(df, path, engine=None, compression=None, **kwargs): # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): - raise ValueError("parquet does not serializing {} " + raise ValueError("parquet does not support serializing {} " "for the index; you can .reset_index()" "to make the index into column(s)".format( type(df.index))) if not df.index.equals(RangeIndex.from_range(range(len(df)))): - raise ValueError("parquet does not serializing a non-default index " - "for the index; you can .reset_index()" - "to make the index into column(s)") + raise ValueError("parquet does not support serializing a " + "non-default index for the index; you " + "can .reset_index() to make the index " + "into column(s)") if df.index.name is not None: raise ValueError("parquet does not serialize index meta-data on a " @@ -136,7 +137,7 @@ def to_parquet(df, path, engine=None, compression=None, **kwargs): def read_parquet(path, engine=None, **kwargs): """ - Load a parquet object from the file path + Load a parquet object from the file path, returning a DataFrame. .. versionadded 0.20.0 @@ -144,14 +145,14 @@ def read_parquet(path, engine=None, **kwargs): ---------- path : string File path - engine : parquet engine - supported are {'pyarrow', 'fastparquet'} - if None, will use the option: io.parquet.engine + engine : str, optional + The parquet engine, one of {'pyarrow', 'fastparquet'} + if None, will use the option: `io.parquet.engine` kwargs are passed to the engine Returns ------- - type of object stored in file + DataFrame """