diff --git a/doc/source/io.rst b/doc/source/io.rst index fd998d32cfbfb..c5b7eff292722 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -20,9 +20,9 @@ plt.close('all') import pandas.util.testing as tm - pd.options.display.max_rows=15 - clipdf = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, - index=['x','y','z']) + pd.options.display.max_rows = 15 + clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, + index=['x', 'y', 'z']) =============================== IO Tools (Text, CSV, HDF5, ...) @@ -256,7 +256,7 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default column. - If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date column. - - If ``{'foo' : [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. + - If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. A fast-path exists for iso8601-formatted dates. infer_datetime_format : boolean, default ``False`` If ``True`` and parse_dates is enabled for a column, attempt to infer the @@ -391,7 +391,7 @@ of :func:`~pandas.read_csv`: .. ipython:: python data = "col_1\n1\n2\n'A'\n4.22" - df = pd.read_csv(StringIO(data), converters={'col_1':str}) + df = pd.read_csv(StringIO(data), converters={'col_1': str}) df df['col_1'].apply(type).value_counts() @@ -789,7 +789,7 @@ The simplest case is to just pass in ``parse_dates=True``: .. ipython:: python :suppress: - f = open('foo.csv','w') + f = open('foo.csv', 'w') f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') f.close() @@ -863,7 +863,7 @@ data columns: date_spec = {'nominal': [1, 2], 'actual': [1, 3]} df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - index_col=0) #index is the nominal column + index_col=0) # index is the nominal column df .. note:: @@ -1336,7 +1336,7 @@ column specifications to the `read_fwf` function along with the file name: .. ipython:: python - #Column specifications are a list of half-intervals + # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) df @@ -1347,7 +1347,7 @@ column widths for contiguous columns: .. ipython:: python - #Widths are a list of integers + # Widths are a list of integers widths = [6, 14, 13, 10] df = pd.read_fwf('bar.csv', widths=widths, header=None) df @@ -1956,7 +1956,7 @@ Specify dtypes for conversion: .. ipython:: python - pd.read_json('test.json', dtype={'A' : 'float32', 'bools' : 'int8'}).dtypes + pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes Preserve string indices: @@ -2111,8 +2111,8 @@ For line-delimited json files, pandas can also return an iterator which reads in .. ipython:: python jsonl = ''' - {"a":1,"b":2} - {"a":3,"b":4} + {"a": 1, "b": 2} + {"a": 3, "b": 4} ''' df = pd.read_json(jsonl, lines=True) df @@ -2784,14 +2784,14 @@ Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - read_excel('path_to_file.xls',sheet_name=None) + read_excel('path_to_file.xls', sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - read_excel('path_to_file.xls',sheet_name=['Sheet1',3]) + read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2812,10 +2812,10 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]}, - index=pd.MultiIndex.from_product([['a','b'],['c','d']])) + df = pd.DataFrame({'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([['a', 'b'],['c', 'd']])) df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0,1]) + df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df If the index has level names, they will parsed as well, using the same @@ -2834,10 +2834,9 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2']) + df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], names=['c1', 'c2']) df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', - index_col=[0,1], header=[0,1]) + df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) df .. ipython:: python @@ -2868,7 +2867,7 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) -Element order is ignored, so ``usecols=[0,1]`` is the same as ``[1,0]``. +Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. Parsing Dates +++++++++++++ @@ -3095,7 +3094,7 @@ applications (CTRL-V on many operating systems). Here we illustrate writing a .. ipython:: python - df = pd.DataFrame(randn(5,3)) + df = pd.DataFrame(randn(5, 3)) df df.to_clipboard() pd.read_clipboard() @@ -3231,30 +3230,30 @@ both on the writing (serialization), and reading (deserialization). .. ipython:: python - df = pd.DataFrame(np.random.rand(5,2),columns=list('AB')) + df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) df.to_msgpack('foo.msg') pd.read_msgpack('foo.msg') - s = pd.Series(np.random.rand(5),index=pd.date_range('20130101',periods=5)) + s = pd.Series(np.random.rand(5), index=pd.date_range('20130101', periods=5)) You can pass a list of objects and you will receive them back on deserialization. .. ipython:: python - pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) + pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s) pd.read_msgpack('foo.msg') You can pass ``iterator=True`` to iterate over the unpacked results: .. ipython:: python - for o in pd.read_msgpack('foo.msg',iterator=True): + for o in pd.read_msgpack('foo.msg', iterator=True): print(o) You can pass ``append=True`` to the writer to append to an existing pack: .. ipython:: python - df.to_msgpack('foo.msg',append=True) + df.to_msgpack('foo.msg', append=True) pd.read_msgpack('foo.msg') Unlike other io methods, ``to_msgpack`` is available on both a per-object basis, @@ -3264,7 +3263,8 @@ pandas objects. .. ipython:: python - pd.to_msgpack('foo2.msg', { 'dict' : [ { 'df' : df }, { 'string' : 'foo' }, { 'scalar' : 1. }, { 's' : s } ] }) + pd.to_msgpack('foo2.msg', {'dict': [{ 'df': df }, {'string': 'foo'}, + {'scalar': 1.}, {'s': s}]}) pd.read_msgpack('foo2.msg') .. ipython:: python @@ -3392,8 +3392,8 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) - df_tl.to_hdf('store_tl.h5','table',append=True) - pd.read_hdf('store_tl.h5', 'table', where = ['index>2']) + df_tl.to_hdf('store_tl.h5','table', append=True) + pd.read_hdf('store_tl.h5', 'table', where=['index>2']) .. ipython:: python :suppress: @@ -3411,17 +3411,17 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({'col1':[0, np.nan, 2], - 'col2':[1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], + 'col2': [1, np.nan, np.nan]}) df_with_missing df_with_missing.to_hdf('file.h5', 'df_with_missing', - format = 'table', mode='w') + format='table', mode='w') pd.read_hdf('file.h5', 'df_with_missing') df_with_missing.to_hdf('file.h5', 'df_with_missing', - format = 'table', mode='w', dropna=True) + format='table', mode='w', dropna=True) pd.read_hdf('file.h5', 'df_with_missing') @@ -3434,21 +3434,21 @@ This is also true for the major axis of a ``Panel``: .. ipython:: python - matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]], - [[np.nan, np.nan, np.nan], [np.nan,5,6]], - [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] + matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_major_axis_all_missing = pd.Panel(matrix, - items=['Item1', 'Item2','Item3'], - major_axis=[1,2], + panel_with_major_axis_all_missing=pd.Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], minor_axis=['A', 'B', 'C']) panel_with_major_axis_all_missing panel_with_major_axis_all_missing.to_hdf('file.h5', 'panel', - dropna = True, - format='table', - mode='w') + dropna=True, + format='table', + mode='w') reloaded = pd.read_hdf('file.h5', 'panel') reloaded @@ -3596,13 +3596,13 @@ defaults to `nan`. .. ipython:: python - df_mixed = pd.DataFrame({ 'A' : randn(8), - 'B' : randn(8), - 'C' : np.array(randn(8),dtype='float32'), - 'string' :'string', - 'int' : 1, - 'bool' : True, - 'datetime64' : pd.Timestamp('20010102')}, + df_mixed = pd.DataFrame({'A': randn(8), + 'B': randn(8), + 'C': np.array(randn(8), dtype='float32'), + 'string':'string', + 'int': 1, + 'bool': True, + 'datetime64': pd.Timestamp('20010102')}, index=list(range(8))) df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan @@ -3631,7 +3631,7 @@ storing/selecting from homogeneous index ``DataFrames``. columns=['A', 'B', 'C']) df_mi - store.append('df_mi',df_mi) + store.append('df_mi', df_mi) store.select('df_mi') # the levels are automatically included as data columns @@ -3679,15 +3679,15 @@ These rules are similar to how boolean expressions are used in pandas for indexi The following are valid expressions: -- ``'index>=date'`` -- ``"columns=['A', 'D']"`` +- ``'index >= date'`` +- ``"columns = ['A', 'D']"`` - ``"columns in ['A', 'D']"`` -- ``'columns=A'`` -- ``'columns==A'`` -- ``"~(columns=['A','B'])"`` -- ``'index>df.index[3] & string="bar"'`` -- ``'(index>df.index[3] & index<=df.index[6]) | string="bar"'`` -- ``"ts>=Timestamp('2012-02-01')"`` +- ``'columns = A'`` +- ``'columns == A'`` +- ``"~(columns = ['A', 'B'])"`` +- ``'index > df.index[3] & string = "bar"'`` +- ``'(index > df.index[3] & index <= df.index[6]) | string = "bar"'`` +- ``"ts >= Timestamp('2012-02-01')"`` - ``"major_axis>=20130101"`` The ``indexers`` are on the left-hand side of the sub-expression: @@ -3699,7 +3699,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` - strings, e.g. ``"bar"`` - date-like, e.g. ``20130101``, or ``"20130101"`` -- lists, e.g. ``"['A','B']"`` +- lists, e.g. ``"['A', 'B']"`` - variables that are defined in the local names space, e.g. ``date`` .. note:: @@ -3737,26 +3737,27 @@ Here are some examples: .. ipython:: python - dfq = pd.DataFrame(randn(10,4),columns=list('ABCD'),index=pd.date_range('20130101',periods=10)) - store.append('dfq',dfq,format='table',data_columns=True) + dfq = pd.DataFrame(randn(10, 4), columns=list('ABCD'), + index=pd.date_range('20130101', periods=10)) + store.append('dfq', dfq, format='table', data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq',"index>pd.Timestamp('20130104') & columns=['A', 'B']") + store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") Use and inline column reference .. ipython:: python - store.select('dfq',where="A>0 or C>0") + store.select('dfq', where="A>0 or C>0") Works with a Panel as well. .. ipython:: python - store.append('wp',wp) + store.append('wp', wp) store store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") @@ -3777,7 +3778,7 @@ space. These are in terms of the total number of rows in a table. wp.to_frame() # limiting the search - store.select('wp',"major_axis>20000102 & minor_axis=['A','B']", + store.select('wp', "major_axis>20000102 & minor_axis=['A', 'B']", start=0, stop=10) .. note:: @@ -3801,11 +3802,11 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) - dftd['C'] = dftd['A']-dftd['B'] + dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i, seconds=10) for i in range(10) ])) + dftd['C'] = dftd['A'] - dftd['B'] dftd - store.append('dftd',dftd,data_columns=True) - store.select('dftd',"C<'-3.5D'") + store.append('dftd', dftd, data_columns=True) + store.select('dftd', "C<'-3.5D'") Indexing ++++++++ @@ -3837,10 +3838,10 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = pd.DataFrame(randn(10,2),columns=list('AB')) - df_2 = pd.DataFrame(randn(10,2),columns=list('AB')) + df_1 = pd.DataFrame(randn(10, 2), columns=list('AB')) + df_2 = pd.DataFrame(randn(10, 2), columns=list('AB')) - st = pd.HDFStore('appends.h5',mode='w') + st = pd.HDFStore('appends.h5', mode='w') st.append('df', df_1, data_columns=['B'], index=False) st.append('df', df_2, data_columns=['B'], index=False) st.get_storer('df').table @@ -3878,15 +3879,15 @@ be ``data_columns``. df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4:6], 'string'] = np.nan - df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' + df_dc.loc[df_dc.index[4: 6], 'string'] = np.nan + df_dc.loc[df_dc.index[7: 9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1:3], ['B','C']] = 1.0 + df_dc.loc[df_dc.index[1: 3], ['B', 'C']] = 1.0 df_dc # on-disk operations store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) - store.select('df_dc', where='B>0') + store.select('df_dc', where='B > 0') # getting creative store.select('df_dc', 'B > 0 & C > 0 & string == foo') @@ -3941,9 +3942,9 @@ chunks. store.append('dfeq', dfeq, data_columns=['number']) def chunks(l, n): - return [l[i:i+n] for i in range(0, len(l), n)] + return [l[i: i+n] for i in range(0, len(l), n)] - evens = [2,4,6,8,10] + evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): print(store.select('dfeq', where=c)) @@ -3975,11 +3976,12 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) - store.append('df_coord',df_coord) - c = store.select_as_coordinates('df_coord','index>20020101') - c.summary() - store.select('df_coord',where=c) + df_coord = pd.DataFrame(np.random.randn(1000, 2), + index=pd.date_range('20000101', periods=1000)) + store.append('df_coord', df_coord) + c = store.select_as_coordinates('df_coord', 'index > 20020101') + c + store.select('df_coord', where=c) .. _io.hdf5-where_mask: @@ -3992,11 +3994,12 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) - store.append('df_mask',df_mask) - c = store.select_column('df_mask','index') - where = c[pd.DatetimeIndex(c).month==5].index - store.select('df_mask',where=where) + df_mask = pd.DataFrame(np.random.randn(1000, 2), + index=pd.date_range('20000101', periods=1000)) + store.append('df_mask', df_mask) + c = store.select_column('df_mask', 'index') + where = c[pd.DatetimeIndex(c).month == 5].index + store.select('df_mask', where=where) Storer Object ^^^^^^^^^^^^^ @@ -4095,7 +4098,7 @@ the table using a ``where`` that selects all but the missing data. .. ipython:: python # returns the number of rows deleted - store.remove('wp', 'major_axis>20000102' ) + store.remove('wp', 'major_axis > 20000102' ) store.select('wp') .. warning:: @@ -4171,7 +4174,8 @@ Enable compression for all objects within the file: .. code-block:: python - store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc:blosclz') + store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, + complib='blosc:blosclz') Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled: @@ -4266,13 +4270,13 @@ stored in a more efficient manner. .. ipython:: python - dfcat = pd.DataFrame({ 'A' : pd.Series(list('aabbcdba')).astype('category'), - 'B' : np.random.randn(8) }) + dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), + 'B': np.random.randn(8) }) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') cstore.append('dfcat', dfcat, format='table', data_columns=['A']) - result = cstore.select('dfcat', where="A in ['b','c']") + result = cstore.select('dfcat', where="A in ['b', 'c']") result result.dtypes @@ -4309,16 +4313,16 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) + dfs = pd.DataFrame(dict(A='foo', B='bar'), index=list(range(5))) dfs # A and B have a size of 30 - store.append('dfs', dfs, min_itemsize = 30) + store.append('dfs', dfs, min_itemsize=30) store.get_storer('dfs').table # A is created as a data_column with a size of 30 # B is size is calculated - store.append('dfs2', dfs, min_itemsize = { 'A' : 30 }) + store.append('dfs2', dfs, min_itemsize={'A': 30}) store.get_storer('dfs2').table **nan_rep** @@ -4328,7 +4332,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame(dict(A = ['foo','bar','nan'])) + dfss = pd.DataFrame(dict(A=['foo', 'bar', 'nan'])) dfss store.append('dfss', dfss) @@ -4358,7 +4362,7 @@ It is possible to write an ``HDFStore`` object that can easily be imported into np.random.seed(1) df_for_r = pd.DataFrame({"first": np.random.rand(100), "second": np.random.rand(100), - "class": np.random.randint(0, 2, (100,))}, + "class": np.random.randint(0, 2, (100, ))}, index=range(100)) df_for_r.head() @@ -4596,7 +4600,8 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) + result = pd.read_parquet('example_fp.parquet', + engine='fastparquet', columns=['a', 'b']) result.dtypes @@ -4846,7 +4851,8 @@ variant appropriate for your database. from pandas.io import sql sql.execute('SELECT * FROM table_name', engine) - sql.execute('INSERT INTO table_name VALUES(?, ?, ?)', engine, params=[('id', 1, 12.2, True)]) + sql.execute('INSERT INTO table_name VALUES(?, ?, ?)', engine, + params=[('id', 1, 12.2, True)]) Engine connection examples @@ -4888,7 +4894,8 @@ Use :func:`sqlalchemy.text` to specify query parameters in a backend-neutral way .. ipython:: python import sqlalchemy as sa - pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'), engine, params={'col1': 'X'}) + pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'), + engine, params={'col1': 'X'}) If you have an SQLAlchemy description of your database you can express where conditions using SQLAlchemy expressions @@ -5306,34 +5313,34 @@ And here's the code: sql_db.close() def test_hdf_fixed_write(df): - df.to_hdf('test_fixed.hdf','test',mode='w') + df.to_hdf('test_fixed.hdf', 'test', mode='w') def test_hdf_fixed_read(): - pd.read_hdf('test_fixed.hdf','test') + pd.read_hdf('test_fixed.hdf', 'test') def test_hdf_fixed_write_compress(df): - df.to_hdf('test_fixed_compress.hdf','test',mode='w',complib='blosc') + df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') def test_hdf_fixed_read_compress(): - pd.read_hdf('test_fixed_compress.hdf','test') + pd.read_hdf('test_fixed_compress.hdf', 'test') def test_hdf_table_write(df): - df.to_hdf('test_table.hdf','test',mode='w',format='table') + df.to_hdf('test_table.hdf', 'test', mode='w', format='table') def test_hdf_table_read(): - pd.read_hdf('test_table.hdf','test') + pd.read_hdf('test_table.hdf', 'test') def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf','test',mode='w',complib='blosc',format='table') + df.to_hdf('test_table_compress.hdf', 'test', mode='w', complib='blosc', format='table') def test_hdf_table_read_compress(): - pd.read_hdf('test_table_compress.hdf','test') + pd.read_hdf('test_table_compress.hdf', 'test') def test_csv_write(df): - df.to_csv('test.csv',mode='w') + df.to_csv('test.csv', mode='w') def test_csv_read(): - pd.read_csv('test.csv',index_col=0) + pd.read_csv('test.csv', index_col=0) def test_feather_write(df): df.to_feather('test.feather')