Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Df ops #333

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
fa11831
Add failing test of basic operations
znicholls Oct 24, 2019
983102d
Fill out tests and pass a few
znicholls Oct 24, 2019
a7300b1
Do easy suggestions from @danielhuppmann
znicholls Nov 6, 2019
23f95b4
Fix meta check
znicholls Nov 6, 2019
c13a89d
migrate merging of `meta` in `append()` to own auxiliary function
danielhuppmann Nov 8, 2019
6ca4b6d
remove legacy issue for `pandas<0.23`
danielhuppmann Nov 8, 2019
cc99d67
refactor `merge_meta()` to more generic `left`, `right` args
danielhuppmann Nov 8, 2019
cb7edac
docstring formatting and clean-up
danielhuppmann Nov 8, 2019
fb0bb41
Update meta handling
znicholls Nov 9, 2019
3844fe5
Appease stickler
znicholls Nov 9, 2019
419c418
Appease stickler more
znicholls Nov 9, 2019
bf8074c
Rename join_col to axis
znicholls Nov 9, 2019
5c8933c
Implement new meta table if no scenarios are leftover
znicholls Nov 9, 2019
9aaf360
Add tests of what's left after dropping nan
znicholls Nov 9, 2019
594e6ab
re-add needed test fixtures
gidden Feb 15, 2020
c0f8770
stickler
gidden Feb 15, 2020
bd5262b
update docstring
gidden Feb 15, 2020
b8f67ee
update defaults for subtract function
gidden Feb 15, 2020
8f9d9ff
refactored ops into their own class for better testing and usage
gidden Feb 17, 2020
df56505
and now with the missing tests!
gidden Feb 17, 2020
2a48339
stickler
gidden Feb 17, 2020
1bc6df9
support ops with same df
gidden Feb 17, 2020
9ac97ea
simplify meta selection if two dfs are the same
gidden Feb 17, 2020
f2581fc
Add comments on how to extend
gidden Feb 17, 2020
b47a05e
Add inplace subtraction as well, to support within-dataframe ops
gidden Feb 18, 2020
3f32cd7
and now use original subtract method
gidden Feb 18, 2020
45600f4
extraneous keyword
gidden Feb 18, 2020
8e64360
move to generic op functions
gidden Feb 18, 2020
71cb9e4
now have generic inplace op too
gidden Feb 18, 2020
1e7ad7b
switch around order of kwargs
gidden Feb 18, 2020
e6bff75
better set meta
gidden Feb 18, 2020
5f3f232
just always merge meta
gidden Feb 18, 2020
2195e75
op_inplace -> op_axis
gidden Feb 18, 2020
b23f7e6
revamp tests
gidden Feb 18, 2020
36aaa28
fixed comment in utils
gidden Feb 18, 2020
2ae6116
add comments to tests for clarity
gidden Feb 18, 2020
6b00904
_axis -> _inplace
gidden Feb 19, 2020
a2dcf61
add divide
gidden Feb 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 77 additions & 35 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
read_pandas,
format_data,
sort_data,
merge_meta,
to_int,
find_depth,
pattern_match,
Expand Down Expand Up @@ -226,10 +227,10 @@ def variables(self, include_units=False):

def append(self, other, ignore_meta_conflict=False, inplace=False,
**kwargs):
"""Append any castable object to this IamDataFrame.
"""Append any castable object to this ``IamDataFrame``

Columns in `other.meta` that are not in `self.meta` are always merged,
duplicate region-variable-unit-year rows raise a ValueError.
Columns in ``other.meta`` that are not in ``self.meta`` are merged.
Conflicting region-variable-unit-year rows raise a ``ValueError``.

Parameters
----------
Expand All @@ -253,44 +254,15 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,

ret = copy.deepcopy(self) if not inplace else self

diff = other.meta.index.difference(ret.meta.index)
intersect = other.meta.index.intersection(ret.meta.index)

# merge other.meta columns not in self.meta for existing scenarios
if not intersect.empty:
# if not ignored, check that overlapping meta dataframes are equal
if not ignore_meta_conflict:
cols = [i for i in other.meta.columns if i in ret.meta.columns]
if not ret.meta.loc[intersect, cols].equals(
other.meta.loc[intersect, cols]):
conflict_idx = (
pd.concat([ret.meta.loc[intersect, cols],
other.meta.loc[intersect, cols]]
).drop_duplicates()
.index.drop_duplicates()
)
msg = 'conflict in `meta` for scenarios {}'.format(
[i for i in pd.DataFrame(index=conflict_idx).index])
raise ValueError(msg)

cols = [i for i in other.meta.columns if i not in ret.meta.columns]
_meta = other.meta.loc[intersect, cols]
ret.meta = ret.meta.merge(_meta, how='outer',
left_index=True, right_index=True)

# join other.meta for new scenarios
if not diff.empty:
# sorting not supported by ` pd.append()` prior to version 23
sort_kwarg = {} if int(pd.__version__.split('.')[1]) < 23 \
else dict(sort=False)
ret.meta = ret.meta.append(other.meta.loc[diff, :], **sort_kwarg)
# merge `meta` tables
ret.meta = merge_meta(ret.meta, other.meta, ignore_meta_conflict)

# append other.data (verify integrity for no duplicates)
_data = ret.data.set_index(sorted(ret._LONG_IDX)).append(
other.data.set_index(sorted(other._LONG_IDX)),
verify_integrity=True)

# merge extra columns in `data` and set `LONG_IDX`
# merge extra columns in `data` and set `self._LONG_IDX`
ret.extra_cols += [i for i in other.extra_cols
if i not in ret.extra_cols]
ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols
Expand Down Expand Up @@ -1565,6 +1537,76 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
if not inplace:
return ret

def subtract(self, other, axis, new_name, ignore_meta_conflict=False):
gidden marked this conversation as resolved.
Show resolved Hide resolved
"""
Subtract data in ``other`` from ``self``

Parameters
----------
other : pyam.IamDataFrame
Object containing timeseries data to subtract

axis : str
Column to use when subtracting the two sets of data
(e.g. ``variable``)

new_name : str
String to write in ``join_col`` in the output timeseries e.g.
(``variable 1 - variable 2``)
gidden marked this conversation as resolved.
Show resolved Hide resolved

ignore_meta_conflict : bool, default False
If False and ``other`` is an IamDataFrame, raise an error if
any meta columns present in ``self`` and ``other`` are not
identical.

Raises
------
ValueError
The metadata columns in ``self`` and ``other`` are not identical

ValueError
``self`` or ``other`` contains more than value for ``join_col``

NotImplementedError
The type of ``other`` is not yet supported
"""
if not isinstance(other, IamDataFrame):
raise NotImplementedError

too_many_vals_error = "`{}` contains more than one `{}`"
if len(self[axis].unique()) > 1:
raise ValueError(too_many_vals_error.format("self", axis))

if len(other[axis].unique()) > 1:
raise ValueError(too_many_vals_error.format("other", axis))

# do this operation here so meta conflicts are raised early
out_meta = merge_meta(self.meta, other.meta, ignore_meta_conflict)

s_data = self.data.copy()
o_data = other.data.copy()

idx = s_data.columns.tolist()
idx_tmp = list(set(idx) - set([axis]) - {"value"})

s_data = s_data.set_index(idx_tmp).drop(axis, axis="columns")
o_data = o_data.set_index(idx_tmp).drop(axis, axis="columns")

res = (s_data - o_data).reset_index()
res[axis] = new_name

res = IamDataFrame(res)

# final meta wrangling
keep_meta_idx = out_meta.index.intersection(_meta_idx(res.data))
if keep_meta_idx.empty:
# nothing common after doing subtraction, stick with empty meta
pass
else:
# just keep the scenarios that survived the nan removal
res.meta = out_meta.loc[keep_meta_idx]

return res

def _meta_idx(data):
"""Return the `META_IDX` from `data` by index or columns"""
Expand Down
54 changes: 54 additions & 0 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,60 @@ def sort_data(data, cols):
return data.sort_values(cols)[cols + ['value']].reset_index(drop=True)


def merge_meta(left, right, ignore_meta_conflict=False):
"""
Merge two ``meta`` tables

Parameters
----------
left : pd.Dataframe
First meta table

right : pd.Dataframe
Second meta table

ignore_meta_conflict : bool, default False
If False, raise an error if any meta columns present in ``left`` and
``right`` are not identical.

Raises
------
ValueError
Values are in conflict and ``ignore_meta_conflict`` is ``False``

Returns
-------
pd.DataFrame
Joined metadata tables
gidden marked this conversation as resolved.
Show resolved Hide resolved
"""
left = left.copy() # make a copy to not change the original object
diff = right.index.difference(left.index)
sect = right.index.intersection(left.index)

# merge `right` into `left` for overlapping scenarios ( `sect`)
if not sect.empty:
# if not ignored, check that overlapping `meta` columns are equal
if not ignore_meta_conflict:
cols = [i for i in right.columns if i in left.columns]
if not left.loc[sect, cols].equals(right.loc[sect, cols]):
conflict_idx = pd.concat(
[right.loc[sect, cols], left.loc[sect, cols]]
).drop_duplicates().index.drop_duplicates()

msg = 'conflict in `meta` for scenarios {}'.format(
[i for i in pd.DataFrame(index=conflict_idx).index])
raise ValueError(msg)
# merge new columns
cols = [i for i in right.columns if i not in left.columns]
left = left.merge(right.loc[sect, cols], how='outer',
left_index=True, right_index=True)

# join `other.meta` for new scenarios (`diff`)
if not diff.empty:
left = left.append(right.loc[diff, :], sort=False)

return left

def find_depth(data, s='', level=None):
"""
return or assert the depth (number of `|`) of variables
Expand Down
133 changes: 133 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,127 @@
columns=IAMC_IDX + [2005, 2010],
)

mg_ascen = ['MSG-GLB', 'a_scen']
gidden marked this conversation as resolved.
Show resolved Hide resolved
mg_ascen_2 = ['MSG-GLB', 'a_scen_2']
CHECK_AGG_DF = pd.DataFrame([
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy', 'EJ/y', 1, 6],
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.75, 5],
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.25, 1],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3, 8],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1, 3],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2, 5],
['IMG', 'a_scen', 'R5REF', 'Primary Energy', 'EJ/y', 0.3, 0.6],
['IMG', 'a_scen', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.15, 0.4],
['IMG', 'a_scen', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.2],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1, 1.4],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.6, 0.8],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.4, 0.6],
['IMG', 'a_scen', 'World', 'Primary Energy', 'EJ/y', 1.3, 6.6],
['IMG', 'a_scen', 'World', 'Primary Energy|Coal', 'EJ/y', 0.9, 5.4],
['IMG', 'a_scen', 'World', 'Primary Energy|Gas', 'EJ/y', 0.4, 1.2],
['IMG', 'a_scen', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4, 9.4],
['IMG', 'a_scen', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.6, 3.8],
['IMG', 'a_scen', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.4, 5.6],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy', 'EJ/y', 1.4, 6.4],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.95, 5.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.45, 1.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3.4, 8.4],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.2, 3.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.2],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy', 'EJ/y', 0.7, 1.0],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.35, 0.6],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.35, 0.4],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1.4, 1.8],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.8, 1.0],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.6, 0.8],
['IMG', 'a_scen_2', 'World', 'Primary Energy', 'EJ/y', 2.1, 7.4],
['IMG', 'a_scen_2', 'World', 'Primary Energy|Coal', 'EJ/y', 1.3, 5.8],
['IMG', 'a_scen_2', 'World', 'Primary Energy|Gas', 'EJ/y', 0.8, 1.6],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4.8, 10.2],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 2.0, 4.2],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.8, 6.0],
mg_ascen + ['R5ASIA', 'Primary Energy', 'EJ/y', 0.8, 5.8],
mg_ascen + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.65, 4.9],
mg_ascen + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.9],
mg_ascen + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 2.8, 7.8],
mg_ascen + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.9, 2.9],
mg_ascen + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 1.9, 4.9],
mg_ascen + ['R5REF', 'Primary Energy', 'EJ/y', 0.1, 0.4],
mg_ascen + ['R5REF', 'Primary Energy|Coal', 'EJ/y', 0.05, 0.3],
mg_ascen + ['R5REF', 'Primary Energy|Gas', 'EJ/y', 0.05, 0.1],
mg_ascen + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', 0.8, 1.2],
mg_ascen + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.5, 0.7],
mg_ascen + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.3, 0.5],
mg_ascen + ['World', 'Primary Energy', 'EJ/y', 0.9, 6.2],
mg_ascen + ['World', 'Primary Energy|Coal', 'EJ/y', 0.7, 5.2],
mg_ascen + ['World', 'Primary Energy|Gas', 'EJ/y', 0.2, 1.0],
mg_ascen + ['World', 'Emissions|CO2', 'Mt CO2/yr', 3.6, 9.0],
mg_ascen + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.4, 3.6],
mg_ascen + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.4],
mg_ascen_2 + ['R5ASIA', 'Primary Energy', 'EJ/y', -1.4, -6.4],
mg_ascen_2 + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', -0.95, -5.2],
mg_ascen_2 + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', -0.45, -1.2],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', -3.4, -8.4],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', -1.2, -3.2],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.2, -5.2],
mg_ascen_2 + ['R5REF', 'Primary Energy', 'EJ/y', -0.7, -1.0],
mg_ascen_2 + ['R5REF', 'Primary Energy|Coal', 'EJ/y', -0.35, -0.6],
mg_ascen_2 + ['R5REF', 'Primary Energy|Gas', 'EJ/y', -0.35, -0.4],
mg_ascen_2 + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', -1.4, -1.8],
mg_ascen_2 + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', -0.8, -1.0],
mg_ascen_2 + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', -0.6, -0.8],
mg_ascen_2 + ['World', 'Primary Energy', 'EJ/y', -2.1, -7.4],
mg_ascen_2 + ['World', 'Primary Energy|Coal', 'EJ/y', -1.3, -5.8],
mg_ascen_2 + ['World', 'Primary Energy|Gas', 'EJ/y', -0.8, -1.6],
mg_ascen_2 + ['World', 'Emissions|CO2', 'Mt CO2/yr', -5.0, -10.6],
mg_ascen_2 + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', -2.0, -4.2],
mg_ascen_2 + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.8, -6.0],
mg_ascen_2 + ['World', 'Emissions|CO2|Agg Agg', 'Mt CO2/yr', -0.2, -0.4],
mg_ascen_2 + ['World', 'Emissions|CF4', 'kt CF4/yr', 54, 56],
mg_ascen_2 + ['World', 'Emissions|C2F6', 'kt C2F6/yr', 32, 27],
mg_ascen_2 + ['World', 'Emissions|C2F6|Solvents', 'kt C2F6/yr', 30, 33],
mg_ascen_2 + ['World', 'Emissions|C2F6|Industry', 'kt C2F6/yr', 2, -6],
mg_ascen_2 + ['World', 'Emissions|CH4', 'Mt CH4/yr', 322, 217],
mg_ascen_2 + ['R5REF', 'Emissions|CH4', 'Mt CH4/yr', 30, 201],
mg_ascen_2 + ['R5ASIA', 'Emissions|CH4', 'Mt CH4/yr', 292, 16],
],
columns=IAMC_IDX + [2005, 2010],
)


ms = ['AIM', 'cscen']
CHECK_AGG_REGIONAL_DF = pd.DataFrame([
ms + ['World', 'Emissions|N2O', 'Mt N/yr', 1.9, 15.7],
ms + ['World', 'Emissions|N2O|AFOLU', 'Mt N/yr', 0.1, 0.1],
ms + ['World', 'Emissions|N2O|Ind', 'Mt N/yr', 1.8, 15.6],
ms + ['World', 'Emissions|N2O|Ind|Shipping', 'Mt N/yr', 1, 6],
ms + ['World', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 1.6, 3.8],
ms + ['World', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -0.8, 5.8],
ms + ['RASIA', 'Emissions|N2O', 'Mt N/yr', 0, 5.9],
ms + ['RASIA', 'Emissions|N2O|Ind', 'Mt N/yr', 0, 5.9],
ms + ['RASIA', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 0.8, 2.6],
ms + ['RASIA', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -0.8, 3.3],
ms + ['REUROPE', 'Emissions|N2O', 'Mt N/yr', 0.8, 3.7],
ms + ['REUROPE', 'Emissions|N2O|Ind', 'Mt N/yr', 0.8, 3.7],
ms + ['REUROPE', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 0.8, 1.2],
ms + ['REUROPE', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 0, 2.5],
ms + ['China', 'Emissions|N2O', 'Mt N/yr', 0.2, 1.3],
ms + ['China', 'Emissions|N2O|Ind', 'Mt N/yr', 0.2, 1.3],
ms + ['China', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 0.2, 1.3],
ms + ['Japan', 'Emissions|N2O', 'Mt N/yr', -1, 2],
ms + ['Japan', 'Emissions|N2O|Ind', 'Mt N/yr', -1, 2],
ms + ['Japan', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -1, 2],
ms + ['Germany', 'Emissions|N2O', 'Mt N/yr', 2, 3],
ms + ['Germany', 'Emissions|N2O|Ind', 'Mt N/yr', 2, 3],
ms + ['Germany', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 2, 3],
ms + ['UK', 'Emissions|N2O', 'Mt N/yr', -2, -0.5],
ms + ['UK', 'Emissions|N2O|Ind', 'Mt N/yr', -2, -0.5],
ms + ['UK', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -2, -0.5],

],
columns=IAMC_IDX + [2005, 2010],
)


TEST_STACKPLOT_DF = pd.DataFrame([
['World', 'Emissions|CO2|Energy|Oil', 'Mt CO2/yr', 2, 3.2, 2.0, 1.8],
Expand Down Expand Up @@ -137,6 +258,18 @@ def simple_df(request):
yield IamDataFrame(model='model_a', scenario='scen_a', data=_df)


@pytest.fixture(scope="function")
def check_aggregate_df():
df = IamDataFrame(data=CHECK_AGG_DF)
yield df


@pytest.fixture(scope="function")
def check_aggregate_regional_df():
df = IamDataFrame(data=CHECK_AGG_REGIONAL_DF)
yield df


@pytest.fixture(scope="function")
def reg_df():
df = IamDataFrame(data=REG_DF)
Expand Down
Loading