IAMconsortium · gidden · Oct 24, 2019 · Oct 24, 2019 · Nov 6, 2019 · Nov 6, 2019
diff --git a/pyam/core.py b/pyam/core.py
@@ -32,6 +32,7 @@
     read_pandas,
     format_data,
     sort_data,
+    merge_meta,
     to_int,
     find_depth,
     pattern_match,
@@ -226,10 +227,10 @@ def variables(self, include_units=False):
 
     def append(self, other, ignore_meta_conflict=False, inplace=False,
                **kwargs):
-        """Append any castable object to this IamDataFrame.
+        """Append any castable object to this ``IamDataFrame``
 
-        Columns in `other.meta` that are not in `self.meta` are always merged,
-        duplicate region-variable-unit-year rows raise a ValueError.
+        Columns in ``other.meta`` that are not in ``self.meta`` are merged.
+        Conflicting region-variable-unit-year rows raise a ``ValueError``.
 
         Parameters
         ----------
@@ -253,44 +254,15 @@ def append(self, other, ignore_meta_conflict=False, inplace=False,
 
         ret = copy.deepcopy(self) if not inplace else self
 
-        diff = other.meta.index.difference(ret.meta.index)
-        intersect = other.meta.index.intersection(ret.meta.index)
-
-        # merge other.meta columns not in self.meta for existing scenarios
-        if not intersect.empty:
-            # if not ignored, check that overlapping meta dataframes are equal
-            if not ignore_meta_conflict:
-                cols = [i for i in other.meta.columns if i in ret.meta.columns]
-                if not ret.meta.loc[intersect, cols].equals(
-                        other.meta.loc[intersect, cols]):
-                    conflict_idx = (
-                        pd.concat([ret.meta.loc[intersect, cols],
-                                   other.meta.loc[intersect, cols]]
-                                  ).drop_duplicates()
-                        .index.drop_duplicates()
-                    )
-                    msg = 'conflict in `meta` for scenarios {}'.format(
-                        [i for i in pd.DataFrame(index=conflict_idx).index])
-                    raise ValueError(msg)
-
-            cols = [i for i in other.meta.columns if i not in ret.meta.columns]
-            _meta = other.meta.loc[intersect, cols]
-            ret.meta = ret.meta.merge(_meta, how='outer',
-                                      left_index=True, right_index=True)
-
-        # join other.meta for new scenarios
-        if not diff.empty:
-            # sorting not supported by ` pd.append()`  prior to version 23
-            sort_kwarg = {} if int(pd.__version__.split('.')[1]) < 23 \
-                else dict(sort=False)
-            ret.meta = ret.meta.append(other.meta.loc[diff, :], **sort_kwarg)
+        # merge `meta` tables
+        ret.meta = merge_meta(ret.meta, other.meta, ignore_meta_conflict)
 
         # append other.data (verify integrity for no duplicates)
         _data = ret.data.set_index(sorted(ret._LONG_IDX)).append(
             other.data.set_index(sorted(other._LONG_IDX)),
             verify_integrity=True)
 
-        # merge extra columns in `data` and set `LONG_IDX`
+        # merge extra columns in `data` and set `self._LONG_IDX`
         ret.extra_cols += [i for i in other.extra_cols
                            if i not in ret.extra_cols]
         ret._LONG_IDX = IAMC_IDX + [ret.time_col] + ret.extra_cols
@@ -1565,6 +1537,76 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
         if not inplace:
             return ret
 
+    def subtract(self, other, axis, new_name, ignore_meta_conflict=False):
+        """
+        Subtract data in ``other`` from ``self``
+
+        Parameters
+        ----------
+        other : pyam.IamDataFrame
+            Object containing timeseries data to subtract
+
+        axis : str
+            Column to use when subtracting the two sets of data
+            (e.g. ``variable``)
+
+        new_name : str
+            String to write in ``join_col`` in the output timeseries e.g.
+            (``variable 1 - variable 2``)
+
+        ignore_meta_conflict : bool, default False
+            If False and ``other`` is an IamDataFrame, raise an error if
+            any meta columns present in ``self`` and ``other`` are not
+            identical.
+
+        Raises
+        ------
+        ValueError
+            The metadata columns in ``self`` and ``other`` are not identical
+
+        ValueError
+            ``self`` or ``other`` contains more than value for ``join_col``
+
+        NotImplementedError
+            The type of ``other`` is not yet supported
+        """
+        if not isinstance(other, IamDataFrame):
+            raise NotImplementedError
+
+        too_many_vals_error = "`{}` contains more than one `{}`"
+        if len(self[axis].unique()) > 1:
+            raise ValueError(too_many_vals_error.format("self", axis))
+
+        if len(other[axis].unique()) > 1:
+            raise ValueError(too_many_vals_error.format("other", axis))
+
+        # do this operation here so meta conflicts are raised early
+        out_meta = merge_meta(self.meta, other.meta, ignore_meta_conflict)
+
+        s_data = self.data.copy()
+        o_data = other.data.copy()
+
+        idx = s_data.columns.tolist()
+        idx_tmp = list(set(idx) - set([axis]) - {"value"})
+
+        s_data = s_data.set_index(idx_tmp).drop(axis, axis="columns")
+        o_data = o_data.set_index(idx_tmp).drop(axis, axis="columns")
+
+        res = (s_data - o_data).reset_index()
+        res[axis] = new_name
+
+        res = IamDataFrame(res)
+
+        # final meta wrangling
+        keep_meta_idx = out_meta.index.intersection(_meta_idx(res.data))
+        if keep_meta_idx.empty:
+            # nothing common after doing subtraction, stick with empty meta
+            pass
+        else:
+            # just keep the scenarios that survived the nan removal
+            res.meta = out_meta.loc[keep_meta_idx]
+
+        return res
 
 def _meta_idx(data):
     """Return the `META_IDX` from `data` by index or columns"""

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -268,6 +268,60 @@ def sort_data(data, cols):
     return data.sort_values(cols)[cols + ['value']].reset_index(drop=True)
 
 
+def merge_meta(left, right, ignore_meta_conflict=False):
+    """
+    Merge two ``meta`` tables
+
+    Parameters
+    ----------
+    left : pd.Dataframe
+        First meta table
+
+    right : pd.Dataframe
+        Second meta table
+
+    ignore_meta_conflict : bool, default False
+        If False, raise an error if any meta columns present in ``left`` and
+        ``right`` are not identical.
+
+    Raises
+    ------
+    ValueError
+        Values are in conflict and ``ignore_meta_conflict`` is ``False``
+
+    Returns
+    -------
+    pd.DataFrame
+        Joined metadata tables
+    """
+    left = left.copy()  # make a copy to not change the original object
+    diff = right.index.difference(left.index)
+    sect = right.index.intersection(left.index)
+
+    # merge `right` into `left` for overlapping scenarios ( `sect`)
+    if not sect.empty:
+        # if not ignored, check that overlapping `meta` columns are equal
+        if not ignore_meta_conflict:
+            cols = [i for i in right.columns if i in left.columns]
+            if not left.loc[sect, cols].equals(right.loc[sect, cols]):
+                conflict_idx = pd.concat(
+                    [right.loc[sect, cols], left.loc[sect, cols]]
+                ).drop_duplicates().index.drop_duplicates()
+
+                msg = 'conflict in `meta` for scenarios {}'.format(
+                    [i for i in pd.DataFrame(index=conflict_idx).index])
+                raise ValueError(msg)
+        # merge new columns
+        cols = [i for i in right.columns if i not in left.columns]
+        left = left.merge(right.loc[sect, cols], how='outer',
+                          left_index=True, right_index=True)
+
+    # join `other.meta` for new scenarios (`diff`)
+    if not diff.empty:
+        left = left.append(right.loc[diff, :], sort=False)
+
+    return left
+
 def find_depth(data, s='', level=None):
     """
     return or assert the depth (number of `|`) of variables

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -67,6 +67,127 @@
     columns=IAMC_IDX + [2005, 2010],
 )
 
+mg_ascen = ['MSG-GLB', 'a_scen']
+mg_ascen_2 = ['MSG-GLB', 'a_scen_2']
+CHECK_AGG_DF = pd.DataFrame([
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy', 'EJ/y', 1, 6],
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.75, 5],
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.25, 1],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3, 8],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1, 3],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2, 5],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy', 'EJ/y', 0.3, 0.6],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.15, 0.4],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.2],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1, 1.4],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.6, 0.8],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.4, 0.6],
+    ['IMG', 'a_scen', 'World', 'Primary Energy', 'EJ/y', 1.3, 6.6],
+    ['IMG', 'a_scen', 'World', 'Primary Energy|Coal', 'EJ/y', 0.9, 5.4],
+    ['IMG', 'a_scen', 'World', 'Primary Energy|Gas', 'EJ/y', 0.4, 1.2],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4, 9.4],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.6, 3.8],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.4, 5.6],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy', 'EJ/y', 1.4, 6.4],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.95, 5.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.45, 1.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3.4, 8.4],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.2, 3.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.2],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy', 'EJ/y', 0.7, 1.0],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.35, 0.6],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.35, 0.4],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1.4, 1.8],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.8, 1.0],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.6, 0.8],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy', 'EJ/y', 2.1, 7.4],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy|Coal', 'EJ/y', 1.3, 5.8],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy|Gas', 'EJ/y', 0.8, 1.6],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4.8, 10.2],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 2.0, 4.2],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.8, 6.0],
+    mg_ascen + ['R5ASIA', 'Primary Energy', 'EJ/y', 0.8, 5.8],
+    mg_ascen + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.65, 4.9],
+    mg_ascen + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.9],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 2.8, 7.8],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.9, 2.9],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 1.9, 4.9],
+    mg_ascen + ['R5REF', 'Primary Energy', 'EJ/y', 0.1, 0.4],
+    mg_ascen + ['R5REF', 'Primary Energy|Coal', 'EJ/y', 0.05, 0.3],
+    mg_ascen + ['R5REF', 'Primary Energy|Gas', 'EJ/y', 0.05, 0.1],
+    mg_ascen + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', 0.8, 1.2],
+    mg_ascen + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.5, 0.7],
+    mg_ascen + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.3, 0.5],
+    mg_ascen + ['World', 'Primary Energy', 'EJ/y', 0.9, 6.2],
+    mg_ascen + ['World', 'Primary Energy|Coal', 'EJ/y', 0.7, 5.2],
+    mg_ascen + ['World', 'Primary Energy|Gas', 'EJ/y', 0.2, 1.0],
+    mg_ascen + ['World', 'Emissions|CO2', 'Mt CO2/yr', 3.6, 9.0],
+    mg_ascen + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.4, 3.6],
+    mg_ascen + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.4],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy', 'EJ/y', -1.4, -6.4],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', -0.95, -5.2],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', -0.45, -1.2],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', -3.4, -8.4],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', -1.2, -3.2],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.2, -5.2],
+    mg_ascen_2 + ['R5REF', 'Primary Energy', 'EJ/y', -0.7, -1.0],
+    mg_ascen_2 + ['R5REF', 'Primary Energy|Coal', 'EJ/y', -0.35, -0.6],
+    mg_ascen_2 + ['R5REF', 'Primary Energy|Gas', 'EJ/y', -0.35, -0.4],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', -1.4, -1.8],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', -0.8, -1.0],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', -0.6, -0.8],
+    mg_ascen_2 + ['World', 'Primary Energy', 'EJ/y', -2.1, -7.4],
+    mg_ascen_2 + ['World', 'Primary Energy|Coal', 'EJ/y', -1.3, -5.8],
+    mg_ascen_2 + ['World', 'Primary Energy|Gas', 'EJ/y', -0.8, -1.6],
+    mg_ascen_2 + ['World', 'Emissions|CO2', 'Mt CO2/yr', -5.0, -10.6],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', -2.0, -4.2],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.8, -6.0],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Agg Agg', 'Mt CO2/yr', -0.2, -0.4],
+    mg_ascen_2 + ['World', 'Emissions|CF4', 'kt CF4/yr', 54, 56],
+    mg_ascen_2 + ['World', 'Emissions|C2F6', 'kt C2F6/yr', 32, 27],
+    mg_ascen_2 + ['World', 'Emissions|C2F6|Solvents', 'kt C2F6/yr', 30, 33],
+    mg_ascen_2 + ['World', 'Emissions|C2F6|Industry', 'kt C2F6/yr', 2, -6],
+    mg_ascen_2 + ['World', 'Emissions|CH4', 'Mt CH4/yr', 322, 217],
+    mg_ascen_2 + ['R5REF', 'Emissions|CH4', 'Mt CH4/yr', 30, 201],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CH4', 'Mt CH4/yr', 292, 16],
+],
+    columns=IAMC_IDX + [2005, 2010],
+)
+
+
+ms = ['AIM', 'cscen']
+CHECK_AGG_REGIONAL_DF = pd.DataFrame([
+    ms + ['World', 'Emissions|N2O', 'Mt N/yr', 1.9, 15.7],
+    ms + ['World', 'Emissions|N2O|AFOLU', 'Mt N/yr', 0.1, 0.1],
+    ms + ['World', 'Emissions|N2O|Ind', 'Mt N/yr', 1.8, 15.6],
+    ms + ['World', 'Emissions|N2O|Ind|Shipping', 'Mt N/yr', 1, 6],
+    ms + ['World', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 1.6, 3.8],
+    ms + ['World', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -0.8, 5.8],
+    ms + ['RASIA', 'Emissions|N2O', 'Mt N/yr', 0, 5.9],
+    ms + ['RASIA', 'Emissions|N2O|Ind', 'Mt N/yr', 0, 5.9],
+    ms + ['RASIA', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 0.8, 2.6],
+    ms + ['RASIA', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -0.8, 3.3],
+    ms + ['REUROPE', 'Emissions|N2O', 'Mt N/yr', 0.8, 3.7],
+    ms + ['REUROPE', 'Emissions|N2O|Ind', 'Mt N/yr', 0.8, 3.7],
+    ms + ['REUROPE', 'Emissions|N2O|Ind|Solvents', 'Mt N/yr', 0.8, 1.2],
+    ms + ['REUROPE', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 0, 2.5],
+    ms + ['China', 'Emissions|N2O', 'Mt N/yr', 0.2, 1.3],
+    ms + ['China', 'Emissions|N2O|Ind', 'Mt N/yr', 0.2, 1.3],
+    ms + ['China', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 0.2, 1.3],
+    ms + ['Japan', 'Emissions|N2O', 'Mt N/yr', -1, 2],
+    ms + ['Japan', 'Emissions|N2O|Ind', 'Mt N/yr', -1, 2],
+    ms + ['Japan', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -1, 2],
+    ms + ['Germany', 'Emissions|N2O', 'Mt N/yr', 2, 3],
+    ms + ['Germany', 'Emissions|N2O|Ind', 'Mt N/yr', 2, 3],
+    ms + ['Germany', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', 2, 3],
+    ms + ['UK', 'Emissions|N2O', 'Mt N/yr', -2, -0.5],
+    ms + ['UK', 'Emissions|N2O|Ind', 'Mt N/yr', -2, -0.5],
+    ms + ['UK', 'Emissions|N2O|Ind|Transport', 'Mt N/yr', -2, -0.5],
+
+],
+    columns=IAMC_IDX + [2005, 2010],
+)
+
 
 TEST_STACKPLOT_DF = pd.DataFrame([
     ['World', 'Emissions|CO2|Energy|Oil', 'Mt CO2/yr', 2, 3.2, 2.0, 1.8],
@@ -137,6 +258,18 @@ def simple_df(request):
     yield IamDataFrame(model='model_a', scenario='scen_a', data=_df)
 
 
+@pytest.fixture(scope="function")
+def check_aggregate_df():
+    df = IamDataFrame(data=CHECK_AGG_DF)
+    yield df
+
+
+@pytest.fixture(scope="function")
+def check_aggregate_regional_df():
+    df = IamDataFrame(data=CHECK_AGG_REGIONAL_DF)
+    yield df
+
+
 @pytest.fixture(scope="function")
 def reg_df():
     df = IamDataFrame(data=REG_DF)