diff --git a/pymrio/__init__.py b/pymrio/__init__.py index 31323df1..21999ed2 100644 --- a/pymrio/__init__.py +++ b/pymrio/__init__.py @@ -38,7 +38,7 @@ import sys from pymrio.core.fileio import * -from pymrio.core.mriosystem import Extension, IOSystem, concate_extension +from pymrio.core.mriosystem import Extension, IOSystem, concate_extension, extension_convert from pymrio.tools.ioclass import ClassificationData, get_classification from pymrio.tools.iodownloader import ( download_eora26, diff --git a/pymrio/core/mriosystem.py b/pymrio/core/mriosystem.py index 532eba56..bfcf09f3 100644 --- a/pymrio/core/mriosystem.py +++ b/pymrio/core/mriosystem.py @@ -1935,7 +1935,7 @@ def characterize( def convert( self, df_map, - extension_name, + new_extension_name, agg_func="sum", drop_not_bridged_index=True, unit_column_orig="unit_orig", @@ -2049,7 +2049,7 @@ def convert( f"Unit in extension does not match the unit in mapping for row {row}" ) - new_extension = Extension(name=extension_name) + new_extension = Extension(name=new_extension_name) if unit_column_new: if unit_column_new not in df_map.columns: @@ -3230,6 +3230,7 @@ def remove_extension(self, ext): instance or of Extension.name. instance was found) """ + # TODO: rename to extension_remove if type(ext) is str: ext = [ext] @@ -3245,150 +3246,157 @@ def remove_extension(self, ext): return self - def extension_convert( - self, - df_map, - extension_name, - extension_col_name="extension", - agg_func="sum", - drop_not_bridged_index=True, - unit_column_orig="unit_orig", - unit_column_new="unit_new", - ignore_columns=None, - ): - """Apply the convert function to all extensions - - Internally that calls the Extension.convert function for all extensions. - - See the Extension.convert function for more details. - TODO: put all details here - - - Parameters - ---------- - - df_map : pd.DataFrame - The DataFrame with the mapping of the old to the new classification. - This requires a specific structure: - - - Constraining data (e.g. stressors, regions, sectors) can be - either in the index or columns of df_orig. The need to have the same - name as the named index or column in df_orig. The algorithm searches - for matching data in df_orig based on all constraining columns in df_map. - - - Bridge columns are columns with '__' in the name. These are used to - map (bridge) some/all of the constraining columns in df_orig to the new - classification. - - - One column "factor", which gives the multiplication factor for the - conversion. If it is missing, it is set to 1. - - - This is better explained with an example. - Assuming a original dataframe df_orig with - index names 'stressor' and 'compartment' and column name 'region', - the characterizing dataframe could have the following structure (column names): - - stressor ... original index name - compartment ... original index name - region ... original column name - factor ... the factor for multiplication/characterization - If no factor is given, the factor is assumed to be 1. - This can be used, to simplify renaming/aggregation mappings. - impact__stressor ... the new index name, - replacing the previous index name "stressor". - Thus here "stressor" will be renamed to "impact", and the row index - will be renamed by the entries here. - compartment__compartment ... the new compartment, - replacing the original compartment. No rename of column happens here, - still row index will be renamed as given here. - - the columns with __ are called bridge columns, they are used - to match the original index. The new dataframe with have index names - based on the first part of the bridge column, in the order - in which the bridge columns are given in the mapping dataframe. - - "region" is constraining column, these can either be for the index or column - in df_orig. In case both exist, the one in index is preferred. - - extension_name: str - The name of the new extension returned - - extension_col_name : str, optional - Name of the column specifying the extension name in df_map. - The entry in df_map here can either be the name returned by Extension.name or the - name of the Extension instance. - Default: 'extension' - - agg_func : str or func - the aggregation function to use for multiple matchings (summation by default) - - drop_not_bridged_index : bool, optional - What to do with index levels in df_orig not appearing in the bridge columns. - If True, drop them after aggregation across these, if False, - pass them through to the result. - - *Note:* Only index levels will be dropped, not columns. - - In case some index levels need to be dropped, and some not - make a bridge column for the ones to be dropped and map all to the same name. - Then drop this index level after the conversion. - - unit_column_orig : str, optional - Name of the column in df_map with the original unit. - This will be used to check if the unit matches the original unit in the extension. - Default is "unit_orig", if None, no check is performed. +def extension_convert( + *extensions, + df_map, + new_extension_name, + extension_col_name="extension", + agg_func="sum", + drop_not_bridged_index=True, + unit_column_orig="unit_orig", + unit_column_new="unit_new", + ignore_columns=None, +): + """Apply the convert function to a list of extensions - unit_column_new : str, optional - Name of the column in df_map with the new unit to be assigned to the new extension. - Default is "unit_new", if None same unit as in df_orig TODO EXPLAIN BETTER, THINK WARNING + Internally that calls the Extension.convert function for all extensions. - ignore_columns : list, optional - List of column names in df_map which should be ignored. - These could be columns with additional information, etc. - The unit columns given in unit_column_orig and unit_column_new - are ignored by default. + See the Extension.convert function for more details. + TODO: put all details here - TODO: remove after explain - Extension for extensions: - extension ... extension name - unit_orig ... the original unit (optional, for double check with the unit) - unit_new ... the new unit to be set for the extension + Parameters + ---------- + + extensions : list of extensions + Extensions to convert. All extensions passed must + have an index structure (index names) ase described in df_map. + + df_map : pd.DataFrame + The DataFrame with the mapping of the old to the new classification. + This requires a specific structure: + + - Constraining data (e.g. stressors, regions, sectors) can be + either in the index or columns of df_orig. The need to have the same + name as the named index or column in df_orig. The algorithm searches + for matching data in df_orig based on all constraining columns in df_map. + + - Bridge columns are columns with '__' in the name. These are used to + map (bridge) some/all of the constraining columns in df_orig to the new + classification. + + - One column "factor", which gives the multiplication factor for the + conversion. If it is missing, it is set to 1. + + + This is better explained with an example. + Assuming a original dataframe df_orig with + index names 'stressor' and 'compartment' and column name 'region', + the characterizing dataframe could have the following structure (column names): + + stressor ... original index name + compartment ... original index name + region ... original column name + factor ... the factor for multiplication/characterization + If no factor is given, the factor is assumed to be 1. + This can be used, to simplify renaming/aggregation mappings. + impact__stressor ... the new index name, + replacing the previous index name "stressor". + Thus here "stressor" will be renamed to "impact", and the row index + will be renamed by the entries here. + compartment__compartment ... the new compartment, + replacing the original compartment. No rename of column happens here, + still row index will be renamed as given here. + + the columns with __ are called bridge columns, they are used + to match the original index. The new dataframe with have index names + based on the first part of the bridge column, in the order + in which the bridge columns are given in the mapping dataframe. + + "region" is constraining column, these can either be for the index or column + in df_orig. In case both exist, the one in index is preferred. + + extension_name: str + The name of the new extension returned + + extension_col_name : str, optional + Name of the column specifying the extension name in df_map. + The entry in df_map here can either be the name returned by Extension.name or the + name of the Extension instance. + Default: 'extension' + + agg_func : str or func + the aggregation function to use for multiple matchings (summation by default) + + drop_not_bridged_index : bool, optional + What to do with index levels in df_orig not appearing in the bridge columns. + If True, drop them after aggregation across these, if False, + pass them through to the result. + + *Note:* Only index levels will be dropped, not columns. + + In case some index levels need to be dropped, and some not + make a bridge column for the ones to be dropped and map all to the same name. + Then drop this index level after the conversion. + + unit_column_orig : str, optional + Name of the column in df_map with the original unit. + This will be used to check if the unit matches the original unit in the extension. + Default is "unit_orig", if None, no check is performed. + + unit_column_new : str, optional + Name of the column in df_map with the new unit to be assigned to the new extension. + Default is "unit_new", if None same unit as in df_orig TODO EXPLAIN BETTER, THINK WARNING + + ignore_columns : list, optional + List of column names in df_map which should be ignored. + These could be columns with additional information, etc. + The unit columns given in unit_column_orig and unit_column_new + are ignored by default. + + + TODO: remove after explain + Extension for extensions: + extension ... extension name + unit_orig ... the original unit (optional, for double check with the unit) + unit_new ... the new unit to be set for the extension - """ - if not ignore_columns: - ignore_columns = [] - ignore_columns.append(extension_col_name) + """ - extensions_to_consider = df_map.loc[:, extension_col_name].unique() + if type(extensions) is Extension: + extensions = [extensions] + elif type(extensions) is tuple: + extensions = list(extensions) - gather = dict() + if not ignore_columns: + ignore_columns = [] + ignore_columns.append(extension_col_name) - for ext in extensions_to_consider: - gather.update( - self._apply_extension_method( - extensions=ext, - method="convert", - df_map=df_map[df_map[extension_col_name] == ext], - agg_func=agg_func, - extension_name=extension_name, - drop_not_bridged_index=drop_not_bridged_index, - unit_column_orig=unit_column_orig, - unit_column_new=unit_column_new, - ignore_columns=ignore_columns, - ) + gather = [] + + for ext in extensions: + gather.append( + ext.convert( + df_map=df_map[df_map[extension_col_name] == ext.name], + agg_func=agg_func, + new_extension_name=new_extension_name, + drop_not_bridged_index=drop_not_bridged_index, + unit_column_orig=unit_column_orig, + unit_column_new=unit_column_new, + ignore_columns=ignore_columns, ) + ) - result_ext = concate_extension(list(gather.values()), name=extension_name) + result_ext = concate_extension(*gather, name=new_extension_name) - return result_ext + + for df, df_name in zip(result_ext.get_DataFrame(data=True, with_unit=True), result_ext.get_DataFrame(data=False, with_unit=True)): + if df_name == "unit": + setattr(result_ext, df_name, df.groupby(level=df.index.names).agg(lambda x: ",".join(set(x)))) + else: + setattr(result_ext, df_name, df.groupby(level=df.index.names).agg(agg_func)) - # look for extension name in df_map - # make unique extension list, and call extension_extract for all - # build a new df_map with removing extension_name column - # call the extension.convert function for the extension - pass + return result_ext def concate_extension(*extensions, name): @@ -3421,6 +3429,9 @@ def concate_extension(*extensions, name): Concatenated extension """ + # TODO: rename to extension_concatenate and also provie method + # TODO: rename name to new_extension_name , make it consitent with conver function + if type(extensions[0]) is tuple or type(extensions[0]) is list: extensions = extensions[0] diff --git a/tests/test_core.py b/tests/test_core.py index 2fc1d373..422fa2f3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -554,7 +554,7 @@ def test_characterize_extension(fix_testmrio): def test_extension_convert(fix_testmrio): - """Testing the convert function within extensions""" + """Testing the convert function within extensions object""" tt_pre = fix_testmrio.testmrio.copy() df_map = pd.DataFrame( columns=[ @@ -639,12 +639,56 @@ def test_extension_convert(fix_testmrio): assert tt_post.post_calc.unit.loc["water_emissions", "unit"] == "g" -def test_extension_convert_full(fix_testmrio): - """Testing the convert function called from the full MRIO""" +def test_extension_convert_function(fix_testmrio): + """Testing the convert function for a list of extensions """ tt_pre = fix_testmrio.testmrio.copy() - df_map = pd.DataFrame( + df_map_double = pd.DataFrame( + columns=[ + "extension", + "stressor", + "compartment", + "stressor__stressor", + "compartment__compartment", + "factor", + "unit_orig", + "unit_new", + ], + data=[ + ["Emissions", "emis.*", "air|water", "total_sum_tonnes", "total", 1e-3, "kg", "t"], + ["Emissions", "emission_type2", "water", "water_emissions", "water", 1000, "kg", "g"], + ], + ) + + # CONT: Something wrong with setting the index to a multiindex when compartment is passed + # Next steps: run this in interprester (with autoreload) and set breakpoint in extension_convert + # Seems to be in gather, but after that in the aggregation or Concatenate we get a problem + + # x = tt_pre.extension_convert(df_map, extension_name="emissions_new_pre_calc") + + # Doing two time the same extension + ext_double = pymrio.extension_convert(tt_pre.emissions, tt_pre.emissions, df_map=df_map_double, new_extension_name="emissions_new_pre_calc") + + assert ext_double.unit.loc["total_sum_tonnes", "unit"] == "t" + assert ext_double.unit.loc["water_emissions", "unit"] == "g" + + pdt.assert_series_equal( + ext_double.F.loc["total_sum_tonnes"], + tt_pre.emissions.F.sum(axis=0) * 1e-3 * 2, + check_names=False, + ) + + pdt.assert_series_equal( + ext_double.F.loc["water_emissions"], + tt_pre.emissions.F.loc["emission_type2",:].iloc[0,:] * 1000 * 2, + check_names=False, + ) + + + tt_pre.emission_new = ext_double + + df_map_add_across = pd.DataFrame( columns=[ "extension", "stressor", @@ -655,16 +699,14 @@ def test_extension_convert_full(fix_testmrio): "unit_new", ], data=[ - ["Emissions", "emis.*", "air|water", "total_sum_tonnes", 1e-3, "kg", "t"], - ["emissions", "emission_type[1|2]", ".*", "total_sum", 1, "kg", "kg"], - ["emissions", "emission_type1", ".*", "air_emissions", 1e-3, "kg", "t"], - ["Emissions", "emission_type2", ".*", "water_emissions", 1000, "kg", "g"], + ["Emissions", "emission_type2", ".*", "water", 1, "kg", "kg"], + ["emission_new_pre_calc", "water_emissions", ".*", "water", 1E-3, "g", "kg"], ], ) - x = tt_pre.extension_convert(df_map, extension_name="emissions_new_pre_calc") + ext_across = pymrio.extension_convert(tt_pre.emissions, ext_double, df_map=df_map_add_across, new_extension_name="add_across") + # CONT: - # write test with units # make a second extensions are check running over 2 # cleanup docstrings and write docs