diff --git a/xagg/aux.py b/xagg/aux.py index 9b427e6..26c6a18 100644 --- a/xagg/aux.py +++ b/xagg/aux.py @@ -42,6 +42,25 @@ def normalize(a,drop_na = False): else: return a*np.nan +def find_rel_area(df): + """ + Find the relative area of each row in a geodataframe + """ + df['rel_area'] = df.area/df.area.sum() + return df + + +def list_or_first(ser): + lis = list(ser) + # only the columns associated with the pixels should have multiple values; + # for all other columns (those associated with the polygons), it should be + # safe to return just the first item + if all(x == lis[0] for x in lis) and ser.name not in ['pix_idx', 'coords', 'rel_area', 'lat', 'lon']: + return lis[0] + else: + return lis + + def fix_ds(ds,var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'}, 'Latitude':{'Latitude':'lat','Longitude':'lon'}, 'Lat':{'Lat':'lat','Lon':'lon'}, diff --git a/xagg/classes.py b/xagg/classes.py index 7221ef6..1086861 100644 --- a/xagg/classes.py +++ b/xagg/classes.py @@ -24,11 +24,12 @@ class weightmap(object): """ Class for mapping from pixels to polgyons, output from :func:`xagg.wrappers.pixel_overlaps` """ - def __init__(self,agg,source_grid,geometry,weights='nowghts'): + def __init__(self,agg,source_grid,geometry,overlap_da,weights='nowghts'): self.agg = agg self.source_grid = source_grid self.geometry = geometry self.weights = weights + self.overlap_da = overlap_da def diag_fig(self,poly_idx): """ (NOT YET IMPLEMENTED) a diagnostic figure of the aggregation diff --git a/xagg/core.py b/xagg/core.py index e4c6ce3..94d12d2 100644 --- a/xagg/core.py +++ b/xagg/core.py @@ -6,7 +6,7 @@ import warnings import xesmf as xe -from . aux import (normalize,fix_ds,get_bnds,subset_find) +from . aux import (find_rel_area,fix_ds,get_bnds,subset_find,list_or_first) from . classes import (weightmap,aggregated) @@ -281,7 +281,6 @@ def get_pixel_overlaps(gdf_in,pix_agg): """ - # Add an index for each polygon as a column to make indexing easier #if 'poly_idx' not in gdf_in.columns: # gdf_in['poly_idx'] = gdf_in.index.values @@ -309,42 +308,43 @@ def get_pixel_overlaps(gdf_in,pix_agg): pix_agg['gdf_pixels'].to_crs(epsg_set), how='intersection') + overlaps = overlaps.groupby('poly_idx').apply(find_rel_area) + overlaps['lat'] = overlaps['lat'].astype(float) + overlaps['lon'] = overlaps['lon'].astype(float) + # Now, group by poly_idx (each polygon in the shapefile) ov_groups = overlaps.groupby('poly_idx') - - # Calculate relative area of each overlap (so how much of the total - # area of each polygon is taken up by each pixel), the pixels - # corresponding to those areas, and the lat/lon coordinates of - # those pixels - # aggfunc=lambda ds: normalize(ds.area) doesn't quite work for - # some reason - would be great to use that though; it would get - # rid of the NaN warning that shows up - #lambda ds: [ds.area/ds.area.sum()] - overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]), - pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]), - lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]), - lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds])) - + + overlap_info = ov_groups.agg(list_or_first) + + overlap_info = overlap_info.rename(columns={'pix_idx': 'pix_idxs'}) + # Zip lat, lon columns into a list of (lat,lon) coordinates # (separate from above because as of 12/20, named aggs with # multiple columns is still an open issue in the pandas github) overlap_info['coords'] = overlap_info.apply(lambda row: list(zip(row['lat'],row['lon'])),axis=1) overlap_info = overlap_info.drop(columns=['lat','lon']) - + # Reset index to make poly_idx a column for merging with gdf_in overlap_info = overlap_info.reset_index() - + # Merge in pixel overlaps to the input polygon geodataframe - gdf_in = pd.merge(gdf_in,overlap_info,'outer') - - # Drop 'geometry' eventually, just for size/clarity - wm_out = weightmap(agg=gdf_in.drop('geometry',axis=1), + overlap_columns = ['pix_idxs', 'rel_area', 'coords', 'poly_idx'] + gdf_in = pd.merge(gdf_in, overlap_info[overlap_columns],'outer', on='poly_idx') + + # make the weight grid an xarray dataset for later dot product + idx_cols = ['lat', 'lon', 'poly_idx'] + overlap_da = overlaps.set_index(idx_cols)['rel_area'].to_xarray() + overlap_da = overlap_da.stack(loc=['lat', 'lon']) + overlap_da = overlap_da.fillna(0) + wm_out = weightmap(agg=gdf_in.drop('geometry', axis=1), source_grid=pix_agg['source_grid'], - geometry=gdf_in.geometry) + geometry=gdf_in.geometry, + overlap_da = overlap_da) if 'weights' in pix_agg['gdf_pixels'].columns: wm_out.weights = pix_agg['gdf_pixels'].weights - + return wm_out @@ -423,68 +423,57 @@ def aggregate(ds,wm): warnings.warn('wm.weights is: \n '+print(wm.weights)+ ', \n which is not a supported weight vector (in a pandas series) '+ 'or "nowghts" as a string. Assuming no weights are included...') - weights = np.ones((len(wm.source_grid['lat']))) + weights = np.ones((len(wm.overlap_da['loc']))) + data_dict = dict() for var in ds.var(): # Process for every variable that has locational information, but isn't a # bound variable if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims): print('aggregating '+var+'...') - # Create the column for the relevant variable - wm.agg[var] = None - #breakpoint() - # Get weighted average of variable based on pixel overlap + other weights - for poly_idx in wm.agg.poly_idx: - # Get average value of variable over the polygon; weighted by - # how much of the pixel area is in the polygon, and by (optionally) - # a separate gridded weight. This if statement checks - # - if the weights output for this polygon is just "np.nan", which - # indicates there's no overlap between the pixel grid and polygons - # - [this has been pushed down a few lines] or, if all the pixels in - # the grid have just nan values for this variable - # in both cases; the "aggregated variable" is just a vector of nans. - if not np.isnan(wm.agg.iloc[poly_idx,:].pix_idxs).all(): - # Get the dimensions of the variable that aren't "loc" (location) - other_dims = [k for k in np.atleast_1d(ds[var].dims) if k != 'loc'] - # Check first if any nans are "complete" (meaning that a pixel - # either has values for each step, or nans for each step - if - # there are random nans within a pixel, throw a warning) - if not xr.Dataset.equals(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).any(other_dims), - np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims)): - warnings.warn('One or more of the pixels in variable '+var+' have nans in them in the dimensions '+ - ', '.join([k for k in ds[var].dims if k != 'loc'])+ - '. The code can currently only deal with pixels for which the '+ - '*entire* pixel has nan values in all dimensions, however there is currently no '+ - ' support for data in which pixels have only some nan values. The aggregation '+ - 'calculation is likely incorrect.') - - if not np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(): - # Get relative areas for the pixels overlapping with this Polygon - tmp_areas = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].rel_area)) - # Replace overlapping pixel areas with nans if the corresponding pixel - # is only composed of nans - tmp_areas[np.array(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims).values)] = np.nan - # Calculate the normalized area+weight of each pixel (taking into account - # nans) - normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True) - - # Take the weighted average of all the pixel values to calculate the - # aggregated value for the shapefile - wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)* - normed_areaweights). - sum('loc')).values]] - - else: - wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] - - else: - #breakpoint() - #wm.agg.loc[poly_idx,var] = [[np.array(np.nan)]] - wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] - + + # select just data for which we have overlaps + var_array = ds[var].sel(loc=wm.overlap_da['loc']) + + # multiply percent-overlaps by user-supplied weights + weights_and_overlaps = wm.overlap_da * weights + + # fill any nans in the gridded data to zero and change the corresponding + # weights to zero so that they will not affect the aggregated value + var_array_filled = var_array.fillna(0) + weights_and_overlaps = weights_and_overlaps.where(var_array.notnull(), 0) + + # the weights now may add up to less than one because of nans or + # because of the user-supplied weights. here we normalize the + # weights so they add up to 1 and fill any nan's with 0 so they + # won't be counted + normed_weights = weights_and_overlaps/weights_and_overlaps.sum(dim = "loc", skipna=True) + normed_weights = normed_weights.fillna(0) + + # finally we do the dot product to get the weighted averages + aggregated_array = normed_weights.dot(var_array_filled) + + # if the original gridded values were all nan, make the final + # aggregation nan + if np.isnan(var_array.values).all(): + aggregated_array = aggregated_array * np.nan + + data_dict[var] = aggregated_array + + ds_combined = xr.Dataset(data_dict) + df_combined = ds_combined.to_dataframe().reset_index() + df_combined = df_combined.groupby('poly_idx').agg(list_or_first) + + wm.agg = pd.merge(wm.agg, df_combined, on='poly_idx') + for var in ds.var(): + if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims): + # convert to list of arrays - NOT SURE THIS IS THE RIGHT THING TO + # DO, JUST TRYING TO MATCH ORIGINAL FORMAT + wm.agg[var] = wm.agg[var].apply(np.array).apply(lambda x: [x]) + # Put in class format agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid, - geometry=wm.geometry,ds_in=ds,weights=wm.weights) + geometry=wm.geometry,ds_in=ds_combined,weights=wm.weights) # Return print('all variables aggregated to polygons!')