Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dot product implementation #4

Merged
merged 3 commits into from
Feb 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions xagg/aux.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ def normalize(a,drop_na = False):
else:
return a*np.nan

def find_rel_area(df):
"""
Find the relative area of each row in a geodataframe
"""
df['rel_area'] = df.area/df.area.sum()
return df


def list_or_first(ser):
lis = list(ser)
# only the columns associated with the pixels should have multiple values;
# for all other columns (those associated with the polygons), it should be
# safe to return just the first item
if all(x == lis[0] for x in lis) and ser.name not in ['pix_idx', 'coords', 'rel_area', 'lat', 'lon']:
return lis[0]
else:
return lis


def fix_ds(ds,var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
'Latitude':{'Latitude':'lat','Longitude':'lon'},
'Lat':{'Lat':'lat','Lon':'lon'},
Expand Down
3 changes: 2 additions & 1 deletion xagg/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ class weightmap(object):
""" Class for mapping from pixels to polgyons, output from :func:`xagg.wrappers.pixel_overlaps`

"""
def __init__(self,agg,source_grid,geometry,weights='nowghts'):
def __init__(self,agg,source_grid,geometry,overlap_da,weights='nowghts'):
self.agg = agg
self.source_grid = source_grid
self.geometry = geometry
self.weights = weights
self.overlap_da = overlap_da

def diag_fig(self,poly_idx):
""" (NOT YET IMPLEMENTED) a diagnostic figure of the aggregation
Expand Down
145 changes: 67 additions & 78 deletions xagg/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import warnings
import xesmf as xe

from . aux import (normalize,fix_ds,get_bnds,subset_find)
from . aux import (find_rel_area,fix_ds,get_bnds,subset_find,list_or_first)
from . classes import (weightmap,aggregated)


Expand Down Expand Up @@ -281,7 +281,6 @@ def get_pixel_overlaps(gdf_in,pix_agg):

"""


# Add an index for each polygon as a column to make indexing easier
#if 'poly_idx' not in gdf_in.columns:
# gdf_in['poly_idx'] = gdf_in.index.values
Expand Down Expand Up @@ -309,42 +308,43 @@ def get_pixel_overlaps(gdf_in,pix_agg):
pix_agg['gdf_pixels'].to_crs(epsg_set),
how='intersection')

overlaps = overlaps.groupby('poly_idx').apply(find_rel_area)
overlaps['lat'] = overlaps['lat'].astype(float)
overlaps['lon'] = overlaps['lon'].astype(float)

# Now, group by poly_idx (each polygon in the shapefile)
ov_groups = overlaps.groupby('poly_idx')

# Calculate relative area of each overlap (so how much of the total
# area of each polygon is taken up by each pixel), the pixels
# corresponding to those areas, and the lat/lon coordinates of
# those pixels
# aggfunc=lambda ds: normalize(ds.area) doesn't quite work for
# some reason - would be great to use that though; it would get
# rid of the NaN warning that shows up
#lambda ds: [ds.area/ds.area.sum()]
overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]),
pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]),
lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]),
lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds]))


overlap_info = ov_groups.agg(list_or_first)

overlap_info = overlap_info.rename(columns={'pix_idx': 'pix_idxs'})

# Zip lat, lon columns into a list of (lat,lon) coordinates
# (separate from above because as of 12/20, named aggs with
# multiple columns is still an open issue in the pandas github)
overlap_info['coords'] = overlap_info.apply(lambda row: list(zip(row['lat'],row['lon'])),axis=1)
overlap_info = overlap_info.drop(columns=['lat','lon'])

# Reset index to make poly_idx a column for merging with gdf_in
overlap_info = overlap_info.reset_index()

# Merge in pixel overlaps to the input polygon geodataframe
gdf_in = pd.merge(gdf_in,overlap_info,'outer')

# Drop 'geometry' eventually, just for size/clarity
wm_out = weightmap(agg=gdf_in.drop('geometry',axis=1),
overlap_columns = ['pix_idxs', 'rel_area', 'coords', 'poly_idx']
gdf_in = pd.merge(gdf_in, overlap_info[overlap_columns],'outer', on='poly_idx')

# make the weight grid an xarray dataset for later dot product
idx_cols = ['lat', 'lon', 'poly_idx']
overlap_da = overlaps.set_index(idx_cols)['rel_area'].to_xarray()
overlap_da = overlap_da.stack(loc=['lat', 'lon'])
overlap_da = overlap_da.fillna(0)
wm_out = weightmap(agg=gdf_in.drop('geometry', axis=1),
source_grid=pix_agg['source_grid'],
geometry=gdf_in.geometry)
geometry=gdf_in.geometry,
overlap_da = overlap_da)

if 'weights' in pix_agg['gdf_pixels'].columns:
wm_out.weights = pix_agg['gdf_pixels'].weights

return wm_out


Expand Down Expand Up @@ -423,68 +423,57 @@ def aggregate(ds,wm):
warnings.warn('wm.weights is: \n '+print(wm.weights)+
', \n which is not a supported weight vector (in a pandas series) '+
'or "nowghts" as a string. Assuming no weights are included...')
weights = np.ones((len(wm.source_grid['lat'])))
weights = np.ones((len(wm.overlap_da['loc'])))

data_dict = dict()
for var in ds.var():
# Process for every variable that has locational information, but isn't a
# bound variable
if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims):
print('aggregating '+var+'...')
# Create the column for the relevant variable
wm.agg[var] = None
#breakpoint()
# Get weighted average of variable based on pixel overlap + other weights
for poly_idx in wm.agg.poly_idx:
# Get average value of variable over the polygon; weighted by
# how much of the pixel area is in the polygon, and by (optionally)
# a separate gridded weight. This if statement checks
# - if the weights output for this polygon is just "np.nan", which
# indicates there's no overlap between the pixel grid and polygons
# - [this has been pushed down a few lines] or, if all the pixels in
# the grid have just nan values for this variable
# in both cases; the "aggregated variable" is just a vector of nans.
if not np.isnan(wm.agg.iloc[poly_idx,:].pix_idxs).all():
# Get the dimensions of the variable that aren't "loc" (location)
other_dims = [k for k in np.atleast_1d(ds[var].dims) if k != 'loc']
# Check first if any nans are "complete" (meaning that a pixel
# either has values for each step, or nans for each step - if
# there are random nans within a pixel, throw a warning)
if not xr.Dataset.equals(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).any(other_dims),
np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims)):
warnings.warn('One or more of the pixels in variable '+var+' have nans in them in the dimensions '+
', '.join([k for k in ds[var].dims if k != 'loc'])+
'. The code can currently only deal with pixels for which the '+
'*entire* pixel has nan values in all dimensions, however there is currently no '+
' support for data in which pixels have only some nan values. The aggregation '+
'calculation is likely incorrect.')

if not np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all():
# Get relative areas for the pixels overlapping with this Polygon
tmp_areas = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].rel_area))
# Replace overlapping pixel areas with nans if the corresponding pixel
# is only composed of nans
tmp_areas[np.array(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims).values)] = np.nan
# Calculate the normalized area+weight of each pixel (taking into account
# nans)
normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True)

# Take the weighted average of all the pixel values to calculate the
# aggregated value for the shapefile
wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)*
normed_areaweights).
sum('loc')).values]]

else:
wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]

else:
#breakpoint()
#wm.agg.loc[poly_idx,var] = [[np.array(np.nan)]]
wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]


# select just data for which we have overlaps
var_array = ds[var].sel(loc=wm.overlap_da['loc'])

# multiply percent-overlaps by user-supplied weights
weights_and_overlaps = wm.overlap_da * weights

# fill any nans in the gridded data to zero and change the corresponding
# weights to zero so that they will not affect the aggregated value
var_array_filled = var_array.fillna(0)
weights_and_overlaps = weights_and_overlaps.where(var_array.notnull(), 0)

# the weights now may add up to less than one because of nans or
# because of the user-supplied weights. here we normalize the
# weights so they add up to 1 and fill any nan's with 0 so they
# won't be counted
normed_weights = weights_and_overlaps/weights_and_overlaps.sum(dim = "loc", skipna=True)
normed_weights = normed_weights.fillna(0)

# finally we do the dot product to get the weighted averages
aggregated_array = normed_weights.dot(var_array_filled)

# if the original gridded values were all nan, make the final
# aggregation nan
if np.isnan(var_array.values).all():
aggregated_array = aggregated_array * np.nan

data_dict[var] = aggregated_array

ds_combined = xr.Dataset(data_dict)
df_combined = ds_combined.to_dataframe().reset_index()
df_combined = df_combined.groupby('poly_idx').agg(list_or_first)

wm.agg = pd.merge(wm.agg, df_combined, on='poly_idx')
for var in ds.var():
if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims):
# convert to list of arrays - NOT SURE THIS IS THE RIGHT THING TO
# DO, JUST TRYING TO MATCH ORIGINAL FORMAT
wm.agg[var] = wm.agg[var].apply(np.array).apply(lambda x: [x])

# Put in class format
agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid,
geometry=wm.geometry,ds_in=ds,weights=wm.weights)
geometry=wm.geometry,ds_in=ds_combined,weights=wm.weights)

# Return
print('all variables aggregated to polygons!')
Expand Down