ks905383 · ks905383 · Feb 17, 2022 · May 19, 2021 · Nov 24, 2021 · Jan 11, 2022
diff --git a/xagg/aux.py b/xagg/aux.py
@@ -42,6 +42,25 @@ def normalize(a,drop_na = False):
     else:
         return a*np.nan
 
+def find_rel_area(df):
+    """ 
+    Find the relative area of each row in a geodataframe
+    """
+    df['rel_area'] = df.area/df.area.sum()
+    return df
+
+
+def list_or_first(ser):
+    lis = list(ser)
+    # only the columns associated with the pixels should have multiple values;
+    # for all other columns (those associated with the polygons), it should be
+    # safe to return just the first item
+    if all(x == lis[0] for x in lis) and ser.name not in ['pix_idx', 'coords', 'rel_area', 'lat', 'lon']:
+        return lis[0]
+    else:
+        return lis
+
+
 def fix_ds(ds,var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
                             'Latitude':{'Latitude':'lat','Longitude':'lon'},
                             'Lat':{'Lat':'lat','Lon':'lon'},

diff --git a/xagg/classes.py b/xagg/classes.py
@@ -24,11 +24,12 @@ class weightmap(object):
     """ Class for mapping from pixels to polgyons, output from :func:`xagg.wrappers.pixel_overlaps`
 
     """
-    def __init__(self,agg,source_grid,geometry,weights='nowghts'):
+    def __init__(self,agg,source_grid,geometry,overlap_da,weights='nowghts'):
         self.agg = agg
         self.source_grid = source_grid
         self.geometry = geometry
         self.weights = weights
+        self.overlap_da = overlap_da
 
     def diag_fig(self,poly_idx):
         """ (NOT YET IMPLEMENTED) a diagnostic figure of the aggregation

diff --git a/xagg/core.py b/xagg/core.py
@@ -6,7 +6,7 @@
 import warnings
 import xesmf as xe
 
-from . aux import (normalize,fix_ds,get_bnds,subset_find)
+from . aux import (find_rel_area,fix_ds,get_bnds,subset_find,list_or_first)
 from . classes import (weightmap,aggregated)
 
 
@@ -281,7 +281,6 @@ def get_pixel_overlaps(gdf_in,pix_agg):
 
     """
 
-
     # Add an index for each polygon as a column to make indexing easier
     #if 'poly_idx' not in gdf_in.columns:
     #    gdf_in['poly_idx'] = gdf_in.index.values
@@ -309,42 +308,43 @@ def get_pixel_overlaps(gdf_in,pix_agg):
                            pix_agg['gdf_pixels'].to_crs(epsg_set),
                            how='intersection')
 
+    overlaps = overlaps.groupby('poly_idx').apply(find_rel_area)
+    overlaps['lat'] = overlaps['lat'].astype(float)
+    overlaps['lon'] = overlaps['lon'].astype(float)
+
     # Now, group by poly_idx (each polygon in the shapefile)
     ov_groups = overlaps.groupby('poly_idx')
-
-    # Calculate relative area of each overlap (so how much of the total 
-    # area of each polygon is taken up by each pixel), the pixels 
-    # corresponding to those areas, and the lat/lon coordinates of 
-    # those pixels
-    # aggfunc=lambda ds: normalize(ds.area) doesn't quite work for 
-    # some reason - would be great to use that though; it would get
-    # rid of the NaN warning that shows up 
-    #lambda ds: [ds.area/ds.area.sum()]
-    overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]),
-                                  pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]),
-                                  lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]),
-                                  lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds]))
-
+
+    overlap_info = ov_groups.agg(list_or_first)
+
+    overlap_info = overlap_info.rename(columns={'pix_idx': 'pix_idxs'})
+
     # Zip lat, lon columns into a list of (lat,lon) coordinates
     # (separate from above because as of 12/20, named aggs with 
     # multiple columns is still an open issue in the pandas github)
     overlap_info['coords'] = overlap_info.apply(lambda row: list(zip(row['lat'],row['lon'])),axis=1)
     overlap_info = overlap_info.drop(columns=['lat','lon'])
-    
+
     # Reset index to make poly_idx a column for merging with gdf_in
     overlap_info = overlap_info.reset_index()
-    
+
     # Merge in pixel overlaps to the input polygon geodataframe
-    gdf_in = pd.merge(gdf_in,overlap_info,'outer')
-
-    # Drop 'geometry' eventually, just for size/clarity
-    wm_out = weightmap(agg=gdf_in.drop('geometry',axis=1),
+    overlap_columns = ['pix_idxs', 'rel_area', 'coords', 'poly_idx']
+    gdf_in = pd.merge(gdf_in, overlap_info[overlap_columns],'outer', on='poly_idx')
+
+    # make the weight grid an xarray dataset for later dot product
+    idx_cols = ['lat', 'lon', 'poly_idx']
+    overlap_da = overlaps.set_index(idx_cols)['rel_area'].to_xarray()
+    overlap_da = overlap_da.stack(loc=['lat', 'lon'])
+    overlap_da = overlap_da.fillna(0)
+    wm_out = weightmap(agg=gdf_in.drop('geometry', axis=1),
                source_grid=pix_agg['source_grid'],
-               geometry=gdf_in.geometry)
+               geometry=gdf_in.geometry,
+               overlap_da = overlap_da)
 
     if 'weights' in pix_agg['gdf_pixels'].columns:
         wm_out.weights = pix_agg['gdf_pixels'].weights
-    
+
     return wm_out
 
 
@@ -423,68 +423,57 @@ def aggregate(ds,wm):
             warnings.warn('wm.weights is: \n '+print(wm.weights)+
                             ', \n which is not a supported weight vector (in a pandas series) '+
                             'or "nowghts" as a string. Assuming no weights are included...')
-        weights = np.ones((len(wm.source_grid['lat'])))
+        weights = np.ones((len(wm.overlap_da['loc'])))
 
+    data_dict = dict()
     for var in ds.var():
         # Process for every variable that has locational information, but isn't a 
         # bound variable
         if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims):
             print('aggregating '+var+'...')
-            # Create the column for the relevant variable
-            wm.agg[var] = None
-            #breakpoint()
-            # Get weighted average of variable based on pixel overlap + other weights
-            for poly_idx in wm.agg.poly_idx:
-                # Get average value of variable over the polygon; weighted by 
-                # how much of the pixel area is in the polygon, and by (optionally)
-                # a separate gridded weight. This if statement checks 
-                # - if the weights output for this polygon is just "np.nan", which 
-                #   indicates there's no overlap between the pixel grid and polygons
-                # - [this has been pushed down a few lines] or, if all the pixels in 
-                #   the grid have just nan values for this variable
-                # in both cases; the "aggregated variable" is just a vector of nans. 
-                if not np.isnan(wm.agg.iloc[poly_idx,:].pix_idxs).all():
-                    # Get the dimensions of the variable that aren't "loc" (location)
-                    other_dims = [k for k in np.atleast_1d(ds[var].dims) if k != 'loc']
-                    # Check first if any nans are "complete" (meaning that a pixel 
-                    # either has values for each step, or nans for each step - if
-                    # there are random nans within a pixel, throw a warning)
-                    if not xr.Dataset.equals(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).any(other_dims),
-                                             np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims)):
-                        warnings.warn('One or more of the pixels in variable '+var+' have nans in them in the dimensions '+
-                                      ', '.join([k for k in ds[var].dims if k != 'loc'])+
-                                      '. The code can currently only deal with pixels for which the '+
-                                      '*entire* pixel has nan values in all dimensions, however there is currently no '+
-                                      ' support for data in which pixels have only some nan values. The aggregation '+
-                                      'calculation is likely incorrect.')
-
-                    if not np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(): 
-                        # Get relative areas for the pixels overlapping with this Polygon
-                        tmp_areas = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].rel_area))
-                        # Replace overlapping pixel areas with nans if the corresponding pixel
-                        # is only composed of nans
-                        tmp_areas[np.array(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims).values)] = np.nan
-                        # Calculate the normalized area+weight of each pixel (taking into account
-                        # nans)
-                        normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True)
-
-                        # Take the weighted average of all the pixel values to calculate the 
-                        # aggregated value for the shapefile
-                        wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)*
-                                                        normed_areaweights).
-                                                       sum('loc')).values]]
-
-                    else:
-                        wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]
-
-                else:
-                    #breakpoint()
-                    #wm.agg.loc[poly_idx,var] = [[np.array(np.nan)]]
-                    wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]
-
+
+            # select just data for which we have overlaps
+            var_array = ds[var].sel(loc=wm.overlap_da['loc'])
+
+            # multiply percent-overlaps by user-supplied weights 
+            weights_and_overlaps = wm.overlap_da * weights
+
+            # fill any nans in the gridded data to zero and change the corresponding
+            # weights to zero so that they will not affect the aggregated value
+            var_array_filled = var_array.fillna(0)
+            weights_and_overlaps = weights_and_overlaps.where(var_array.notnull(), 0)
+
+            # the weights now may add up to less than one because of nans or
+            # because of the user-supplied weights. here we normalize the
+            # weights so they add up to 1 and fill any nan's with 0 so they
+            # won't be counted
+            normed_weights = weights_and_overlaps/weights_and_overlaps.sum(dim = "loc", skipna=True)
+            normed_weights = normed_weights.fillna(0)
+
+            # finally we do the dot product to get the weighted averages
+            aggregated_array = normed_weights.dot(var_array_filled)
+
+            # if the original gridded values were all nan, make the final
+            # aggregation nan
+            if np.isnan(var_array.values).all():
+                aggregated_array = aggregated_array * np.nan
+
+            data_dict[var] = aggregated_array
+
+    ds_combined = xr.Dataset(data_dict)    
+    df_combined = ds_combined.to_dataframe().reset_index()
+    df_combined = df_combined.groupby('poly_idx').agg(list_or_first)
+
+    wm.agg = pd.merge(wm.agg, df_combined, on='poly_idx')
+    for var in ds.var():
+        if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims):
+            # convert to list of arrays - NOT SURE THIS IS THE RIGHT THING TO
+            # DO, JUST TRYING TO MATCH ORIGINAL FORMAT
+            wm.agg[var] = wm.agg[var].apply(np.array).apply(lambda x: [x])
+
     # Put in class format
     agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid,
-    					 geometry=wm.geometry,ds_in=ds,weights=wm.weights)
+    					 geometry=wm.geometry,ds_in=ds_combined,weights=wm.weights)
 
     # Return
     print('all variables aggregated to polygons!')