-
Notifications
You must be signed in to change notification settings - Fork 1
/
matching.py
197 lines (159 loc) · 6.23 KB
/
matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
# categorical (exact) matching
def match_categorical(
df_pop: pd.DataFrame,
df_pop_cols: list,
df_pop_id: str,
df_sample: pd.DataFrame,
df_sample_cols: list,
df_sample_id: str,
chunk_size: int,
show_progress=True,
) -> dict:
"""
Match the rows in two DataFrames based on specified columns.
The function matches the rows in df_pop to the rows in df_sample based
on the columns in df_pop_cols and df_sample_cols. The matching is done
in chunks to avoid memory issues.
Parameters
----------
df_pop: pandas DataFrame
The DataFrame to be matched on
df_pop_cols: list
The columns to be used for matching in df_pop
df_pop_id: str
The column name that contains the unique identifier in df_pop
It is the key in the final dictionary
df_sample: pandas DataFrame
The DataFrame to be matched with
df_sample_cols: list
The columns to be used for matching in df_sample
df_sample_id: str
The column name that contains the unique identifier in df_sample
It is the value in the final dictionary
chunk_size: int
The number of rows to process at a time
show_progress: bool
Whether to print the progress of the matching to the console
Returns
-------
results: dict
A dictionary with the matched rows {df_pop_id: [df_sample_id]}
"""
# dictionary to store results
results = {}
# loop over the df_pop DataFrame in chunks
for i in range(0, df_pop.shape[0], chunk_size):
# filter the df_pop DataFrame to the current chunk
j = i + chunk_size
if show_progress:
print("matching rows ", i, "to", j, " out of ", df_pop.shape[0])
df_pop_chunk = df_pop.iloc[i:j]
# merge the df_pop_chunk with the df_sample DataFrame
df_matched_chunk = df_pop_chunk.merge(
df_sample, left_on=df_pop_cols, right_on=df_sample_cols, how="left"
)
# convert the matched df to a dictionary:
df_matched_dict_i = (
df_matched_chunk.groupby(df_pop_id)[df_sample_id].apply(list).to_dict()
)
# add the dictionary to results{}
results.update(df_matched_dict_i)
return results
# propensity score matching
def match_psm(df1: pd.DataFrame, df2: pd.DataFrame, matching_columns: list) -> dict:
"""
Use the Propensity Score Matching (PSM) method to match the rows in two DataFrames
The distances between columns is calculated using the NearestNeighbors algorithm
Parameters
----------
df1: pandas DataFrame
The first DataFrame to be matched on
df2: pandas DataFrame
The second DataFrame to be matched with
matching_columns: list
The columns to be used for the matching
Returns
-------
matches: dict
A dictionary with the matched row indeces from the two DataFrames {df1: df2}
"""
# Initialize an empty dict to store the matches
matches = {}
# Matching without replacement
while not df1.empty:
# Fit a NearestNeighbors model on the specified columns for df2
nn = NearestNeighbors(n_neighbors=1, algorithm="ball_tree")
nn.fit(df2[matching_columns])
# Find the closest row in df2 for each row in df1
distances, indices = nn.kneighbors(df1[matching_columns])
# Get the index of the closest match in df2 for each row in df1
closest_indices = indices.flatten()
# Get the row in df1 with the smallest distance to its closest match in df2
min_distance_index = np.argmin(distances)
# Get the corresponding row in df2
closest_df2_index = closest_indices[min_distance_index]
# Get the row id from df1 and df2
row_id_df1 = df1.index[min_distance_index]
row_id_df2 = df2.index[closest_df2_index]
# Store the match in the dictionary
matches[row_id_df1] = row_id_df2
# Remove the matched rows from df1 and df2
df1 = df1.drop(df1.index[min_distance_index])
df2 = df2.drop(df2.index[closest_df2_index])
return matches
# TODO: parallelize the matching process. See this stackoverflow suggestion
# for iterating over dict keys https://stackoverflow.com/a/30075659
def match_individuals(
df1: pd.DataFrame,
df2: pd.DataFrame,
matching_columns: list,
df1_id: str,
df2_id: str,
matches_hh: dict,
show_progress: bool = False,
) -> dict:
"""
Apply a matching function iteratively to members of each household.
In each iteration, filter df1 and df2 to the household ids of item i
in matches_hh, and then apply the matching function to the filtered DataFrames.
Parameters
----------
df1: pandas DataFrame
The first DataFrame to be matched on
df2: pandas DataFrame
The second DataFrame to be matched with
matching_columns: list
The columns to be used for the matching
df1_id: str
The household_id from the first DataFrame
df2_id: str
The household_id from the second DataFrame
matches_hh: dict
A dictionary with the matched household ids {df1_id: df2_id}
show_progress: bool
Whether to print the progress of the matching to the console
Returns
-------
matches: dict
A dictionary with the matched row indeces from the two DataFrames {df1: df2}
"""
# Initialize an empty dic to store the matches
matches = {}
# Remove all unmateched households
matches_hh = {key: value for key, value in matches_hh.items() if not pd.isna(value)}
# loop over all rows in the matches_hh dictionary
for i, (key, value) in enumerate(matches_hh.items(), 1):
# Get the rows in df1 and df2 that correspond to the matched hids
rows_df1 = df1[df1[df1_id] == key]
rows_df2 = df2[df2[df2_id] == int(value)]
if show_progress:
# Print the iteration number and the number of keys in the dict
print(f"Matching for household {i} out of: {len(matches_hh)}")
# apply the matching
match = match_psm(rows_df1, rows_df2, matching_columns)
# append the results to the main dict
matches.update(match)
return matches