-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_match_households_and_individuals.py
1115 lines (937 loc) · 38.9 KB
/
2_match_households_and_individuals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import pickle as pkl
from pathlib import Path
import numpy as np
import pandas as pd
# from joblib import Parallel, delayed
# from tqdm import trange
import acbm
from acbm.cli import acbm_cli
from acbm.config import load_config
from acbm.logger_config import matching_logger as logger
from acbm.matching import match_categorical, match_individuals
from acbm.preprocessing import (
count_per_group,
# nts_filter_by_region,
nts_filter_by_year,
num_adult_child_hh,
transform_by_group,
truncate_values,
)
@acbm_cli
def main(config_file):
config = load_config(config_file)
config.init_rng()
pd.set_option("display.max_columns", None)
def get_interim_path(
file_name: str, path: str | Path = acbm.root_path / "data/interim/matching/"
) -> str:
os.makedirs(path, exist_ok=True)
return f"{path}/{file_name}"
# ## Step 1: Load in the datasets
# ### SPC
# useful variables
region = "leeds"
logger.info("Loading SPC data")
# Read in the spc data (parquet format)
spc = pd.read_parquet(
acbm.root_path / "data/external/spc_output/" f"{region}_people_hh.parquet"
)
logger.info("Filtering SPC data to specific columns")
# select columns
spc = spc[
[
"id",
"household",
"pid_hs",
"msoa11cd",
"oa11cd",
"members",
"sic1d2007",
"sic2d2007",
"pwkstat",
"salary_yearly",
"salary_hourly",
"hid",
"accommodation_type",
"communal_type",
"num_rooms",
"central_heat",
"tenure",
"num_cars",
"sex",
"age_years",
"ethnicity",
"nssec8",
]
]
logger.info("Sampling SPC data")
# --- temporary reduction of the dataset for quick analysis
# Identify unique households
unique_households = spc["household"].unique()
# Sample a subset of households, RNG seeded above with `init_rng``
sampled_households = pd.Series(unique_households).sample(
n=(
config.parameters.number_of_households
if config.parameters.number_of_households is not None
else unique_households.shape[0]
),
)
# Filter the original DataFrame based on the sampled households
spc = spc[spc["household"].isin(sampled_households)]
logger.info(f"Sampled {spc.shape[0]} individuals from SPC data")
# ### NTS
#
# The NTS is split up into multiple tables. We will load in the following tables:
# - individuals
# - households
# - trips
logger.info("Loading NTS data")
# #### PSU
logger.info("Loading NTS data: PSU table")
path_psu = (
acbm.root_path / "data/external/nts/UKDA-5340-tab/tab/psu_eul_2002-2022.tab"
)
psu = pd.read_csv(path_psu, sep="\t")
# #### Individuals
logger.info("Loading NTS data: individuals table")
path_individuals = (
acbm.root_path
/ "data/external/nts/UKDA-5340-tab/tab/individual_eul_2002-2022.tab"
)
nts_individuals = pd.read_csv(
path_individuals,
sep="\t",
usecols=[
"IndividualID",
"HouseholdID",
"PSUID",
"Age_B01ID",
"Age_B04ID",
"Sex_B01ID",
"OfPenAge_B01ID",
"HRPRelation_B01ID",
"EdAttn1_B01ID",
"EdAttn2_B01ID",
"EdAttn3_B01ID",
"OwnCycle_B01ID", # Owns a cycle
"DrivLic_B02ID", # type of driving license
"CarAccess_B01ID",
"IndIncome2002_B02ID",
"IndWkGOR_B02ID", # Region of usual place of work
"EcoStat_B02ID", # Working status of individual
"EcoStat_B03ID",
"NSSec_B03ID", # NSSEC high level breakdown
"SC_B01ID", # Social class of individual
"Stat_B01ID", # employee or self-employed
"WkMode_B01ID", # Usual means of travel to work
"WkHome_B01ID", # Work from home
"PossHom_B01ID", # Is it possible to work from home?
"OftHome_B01ID", # How often work from home
"TravSh_B01ID", # Usual mode from main food shopping trip
"SchDly_B01ID", # Daily school journey?
"SchTrav_B01ID", # Usual mode of travel to school
"SchAcc_B01ID", # IS school trip accompanied by an adult?
"FdShp_B01ID", # How do you usually carry ot main food shop (go to shop, online etc)
],
)
# #### Households
logger.info("Loading NTS data: household table")
path_households = (
acbm.root_path
/ "data/external/nts/UKDA-5340-tab/tab/household_eul_2002-2022.tab"
)
nts_households = pd.read_csv(
path_households,
sep="\t",
usecols=[
"HouseholdID",
"PSUID",
"HHIncome2002_B02ID",
"AddressType_B01ID", # type of house
"Ten1_B02ID", # type of tenure
"HHoldNumAdults", # total no. of adults in household
"HHoldNumChildren", # total no. of children in household
"HHoldNumPeople", # total no. of people in household
"NumLicHolders", # total no. of driving license holders in household
"HHoldEmploy_B01ID", # number of employed in household
"NumBike", # no. of bikes
"NumCar", # no. of cars
"NumVanLorry", # no. of vans or lorries
"NumMCycle", # no. of motorcycles
"WalkBus_B01ID", # walk time from house to nearest bus stop
"Getbus_B01ID", # frequency of bus service
"WalkRail_B01ID", # walk time from house to nearest rail station
"JTimeHosp_B01ID", # journey time to nearest hospital
"DVShop_B01ID", # person no. for main food shooper in hh
"Settlement2011EW_B03ID", # ONS Urban/Rural: 2 categories
"Settlement2011EW_B04ID", # ONS Urban/Rural: 3 categories
"HHoldOAClass2011_B03ID", # Census 2011 OA Classification
"HRPWorkStat_B02ID", # HH ref person working status
"HRPSEGWorkStat_B01ID", # HH ref person socio economic group for active workers
"W0", # Unweighted interview sample
"W1", # Unweighted diary sample
"W2", # Weighted diary sample
"W3", # Weighted interview sample
],
)
# #### Trips
logger.info("Loading NTS data: trips table")
path_trips = (
acbm.root_path / "data/external/nts/UKDA-5340-tab/tab/trip_eul_2002-2022.tab"
)
nts_trips = pd.read_csv(
path_trips,
sep="\t",
usecols=[
"TripID",
"DayID",
"IndividualID",
"HouseholdID",
"PSUID",
"PersNo",
"TravDay",
"JourSeq",
"ShortWalkTrip_B01ID",
"NumStages",
"MainMode_B03ID",
"MainMode_B04ID",
"TripPurpFrom_B01ID",
"TripPurpTo_B01ID",
"TripPurpose_B04ID",
"TripStart",
"TripEnd",
"TripTotalTime",
"TripTravTime",
"TripDisIncSW",
"TripDisExSW",
"TripOrigGOR_B02ID",
"TripDestGOR_B02ID",
"W5",
"W5xHH",
],
)
# #### Filter by year
#
# We will filter the NTS data to only include data from specific years. We can choose
# only 1 year, or multiple years to increase our sample size and the likelihood of a
# match with the spc.
logger.info("Filtering NTS data by specified year(s)")
years = [2019, 2021, 2022]
nts_individuals = nts_filter_by_year(nts_individuals, psu, years)
nts_households = nts_filter_by_year(nts_households, psu, years)
nts_trips = nts_filter_by_year(nts_trips, psu, years)
# #### Filter by geography
#
# I will not do this for categorical matching, as it reduces the sample significantly,
# and leads to more spc households not being matched
# regions = ['Yorkshire and the Humber', 'North West']
# nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
# nts_households = nts_filter_by_region(nts_households, psu, regions)
# nts_trips = nts_filter_by_region(nts_trips, psu, regions)
# Create dictionaries of key value pairs
"""
guide to the dictionaries:
_nts_hh: from NTS households table
_nts_ind: from NTS individuals table
_spc: from SPC
"""
logger.info("Categorical matching: Data preparation")
logger.info("Categorical matching: Creating dictionaries")
# ---------- NTS
# Create a dictionary for the HHIncome2002_B02ID column
income_dict_nts_hh = {
"1": "0-25k",
"2": "25k-50k",
"3": "50k+",
"-8": "NA",
# should be -10, but
# it could be a typo in household_eul_2002-2022_ukda_data_dictionary
"-1": "DEAD",
}
# Create a dictionary for the HHoldEmploy_B01ID column
# (PT: Part time, FT: Full time)
employment_dict_nts_hh = {
"1": "None",
"2": "0 FT, 1 PT",
"3": "1 FT, 0 PT",
"4": "0 FT, 2 PT",
"5": "1 FT, 1 PT",
"6": "2 FT, 0 PT",
"7": "1 FT, 2+ PT",
"8": "2 FT, 1+ PT",
"9": "0 FT, 3+ PT",
"10": "3+ FT, 0 PT",
"11": "3+ FT, 1+ PT",
"-8": "NA",
"-10": "DEAD",
}
# Create a dictionary for the Ten1_B02ID column
tenure_dict_nts_hh = {
"1": "Owns / buying",
"2": "Rents",
"3": "Other (including rent free)",
"-8": "NA",
"-9": "DNA",
"-10": "DEAD",
}
# ---------- SPC
# create a dictionary for the pwkstat column
employment_dict_spc = {
"0": "Not applicable (age < 16)",
"1": "Employee FT",
"2": "Employee PT",
"3": "Employee unspecified",
"4": "Self-employed",
"5": "Unemployed",
"6": "Retired",
"7": "Homemaker/Maternal leave",
"8": "Student",
"9": "Long term sickness/disability",
"10": "Other",
}
# Create a dictionary for the tenure column
tenure_dict_spc = {
"1": "Owned: Owned outright",
"2": "Owned: Owned with a mortgage or loan or shared ownership",
"3": "Rented or living rent free: Total",
"4": "Rented: Social rented",
"5": "Rented: Private rented or living rent free",
"-8": "NA",
"-9": "DNA",
"-10": "DEAD",
}
# Combine the dictionaries into a dictionary of dictionaries
dict_nts = {
"HHIncome2002_B02ID": income_dict_nts_hh,
"HHoldEmploy_B01ID": employment_dict_nts_hh,
"Ten1_B02ID": tenure_dict_nts_hh,
}
dict_spc = {"pwkstat": employment_dict_spc, "tenure": tenure_dict_spc}
# ## Step 2: Decide on matching variables
#
# We need to identify the socio-demographic characteristics that we will match on. The
# schema for the synthetic population can be found [here](https://github.com/alan-turing-institute/uatk-spc/blob/main/synthpop.proto).
#
# Matching between the SPC and the NTS will happen in two steps:
#
# 1. Match at the household level
# 2. Match individuals within the household
#
# ### Household level matching
#
# | Variable | Name (NTS) | Name (SPC) | Transformation (NTS) | Transformation (SPC) |
# | ------------------ | -------------------- | --------------- | -------------------- | -------------------- |
# | Household income | `HHIncome2002_BO2ID` | `salary_yearly` | NA | Group by household ID and sum |
# | Number of adults | `HHoldNumAdults` | `age_years` | NA | Group by household ID and count |
# | Number of children | `HHoldNumChildren` | `age_years` | NA | Group by household ID and count |
# | Employment status | `HHoldEmploy_B01ID` | `pwkstat` | NA | a) match to NTS categories. b) group by household ID |
# | Car ownership | `NumCar` | `num_cars` | SPC is capped at 2. We change all entries > 2 to 2 | NA |
#
# Other columns to match in the future
# | Variable | Name (NTS) | Name (SPC) | Transformation (NTS) | Transformation (SPC) |
# | ------------------ | -------------------- | --------------- | -------------------- | -------------------- |
# | Type of tenancy | `Ten1_B02ID` | `tenure` | ?? | ?? |
# | Urban-Rural classification of residence | `Settlement2011EW_B04ID` | NA | NA | Spatial join between [layer](https://www.gov.uk/government/collections/rural-urban-classification) and SPC |
#
#
# ### 2.1 Edit SPC columns
# #### Household Income
logger.info("Categorical matching: Editing SPC columns (HH income)")
#
# Edit the spc so that we have household income as well as individual income.
# add household income column for SPC
spc_edited = transform_by_group(
data=spc,
group_col="household",
transform_col="salary_yearly",
new_col="salary_yearly_hh",
transformation_type="sum",
)
# --- Recode column so that it matches the reported NTS values (Use income_dict_nts_hh
# dictionary for reference)
# Define the bins (first )
bins = [0, 24999, 49999, np.inf]
# Define the labels for the bins
labels = [1, 2, 3]
spc_edited = spc_edited.copy()
spc_edited["salary_yearly_hh_cat"] = (
pd.cut(
spc_edited["salary_yearly_hh"],
bins=bins,
labels=labels,
include_lowest=True,
)
.astype("str")
.astype("float")
)
# replace NA values with -8 (to be consistent with NTS)
spc_edited["salary_yearly_hh_cat"] = spc_edited["salary_yearly_hh_cat"].fillna(-8)
# Convert the column to int
spc_edited["salary_yearly_hh_cat"] = spc_edited["salary_yearly_hh_cat"].astype(
"int"
)
# #### Household Composition (No. of Adults / Children)
logger.info(
"Categorical matching: Editing SPC columns (number of adults / children)"
)
# Number of adults and children in the household
spc_edited = num_adult_child_hh(
data=spc_edited, group_col="household", age_col="age_years"
)
# #### Employment Status
logger.info("Categorical matching: Editing SPC columns (employment status)")
# Employment status
# check the colums values from our dictionary
dict_spc["pwkstat"], dict_nts["HHoldEmploy_B01ID"]
# The NTS only reports the number of Full time and Part time employees for each
# household. For the SPC we also need to get the number of full time and part-time
# workers for each household.
#
# Step 1: Create a column for Full time and a column for Part time
# We will only use '1' and '2' for the employment status
counts_df = count_per_group(
df=spc_edited,
group_col="household",
count_col="pwkstat",
values=[1, 2],
value_names=["pwkstat_FT_hh", "pwkstat_PT_hh"],
)
counts_df.head(10)
# Create a column that matches the NTS categories (m FT, n PT)
# We want to match the SPC values to the NTS
dict_nts["HHoldEmploy_B01ID"]
"""
{
'1': 'None',
'2': '0 FT, 1 PT',
'3': '1 FT, 0 PT',
'4': '0 FT, 2 PT',
'5': '1 FT, 1 PT',
'6': '2 FT, 0 PT',
'7': '1 FT, 2+ PT',
'8': '2 FT, 1+ PT',
'9': '0 FT, 3+ PT',
'10': '3+ FT, 0 PT',
'11': '3+ FT, 1+ PT',
'-8': 'NA',
'-10': 'DEAD'}
"""
# 1) Match each row to the NTS
# Define the conditions and outputs.
# We are using the keys in dict_nts['HHoldEmploy_B01ID'] as reference
conditions = [
(counts_df["pwkstat_FT_hh"] == 0) & (counts_df["pwkstat_PT_hh"] == 0),
(counts_df["pwkstat_FT_hh"] == 0) & (counts_df["pwkstat_PT_hh"] == 1),
(counts_df["pwkstat_FT_hh"] == 1) & (counts_df["pwkstat_PT_hh"] == 0),
(counts_df["pwkstat_FT_hh"] == 0) & (counts_df["pwkstat_PT_hh"] == 2),
(counts_df["pwkstat_FT_hh"] == 1) & (counts_df["pwkstat_PT_hh"] == 1),
(counts_df["pwkstat_FT_hh"] == 2) & (counts_df["pwkstat_PT_hh"] == 0),
(counts_df["pwkstat_FT_hh"] == 1) & (counts_df["pwkstat_PT_hh"] >= 2),
(counts_df["pwkstat_FT_hh"] == 2) & (counts_df["pwkstat_PT_hh"] >= 1),
(counts_df["pwkstat_FT_hh"] == 0) & (counts_df["pwkstat_PT_hh"] >= 3),
(counts_df["pwkstat_FT_hh"] >= 3) & (counts_df["pwkstat_PT_hh"] == 0),
(counts_df["pwkstat_FT_hh"] >= 3) & (counts_df["pwkstat_PT_hh"] >= 1),
]
# Define the corresponding outputs based on dict_nts['HHoldEmploy_B01ID]
outputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
# Create a new column using np.select
counts_df["pwkstat_NTS_match"] = np.select(conditions, outputs, default=-8)
# 2) merge back onto the spc
spc_edited = spc_edited.merge(counts_df, left_on="household", right_index=True)
# check the output
spc_edited[
["household", "pwkstat", "pwkstat_FT_hh", "pwkstat_PT_hh", "pwkstat_NTS_match"]
].head(10)
# #### Urban Rural Classification
logger.info(
"Categorical matching: Editing SPC columns (urban / rural classification)"
)
# We use the 2011 rural urban classification to match the SPC to the NTS. The NTS has 2 columns that we can use to match to the SPC: `Settlement2011EW_B03ID` and `Settlement2011EW_B04ID`. The `Settlement2011EW_B03ID` column is more general (urban / rural only), while the `Settlement2011EW_B04ID` column is more specific. We stick to the more general column for now.
# read the rural urban classification data
rural_urban = pd.read_csv(
acbm.root_path / "data/external/census_2011_rural_urban.csv", sep=","
)
# merge the rural_urban data with the spc
spc_edited = spc_edited.merge(
rural_urban[["OA11CD", "RUC11", "RUC11CD"]], left_on="oa11cd", right_on="OA11CD"
)
# create dictionary from the NTS `Settlement2011EW_B03ID` column
Settlement2011EW_B03ID_nts_hh = {
"1": "Urban",
"2": "Rural",
"3": "Scotland",
"-8": "NA",
"-10": "DEAD",
}
Settlement2011EW_B04ID_nts_hh = {
"1": "Urban Conurbation",
"2": "Urban City and Town",
"3": "Rural Town and Fringe",
"4": "Rural Village, Hamlet and Isolated Dwellings",
"5": "Scotland",
"-8": "NA",
"-10": "DEAD",
}
census_2011_to_nts_B03ID = {
"Urban major conurbation": "Urban",
"Urban minor conurbation": "Urban",
"Urban city and town": "Urban",
"Urban city and town in a sparse setting": "Urban",
"Rural town and fringe": "Rural",
"Rural town and fringe in a sparse setting": "Rural",
"Rural village": "Rural",
"Rural village in a sparse setting": "Rural",
"Rural hamlets and isolated dwellings": "Rural",
"Rural hamlets and isolated dwellings in a sparse setting": "Rural",
}
census_2011_to_nts_B04ID = {
"Urban major conurbation": "Urban Conurbation",
"Urban minor conurbation": "Urban Conurbation",
"Urban city and town": "Urban City and Town",
"Urban city and town in a sparse setting": "Urban City and Town",
"Rural town and fringe": "Rural Town and Fringe",
"Rural town and fringe in a sparse setting": "Rural Town and Fringe",
"Rural village": "Rural Village, Hamlet and Isolated Dwellings",
"Rural village in a sparse setting": "Rural Village, Hamlet and Isolated Dwellings",
"Rural hamlets and isolated dwellings": "Rural Village, Hamlet and Isolated Dwellings",
"Rural hamlets and isolated dwellings in a sparse setting": "Rural Village, Hamlet and Isolated Dwellings",
}
# add the nts Settlement2011EW_B03ID and Settlement2011EW_B04ID columns to the spc
spc_edited["Settlement2011EW_B03ID_spc"] = spc_edited["RUC11"].map(
census_2011_to_nts_B03ID
)
spc_edited["Settlement2011EW_B04ID_spc"] = spc_edited["RUC11"].map(
census_2011_to_nts_B04ID
)
spc_edited.head()
# add the keys from nts_Settlement2011EW_B03ID and nts_Settlement2011EW_B04ID to the spc based on above mappings
# reverse the dictionaries
Settlement2011EW_B03ID_nts_rev = {
v: k for k, v in Settlement2011EW_B03ID_nts_hh.items()
}
# map the values
spc_edited["Settlement2011EW_B03ID_spc_CD"] = (
spc_edited["Settlement2011EW_B03ID_spc"]
.map(Settlement2011EW_B03ID_nts_rev)
.astype("int")
)
Settlement2011EW_B04ID_nts_rev = {
v: k for k, v in Settlement2011EW_B04ID_nts_hh.items()
}
spc_edited["Settlement2011EW_B04ID_spc_CD"] = (
spc_edited["Settlement2011EW_B04ID_spc"]
.map(Settlement2011EW_B04ID_nts_rev)
.astype("int")
)
spc_edited.head()
# ### 2.2 Edit NTS columns
logger.info("Categorical matching: Editing NTS columns (number of pensioners")
# #### Number of people of pension age
nts_pensioners = count_per_group(
df=nts_individuals,
group_col="HouseholdID",
count_col="OfPenAge_B01ID",
values=[1],
value_names=["num_pension_age_nts"],
)
nts_pensioners.head()
# join onto the nts household df
nts_households = nts_households.merge(
nts_pensioners, left_on="HouseholdID", right_index=True, how="left"
)
# #### Number of cars
logger.info("Categorical matching: Editing NTS columns (number of cars")
# - `SPC.num_cars` only has values [0, 1, 2]. 2 is for all households with 2 or more cars
# - `NTS.NumCar` is more detailed. It has the actual value of the number of cars. We will cap this at 2.
# Create a new column in NTS
nts_households.loc[:, "NumCar_SPC_match"] = nts_households["NumCar"].apply(
truncate_values, upper=2
)
# #### Type of tenancy
logger.info("Categorical matching: Editing NTS columns (tenure status)")
# Create dictionaries to map tenure onto the spc and nts dfs
# Dictionary showing how we want the final columns to look like
_tenure_dict_nts_spc = {
1: "Owned",
2: "Rented or rent free",
-8: "NA",
-9: "DNA",
-10: "DEAD",
}
# Matching NTS to tenure_dict_nts_spc
# Create a new dictionary for matching
matching_dict_nts_tenure = {1: 1, 2: 2, 3: 2}
matching_dict_spc_tenure = {
1: 1, #'Owned: Owned outright' : 'Owned'
2: 1, #'Owned: Owned with a mortgage or loan or shared ownership', : 'Owned'
3: 2, #'Rented or living rent free: Total', : 'Rented or rent free'
4: 2, #'Rented: Social rented', : 'Rented or rent free'
5: 2, #'Rented: Private rented or living rent free', : 'Rented or rent free'
}
# map dictionaries to create comparable columns
# Create a new column in nts_households
nts_households["tenure_nts_for_matching"] = (
nts_households["Ten1_B02ID"]
.map(matching_dict_nts_tenure) # map the values to the new dictionary
.fillna(nts_households["Ten1_B02ID"])
) # fill the NaNs with the original values
# Create a new column in spc
spc_edited["tenure_spc_for_matching"] = (
spc_edited["tenure"]
.map(matching_dict_spc_tenure) # map the values to the new dictionary
.fillna(spc_edited["tenure"])
) # fill the NaNs with the original values
# ## Step 3: Matching at Household Level
logger.info("Categorical matching: MATCHING HOUSEHOLDS")
#
# Now that we've prepared all the columns, we can start matching.
# ### 3.1 Categorical matching
#
# We will match on (a subset of) the following columns:
#
# | Matching variable | NTS column | SPC column |
# | ------------------| ---------- | ---------- |
# | Household income | `HHIncome2002_BO2ID` | `salary_yearly_hh_cat` |
# | Number of adults | `HHoldNumAdults` | `num_adults` |
# | Number of children | `HHoldNumChildren` | `num_children` |
# | Employment status | `HHoldEmploy_B01ID` | `pwkstat_NTS_match` |
# | Car ownership | `NumCar_SPC_match` | `num_cars` |
# | Type of tenancy | `tenure_nts_for_matching` | `tenure_spc_for_matching` |
# | Rural/Urban Classification | `Settlement2011EW_B03ID` | `Settlement2011EW_B03ID_spc_CD` |
# Prepare SPC df for matching
# Select multiple columns
spc_matching = spc_edited[
[
"hid",
"salary_yearly_hh_cat",
"num_adults",
"num_children",
"num_pension_age",
"pwkstat_NTS_match",
"num_cars",
"tenure_spc_for_matching",
"Settlement2011EW_B03ID_spc_CD",
"Settlement2011EW_B04ID_spc_CD",
]
]
# edit the df so that we have one row per hid
spc_matching = spc_matching.drop_duplicates(subset="hid")
spc_matching.head(10)
# Prepare NTS df for matching
nts_matching = nts_households[
[
"HouseholdID",
"HHIncome2002_B02ID",
"HHoldNumAdults",
"HHoldNumChildren",
"num_pension_age_nts",
"HHoldEmploy_B01ID",
"NumCar_SPC_match",
"tenure_nts_for_matching",
"Settlement2011EW_B03ID",
"Settlement2011EW_B04ID",
]
]
# Dictionary of matching columns. We extract column names from this dictioary when matching on a subset of the columns
# column_names (keys) for the dictionary
matching_ids = [
"household_id",
"yearly_income",
"number_adults",
"number_children",
"num_pension_age",
"employment_status",
"number_cars",
"tenure_status",
"rural_urban_2_categories",
"rural_urban_4_categories",
]
# i want the value to be a list with spc_matching and nts_matching
matching_dfs_dict = {
column_name: [spc_value, nts_value]
for column_name, spc_value, nts_value in zip(
matching_ids, spc_matching, nts_matching
)
}
# #### Match on a subset of columns (exclude salary, tenure, and employment status)
#
# To decide on the subset of columns to match on, we explore the results from different combinations. This is shown in a separate notebook: `2.1_sandbox-match_households.ipynb`.
# columns for matching
keys = [
"number_adults",
"number_children",
"num_pension_age",
"number_cars",
"rural_urban_2_categories",
]
# extract equivalent column names from dictionary
spc_cols = [matching_dfs_dict[key][0] for key in keys]
nts_cols = [matching_dfs_dict[key][1] for key in keys]
# Match
matches_hh_level = match_categorical(
df_pop=spc_matching,
df_pop_cols=spc_cols,
df_pop_id="hid",
df_sample=nts_matching,
df_sample_cols=nts_cols,
df_sample_id="HouseholdID",
chunk_size=50000,
show_progress=True,
)
# Number of unmatched households
# no. of keys where value is na
na_count = sum([1 for v in matches_hh_level.values() if pd.isna(v).all()])
logger.info(f"Categorical matching: {na_count} households in the SPC had no match")
logger.info(
f"{round((na_count / len(matches_hh_level)) * 100, 1)}% of households in the SPC had no match"
)
## add matches_hh_level as a column in spc_edited
spc_edited["nts_hh_id"] = spc_edited["hid"].map(matches_hh_level)
# ### Random Sampling from matched households
logger.info("Categorical matching: Randomly choosing one match per household")
#
# In categorical matching, many households in the SPC are matched to more than 1 household in the NTS. Which household to choose? We do random sampling
# for each key in the dictionary, sample 1 of the values associated with it and store it in a new dictionary
"""
- iterate over each key-value pair in the matches_hh_result dictionary.
- For each key-value pair, use np.random.choice(value) to randomly select
one item from the list of values associated with the current key.
- create a new dictionary hid_to_HouseholdID_sample where each key from the
original dictionary is associated with one randomly selected value from the
original list of values.
"""
matches_hh_level_sample = {
key: np.random.choice(value) for key, value in matches_hh_level.items()
}
# remove items in list where value is nan
matches_hh_level_sample = {
key: value
for key, value in matches_hh_level_sample.items()
if not pd.isna(value)
}
# Multiple matches in case we want to try stochastic runs
# same logic as cell above, but repeat it multiple times and store each result as a separate dictionary in a list
matches_hh_level_sample_list = [
{key: np.random.choice(value) for key, value in matches_hh_level.items()}
for i in range(25)
]
logger.info("Categorical matching: Random sampling complete")
# Save results
logger.info("Categorical matching: Saving results")
# random sample
with open(
get_interim_path("matches_hh_level_categorical_random_sample.pkl"), "wb"
) as f:
pkl.dump(matches_hh_level_sample, f)
# multiple random samples
with open(
get_interim_path("matches_hh_level_categorical_random_sample_multiple.pkl"),
"wb",
) as f:
pkl.dump(matches_hh_level_sample_list, f)
# Do the same at the df level. Add nts_hh_id_sample column to the spc df
# # for each hid in spc_edited, sample a value from the nts_hh_id col.
# spc_edited['nts_hh_id_sample'] = spc_edited['nts_hh_id'].apply(lambda x: np.random.choice(x) if x is not np.nan else np.nan)
# # All rows with the same 'hid' should have the same value for 'nts_hh_id_sample'. Group by hid and assign the first value to all rows in the group
# spc_edited['nts_hh_id_sample'] = spc_edited.groupby('hid')['nts_hh_id_sample'].transform('first')
# spc_edited.head(10)
# ## Step 4: Matching at Individual Level
#
# 1) Prepare columns for matching - they should all be numerical
# a) age_years in the SPC -> Convert from actual age to age brackets from the dictionary
# 2) Filter to specific household
# 3) Nearest neighbor merge without replacement (edit while function below)
#
#
logger.info("Statistical matching: MATCHING INDIVIDUALS")
# Create an 'age' column in the SPC that matches the NTS categories
# create a dictionary for reference on how the labels for "Age_B04ID" match the actual age brackets
# dict_nts_ind_age = {-10: 'DEAD',
# -8: 'NA',
# 1: '0-4',
# 2: '5-10',
# 3: '11-16',
# 4: '17-20',
# 5: '21-29',
# 6: '30-39',
# 7: '40-49',
# 8: '50-59',
# 9: '60+'
# }
# Define the bins and labels based on dict_nts_ind_age
bins = [0, 4, 10, 16, 20, 29, 39, 49, 59, np.inf]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Create a new column in spc_edited that maps the age_years to the keys of dict_nts_ind_age
spc_edited["age_group"] = (
pd.cut(spc_edited["age_years"], bins=bins, labels=labels)
.astype("int")
.fillna(-8)
)
# rename nts columns in preparation for matching
nts_individuals.rename(
columns={"Age_B04ID": "age_group", "Sex_B01ID": "sex"}, inplace=True
)
# PSM matching using internal match_individuals function
matches_ind = match_individuals(
df1=spc_edited,
df2=nts_individuals,
matching_columns=["age_group", "sex"],
df1_id="hid",
df2_id="HouseholdID",
matches_hh=matches_hh_level_sample,
show_progress=True,
)
# Add matches_ind values to spc_edited using map
spc_edited["nts_ind_id"] = spc_edited.index.map(matches_ind)
# add the nts_individuals.IndividualID to spc_edit. The current nts_ind_id is the row index of nts_individuals
spc_edited["nts_ind_id"] = spc_edited["nts_ind_id"].map(
nts_individuals["IndividualID"]
)
logger.info("Statistical matching: Matching complete")
# save random sample
with open(
get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb"
) as f:
pkl.dump(matches_ind, f)
# ### Match on multiple samples
# logger.info("Statistical matching: Matching on multiple samples")
# # In household level matching, some households in the SPC are matched to multiple households in the NTS. To have 1:1 match between the SPC and NTS, we randomly sample from the list of matches
# #
# # The random sample produces different results each time. In `matches_hh_level_sample_list` we did many iterations of random sampling to produce multiple results of household matching, and saved the output in a list of dictionaries.
# #
# # Here, we iterate over the list and do individual matching for each item. The output is a list of n dictionaries, each of which could be used as a synthetic population matched to the NTS
# # iterate over all items in the matches_hh_level_sample_list and apply the match_individuals function to each
# parallel = Parallel(n_jobs=-1, return_as="generator")
# matches_list_of_dict = list(
# parallel(
# delayed(match_individuals)(
# df1=spc_edited,
# df2=nts_individuals,
# matching_columns=["age_group", "sex"],
# df1_id="hid",
# df2_id="HouseholdID",
# matches_hh=matches_hh_level_sample_list[i],
# show_progress=False,
# )
# for i in trange(len(matches_hh_level_sample_list))
# )
# )
# # Save the results of individual matching
# logger.info("Statistical matching: Saving results")
# # save multiple random samples
# with open(
# get_interim_path("matches_ind_level_categorical_random_sample_multiple.pkl"), "wb"
# ) as f:
# pkl.dump(matches_list_of_dict, f)
# ### Add trip data
logger.info("Post-processing: Editing column names")
# Rename columns and map actual modes and trip purposes to the trip table.
#
# Code taken from: https://github.com/arup-group/pam/blob/main/examples/07_travel_survey_to_matsim.ipynb
nts_trips = nts_trips.rename(
columns={ # rename data
"JourSeq": "seq",
"TripOrigGOR_B02ID": "ozone",
"TripDestGOR_B02ID": "dzone",
"TripPurpFrom_B01ID": "oact",
"TripPurpTo_B01ID": "dact",