-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathitem_matrix_singlemarkers_climate.py
125 lines (99 loc) · 4.75 KB
/
item_matrix_singlemarkers_climate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'''
Determines the percentage of people with each individual demographic marker who prioritized each item in the CLIMATE UPV game
Inputs: Prepared survey data file, list of markers (i.e., one-hot columns in data file), list of items.
Outputs: Matrix showing the selection of each item across all markers (plot and Excel file).
Author: Alycia Leonard
Contact: [email protected]
Last edited: 2024-03-08
'''
import pandas as pd
import numpy as np
#import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import os
#warnings.simplefilter(action='ignore', category=FutureWarning)
# Get the current working directory, data directory, and parameters directory
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, "data")
parameters_directory = os.path.join(current_directory, "parameters")
# Define location of input data
survey_filepath = os.path.join(data_directory, "Siaya_UPV_Survey_Prepared.csv")
markers_filepath = os.path.join(parameters_directory, "markers.csv")
items_filepath = os.path.join(parameters_directory, "items.csv")
# Define path to save results
results_directory = os.path.join(current_directory, "results")
excel_save_path = os.path.join(results_directory, "item_matrix_ALL_percent_climate.xlsx")
figure_save_path = os.path.join(results_directory, "item_matrix_ALL_percent_climate_top15.png")
# Load list of markers, list of items, survey data
markers = pd.read_csv(markers_filepath).values.ravel().tolist()
markers.remove('Divorced') # Remove this marker as it is excluded due to size.
items = pd.read_csv(items_filepath).values.ravel().tolist()
df = pd.read_csv(survey_filepath,low_memory=False)
# Create nested dictionary for results
results = {item: {m1: None for m1 in markers} for item in items}
# For each marker:
for m1 in markers:
# Grab only the rows of the dataset with that vulnerability
subset = df.loc[(df[m1] == 1)]
# Get the number of unique respondents in that subset
interviewees = subset['Interview ID'].unique()
# Count the occurrence of items in each UPV column
counts_item1 = subset['Climate UPV - Item 1'].value_counts()
counts_item2 = subset['Climate UPV - Item 2'].value_counts()
counts_item3 = subset['Climate UPV - Item 3'].value_counts()
# Concatenate counts for each UPV column
counts = pd.concat([counts_item1, counts_item2, counts_item3]).to_frame()
# Make the index (item name) a column
counts['item'] = counts.index
# Rename the first column as count
counts.columns.values[0] = 'count'
# Make dataframe called "count" to hold the sum for each item
# count = pd.DataFrame(columns=['item', 'count'])
# Make a list called "count_list" to hold the sum for each item
count_list = []
# Sum across items in "counts" and put the result in "count"
for i in items:
s = counts.loc[counts['item'] == i, 'count'].sum()
data = {"item": i, "count": s}
count_list.append(data)
# Turn count_list into a dataframe
count = pd.DataFrame(count_list)
# Normalize by dividing by the total number of people in the subset, save in column "percent"
count['percent'] = (count['count'] / len(subset))*100
# Set the index of "count" to be the item names
count.set_index('item', inplace=True)
# Save the counts for each item for this intersection in the relevant results dict
for i in items:
results[i][m1] = count.at[i, 'percent']
# Get the results as a dataframe
r = pd.DataFrame.from_dict(results).astype(float).transpose()
# Sort highest to lowest by dataset average
r.sort_values(by='Full dataset', ascending=False, inplace=True)
# Save to Excel
r.to_excel(excel_save_path)
# Remove useless items column if it exists
if 'Items' in r.columns:
r.drop("Items", axis=0, inplace=True)
# If there is some data to plot, make a matrix heatmap and save it
if not r.empty:
# Drop rows beyond a certain threshold if you want to only plot top results
r = r.drop(r.index[15:])
fig, ax = plt.subplots(figsize=(7, 5))
sns.heatmap(r, ax=ax, cmap='Reds', xticklabels=True, yticklabels=True, vmin=0, vmax=100, annot=True,
annot_kws={'size': 8}, fmt='.0f', cbar_kws={'label': '% of group who chose the item', 'shrink': 0.75})
# Get the dimensions of the data
num_rows, num_cols = r.shape
# Calculate the coordinates for the rectangle that covers the last column
rect_x = num_cols - 1
rect_y = 0
rect_width = 1
rect_height = num_rows
# Draw the rectangle with black outline and no fill
rect = Rectangle((rect_x, rect_y), rect_width, rect_height, linewidth=2, edgecolor='black', facecolor='none')
ax.add_patch(rect)
# Save and close
plt.savefig(figure_save_path, bbox_inches="tight")
plt.close()
plt.clf()