-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
1166 lines (1059 loc) · 46.2 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Data Analysis
"""
import os
from string import ascii_lowercase
import csv
import argparse
import time
from collections import defaultdict
from constants import POPULAR_CATEGORIES, FULL, TOP_THREE, TOP, RESULT_SUBSETS
from data_helpers import get_dataframes, load_coded_as_dicts, prep_data, set_or_concat
from qual_code import TWITTER_DOMAIN, strip_twitter_screename
from plotters import plot_comparison, plot_importance
from profiles_in_kp import queries as queries_to_kp_profiles
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import average_precision_score
from scipy.stats import ttest_ind, fisher_exact
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.contingency_tables import mcnemar
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyxdameraulevenshtein import damerau_levenshtein_distance
UGC_WHITELIST = [
'wikipedia.org',
'UserTweetCarousel',
'SearchTweetCarousel',
'facebook.com',
'twitter.com',
'youtube.com',
'instagram.com',
'linkedin.com',
'yelp.com',
'tripadvisor.com',
]
class Comparison():
"""
A comparison entity
For comparing two groups within a set of results
e.g. urban vs. rural wikipedia incidence rate
urban vs. rural map incidence rate
"""
def __init__(
self, df_a, name_a, df_b, name_b, cols_to_compare,
print_all=False, recurse_on_queries=False
):
self.df_a = df_a
self.name_a = name_a
self.df_b = df_b
self.name_b = name_b
self.cols_to_compare = cols_to_compare
self.print_all = print_all
self.recurse_on_queries = recurse_on_queries
def print_results(self):
"""
Compare columns for the two groups belonging to this Comparison entity
Prints out the results
"""
ret = []
err = []
query_comparison_lists = {key: [] for key in RESULT_SUBSETS}
pval_summary = {key: [] for key in RESULT_SUBSETS}
whitelist_summary = {key: [] for key in RESULT_SUBSETS}
fisher_summary = {key: [] for key in RESULT_SUBSETS}
for col in self.cols_to_compare:
# a = list(self.df_a[col])
# b = list(self.df_b[col])
try:
filtered_df_a = self.df_a[self.df_a[col].notnull()]
a = list(filtered_df_a[col])
except KeyError:
if self.print_all:
print('Column {} missing from df_a, {}'.format(
col, self.name_a))
continue
try:
filtered_df_b = self.df_b[self.df_b[col].notnull()]
b = list(filtered_df_b[col])
except KeyError:
if self.print_all:
print('Column {} missing from df_a, {}'.format(
col, self.name_a))
continue
if not a and not b:
err.append('Skipping {} b/c two empty lists'.format(col))
continue
assert len(a) == len(b)
mean = np.mean(np.array(a + b), axis=0)
mean_a = np.mean(a)
mean_b = np.mean(b)
n = len(a) + len(b)
df = pd.DataFrame({'a': a, 'b': b}).melt()
_, pval = ttest_ind(a, b, equal_var=False)
#_, ztest_pval = proportions_ztest()
try:
tab = pd.crosstab(df.variable, df.value)
# df = pd.DataFrame({'a': a, 'b': b})
# tab = pd.crosstab(df.a, df.b)
# if 0 not in tab.columns:
# tab[0] = [0, 0]
# if 1 not in tab.columns:
# tab[1] = [0, 0]
# bunch = mcnemar(tab)
# fisher_pval = bunch.pvalue
#print(tab)
_, fisher_pval = fisher_exact(tab)
except Exception as ex:
print('ex', ex)
fisher_pval = 1
if mean_a == mean_b:
larger, smaller = mean_a, mean_b
winner = None
elif mean_a > mean_b:
larger, smaller = mean_a, mean_b
winner = self.name_a
else:
larger, smaller = mean_b, mean_a
winner = self.name_b
if smaller > 0:
mult_increase = round(larger / smaller, 2)
else:
mult_increase = float('nan')
marker = ''
if pval <= 0.001:
marker = '**'
elif pval <= 0.05:
marker = '*'
row_dict = {
'column': marker + col,
'winner': winner,
'mult_inc': mult_increase,
'add_inc': round(larger - smaller, 3),
'mean_a': round(mean_a, 3),
'mean_b': round(mean_b, 3),
'name_a': self.name_a,
'name_b': self.name_b,
'pval': pval,
'fisher_pval': fisher_pval,
'len(a)': len(a),
'len(b)': len(b),
'n': n,
'mean': mean,
}
ret.append(row_dict)
is_in_whitelist = False
for domain in UGC_WHITELIST:
if domain in col:
is_in_whitelist = True
key = None
for result_subset in RESULT_SUBSETS:
if result_subset + '_domain' in col or result_subset + '_code' in col:
key = result_subset
break
if key:
if is_in_whitelist:
whitelist_summary[key].append(row_dict)
if fisher_pval < 0.05:
fisher_summary[key].append(row_dict)
if marker:
pval_summary[key].append(row_dict)
if self.recurse_on_queries:
# now mark all the comparisons
queries = set(
list(self.df_a['query']) + list(self.df_b['query'])
)
# queries = set(
# list(
# filtered_df_a['query'].drop_duplicates()) +
# list(filtered_df_b['query'].drop_duplicates()
# )
# )
for query in queries:
query_a = filtered_df_a[filtered_df_a['query'] == query]
query_b = filtered_df_b[filtered_df_b['query'] == query]
query_comparison = Comparison(
df_a=query_a, name_a=self.name_a,
df_b=query_b, name_b=self.name_b,
cols_to_compare=[col],
print_all=self.print_all,
recurse_on_queries=False,
)
comparison_dicts = query_comparison.print_results()[
0]
comparison_dicts = [
x for x in comparison_dicts if x['mean'] != 0]
for d in comparison_dicts:
d['query'] = query
query_comparison_lists[key] += comparison_dicts
summary = {
'pval': pval_summary,
'whitelist': whitelist_summary,
'fisher': fisher_summary,
}
return ret, summary, err, query_comparison_lists
def get_matching_columns(columns, whitelist):
"""
Takes a list of columns and returns the ones that match whitelist
"""
ret = []
for x in whitelist:
for column in columns:
if x in column and column not in ret:
ret.append(column)
return ret
def encode_links_as_strings(links1, links2):
"""
Take two lists of pages and turn them into strings
For the sole purpose of calculating edit distance
"""
set1, set2 = set(links1), set(links2)
union = set1.union(set2)
mapping = {}
# will never have more than 10 results...
for item, letter in zip(list(union), ascii_lowercase):
mapping[item] = letter
string1 = ''.join([mapping[link] for link in links1])
string2 = ''.join([mapping[link] for link in links2])
return string1, string2
def jaccard_similarity(x, y):
"""
set implementation of jaccard similarity
"""
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
return intersection_cardinality / float(union_cardinality)
def wrap_finder(data, link_type):
"""
Take a df and return a function to get tweets or news in that df corresponding to a serp_id
"""
def finder(sid):
return data[(data.serp_id == sid) & (data.link_type == link_type)]
return finder
class MetricCalculator():
"""
Calculates metrics used in the study
domain_fracs
domain_appears
domain_ranks
domain_counts
"""
def __init__(self, finders, sid):
self.finders = finders
self.sid = sid
def calc_domain_fracs(self, cols, use_codes=False):
"""
This is specific to a given SERP
Figure out how many domains of interest appear in search results
Currently using control queries is deprecated.
return a dict
"""
domains_to_count = defaultdict(int)
domains_to_ranksum = defaultdict(int)
# this is strange.
df = cols
if not df.empty:
for _, row in df.iterrows():
if not use_codes:
domain = row.domain
else:
domain = str(row.domain) + ':' + str(row['code'])
rank = row['rank']
if isinstance(domain, float) and np.isnan(domain):
domains_to_count['none'] += 1
domains_to_ranksum['none'] += rank
elif domain == 'NewsCarousel' or 'TweetCarousel' in domain:
# was re-using "domain" here, looked sketchy
if domain == 'NewsCarousel':
subdf = self.finders['news'](self.sid).iloc[:3]
else: # must be tweets
subdf = self.finders['tweets'](self.sid).iloc[:3]
for _, subrow in subdf.iloc[:3].iterrows():
domains_to_count[subrow.domain] += 1
domains_to_ranksum[subrow.domain] += rank
domains_to_count[domain] += 1
domains_to_ranksum[domain] += rank
else:
domains_to_count[domain] += 1
domains_to_ranksum[domain] += rank
frac_ret = {}
rank_ret = {}
num_counted = sum(domains_to_count.values())
for key, val in domains_to_count.items():
frac_ret[key] = val / num_counted
for key, val in domains_to_ranksum.items():
rank_ret[key] = val / domains_to_count[key]
domains_to_map = {}
for key in domains_to_count.keys():
y_true = [x == key for x in cols.domain]
y_score = [1/(i+1) for i, x in enumerate(cols['rank'])]
# print(y_true)
# print(y_score)
domains_to_map[key] = average_precision_score(y_true, y_score)
return frac_ret, rank_ret, domains_to_count, domains_to_map
def compute_serp_features(
links, cols,
control_links, control_cols,
sid, finders,
):
"""
Computes features for a set of results corresponding to one serp
Args:
links - a list of links (as strings)
control_links - a list of links (as strings)
domains_col - a pandas series corresponding to the "domain" column
code_col - a pandas series corresponding to the "code" column
Returns:
A dictionary of computed values
ret: {
jaccard index with control,
edit distance with control,
domain_fracs for results, top3, top1,
}
"""
metric_calculator = MetricCalculator(finders=finders, sid=sid)
string, control_string = encode_links_as_strings(links, control_links)
ret = {}
if control_links and control_cols:
ret['control_jaccard'] = jaccard_similarity(
links, control_links
)
ret['control_edit'] = damerau_levenshtein_distance(
string, control_string
)
if 'knowledge_panel' in list(cols.link_type):
cols = cols.sort_values('link_type')
fracs, ranks, counts, maps = metric_calculator.calc_domain_fracs(cols)
# print(cols.domain)
# print('maps\n', maps)
# print(ranks)
# input()
ret[FULL] = {
'domain_fracs': fracs,
'domain_ranks': ranks,
'domain_counts': counts,
'domain_maps': maps,
}
top3_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:3])
ret[TOP_THREE] = {
'domain_fracs': top3_fracs
}
top_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:1])
ret[TOP] = {
'domain_fracs': top_fracs
}
for subset in RESULT_SUBSETS:
ret[subset]['domain_appears'] = {}
for key, val in ret[subset]['domain_fracs'].items():
if val > 0:
ret[subset]['domain_appears'][key] = 1
else:
ret[subset]['domain_appears'][key] = 0
code_fracs, code_ranks, code_counts, code_maps = metric_calculator.calc_domain_fracs(cols, use_codes=True)
ret[FULL]['code_fracs'] = code_fracs
ret[FULL]['code_ranks'] = code_ranks
ret[FULL]['code_counts'] = code_counts
ret[FULL]['code_maps'] = code_maps
top3_code_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:3], use_codes=True)
ret[TOP_THREE]['code_fracs'] = top3_code_fracs
top_code_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:1], use_codes=True)
ret[TOP]['code_fracs'] = top_code_fracs
for subset in RESULT_SUBSETS:
ret[subset]['code_appears'] = {}
for key, val in ret[subset]['code_fracs'].items():
if val > 0:
ret[subset]['code_appears'][key] = 1
else:
ret[subset]['code_appears'][key] = 0
return ret
def analyze_subset(data, location_set, config, finders):
"""
A subset consists of results of a certain TYPE for a certain QUERY
Args:
data - a dataframe object with rows matching a TYPE and QUERY
location_set - a set of strings corresponding to locations queried
"""
# d holds the results and editdistances
d = {}
for loc in location_set:
results = data[data.reported_location == loc]
if results.empty:
continue
treatment = results[results.is_control == 0]
links = list(treatment.link)
snippets = list(treatment.snippet)
titles = list(treatment.title)
if config.get('check_ranks'):
ranks = list(treatment['rank'])
largest_rank = ranks[-1]
perfect_sequence = set(range(1, largest_rank + 1))
missing_ranks = perfect_sequence.difference(set(ranks))
if missing_ranks and missing_ranks != set([1]):
print(results[['query', 'link', 'rank']], set(ranks), perfect_sequence, missing_ranks)
input()
if config.get('use_control'):
control = results[results.is_control == 1]
control_links = list(control.link)
if not control_links:
# 'Missing expected control links for loc {}'.format(loc))
continue
if not links:
# 'Missing expected links for loc {}'.format(loc))
continue
else:
control = pd.DataFrame(
data={
'domain': [],
'code': [],
'rank': [],
'domains_plus_codes': [],
'link_type': [],
}
)
control_links = []
first_row = results.iloc[0]
sid = first_row.serp_id
d[loc] = {}
d[loc]['links'] = links
d[loc]['has_' + first_row.link_type] = 1 if links else 0
d[loc]['domains'] = list(treatment.domain)
d[loc]['control_links'] = control_links
d[loc]['computed'] = compute_serp_features(
links,
treatment[['domain', 'code', 'rank', 'domains_plus_codes', 'link_type']],
control_links,
control[['domain', 'code', 'rank', 'domains_plus_codes', 'link_type']],
sid, finders
)
d[loc]['serp_id'] = sid
sid = SentimentIntensityAnalyzer()
snippet_polarities = [sid.polarity_scores(
x)['compound'] for x in snippets if x]
title_polarities = [sid.polarity_scores(
x)['compound'] for x in titles if x]
for polarities, textname in [
(snippet_polarities, 'snippet'),
(title_polarities, 'title')
]:
for prefix, subset in [
(FULL, polarities),
(TOP_THREE, polarities[:3]),
(TOP, polarities[:1]),
]:
if subset:
mean_polarity = sum(subset) / len(subset)
d[loc]['computed'][prefix + '_' + textname +
'_mean_polarity'] = mean_polarity
for loc in location_set:
if loc not in d:
continue
d[loc]['comparisons'] = {}
tmp = d[loc]['comparisons']
for comparison_loc in location_set:
if comparison_loc not in d:
continue
if loc == comparison_loc:
continue
tmp[comparison_loc] = {}
string1, string2 = encode_links_as_strings(
d[loc]['links'], d[comparison_loc]['links'])
tmp[comparison_loc]['edit'] = \
damerau_levenshtein_distance(
string1, string2
)
try:
jac = jaccard_similarity(
d[loc]['links'],
d[comparison_loc]['links']
)
except ZeroDivisionError:
jac = float('nan')
tmp[comparison_loc]['jaccard'] = jac
return d
def prep_paths(db, category):
"""
Creates paths in the filesystem and return the path names
"""
path1 = 'output'
path2 = '{}/{}'.format(path1, db)
if category:
path2 += '__' + category
for path in [path1, path2]:
try:
os.mkdir(path)
except OSError:
pass
return path1, path2
def main(args, db, category):
"""Do analysis"""
data, serp_df = get_dataframes(db)
data = prep_data(data)
if args.group_popular:
pop_mask = serp_df['category'].isin(POPULAR_CATEGORIES)
serp_df.loc[pop_mask, 'category'] = 'popular'
pop_mask = data['category'].isin(POPULAR_CATEGORIES)
data.loc[pop_mask, 'category'] = 'popular'
categories = list(data['category'].drop_duplicates()) + ['all']
if category not in categories:
return None
if 'dbs' in db:
shortened_db = db[4:]
else:
shortened_db = db
_, path2 = prep_paths(shortened_db, category)
link_codes_file = 'link_codes.csv'
twitter_user_codes_file = 'twitter_user_codes.csv'
link_codes, twitter_user_codes = load_coded_as_dicts(
link_codes_file, twitter_user_codes_file)
for link, code in link_codes.items():
data.loc[data.link == link, 'code'] = code
twitter_data = data[data.domain == TWITTER_DOMAIN]
twitter_links = twitter_data.link.drop_duplicates()
for link in twitter_links:
screen_name = strip_twitter_screename(link)
code = twitter_user_codes.get(screen_name)
if not code:
# print('Could not get code for screen_name {}'.format(screen_name))
pass
data.loc[data.link == link, 'code'] = code
data.code = data.code.astype('category')
domains_plus_codes = [
str(x) + '_' + str(y) for x, y in zip(
list(data.domain),
list(data.code)
)
]
data = data.assign(domains_plus_codes=domains_plus_codes)
data.domains_plus_codes = data.domains_plus_codes.astype('category')
data.describe(include='all').to_csv(path2 + '/data.describe().csv')
tweets_finder = wrap_finder(data, 'tweets')
news_finder = wrap_finder(data, 'news')
finders = {
'tweets': tweets_finder,
'news': news_finder,
}
serp_df.reported_location.value_counts().to_csv(
path2 + '/values_counts_reported_location.csv')
serp_df['query'].value_counts().to_csv(path2 + '/values_counts_query.csv')
scraper_search_id_set = data.scraper_search_id.drop_duplicates()
link_types = [
'results',
#'knowledge_panel',
#'news'
#['results', 'tweets'],
#['results', 'knowledge_panel']
]
serp_comps = {}
config = {}
config['use_control'] = False
config['check_ranks'] = False
link_type_to_domains = {}
# go through each link type specified above
for i, link_type in enumerate(link_types):
if isinstance(link_type, list):
mask = data.link_type == link_type[0]
for x in link_type:
mask = (mask) | (data.link_type == x)
link_type_specific_data = data[mask]
link_type = '_and_'.join(link_type)
link_types[i] = link_type # carry this beyond the for loop
else:
link_type_specific_data = data[data.link_type == link_type]
# grab data from the given category, if applicable
if category in [
'trending', 'procon_popular', 'popular', 'top_insurance', 'top_loans',
'med_sample_first_20'
]:
link_type_specific_data = link_type_specific_data[
link_type_specific_data['category'] == category]
else:
if category != 'all':
raise ValueError('INVALID CATEGORY')
path3 = '{}/{}'.format(path2, link_type)
try:
os.mkdir(path3)
except OSError:
pass
link_type_specific_data.domain.value_counts().to_csv(
path3 + '/values_counts_domain.csv')
top_domains = list(
link_type_specific_data.domain.value_counts().to_dict().keys())[:30] + UGC_WHITELIST
top_domains = list(set(top_domains))
top_domains = [
domain for domain in top_domains if isinstance(domain, str)]
link_type_to_domains[link_type] = top_domains
for scraper_search_id in scraper_search_id_set:
filtered = link_type_specific_data[link_type_specific_data.scraper_search_id == scraper_search_id]
if filtered.empty:
continue
queries = list(filtered['query'].drop_duplicates())
if len(queries) != 1:
raise ValueError('Multiple queries found in a single serp')
location_set = filtered.reported_location.drop_duplicates()
d = analyze_subset(filtered, location_set, config, finders)
for loc, vals in d.items():
sid = vals['serp_id']
# we're gonna put stuff into a nested dict
tmp = d[loc]['computed']
dist_sum, jacc_sum, count = 0, 0, 0
for _, metrics in vals['comparisons'].items():
dist_sum += metrics['edit']
jacc_sum += metrics['jaccard']
count += 1
if count:
avg_edit = dist_sum / count
avg_jacc = jacc_sum / count
else:
avg_edit = avg_jacc = float('nan')
tmp[link_type + '_avg_edit'] = avg_edit
tmp[link_type + '_avg_jaccard'] = avg_jacc
# make sure we're NOT overwriting an already existent sub-dict!
# (this comment suggests a foolish programmer did this in the past)
if sid not in serp_comps:
serp_comps[sid] = {'id': sid}
serp_comps[sid][link_type + '_avg_edit'] = avg_edit
serp_comps[sid][link_type + '_avg_jacc'] = avg_jacc
has_type_key = 'has_' + link_type
serp_comps[sid][has_type_key] = d[loc].get(has_type_key, 0)
for comp_key in RESULT_SUBSETS:
domain_fracs = tmp[comp_key]['domain_fracs']
for domain_string, frac in domain_fracs.items():
for top_domain in top_domains:
# only do it if domain_string is in the top_domains list
if domain_string == top_domain:
concat_key = '_'.join(
[link_type, comp_key, 'domain_frac',
domain_string]
)
serp_comps[sid][concat_key] = frac
domain_appears_concat_key = concat_key.replace(
'_frac', '_appears')
did_it_appear = tmp[comp_key]['domain_appears'][domain_string]
serp_comps[sid][domain_appears_concat_key] = did_it_appear
# puts ranks, counts, maps into serp_comps
if comp_key == FULL:
domain_ranks_concat_key = concat_key.replace(
'_frac', '_rank')
domain_counts_concat_key = concat_key.replace(
'_frac', '_count')
domain_maps_concat_key = concat_key.replace(
'_frac', '_maps')
serp_comps[
sid][domain_ranks_concat_key
] = tmp[comp_key]['domain_ranks'][domain_string]
serp_comps[
sid][domain_counts_concat_key
] = tmp[comp_key]['domain_counts'][domain_string]
serp_comps[
sid][domain_maps_concat_key
] = tmp[comp_key]['domain_maps'][domain_string]
if args.coded_metrics:
# we will include each code as a unique domain
# so commercial facebook is different from journalist fb, etc.
code_appears = tmp[comp_key]['code_appears']
for code, appears in code_appears.items():
concat_key = '_'.join(
[link_type, comp_key, 'code_appears', str(code)]
)
serp_comps[sid][concat_key] = appears
# compute polarity. Not used in the paper right now, but it'll be in the data!
for textcol in ['snippet', 'title']:
pol_key = '_'.join(
[link_type, comp_key, textcol, 'mean_polarity'])
serp_comps[sid][pol_key] = tmp.get(
'_'.join([comp_key, textcol, 'mean_polarity'])
)
serp_comps_df = pd.DataFrame.from_dict(serp_comps, orient='index')
#serp_comps_df.index.name = 'id'
# Future Warning here
# print(serp_df.head())
# print(serp_comps_df.head())
serp_df = serp_df.merge(serp_comps_df, on='id')
serp_df.reported_location = serp_df.reported_location.astype('category')
# ANCHOR: fix KP SOCIAL MEDIA
if args.include_kp:
kp_finder = wrap_finder(data, 'knowledge_panel')
for key, val in queries_to_kp_profiles.items():
relevant_row_mask = (serp_df['query'] == key) & (serp_df.has_knowledge_panel == True)
for domain in val:
serp_df.loc[relevant_row_mask, 'results_full_domain_appears_' + domain] = 1
indices = []
for index, row in serp_df[serp_df.has_knowledge_panel == True].iterrows():
kp_item = kp_finder(row['id']).iloc[0]
if kp_item.domain == 'wikipedia.org':
indices.append(index)
has_it_already = serp_df[serp_df['id'] == row['id']].iloc[0]['results_full_domain_appears_wikipedia.org']
if not has_it_already == 1:
serp_df.loc[serp_df['id'] == row['id'], 'results_full_domain_appears_wikipedia.org'] = 1
serp_df.describe(include='all').to_csv(path2 + '/serp_df.describe().csv')
# ANCHOR: plotting
ugc_ret_cols = []
big_ret_cols = []
cols = get_matching_columns(list(serp_df.columns.values), UGC_WHITELIST)
cols_with_nonzero_mean = [
x for x in cols if serp_df[x].mean() != 0
]
if cols_with_nonzero_mean:
serp_df[cols_with_nonzero_mean].describe().to_csv(
path2 + '/nz_ugcin_serp_df.csv')
# nz for non-zero (variable name was too long)
results_domain_fracs_cols_nz = [
x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_frac' in x
]
results_domain_ranks_cols_nz = [
x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_rank' in x
]
results_domain_appears_cols_nz = [
x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_appears' in x
]
if args.plot_detailed:
_, domain_fracs_ax = plt.subplots(nrows=3)
_, axes2 = plt.subplots(nrows=4)
_, big_ax = plt.subplots(nrows=3)
_, dist_axes = plt.subplots(nrows=2)
_, personalization_ax = plt.subplots(nrows=2)
for index, subset in enumerate(RESULT_SUBSETS):
results_domain_fracs_cols_nz_subset = [
x for x in results_domain_fracs_cols_nz if subset + '_domain_frac' in x
]
results_domain_appears_cols_nz_subset = [
x for x in results_domain_appears_cols_nz if subset + '_domain_appears' in x
]
results_domain_rank_cols_nz_subset = [
x.replace('_domain_frac', '_domain_rank') for x in results_domain_fracs_cols_nz_subset
] if subset == FULL else []
results_domain_count_cols_nz_subset = [
x.replace('_domain_frac', '_domain_count') for x in results_domain_fracs_cols_nz_subset
] if subset == FULL else []
big_candidate_cols = [
x for x in list(serp_df.columns.values) if 'results_' + subset + '_domain_appears' in x
]
serp_df = serp_df.fillna({
x: 0 for x in big_candidate_cols
})
big_appears_cols = list(serp_df[big_candidate_cols].mean().sort_values(ascending=False).index)[:30]
big_frac_cols = [
x.replace('_domain_appears', '_domain_frac') for x in big_appears_cols
]
big_rank_cols = [
x.replace('_domain_appears', '_domain_rank') for x in big_appears_cols
] if subset == FULL else []
big_count_cols = [
x.replace('_domain_appears', '_domain_count') for x in big_appears_cols
] if subset == FULL else []
if results_domain_fracs_cols_nz_subset:
if args.plot_detailed:
serp_df[results_domain_fracs_cols_nz_subset].mean().sort_values().plot(
kind='barh', ax=domain_fracs_ax[index], title='Category: {}, Domain Fractions: {}'.format(category, subset))
serp_df[results_domain_appears_cols_nz_subset].mean().sort_values().plot(
kind='barh', ax=axes2[index], title='Domain Appears: {}'.format(subset))
ugc_ret_cols += results_domain_fracs_cols_nz_subset
ugc_ret_cols += results_domain_rank_cols_nz_subset
ugc_ret_cols += results_domain_count_cols_nz_subset
# why does this have special handling?
# It is possible that the domain_appears column is marked 1
# because the domain appeared only in the KP Profiles section
if results_domain_appears_cols_nz_subset:
ugc_ret_cols += results_domain_appears_cols_nz_subset
if big_appears_cols:
if args.plot_detailed:
serp_df[big_appears_cols].mean().sort_values().plot(
kind='barh', ax=big_ax[index], title='Big Appears: {}'.format(subset))
big_ret_cols += big_appears_cols
big_ret_cols += big_frac_cols
big_ret_cols += big_rank_cols
big_ret_cols += big_count_cols
if args.plot_detailed:
serp_df[results_domain_ranks_cols_nz].mean().sort_values().plot(
kind='barh', ax=axes2[3], title='Domain Ranks')
wp_vals = serp_df[
'results_full_domain_rank_wikipedia.org'][serp_df['results_full_domain_rank_wikipedia.org'].notnull() == True]
sns.distplot(
wp_vals, bins=list(range(1, 13)), norm_hist=True,
kde=False, color="b", ax=dist_axes[0])
dist_axes[0].axvline(wp_vals.mean(), color='b',
linestyle='dashed', linewidth=2)
try:
tw_vals = serp_df[
'results_full_domain_rank_UserTweetCarousel'][serp_df['results_full_domain_rank_UserTweetCarousel'].notnull() == True]
sns.distplot(
tw_vals, bins=list(range(1, 13)), norm_hist=True,
kde=False, color="g", ax=dist_axes[1])
dist_axes[1].axvline(tw_vals.mean(), color='g',
linestyle='dashed', linewidth=2)
except:
pass
# PERSONALIZATION
jacc_vals = serp_df[serp_df['results_avg_jacc'].notnull()
== True]['results_avg_jacc']
sns.distplot(
jacc_vals, norm_hist=True,
kde=False, color="b", ax=personalization_ax[0])
personalization_ax[0].axvline(
jacc_vals.mean(), color='b', linestyle='dashed', linewidth=2)
edit_vals = serp_df[serp_df['results_avg_edit'].notnull()
== True]['results_avg_edit']
sns.distplot(
edit_vals, norm_hist=True,
kde=False, color="g", ax=personalization_ax[1])
personalization_ax[1].axvline(
edit_vals.mean(), color='g', linestyle='dashed', linewidth=2)
outputs, errors = [], []
pval_summaries = {key: [] for key in RESULT_SUBSETS}
whitelist_summaries = {key: [] for key in RESULT_SUBSETS}
fisher_summaries = {key: [] for key in RESULT_SUBSETS}
query_comparison_listss = {key: [] for key in RESULT_SUBSETS}
comparison_df = None
ugc_ret_cols = [
x for x in ugc_ret_cols if x in list(serp_df.columns.values)
]
big_ret_cols = [
x for x in big_ret_cols if x in list(serp_df.columns.values)
]
all_cols = list(serp_df.columns.values)
for link_type in link_types:
path3 = '{}/{}'.format(path2, link_type)
cols_to_compare = []
link_type_cols = [
x for x in all_cols if link_type + '_' in x
]
cols_must_include = ['full_domain_appears', 'top_three_domain_appears', 'code_appears', 'top_three_code_appears']
cols_to_compare = [
x for x in link_type_cols if (
cols_must_include[0] in x or cols_must_include[1] in x or cols_must_include[2] in x or cols_must_include[3] in x
)
]
cols_to_compare = [
x for x in cols_to_compare if x[-3:] != 'nan'
]
serp_df = serp_df.fillna({
col: 0 for col in cols_to_compare
})
# SERPS that have NO TWEETS or NO NEWS (etc)
# will have nan values for any related calculations (e.g. avg_jacc of Tweets)
if link_type == 'results':
cols_to_fill = [
'has_knowledge_panel',
'has_top_ads',
'has_bottom_ads',
]
serp_df = serp_df.fillna({
col: 0 for col in cols_to_fill
})
for col in cols_to_fill:
cols_to_compare.append(col)
comparisons = []
rec = False
if args.comparison in ['urban-rural', 'all']:
comparisons.append(Comparison(
df_a=serp_df[(serp_df['urban_rural_code'] == 5) |
(serp_df['urban_rural_code'] == 6)],
name_a='rural',
df_b=serp_df[(serp_df['urban_rural_code'] == 1) |
(serp_df['urban_rural_code'] == 2)],
name_b='urban',
cols_to_compare=cols_to_compare,
print_all=args.print_all,
recurse_on_queries=rec
))
if args.comparison in ['income', 'all']:
comparisons.append(Comparison(
df_a=serp_df[serp_df['median_income'] <= 45111],
name_a='low-income',
df_b=serp_df[serp_df['median_income'] > 45111],
name_b='high-income',
cols_to_compare=cols_to_compare,
print_all=args.print_all,
recurse_on_queries=rec
))
if args.comparison in ['voting', 'all']:
comparisons.append(Comparison(
df_a=serp_df[serp_df['percent_dem'] <= 0.5],
name_a='GOP',
df_b=serp_df[serp_df['percent_dem'] > 0.5],
name_b='DEM',
cols_to_compare=cols_to_compare,
print_all=args.print_all,
recurse_on_queries=rec
))
for comparison in comparisons:
out, summary, error, query_comparison_lists = comparison.print_results()
for key in RESULT_SUBSETS:
pval_summaries[key] += summary['pval'][key]
whitelist_summaries[key] += summary['whitelist'][key]
fisher_summaries[key] += summary['fisher'][key]
query_comparison_listss[key] += query_comparison_lists[key]
outputs += out
errors += error
# write out the comparisons
output_df = pd.DataFrame(outputs)
output_df.to_csv(path2 + '/comparisons.csv')
# write out a summary of statistically significant comparisons
paper_table_list = []
for key in RESULT_SUBSETS:
paper_table_list += pval_summaries[key]
paper_table_list += whitelist_summaries[key]
pval_summary_df = pd.DataFrame(pval_summaries[key])
pval_summary_df.to_csv(path2 + '/' + key + '_pval_summary.csv')
fisher_summary_df = pd.DataFrame(fisher_summaries[key])
fisher_summary_df.to_csv(path2 + '/' + key + '_fisher_summary.csv')
whitelist_summary_df = pd.DataFrame(whitelist_summaries[key])
whitelist_summary_df.to_csv(
path2 + '/' + key + '_whitelist_summary.csv')
query_comparison_df = pd.DataFrame(query_comparison_listss[key])
query_comparison_df.to_csv(
path3 + '/' + key + '_query_comparisons.csv')
# merged will hold the union of the whitelist summary and the pval summary
if not whitelist_summary_df.empty and not pval_summary_df.empty: