analysis.py

"""
Data Analysis

"""
import os
from string import ascii_lowercase
import csv
import argparse
import time
from collections import defaultdict

from constants import POPULAR_CATEGORIES, FULL, TOP_THREE, TOP, RESULT_SUBSETS
from data_helpers import get_dataframes, load_coded_as_dicts, prep_data, set_or_concat
from qual_code import TWITTER_DOMAIN, strip_twitter_screename
from plotters import plot_comparison, plot_importance
from profiles_in_kp import queries as queries_to_kp_profiles

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import average_precision_score
from scipy.stats import ttest_ind, fisher_exact
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.contingency_tables import mcnemar
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyxdameraulevenshtein import damerau_levenshtein_distance


UGC_WHITELIST = [
    'wikipedia.org',
    'UserTweetCarousel',
    'SearchTweetCarousel',
    'facebook.com',
    'twitter.com',
    'youtube.com',
    'instagram.com',
    'linkedin.com',
    'yelp.com',
    'tripadvisor.com',
]


class Comparison():
    """
    A comparison entity
    For comparing two groups within a set of results
    e.g. urban vs. rural wikipedia incidence rate
    urban vs. rural map incidence rate
    """

    def __init__(
        self, df_a, name_a, df_b, name_b, cols_to_compare,
        print_all=False, recurse_on_queries=False
    ):
        self.df_a = df_a
        self.name_a = name_a
        self.df_b = df_b
        self.name_b = name_b
        self.cols_to_compare = cols_to_compare
        self.print_all = print_all
        self.recurse_on_queries = recurse_on_queries

    def print_results(self):
        """
        Compare columns for the two groups belonging to this Comparison entity
        Prints out the results
        """
        ret = []
        err = []
        query_comparison_lists = {key: [] for key in RESULT_SUBSETS}
        pval_summary = {key: [] for key in RESULT_SUBSETS}
        whitelist_summary = {key: [] for key in RESULT_SUBSETS}
        fisher_summary = {key: [] for key in RESULT_SUBSETS}
        for col in self.cols_to_compare:
            # a = list(self.df_a[col])
            # b = list(self.df_b[col])

            try:
                filtered_df_a = self.df_a[self.df_a[col].notnull()]
                a = list(filtered_df_a[col])
            except KeyError:
                if self.print_all:
                    print('Column {} missing from df_a, {}'.format(
                        col, self.name_a))
                continue
            try:
                filtered_df_b = self.df_b[self.df_b[col].notnull()]
                b = list(filtered_df_b[col])
            except KeyError:
                if self.print_all:
                    print('Column {} missing from df_a, {}'.format(
                        col, self.name_a))
                continue

            if not a and not b:
                err.append('Skipping {} b/c two empty lists'.format(col))
                continue
            assert len(a) == len(b)
            mean = np.mean(np.array(a + b), axis=0)
            mean_a = np.mean(a)
            mean_b = np.mean(b)
            n = len(a) + len(b)

            df = pd.DataFrame({'a': a, 'b': b}).melt()
            _, pval = ttest_ind(a, b, equal_var=False)
            #_, ztest_pval = proportions_ztest()
            try:
                tab = pd.crosstab(df.variable, df.value)
                # df = pd.DataFrame({'a': a, 'b': b})
                # tab = pd.crosstab(df.a, df.b)
                # if 0 not in tab.columns:
                #     tab[0] = [0, 0]
                # if 1 not in tab.columns:
                #     tab[1] = [0, 0]
                # bunch = mcnemar(tab)
                # fisher_pval = bunch.pvalue
                #print(tab)
                _, fisher_pval = fisher_exact(tab)
                
                
            except Exception as ex:
                print('ex', ex)
                fisher_pval = 1

            if mean_a == mean_b:
                larger, smaller = mean_a, mean_b
                winner = None
            elif mean_a > mean_b:
                larger, smaller = mean_a, mean_b
                winner = self.name_a
            else:
                larger, smaller = mean_b, mean_a
                winner = self.name_b
            if smaller > 0:
                mult_increase = round(larger / smaller, 2)
            else:
                mult_increase = float('nan')
            marker = ''
            if pval <= 0.001:
                marker = '**'
            elif pval <= 0.05:
                marker = '*'

            row_dict = {
                'column': marker + col,
                'winner': winner,
                'mult_inc': mult_increase,
                'add_inc': round(larger - smaller, 3),
                'mean_a': round(mean_a, 3),
                'mean_b': round(mean_b, 3),
                'name_a': self.name_a,
                'name_b': self.name_b,
                'pval': pval,
                'fisher_pval': fisher_pval,
                'len(a)': len(a),
                'len(b)': len(b),
                'n': n,
                'mean': mean,
            }
            ret.append(row_dict)
            is_in_whitelist = False
            for domain in UGC_WHITELIST:
                if domain in col:
                    is_in_whitelist = True


            key = None
            for result_subset in RESULT_SUBSETS:
                if result_subset + '_domain' in col or result_subset + '_code' in col:
                    key = result_subset
                    break
            if key:
                if is_in_whitelist:
                    whitelist_summary[key].append(row_dict)
                if fisher_pval < 0.05:
                    fisher_summary[key].append(row_dict)
                if marker:
                    pval_summary[key].append(row_dict)
                    if self.recurse_on_queries:
                        # now mark all the comparisons
                        queries = set(
                            list(self.df_a['query']) + list(self.df_b['query'])
                        )
                        # queries = set(
                        #     list(
                        #         filtered_df_a['query'].drop_duplicates()) +
                        #     list(filtered_df_b['query'].drop_duplicates()
                        #          )
                        # )
                        for query in queries:
                            query_a = filtered_df_a[filtered_df_a['query'] == query]
                            query_b = filtered_df_b[filtered_df_b['query'] == query]
                            query_comparison = Comparison(
                                df_a=query_a, name_a=self.name_a,
                                df_b=query_b, name_b=self.name_b,
                                cols_to_compare=[col],
                                print_all=self.print_all,
                                recurse_on_queries=False,
                            )
                            comparison_dicts = query_comparison.print_results()[
                                0]
                            comparison_dicts = [
                                x for x in comparison_dicts if x['mean'] != 0]
                            for d in comparison_dicts:
                                d['query'] = query
                            query_comparison_lists[key] += comparison_dicts
        summary = {
            'pval': pval_summary,
            'whitelist': whitelist_summary,
            'fisher': fisher_summary,
        }
        return ret, summary, err, query_comparison_lists


def get_matching_columns(columns, whitelist):
    """
    Takes a list of columns and returns the ones that match whitelist
    """
    ret = []
    for x in whitelist:
        for column in columns:
            if x in column and column not in ret:
                ret.append(column)
    return ret


def encode_links_as_strings(links1, links2):
    """
    Take two lists of pages and turn them into strings
    For the sole purpose of calculating edit distance
    """
    set1, set2 = set(links1), set(links2)
    union = set1.union(set2)
    mapping = {}
    # will never have more than 10 results...
    for item, letter in zip(list(union), ascii_lowercase):
        mapping[item] = letter
    string1 = ''.join([mapping[link] for link in links1])
    string2 = ''.join([mapping[link] for link in links2])
    return string1, string2


def jaccard_similarity(x, y):
    """
    set implementation of jaccard similarity
    """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality / float(union_cardinality)

def wrap_finder(data, link_type):
    """
    Take a df and return a function to get tweets or news in that df corresponding to a serp_id
    """

    def finder(sid):
        return data[(data.serp_id == sid) & (data.link_type == link_type)]

    return finder


class MetricCalculator():
    """
    Calculates metrics used in the study
    domain_fracs
    domain_appears
    domain_ranks
    domain_counts
    """
    def __init__(self, finders, sid):
        self.finders = finders
        self.sid = sid

    def calc_domain_fracs(self, cols, use_codes=False):
        """
        This is specific to a given SERP
        Figure out how many domains of interest appear in search results

        Currently using control queries is deprecated.
        return a dict
        """
        domains_to_count = defaultdict(int)
        domains_to_ranksum = defaultdict(int)

        # this is strange.
        df = cols
        if not df.empty:
            for _, row in df.iterrows():
                if not use_codes:
                    domain = row.domain
                else:
                    domain = str(row.domain) + ':' + str(row['code'])
                rank = row['rank']
                if isinstance(domain, float) and np.isnan(domain):
                    domains_to_count['none'] += 1
                    domains_to_ranksum['none'] += rank
                elif domain == 'NewsCarousel' or 'TweetCarousel' in domain:
                    # was re-using "domain" here, looked sketchy
                    if domain == 'NewsCarousel':
                        subdf = self.finders['news'](self.sid).iloc[:3]
                    else: # must be tweets
                        subdf = self.finders['tweets'](self.sid).iloc[:3]
                    for _, subrow in subdf.iloc[:3].iterrows():
                        domains_to_count[subrow.domain] += 1
                        domains_to_ranksum[subrow.domain] += rank
                    domains_to_count[domain] += 1
                    domains_to_ranksum[domain] += rank
                else:
                    domains_to_count[domain] += 1
                    domains_to_ranksum[domain] += rank
        frac_ret = {}
        rank_ret = {}
        num_counted = sum(domains_to_count.values())
        for key, val in domains_to_count.items():
            frac_ret[key] = val / num_counted
        for key, val in domains_to_ranksum.items():
            rank_ret[key] = val / domains_to_count[key]

        domains_to_map = {}
        for key in domains_to_count.keys():
            y_true = [x == key for x in cols.domain]
            y_score = [1/(i+1) for i, x in enumerate(cols['rank'])]
            # print(y_true)
            # print(y_score)
            domains_to_map[key] = average_precision_score(y_true, y_score)

        return frac_ret, rank_ret, domains_to_count, domains_to_map


def compute_serp_features(
        links, cols,
        control_links, control_cols,
        sid, finders,
):
    """
    Computes features for a set of results corresponding to one serp
    Args:
        links - a list of links (as strings)
        control_links - a list of links (as strings)
        domains_col - a pandas series corresponding to the "domain" column
        code_col - a pandas series corresponding to the "code" column
    Returns:
        A dictionary of computed values
        ret: {
            jaccard index with control,
            edit distance with control,
            domain_fracs for results, top3, top1,
        }
    """
    metric_calculator = MetricCalculator(finders=finders, sid=sid)
    string, control_string = encode_links_as_strings(links, control_links)
    ret = {}
    if control_links and control_cols:
        ret['control_jaccard'] = jaccard_similarity(
            links, control_links
        )
        ret['control_edit'] = damerau_levenshtein_distance(
            string, control_string
        )
    if 'knowledge_panel' in list(cols.link_type):
        cols = cols.sort_values('link_type')

    fracs, ranks, counts, maps =  metric_calculator.calc_domain_fracs(cols)
    
    # print(cols.domain)
    # print('maps\n', maps)
    # print(ranks)
    # input()
    ret[FULL] = {
        'domain_fracs': fracs,
        'domain_ranks': ranks,
        'domain_counts': counts,
        'domain_maps': maps,
    }
    
    top3_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:3])
    ret[TOP_THREE] = {
        'domain_fracs': top3_fracs
    }
    top_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:1])
    ret[TOP] = {
        'domain_fracs': top_fracs
    }
    for subset in RESULT_SUBSETS:
        ret[subset]['domain_appears'] = {}
        for key, val in ret[subset]['domain_fracs'].items():
            if val > 0:
                ret[subset]['domain_appears'][key] = 1
            else:
                ret[subset]['domain_appears'][key] = 0

    code_fracs, code_ranks, code_counts, code_maps = metric_calculator.calc_domain_fracs(cols, use_codes=True)
    ret[FULL]['code_fracs'] = code_fracs
    ret[FULL]['code_ranks'] = code_ranks
    ret[FULL]['code_counts'] = code_counts
    ret[FULL]['code_maps'] = code_maps
    
    top3_code_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:3], use_codes=True)
    ret[TOP_THREE]['code_fracs'] = top3_code_fracs

    top_code_fracs, _, _, _ = metric_calculator.calc_domain_fracs(cols.iloc[:1], use_codes=True)
    ret[TOP]['code_fracs'] = top_code_fracs

    for subset in RESULT_SUBSETS:
        ret[subset]['code_appears'] = {}
        for key, val in ret[subset]['code_fracs'].items():
            if val > 0:
                ret[subset]['code_appears'][key] = 1
            else:
                ret[subset]['code_appears'][key] = 0
    return ret


def analyze_subset(data, location_set, config, finders):
    """
    A subset consists of results of a certain TYPE for a certain QUERY
    Args:
        data - a dataframe object with rows matching a TYPE and QUERY
        location_set - a set of strings corresponding to locations queried
    """
    # d holds the results and editdistances
    d = {}
    for loc in location_set:
        results = data[data.reported_location == loc]
        if results.empty:
            continue
        treatment = results[results.is_control == 0]
        links = list(treatment.link)
        snippets = list(treatment.snippet)
        titles = list(treatment.title)
        if config.get('check_ranks'):
            ranks = list(treatment['rank'])
            largest_rank = ranks[-1]
            perfect_sequence = set(range(1, largest_rank + 1))
            missing_ranks = perfect_sequence.difference(set(ranks))
            if missing_ranks and missing_ranks != set([1]):
                print(results[['query', 'link', 'rank']], set(ranks), perfect_sequence, missing_ranks)
                input()
        if config.get('use_control'):
            control = results[results.is_control == 1]
            control_links = list(control.link)
            if not control_links:
                # 'Missing expected control links for loc {}'.format(loc))
                continue
            if not links:
                # 'Missing expected links for loc {}'.format(loc))
                continue
        else:
            control = pd.DataFrame(
                data={
                    'domain': [],
                    'code': [],
                    'rank': [],
                    'domains_plus_codes': [],
                    'link_type': [],
                }
            )
            control_links = []

        first_row = results.iloc[0]

        sid = first_row.serp_id
        d[loc] = {}
        d[loc]['links'] = links
        d[loc]['has_' + first_row.link_type] = 1 if links else 0
        d[loc]['domains'] = list(treatment.domain)
        d[loc]['control_links'] = control_links
        d[loc]['computed'] = compute_serp_features(
            links, 
            treatment[['domain', 'code', 'rank', 'domains_plus_codes', 'link_type']],
            control_links,
            control[['domain', 'code', 'rank', 'domains_plus_codes', 'link_type']],
            sid, finders
        )
        d[loc]['serp_id'] = sid
        sid = SentimentIntensityAnalyzer()
        snippet_polarities = [sid.polarity_scores(
            x)['compound'] for x in snippets if x]
        title_polarities = [sid.polarity_scores(
            x)['compound'] for x in titles if x]
        for polarities, textname in [
                (snippet_polarities, 'snippet'),
                (title_polarities, 'title')
        ]:
            for prefix, subset in [
                    (FULL, polarities),
                    (TOP_THREE, polarities[:3]),
                    (TOP, polarities[:1]),
            ]:
                if subset:
                    mean_polarity = sum(subset) / len(subset)
                    d[loc]['computed'][prefix + '_' + textname +
                                       '_mean_polarity'] = mean_polarity

    for loc in location_set:
        if loc not in d:
            continue
        d[loc]['comparisons'] = {}
        tmp = d[loc]['comparisons']
        for comparison_loc in location_set:
            if comparison_loc not in d:
                continue
            if loc == comparison_loc:
                continue
            tmp[comparison_loc] = {}
            string1, string2 = encode_links_as_strings(
                d[loc]['links'], d[comparison_loc]['links'])
            tmp[comparison_loc]['edit'] = \
                damerau_levenshtein_distance(
                string1, string2
            )
            try:
                jac = jaccard_similarity(
                    d[loc]['links'],
                    d[comparison_loc]['links']
                )
            except ZeroDivisionError:
                jac = float('nan')
            tmp[comparison_loc]['jaccard'] = jac
    return d


def prep_paths(db, category):
    """
    Creates paths in the filesystem and return the path names
    """
    path1 = 'output'
    path2 = '{}/{}'.format(path1, db)
    if category:
        path2 += '__' + category
    for path in [path1, path2]:
        try:
            os.mkdir(path)
        except OSError:
            pass
    return path1, path2


def main(args, db, category):
    """Do analysis"""
    data, serp_df = get_dataframes(db)
    data = prep_data(data)
    if args.group_popular:
        pop_mask = serp_df['category'].isin(POPULAR_CATEGORIES)
        serp_df.loc[pop_mask, 'category'] = 'popular'
        pop_mask = data['category'].isin(POPULAR_CATEGORIES)
        data.loc[pop_mask, 'category'] = 'popular'
    categories = list(data['category'].drop_duplicates()) + ['all']
    if category not in categories:
        return None
    if 'dbs' in db:
        shortened_db = db[4:]
    else:
        shortened_db = db
    _, path2 = prep_paths(shortened_db, category)

    link_codes_file = 'link_codes.csv'
    twitter_user_codes_file = 'twitter_user_codes.csv'
    link_codes, twitter_user_codes = load_coded_as_dicts(
        link_codes_file, twitter_user_codes_file)
    for link, code in link_codes.items():
        data.loc[data.link == link, 'code'] = code
    twitter_data = data[data.domain == TWITTER_DOMAIN]

    twitter_links = twitter_data.link.drop_duplicates()
    for link in twitter_links:
        screen_name = strip_twitter_screename(link)
        code = twitter_user_codes.get(screen_name)
        if not code:
            # print('Could not get code for screen_name {}'.format(screen_name))
            pass
        data.loc[data.link == link, 'code'] = code
    data.code = data.code.astype('category')
    domains_plus_codes = [
        str(x) + '_' + str(y) for x, y in zip(
            list(data.domain),
            list(data.code)
        )
    ]
    data = data.assign(domains_plus_codes=domains_plus_codes)
    data.domains_plus_codes = data.domains_plus_codes.astype('category')
    data.describe(include='all').to_csv(path2 + '/data.describe().csv')
    tweets_finder = wrap_finder(data, 'tweets')
    news_finder = wrap_finder(data, 'news')
    finders = {
        'tweets': tweets_finder,
        'news': news_finder,
    }

    serp_df.reported_location.value_counts().to_csv(
        path2 + '/values_counts_reported_location.csv')
    serp_df['query'].value_counts().to_csv(path2 + '/values_counts_query.csv')
    scraper_search_id_set = data.scraper_search_id.drop_duplicates()

    link_types = [
        'results',
        #'knowledge_panel',
        #'news'
        #['results', 'tweets'],
        #['results', 'knowledge_panel']
    ]

    serp_comps = {}
    config = {}
    config['use_control'] = False
    config['check_ranks'] = False

    link_type_to_domains = {}

    # go through each link type specified above
    for i, link_type in enumerate(link_types):
        if isinstance(link_type, list):
            mask = data.link_type == link_type[0]
            for x in link_type:
                mask = (mask) | (data.link_type == x)
            link_type_specific_data = data[mask]
            link_type = '_and_'.join(link_type)
            link_types[i] = link_type  # carry this beyond the for loop
        else:
            link_type_specific_data = data[data.link_type == link_type]

        # grab data from the given category, if applicable
        if category in [
            'trending', 'procon_popular', 'popular', 'top_insurance', 'top_loans',
            'med_sample_first_20'
        ]:
            link_type_specific_data = link_type_specific_data[
                link_type_specific_data['category'] == category]
        else:
            if category != 'all':
                raise ValueError('INVALID CATEGORY')
        path3 = '{}/{}'.format(path2, link_type)
        try:
            os.mkdir(path3)
        except OSError:
            pass
        link_type_specific_data.domain.value_counts().to_csv(
            path3 + '/values_counts_domain.csv')

        top_domains = list(
            link_type_specific_data.domain.value_counts().to_dict().keys())[:30] + UGC_WHITELIST
        top_domains = list(set(top_domains))
        top_domains = [
            domain for domain in top_domains if isinstance(domain, str)]
        link_type_to_domains[link_type] = top_domains

        for scraper_search_id in scraper_search_id_set:
            filtered = link_type_specific_data[link_type_specific_data.scraper_search_id == scraper_search_id]
            if filtered.empty:
                continue
            queries = list(filtered['query'].drop_duplicates())
            if len(queries) != 1:
                raise ValueError('Multiple queries found in a single serp')
            location_set = filtered.reported_location.drop_duplicates()
            d = analyze_subset(filtered, location_set, config, finders)

            for loc, vals in d.items():
                sid = vals['serp_id']
                
                # we're gonna put stuff into a nested dict
                tmp = d[loc]['computed']

                dist_sum, jacc_sum, count = 0, 0, 0
                for _, metrics in vals['comparisons'].items():
                    dist_sum += metrics['edit']
                    jacc_sum += metrics['jaccard']
                    count += 1
                if count:
                    avg_edit = dist_sum / count
                    avg_jacc = jacc_sum / count
                else:
                    avg_edit = avg_jacc = float('nan')
                tmp[link_type + '_avg_edit'] = avg_edit
                tmp[link_type + '_avg_jaccard'] = avg_jacc

                # make sure we're NOT overwriting an already existent sub-dict!
                # (this comment suggests a foolish programmer did this in the past)
                if sid not in serp_comps:
                    serp_comps[sid] = {'id': sid}
                serp_comps[sid][link_type + '_avg_edit'] = avg_edit
                serp_comps[sid][link_type + '_avg_jacc'] = avg_jacc
                has_type_key = 'has_' + link_type
                serp_comps[sid][has_type_key] = d[loc].get(has_type_key, 0)
                for comp_key in RESULT_SUBSETS:
                    domain_fracs = tmp[comp_key]['domain_fracs']
                    for domain_string, frac in domain_fracs.items():
                        for top_domain in top_domains:
                            # only do it if domain_string is in the top_domains list
                            if domain_string == top_domain:
                                concat_key = '_'.join(
                                    [link_type, comp_key, 'domain_frac',
                                        domain_string]
                                )
                                serp_comps[sid][concat_key] = frac
                                domain_appears_concat_key = concat_key.replace(
                                    '_frac', '_appears')
                                did_it_appear = tmp[comp_key]['domain_appears'][domain_string]
                                serp_comps[sid][domain_appears_concat_key] = did_it_appear

                                # puts ranks, counts, maps into serp_comps
                                if comp_key == FULL:
                                    domain_ranks_concat_key = concat_key.replace(
                                        '_frac', '_rank')
                                    domain_counts_concat_key = concat_key.replace(
                                        '_frac', '_count')
                                    domain_maps_concat_key = concat_key.replace(
                                        '_frac', '_maps')                                    

                                    serp_comps[
                                        sid][domain_ranks_concat_key
                                    ] = tmp[comp_key]['domain_ranks'][domain_string]
                                    serp_comps[
                                        sid][domain_counts_concat_key
                                    ] = tmp[comp_key]['domain_counts'][domain_string]
                                    serp_comps[
                                        sid][domain_maps_concat_key
                                    ] = tmp[comp_key]['domain_maps'][domain_string]
                    if args.coded_metrics:
                        # we will include each code as a unique domain
                        # so commercial facebook is different from journalist fb, etc.
                        code_appears = tmp[comp_key]['code_appears']
                        for code, appears in code_appears.items():
                            concat_key = '_'.join(
                                [link_type, comp_key, 'code_appears', str(code)]
                            )
                            serp_comps[sid][concat_key] = appears

                    # compute polarity. Not used in the paper right now, but it'll be in the data!
                    for textcol in ['snippet', 'title']:
                        pol_key = '_'.join(
                            [link_type, comp_key, textcol, 'mean_polarity'])
                        serp_comps[sid][pol_key] = tmp.get(
                            '_'.join([comp_key, textcol, 'mean_polarity'])
                        )

    serp_comps_df = pd.DataFrame.from_dict(serp_comps, orient='index')
    #serp_comps_df.index.name = 'id'
    # Future Warning here
    # print(serp_df.head())
    # print(serp_comps_df.head())
    serp_df = serp_df.merge(serp_comps_df, on='id')
    serp_df.reported_location = serp_df.reported_location.astype('category')

    # ANCHOR: fix KP SOCIAL MEDIA
    if args.include_kp:
        kp_finder = wrap_finder(data, 'knowledge_panel')
        for key, val in queries_to_kp_profiles.items():
            relevant_row_mask = (serp_df['query'] == key) & (serp_df.has_knowledge_panel == True)
            for domain in val:
                serp_df.loc[relevant_row_mask, 'results_full_domain_appears_' + domain] = 1
        indices = []
        for index, row in serp_df[serp_df.has_knowledge_panel == True].iterrows():
            kp_item = kp_finder(row['id']).iloc[0]
            if kp_item.domain == 'wikipedia.org':
                indices.append(index)
            has_it_already = serp_df[serp_df['id'] == row['id']].iloc[0]['results_full_domain_appears_wikipedia.org']
            if not has_it_already == 1:
                serp_df.loc[serp_df['id'] == row['id'], 'results_full_domain_appears_wikipedia.org'] = 1
    serp_df.describe(include='all').to_csv(path2 + '/serp_df.describe().csv')

    # ANCHOR: plotting
    ugc_ret_cols = []
    big_ret_cols = []
    cols = get_matching_columns(list(serp_df.columns.values), UGC_WHITELIST)
    cols_with_nonzero_mean = [
        x for x in cols if serp_df[x].mean() != 0
    ]

    if cols_with_nonzero_mean:
        serp_df[cols_with_nonzero_mean].describe().to_csv(
            path2 + '/nz_ugcin_serp_df.csv')

    # nz for non-zero (variable name was too long)
    results_domain_fracs_cols_nz = [
        x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_frac' in x
    ]
    results_domain_ranks_cols_nz = [
        x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_rank' in x
    ]
    results_domain_appears_cols_nz = [
        x for x in cols_with_nonzero_mean if 'results_' in x and 'domain_appears' in x
    ]
    if args.plot_detailed:
        _, domain_fracs_ax = plt.subplots(nrows=3)
        _, axes2 = plt.subplots(nrows=4)
        _, big_ax = plt.subplots(nrows=3)
        _, dist_axes = plt.subplots(nrows=2)
        _, personalization_ax = plt.subplots(nrows=2)

    for index, subset in enumerate(RESULT_SUBSETS):
        results_domain_fracs_cols_nz_subset = [
            x for x in results_domain_fracs_cols_nz if subset + '_domain_frac' in x
        ]
        results_domain_appears_cols_nz_subset = [
            x for x in results_domain_appears_cols_nz if subset + '_domain_appears' in x
        ]
        results_domain_rank_cols_nz_subset = [
            x.replace('_domain_frac', '_domain_rank') for x in results_domain_fracs_cols_nz_subset
        ] if subset == FULL else []
        results_domain_count_cols_nz_subset = [
            x.replace('_domain_frac', '_domain_count') for x in results_domain_fracs_cols_nz_subset
        ] if subset == FULL else []
        big_candidate_cols = [
            x for x in list(serp_df.columns.values) if 'results_' + subset + '_domain_appears' in x
        ]
        serp_df = serp_df.fillna({
            x: 0 for x in big_candidate_cols
        })
        big_appears_cols = list(serp_df[big_candidate_cols].mean().sort_values(ascending=False).index)[:30]
        big_frac_cols = [
            x.replace('_domain_appears', '_domain_frac') for x in big_appears_cols
        ]
        big_rank_cols = [
            x.replace('_domain_appears', '_domain_rank') for x in big_appears_cols
        ] if subset == FULL else []
        big_count_cols = [
            x.replace('_domain_appears', '_domain_count') for x in big_appears_cols
        ] if subset == FULL else []

        if results_domain_fracs_cols_nz_subset:
            if args.plot_detailed:
                serp_df[results_domain_fracs_cols_nz_subset].mean().sort_values().plot(
                    kind='barh', ax=domain_fracs_ax[index], title='Category: {}, Domain Fractions: {}'.format(category, subset))
                serp_df[results_domain_appears_cols_nz_subset].mean().sort_values().plot(
                    kind='barh', ax=axes2[index], title='Domain Appears: {}'.format(subset))
            ugc_ret_cols += results_domain_fracs_cols_nz_subset
            ugc_ret_cols += results_domain_rank_cols_nz_subset
            ugc_ret_cols += results_domain_count_cols_nz_subset
        # why does this have special handling?
        # It is possible that the domain_appears column is marked 1
        # because the domain appeared only in the KP Profiles section
        if results_domain_appears_cols_nz_subset:
            ugc_ret_cols += results_domain_appears_cols_nz_subset
        if big_appears_cols:
            if args.plot_detailed:
                serp_df[big_appears_cols].mean().sort_values().plot(
                    kind='barh', ax=big_ax[index], title='Big Appears: {}'.format(subset))
            big_ret_cols += big_appears_cols
            big_ret_cols += big_frac_cols
            big_ret_cols += big_rank_cols
            big_ret_cols += big_count_cols
    if args.plot_detailed:
        serp_df[results_domain_ranks_cols_nz].mean().sort_values().plot(
            kind='barh', ax=axes2[3], title='Domain Ranks')
        wp_vals = serp_df[
            'results_full_domain_rank_wikipedia.org'][serp_df['results_full_domain_rank_wikipedia.org'].notnull() == True]
        sns.distplot(
            wp_vals, bins=list(range(1, 13)), norm_hist=True,
            kde=False, color="b", ax=dist_axes[0])
        dist_axes[0].axvline(wp_vals.mean(), color='b',
                            linestyle='dashed', linewidth=2)
        try:
            tw_vals = serp_df[
                'results_full_domain_rank_UserTweetCarousel'][serp_df['results_full_domain_rank_UserTweetCarousel'].notnull() == True]
            sns.distplot(
                tw_vals, bins=list(range(1, 13)), norm_hist=True,
                kde=False, color="g", ax=dist_axes[1])
            dist_axes[1].axvline(tw_vals.mean(), color='g',
                                linestyle='dashed', linewidth=2)
        except:
            pass
        # PERSONALIZATION
        jacc_vals = serp_df[serp_df['results_avg_jacc'].notnull()
                            == True]['results_avg_jacc']
        sns.distplot(
            jacc_vals, norm_hist=True,
            kde=False, color="b", ax=personalization_ax[0])
        personalization_ax[0].axvline(
            jacc_vals.mean(), color='b', linestyle='dashed', linewidth=2)
        edit_vals = serp_df[serp_df['results_avg_edit'].notnull()
                            == True]['results_avg_edit']
        sns.distplot(
            edit_vals, norm_hist=True,
            kde=False, color="g", ax=personalization_ax[1])
        personalization_ax[1].axvline(
            edit_vals.mean(), color='g', linestyle='dashed', linewidth=2)

    outputs, errors = [], []
    pval_summaries = {key: [] for key in RESULT_SUBSETS}
    whitelist_summaries = {key: [] for key in RESULT_SUBSETS}
    fisher_summaries = {key: [] for key in RESULT_SUBSETS}
    query_comparison_listss = {key: [] for key in RESULT_SUBSETS}
    comparison_df = None

    ugc_ret_cols = [
        x for x in ugc_ret_cols if x in list(serp_df.columns.values)
    ]
    big_ret_cols = [
        x for x in big_ret_cols if x in list(serp_df.columns.values)
    ]
    all_cols = list(serp_df.columns.values)

    for link_type in link_types:
        path3 = '{}/{}'.format(path2, link_type)
        cols_to_compare = []
        link_type_cols = [
            x for x in all_cols if link_type + '_' in x
        ]
        cols_must_include = ['full_domain_appears', 'top_three_domain_appears', 'code_appears', 'top_three_code_appears']
        cols_to_compare = [
            x for x in link_type_cols if (
                cols_must_include[0] in x or cols_must_include[1] in x or cols_must_include[2] in x or cols_must_include[3] in x
            )
        ]
        cols_to_compare = [
            x for x in cols_to_compare if x[-3:] != 'nan'
        ]

        serp_df = serp_df.fillna({
            col: 0 for col in cols_to_compare
        })

        # SERPS that have NO TWEETS or NO NEWS (etc)
        # will have nan values for any related calculations (e.g. avg_jacc of Tweets)
        if link_type == 'results':
            cols_to_fill = [
                'has_knowledge_panel',
                'has_top_ads',
                'has_bottom_ads',
            ]
            serp_df = serp_df.fillna({
                col: 0 for col in cols_to_fill
            })
            
            for col in cols_to_fill:
                cols_to_compare.append(col)

        comparisons = []
        rec = False
        if args.comparison in ['urban-rural', 'all']:
            comparisons.append(Comparison(
                df_a=serp_df[(serp_df['urban_rural_code'] == 5) |
                             (serp_df['urban_rural_code'] == 6)],
                name_a='rural',
                df_b=serp_df[(serp_df['urban_rural_code'] == 1) |
                             (serp_df['urban_rural_code'] == 2)],
                name_b='urban',
                cols_to_compare=cols_to_compare,
                print_all=args.print_all,
                recurse_on_queries=rec
            ))
        if args.comparison in ['income', 'all']:
            comparisons.append(Comparison(
                df_a=serp_df[serp_df['median_income'] <= 45111],
                name_a='low-income',
                df_b=serp_df[serp_df['median_income'] > 45111],
                name_b='high-income',
                cols_to_compare=cols_to_compare,
                print_all=args.print_all,
                recurse_on_queries=rec
            ))
        if args.comparison in ['voting', 'all']:
            comparisons.append(Comparison(
                df_a=serp_df[serp_df['percent_dem'] <= 0.5],
                name_a='GOP',
                df_b=serp_df[serp_df['percent_dem'] > 0.5],
                name_b='DEM',
                cols_to_compare=cols_to_compare,
                print_all=args.print_all,
                recurse_on_queries=rec
            ))

        for comparison in comparisons:
            out, summary, error, query_comparison_lists = comparison.print_results()
            for key in RESULT_SUBSETS:
                pval_summaries[key] += summary['pval'][key]
                whitelist_summaries[key] += summary['whitelist'][key]
                fisher_summaries[key] += summary['fisher'][key]
                query_comparison_listss[key] += query_comparison_lists[key]
            outputs += out
            errors += error

        # write out the comparisons
        output_df = pd.DataFrame(outputs)
        output_df.to_csv(path2 + '/comparisons.csv')

        # write out a summary of statistically significant comparisons
        paper_table_list = []
        for key in RESULT_SUBSETS:
            paper_table_list += pval_summaries[key]
            paper_table_list += whitelist_summaries[key]
            pval_summary_df = pd.DataFrame(pval_summaries[key])
            pval_summary_df.to_csv(path2 + '/' + key + '_pval_summary.csv')
            
            fisher_summary_df = pd.DataFrame(fisher_summaries[key])
            fisher_summary_df.to_csv(path2 + '/' + key + '_fisher_summary.csv')
            
            whitelist_summary_df = pd.DataFrame(whitelist_summaries[key])
            whitelist_summary_df.to_csv(
                path2 + '/' + key + '_whitelist_summary.csv')
            query_comparison_df = pd.DataFrame(query_comparison_listss[key])
            query_comparison_df.to_csv(
                path3 + '/' + key + '_query_comparisons.csv')

            # merged will hold the union of the whitelist summary and the pval summary
            if not whitelist_summary_df.empty and not pval_summary_df.empty:
                merged = pd.merge(whitelist_summary_df,
                                  pval_summary_df[['column']], on='column')
                if not merged.empty:
                    merged.loc[:, 'subset'] = key
                    # comparison_df = set_or_concat(comparison_df, merged)
            if not whitelist_summary_df.empty:
                whitelist_summary_df.loc[:, 'subset'] = key
                comparison_df = set_or_concat(comparison_df, whitelist_summary_df)

        pd.DataFrame(paper_table_list).to_csv(path3 + '/paper_table.csv')

        with open(path2 + '/errs.csv', 'w') as outfile:
            writer = csv.writer(outfile)
            for row in errors:
                writer.writerow([row])
    
    importance_df = serp_df[all_cols]
    #print('importance_df', importance_df)
    if category == 'all':
        importance_df.loc[:, 'category'] = 'all'
    if comparison_df is not None:
        comparison_df.loc[:, 'category'] = category
    return {
        'comparison_df': comparison_df, 
        'importance_df': importance_df,
        'ugc_ret_cols': ugc_ret_cols,
        'big_ret_cols': big_ret_cols
    }

def parse():
    """parse args"""
    parser = argparse.ArgumentParser(description='Perform analysis.')
    parser.add_argument(
        '--comparison', help='What comparison to do', default=[])
    parser.add_argument(
        '--category', help='Which category to include in the analysis', default='each')
    parser.add_argument(
        '--db', help='Name of the database(s)', nargs='+', required=True)
    parser.add_argument(
        '--print_all', dest='print_all', help='Whether to print ALL comparisons', action='store_true')
    parser.add_argument(
        '--plot', dest='plot', help='Whether to plot', action='store_true')
    parser.add_argument(
        '--write_long', dest='write_long', help='Whether to write the longform importance df', action='store_true')
    parser.add_argument(
        '--plot_detailed', dest='plot_detailed', help='Whether to plot', action='store_true', default=False)
    parser.add_argument(
        '--include_kp', dest='include_kp', help='Whether to include_kp directly into results. Works differently than setting link_types.', action='store_true', default=False)
    parser.add_argument(
        '--coded_metrics', dest='coded_metrics', help='Whether to include coded_ugc_fracs in analysis output', action='store_true', default=False)
    parser.add_argument(
        '--group_popular', dest='group_popular', help='treat all popular queries as one group for the purposes of plotting', action='store_true', default=True)
    parser.set_defaults(print_all=False)

    args = parser.parse_args()
    print('args.db', args.db)
    comparison_df = None
    df = None
    ugc_cols = []
    big_cols = []
    for db in args.db:
        if args.category == 'each':
            cats = ['popular', 'trending', 'procon_popular', 'top_insurance', 'top_loans', 'med_sample_first_20', 'all']
            # in addition to doing indivdual categories, it's helpful to do all. This means when manually inspecting results, we can easily browse through the output/ folder to look at specific categories OR all at once
            # furthermore, we can easily discard the "all" results from importance_df.csv
        else:
            cats = [args.category]
        start = time.time()
        tic = time.time()
        for cat in cats:
            results = main(args, db, cat)
            if results:
                comparisons_for_cat = results['comparison_df']
                df_for_cat = results['importance_df']
                ugc_cols += results['ugc_ret_cols']
                big_cols += results['big_ret_cols']
            else:
                comparisons_for_cat = df_for_cat = None

            # about to write unintuitive code that overuses None...
            if comparisons_for_cat is not None:
                comparison_df = set_or_concat(
                    comparison_df, comparisons_for_cat)
            if df_for_cat is not None:
                df = set_or_concat(
                    df, df_for_cat)
            tmp = time.time()
            print('Benchmark: Category {} took {} seconds. A total of {} seconds have passed.'.format(
                cat, round(tmp - tic, 2), round(tmp - start, 2)
            ))
            tic = tmp


    # this code does a customized "melt" on the data
    # that is, it makes the wide-form dataframe long-form
    # and it separates the complicated column names into multiple columns
    # e.g. link_type, subset, metric, 


    if args.write_long:
        # ANCHOR: MELT
        row_dicts = []
        print('len(df.index)', len(df.index))
        for col in df.columns.values:
            is_ugc_col = False
            is_big_col = False
            if col:
                col_components = [
                    'domain_appears',
                   # 'domain_frac',
                    'domain_rank',
                    'domain_count',
                    'domain_maps',
                ]

                # we only want to include the columns specified in "col_components" in the long-form data
                valid = False
                for col_component in col_components:
                    if '_' + col_component in col:
                        valid = True
                        break
                if valid:
                    # useful to mark down if this column made is (1) UGC or (2) a "big player"
                    if col in ugc_cols:
                        is_ugc_col = True
                    if col in big_cols:
                        is_big_col = True
                    for col_component in col_components:
                        if col_component in col:
                            metric = col_component
                            tmp, domain = col.split('_' + col_component + '_')
                    link_type, subset = None, None
                    for key in RESULT_SUBSETS:
                        if key + '_domain' in col:
                            subset = key
                            link_type = tmp.replace(key, '').strip('_')

                    # here we iterate through every row of df...
                    for i_row, row in df.iterrows():
                        row_dict = {
                            'index_in_concat_df': i_row,
                            'link_type': link_type,
                            'subset': subset,
                            'metric': metric,
                            'domain': domain,
                            'val':  row[col],
                            'category': row['category'],
                            'is_big_col': is_big_col,
                            'is_ugc_col': is_ugc_col,
                        }
                        row_dicts.append(row_dict)
        importance_df = pd.DataFrame(row_dicts)
        importance_df.to_csv('importance_df.csv')
        plot_importance(importance_df)
    if comparison_df is not None:
        comparison_df.to_csv('comparison_df.csv')
        plot_comparison(comparison_df)
    else:
        print('found no comparisons...')

    if args.plot:
        plt.show()


if __name__ == '__main__':
    parse()