-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtasks.py
44 lines (38 loc) · 1.47 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
# some usefull functions for repetative data processing tasks before and after using the scripts
def get_gender_dictionaty(df):
""" Returns a dictionary where KEY is FIRST NAME and VALUE is GENDER.
Dictionary is generated from a dataframe with a 'gender' column and 'name' column
"""
df = df[df['gender'] != 'unknown']
gender_dict = df[['name','gender']].set_index('name')['gender'].to_dict()
return gender_dict
def save_lst_as_df(lst,col_name,path,header=True, index=False, encoding = "utf-8"):
""" Returns a dataframe generated from the passed list, saves it as csv at specified path
"""
df = pd.DataFrame({
col_name: lst
})
df.to_csv(path,header=header,index=index,encoding = encoding)
print("Dataframe saved at: "+path)
return df
def print_stats(df, col):
""" Prints the number of instances in the data set, number of instances with a known value,
absolute and relative frequencies.
"""
dic = df[col].value_counts().to_dict()
total = sum(dic.values())
print("Total in dataset: ", len(df))
print("Total gender determined: ", total)
print("Ratio known/total: ", total / len(df))
# print(sum(dic.values()))
print(dic)
# print
# print(df[col].value_counts())
dic.update((k, v / total ) for k,v in dic.items())
print(dic)
def concat_dataframes(df_list = []):
""" Verticaly concats given data frames
"""
return pd.concat(df_list)
# add filter unknown or nan