-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing.py
70 lines (56 loc) · 2.66 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
import numpy as np
class Processor():
"""
Class to preprocess AQS data in specified format to feed into models.
"""
def __init__(self):
pass
def project_unique(self, df, measurement, verbose=False):
"""
Keep only columns that have 2 of more unique values.
"""
cols_dict = {col: df[col].nunique() for col in df.columns}
all_cols = {k: v for k,v in cols_dict.items() if v <= 1}
kept_cols = {k: v for k,v in cols_dict.items() if v > 1}
# Keep only variables we care about (changing)
df = df[kept_cols.keys()].copy()
df['datetime'] = pd.to_datetime(df['date_local'] + ' ' + df['time_local'])
df.set_index('datetime', inplace=True)
df = df.drop(['date_gmt', 'time_gmt', 'date_local', 'time_local'], axis=1)
# NOTE: Should I drop this?
if 'date_of_last_change' in df.columns:
df = df.drop(['date_of_last_change'], axis=1)
if verbose:
print('Kept the following columns:')
print(df.columns)
print()
print('Removed the following columns:')
print([col for col in all_cols if col not in df.columns])
print()
df = df.rename({'sample_measurement': measurement}, axis=1)
return df
def process(self, df, measurement, change_freq=False, select_method=False, drop_lat_lon=True, remove_duplicates=False):
if select_method:
df = df.loc[df['method'] == df['method'].unique()[0]].copy()
df['datetime'] = pd.to_datetime(df['date_local'] + ' ' + df['time_local'])
df = df[['datetime', 'sample_measurement', 'latitude', 'longitude']]
df = df.rename({'sample_measurement': measurement}, axis=1)
# TODO: Fix this, currently just select the first year period to avoid duplicate index, but this might select the wrong data
duplicates = df.duplicated(subset='datetime', keep='first')
duplicates = np.where(duplicates)[0]
if len(duplicates) > 0:
df = df.iloc[:duplicates[0]]
df.set_index(['datetime'], inplace=True)
if change_freq:
print(df.head())
df = df.asfreq('1h', method='ffill')
if drop_lat_lon:
df = df.drop(['latitude', 'longitude'], axis=1)
return df
def join(self, dfs):
df = dfs[0].join(dfs[1:], how='outer')
df = df.drop([x for x in df.columns if (('latitude' in x) and (x != 'latitude'))], axis=1)
df = df.drop([x for x in df.columns if (('longitude' in x) and (x != 'longitude'))], axis=1)
df = df.resample('1h').mean()
return df