-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData_preparation.py
71 lines (63 loc) · 2.83 KB
/
Data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# First we create start and end time for each event class
import pandas as pd
data = pd.read_csv("cleaned_data.csv")
res = dict()
# find start and end time of each event class
for _, row in data.iterrows():
Project = row['Project']
Activity = row['Activity']
Person = row['Person']
Date = pd.to_datetime(row['Date'], format = '%d/%m/%Y')
Hours = row['Hours']
if Project not in res:
res[Project] = dict()
if Activity not in res[Project]:
res[Project][Activity] = dict()
if Person not in res[Project][Activity]:
res[Project][Activity][Person] = {'Start': '30/12/2022',
'End': '01/01/2015','Hours': 0}
if Date < pd.to_datetime(res[Project][Activity][Person]['Start']):
res[Project][Activity][Person]['Start'] = Date
if Date > pd.to_datetime(res[Project][Activity][Person]['End']):
res[Project][Activity][Person]['End'] = Date
res[Project][Activity][Person]['Hours'] += float(Hours)
print('Project,Activity,Person,Start,End,Hours')
for Project in res:
for Activity in res[Project]:
for Person in res[Project][Activity]:
str_ = str(Project) + ','+ str(Activity) +',' + str(Person) + ','+ \
str(res[Project][Activity][Person]['Start']) + ',' + \
str(res[Project][Activity][Person]['End']) +',' + \
str(res[Project][Activity][Person]['Hours'])
print(str_)
# ------------------ Use Excel and Power Query in the middle ------------------
# Then we add the start time and end time for each event based on the event
# classifier in the data via Excel and pivot based on the project, activity
# and person, afterwards, unpivot the table on the combination of the three
# values in Power Query. We use the unpivoted dataset to generate waiting nodes
# as follows.
# -----------------------------------------------------------------------------
import holidays # import holidays library to select waiting dates
data = pd.read_csv('unpivot.csv')
data = data.loc[data['Activity'] == 'AT1']
nl_holidays = holidays.NL()
def ho_weekends():
for _, row in data.iterrows():
Date = pd.to_datetime(row['Time'], format = '%Y-%m-%d')
if Date in nl_holidays:
print(Date)
for _, row in data.iterrows():
Date = pd.to_datetime(row['Time'], format = '%Y-%m-%d')
if Date.weekday() > 4:
print(Date)
ho_weekends()
data_normal = data.loc[-data['Date'].isin(ho_weekends)]
data_anomly = data.loc[data['Date'].isin(ho_weekends)]
data_anomly = data_anomly[data_anomly['Hours'] > 0]
data2 = pd.concat([data_normal, data_anomly]).sort_values('Date')
# select events in the interval of an event class
mask = (data2['Date'] >= data2['Start_Time']) & \
(data['Date'] <= data2['End_Time'])
data2 = data2.loc[mask]
AT1 = data2["Activity" == "AT1"]
AT1.to_csv("AT1.csv")