forked from michaelshum123/ECE180-final-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataAnalysis.py
333 lines (296 loc) · 12.7 KB
/
DataAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import csv
from time import localtime
import pandas as pd
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error as mse
class DataAnalysis(object):
"""
Including mutiple methods for data analysis:
AutoRegressive: Price prediction with AR model
data (float): History price data used for predicting future price
testSize (int): Time period for prediction
test (boolean): If True, examine prediction accuracy with test set
If False, predict future price for period of testSize
Corr: Cauculate moving correlation between two inputs
stat1, stat2 (float): data of two variables used for calculating correlation
winSize (int): Window size for calculating correlation
dailyChange: Calculate everyday price change percent with respect to openPrice
openPrice, closePrice (float): price data
fit: Fitting real data points to a smooth curve
method (str):
poly - polynomial curve
exp - expotential curve
sin - sine curve
log - logarithmic curve
ord (int): polynomial order (other method would ignore this parameter)
movingStat: Calculating moving average and moving variance of original data
size (int): Window size used for calculating moving average and variance
raiseOrfall: Calculate increment or decrement in a period of time
data (float): Price data
winSize (int): Time period
separate: Separate increase data and decrease data and give them different color
data (float): Price change data
position (int): start date of each period
ticks: Generate ticks for xlabel
tick (str): date
interval (int): period between two successive ticks
display: Plot graghs of analysis data
"""
def __init__(self, date, price, oprice, volume, method= 'poly', ord= 5, testSize= 30, test= True):
super(DataAnalysis, self).__init__()
self.price = price
self.oprice = oprice
self.oridate = date
self.date, self.tickPos = self.ticks(date, 60)
self.volume = volume
self.method = method
self.ord = ord
self.testSize = testSize
self.test = test
self.avePrice, self.varPrice = self.movingStat(10)
self.aveVar = self.avePrice + self.varPrice
self.dailyC = self.dailyChange(self.oprice, self.price)
self.corr = self.Corr(self.price, self.volume)
self.fitValue = self.fit(method, ord)
# self.support, self.resistance = self.trendLine()
self.perc, self.pos = self.raiseOrfall(self.avePrice)
self.volume = [1e-5 * p for p in self.volume]
self.r, self.rPos, self.d, self.dPos = self.separate(self.perc, self.pos)
# self.prediction = self.AutoRegressive(self.price, self.oridate, self.testSize, self.test)
def AutoRegressive(self, data, date, testSize= 30, test= True):
# Autoregressive model used for time-series predictions
# if test= True, then select the last testSize points as test set
# else predict for a period of testSize
data = np.array(data)
if test:
trainData = data[:-testSize]
testData = data[-testSize:]
else:
trainData = data
model = AR(trainData)
modelFit = model.fit()
winSize, coeff = modelFit.k_ar, modelFit.params
predData = list(trainData[-winSize:])
pred = []
for i in range(testSize):
x = list(predData[-winSize:])
y = coeff[0]
for n in range(winSize):
y += coeff[n+1] * x[winSize-(n+1)]
if test:
predData.append(testData[i])
else:
predData.append(y)
pred.append(y)
if test:
error = mse(testData, pred)
return pred, error, testData
else:
error = None
return pred, error
def Corr(self, stat1, stat2, winSize= 10):
corr = []
corr += 10 * [np.nan]
stat1 /= np.linalg.norm(stat1)
stat2 /= np.linalg.norm(stat2)
for i in range(len(stat1) - winSize):
data1, data2 = stat1[i:i + winSize], stat2[i:i + winSize]
iCorr = np.correlate(data1, data2)
corr.append(iCorr)
return corr
def dailyChange(self, openPrice, closePrice):
diff = np.asarray(openPrice) - np.asarray(closePrice)
perc = diff / np.asarray(openPrice)
return perc
def fit(self, method= 'poly', ord= 3):
# User can customize fit method in 'poly', 'exp', 'sine', 'log'
# For price, it seems 'poly' works best with high order number
def funExp(x, a, b):
return a * np.exp(b * x)
def funSin(x, a, w, t):
return a * np.sin(w * x + t)
def funLog(x, A, a, b):
return A * np.log(a * x + b)
data = np.asarray(self.avePrice)[9:]
n = range(len(data))
if method == 'poly':
assert isinstance(ord, int) and ord != 0
fitCoeff = np.polyfit(n, data, ord)
fitPoly = np.poly1d(fitCoeff)
fitValue = fitPoly(n)
elif method == 'exp':
fitCoeff, _ = curve_fit(funExp, n, self.avePrice)
a, b = fitCoeff[0], fitCoeff[1]
fitValue = funExp(n, a, b)
elif method == 'sine':
fitCoeff, _ = curve_fit(funSin, n, self.avePrice)
a, w, t = fitCoeff[0], fitCoeff[1], fitCoeff[2]
fitValue = funSin(n, a, w, t)
elif method == 'log':
fitCoeff, _ = curve_fit(funLog, n, self.avePrice)
A, a, b = fitCoeff[0], fitCoeff[1], fitCoeff[2]
fitValue = funLog(n, A, a, b)
returnValue = np.zeros((len(fitValue) + 9))
returnValue[:9] = 9 * np.nan
returnValue[9:] = fitValue
return returnValue
def movingStat(self, size= 10):
data = pd.Series(self.price, np.arange(len(self.price)))
ave = data.rolling(window= size, center= False).mean()
var = data.rolling(window= size, center= False).std()
return ave, var
def raiseOrfall(self, data, winSize= 15):
# Calculate price change in a period of winSize
# Change between the start and the end of the period
# Changing percent with respect to the price at the start
c, p = [], []
for i in range(len(data)):
if i >= len(data) - 15:
break
if i % winSize == 0:
diff = (data[i+14] - data[i]) / data[i]
p.append(i)
c.append(diff)
return c, p
def separate(self, data, posit):
# Separate input data into two sets: positive and negative
pos, neg = [], []
posT, negT = [], []
for i in range(len(data)):
if data[i] >= 0:
pos.append(data[i])
posT.append(posit[i])
else:
neg.append(data[i])
negT.append(posit[i])
return pos, posT, neg, negT
def ticks(self, tick, interval= 16):
assert isinstance(interval, int)
newTick, tickPos = [], []
for i in range(len(tick)):
if i % interval == 0:
newTick.append(tick[i])
tickPos.append(i)
return newTick, tickPos
# def trendLine(self):
maxPts, minPts = [], []
for i in range(len(self.price)):
if i >= 1 and i <= len(self.price)-2:
if self.price[i-1] < self.price[i] > self.price[i+1]:
maxPts.append(i)
elif self.price[i-1] > self.price[i] < self.price[i+1]:
minPts.append(i)
sorted(maxPts)
sorted(minPts)
maxLevel = self.price[maxPts[1]]
minLevel = self.price[minPts[-1]]
return maxLevel, minLevel
def display(self):
plt.figure(1)
# Price polt
# Original price, averaged price and its fitting curve
n = np.arange(len(self.price))
# supLine = len(n) * [self.support]
# resLine = len(n) * [self.resistance]
plt.plot(n, self.price, 'k.-', label= 'Original price')
plt.plot(n, self.avePrice, 'r-', label= 'Moving average')
# plt.plot(n, self.expAve, 'g-', label= 'Expotential average')
plt.plot(n, self.fitValue, 'b-', label= 'Fitting curve')
plt.bar(n, self.varPrice, align= 'center', color= 'y', label= 'Moving variance')
# plt.plot(n, supLine, 'y-', linewidth= 2.0, label= 'Support line')
# plt.plot(n, resLine, 'g-', linewidth= 2.0, label= 'Resistance line')
plt.title('Price')
plt.xticks(self.tickPos, self.date, rotation= 70)
# plt.xticks([])
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.legend()
fig, ax1 = plt.subplots()
# Volume and Price change plot
# Volume in blue bars
# Price bars: Red - increase, Green - decrease
ax1.bar(n, self.volume, align= 'center', color= 'b', label= 'Volume')
ax1.bar(self.rPos, self.r, 15, align= 'edge', color= 'r', alpha= 0.7, label= 'Increase')
ax1.bar(self.dPos, self.d, 15, align= 'edge', color= 'g', alpha= 0.7, label= 'Decrease')
plt.ylabel('Volume(BTC) / Percent(%)')
plt.legend()
plt.grid(True)
ax2 = ax1.twinx()
ax2.plot(n, self.corr, 'y-', linewidth= 1.7, label= 'Correlation')
plt.xticks(self.tickPos, self.date, rotation= 70)
plt.title('Volume and Price Change')
plt.xlabel('Date')
plt.legend()
plt.grid(False)
fig.tight_layout()
plt.show()
plt.subplots()
plt.hist(self.dailyC, 'auto')
plt.title('Daily Change Histogram from %s to %s' %(self.date[0], self.date[-1]))
plt.xlabel('Percent')
plt.ylabel('Number')
plt.grid(True)
plt.show()
plt.subplots()
if self.test:
pred, error, testData = self.prediction
n = np.arange(len(pred))
plt.plot(n, testData, 'b-', label= 'Real price')
plt.plot(n, pred, 'r-', label= 'Prediction')
plt.title('Price prediction with error= %.2f' %error)
plt.xlabel('Time / days')
plt.ylabel('Price / USD')
plt.legend()
plt.grid(True)
plt.show()
else:
pred, error = self.prediction
n = np.arange(len(pred))
plt.plot(n, pred, 'r-', label= 'Prediction')
plt.title('Price prediction for %d days' % self.testSize)
plt.xlabel('Time / days')
ple.ylabel('Price / USD')
plt.legend()
plt.grid(True)
plt.show()
# Load data:
# data: str, used for x-axis ticks.
# price: float
# volume: float
# oprice: float
# Path = '../CryptoCurrency/'
# fileName = 'ethereum'
# with open(Path + fileName + '.csv','r') as csvfile:
# reader = csv.reader(csvfile)
# date = [row[1] for row in reader] # Please input column index of date
# with open(Path + fileName + '.csv','r') as csvfile:
# reader = csv.reader(csvfile)
# price = [row[2] for row in reader] # Please input column index of close
# with open(Path + fileName + '.csv','r') as csvfile:
# reader = csv.reader(csvfile)
# volume = [row[6] for row in reader] # Please input column index of volume
# with open(Path + fileName + '.csv','r') as csvfile:
# reader = csv.reader(csvfile)
# oprice = [row[5] for row in reader] # Please input column index of open
#################################################################################
Path = './Update/'
currency = 'BTC'
period = 'Last_month'
fname = Path + currency + '_' + period + '_data' + '.csv'
Data = pd.read_csv(fname)
openPrice, closePrice, date, volume = Data['open'].values, Data['close'].values, Data['timeDate'].values, Data['volumefrom'].values
# Select the last period percent of the whole dataset.
# If using clean datasets, please use full data with period= 1
# period = 0.4
# start = -1 * int(period * (len(price) - 1))
# date = date[start:]
# price = [float(p) for p in price[start:]]
# volume = [float(v) for v in volume[start:]]
# oprice = [float(o) for o in oprice[start:]]
dataAnalysit = DataAnalysis(date, closePrice, openPrice, volume, method= 'poly', ord= 10, testSize= 30, test= True)
dataAnalysit.display()