-
Notifications
You must be signed in to change notification settings - Fork 0
/
Correlation_script
88 lines (69 loc) · 3.43 KB
/
Correlation_script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.linalg import svd
from matplotlib.pyplot import (figure, title, subplot, plot, hist, show,
xlabel, ylabel, xticks, yticks, colorbar, cm,
imshow, suptitle)
# Load the Iris csv data using the Pandas library
filename = ""
df = pd.read_csv(filename)
# Pandas returns a dataframe, (df) which could be used for handling the data.
# We will however convert the dataframe to numpy arrays for this course as
# is also described in the table in the exercise
raw_data = df.values
# Notice that raw_data both contains the information we want to store in an array
# X (the sepal and petal dimensions) and the information that we wish to store
# in y (the class labels, that is the iris species).
# We start by making the data matrix X by indexing into data.
# We know that the attributes are stored in the four columns from inspecting
# the file.
cols = range(0,13)
# We can extract the attribute names that came from the header of the csv
attributeNames = np.asarray(df.columns[cols])
X = raw_data[:, cols]
# We can determine the number of data objects and number of attributes using
# the shape of X
N, M = X.shape
#generate 3 matrices to be used for storing the correlation data
# one for all the values one for high values and one for medium values
correlation_matrix = np.zeros((M,M))
correlation_matrix_high=np.zeros((M,M))
correlation_matrix_med=np.zeros((M,M))
#Calculate the correlation value for every pair of variables
#make a distincsion between low values <0.15, medium values <= 0.15 >0.3 and high values >0.3
#In our case values close and equal to one are generated by pair of the same value so they are not included in the matrix
#A text is printed with which values are correlated
for i in range(0,M):
for j in range(i,M):
correlation=round(np.corrcoef(X[:,i], X[:,j])[0,1],2)
correlation_matrix[i,j]=correlation
if abs(correlation) >= 0.2 and abs(correlation) < 0.4 :
correlation_matrix_med[i,j]=correlation
print(f'{attributeNames[i]} and {attributeNames[j]} are weakly correlated with correlation={correlation}\n'
)
elif abs(correlation) >= 0.4 and abs(correlation) < 0.6 :
correlation_matrix_high[i,j]=correlation
print(f'{attributeNames[i]} and {attributeNames[j]} are hmoderately correlated with correlation={correlation}\n'
)
#%%
#Scatter plot of data with high corellation
for i in range(0,len(np.nonzero(correlation_matrix_high)[0])):
figure( figsize=(3,3))
plot(X[:,np.nonzero(correlation_matrix_high)[0][i]],X[:,np.nonzero(correlation_matrix_high)[1][i]], 'x')
xlabel(f'{attributeNames[np.nonzero(correlation_matrix_high)[0][i]]}');
ylabel(f'{attributeNames[np.nonzero(correlation_matrix_high)[1][i]]}');
title('Scatter plot of data')
show()
#%%
#Scatter plot of data with medium corellation
figure(figsize=(10,10))
suptitle('2-D Normal distribution')
for i in range(0,len(np.nonzero(correlation_matrix_med)[0])):
figure( figsize=(3,3))
plot(X[:,np.nonzero(correlation_matrix_med)[0][i]],X[:,np.nonzero(correlation_matrix_med)[1][i]], 'x')
xlabel(f'{attributeNames[np.nonzero(correlation_matrix_med)[0][i]]}');
ylabel(f'{attributeNames[np.nonzero(correlation_matrix_med)[1][i]]}');
title('Scatter plot of data')
plt.show()