-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.py
104 lines (100 loc) · 3.82 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import random
from scipy.special import expit
from sklearn.model_selection import train_test_split
def generate_data_yfromx(
seed,
n: int = 125,
p: int = 200,
testing_size: float = 0.2,
cor: float = 0.6,
dist: str = 'unif',
response: str = 'bernoulli'
) -> [np.array, np.array, np.array, np.array]:
"""
generate training and testing data with correlated features.
:param seed: seed
:param n: total sample size
:param p: number of features
:param testing_size: proportion of testing set size
:param cor: feature correlation
:param dist: distribution of design matrix, 'unif' for uniform(0,1), anything else for N(0,1)
:param response: distribution of response, 'bernoulli' for binary classification, anything else for normal
:return: x_train(, x_test), y_train(, y_test)
"""
random.seed(seed)
# initialize design matrix
if dist == 'unif':
x = np.random.uniform(-1, 1, (n, p))
xu = np.random.uniform(-1, 1, n)
xv = np.random.uniform(-1, 1, n)
else:
x = np.random.normal(0, 1, (n, p))
xu = np.random.normal(0, 1, n)
xv = np.random.normal(0, 1, n)
t = np.sqrt(cor / (1 - cor))
for j in range(4):
x[:, j] = (x[:, j] + t * xu) / (1 + t) # generate correlated features
for j in range(4, p):
x[:, j] = (x[:, j] + t * xv) / (1 + t) # generate correlated features
# generate true f(x)
truefx = 6 * x[:, 0] + np.sqrt(84) * x[:, 1] ** 3 + np.sqrt(12 / (np.sin(6) / 12 + 1 / 2)) * np.sin(
3 * x[:, 2]) + np.sqrt(48 / (np.exp(2) + np.exp(-1) - np.exp(-2) - np.exp(1))) * np.exp(x[:, 3])
if response == 'bernoulli':
prob = expit(truefx) # compute probability
y = np.random.binomial(1, p=prob) # generate labels from Binomial distribution
else:
y = truefx + np.random.normal(0, 1, n) # generate labels from normal distribution
y = y.reshape(-1, 1)
if testing_size > 0:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=testing_size)
return x_train, x_test, y_train, y_test
else:
return x, y
def generate_data_xfromy(
seed,
n: int = 125,
p: int = 200,
testing_size: float = 0.2,
cor: float = 0.6,
s: int = 10,
sparse : bool = False
) -> [np.array, np.array, np.array, np.array]:
"""
generate x from normal distribution conditioning on two classes of y
:param seed: seed
:param n: total sample size
:param p: number of features
:param testing_size: proportion of testing set size
:param cor: feature correlation
:param s: number of nonzero features, will be ignored if sparse is False
:param sparse: whether x is sparse
"""
random.seed(seed)
if sparse:
# generates expectation as 2 and -2 alternating for the first s elements and 0 for the rest
mu0 = np.array([2 if i % 2 == 0 else -2 for i in range(s)] + [0] * (p-s))
mu1 = np.array([2 if i % 2 == 1 else -2 for i in range(s)] + [0] * (p-s))
else:
# generates mean as range(p)
mu0 = np.array(range(p))
mu1 = np.array(range(p)[::-1])
# generate covariance matrix from AR(1) structure
sigma = np.array([[0.0] * p] * p)
for i in range(p):
for j in range(p):
sigma[i][j] = cor ** abs(i - j)
# generate y
y = np.random.binomial(1, 0.5, n)
# generate x
x = np.array([[0.0] * p] * n)
for i in range (n):
if y[i] == 0:
x[i, :] = np.random.multivariate_normal(mu0, sigma)
else:
x[i, :] = np.random.multivariate_normal(mu1, sigma)
if testing_size > 0:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=testing_size)
return x_train, x_test, y_train, y_test
else:
return x, y