-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_loader.py
158 lines (139 loc) · 7.62 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 8 22:32:37 2021
@author: Sameitos
"""
from . import data_importer
class ECNO(data_importer.cls_data_loader):
'''
Description:
ECNO is a function to import enzyme commssion number data. It gives X data
and y data separately
Parameters:
ratio: {None, float, list}, (default = 0.2): used to split data
into train, test, validation sets as given values. If left None,
only X and y data can be obtained while float value gives train
and test set. If ratio = a (float), then test will be a% of total
data size. If ratio = [a,b] where a and b are in (0,1),
train, test and validation sets are formed according to them. For example,
If a = 0.2 and b = 0.1, train fraction is 0.7, test fraction is 0.2
and validation fraction is 0.1 of all dataset size. If set_type = 'temporal',
then ratio = None automatically.
protein_faeture: {'paac','aac','gaac','ctriad','ctdt','soc_number','kpssm'},
(default = 'paac'): numerical features of protein sequences
set_type: {'random','similarity','temporal'}, (default = 'random'):
split type of data, random:random splitting, target:
similarity based splitting, temporal: splitting according to
annotation time
pre_determined: bool, (default = False), if False, data is given
according to ratio type, If True, already splitted data will
be provided.
label: {None, 'positive','negative'}, (default = None): If None, data
is given directly, if 'negative', only negative set is given,
If 'positive', only positive set is given.
Returns:
Multiple arrays that contains training, test and validation dataset and
their labels. {numpy array, list}
'''
def __init__(self,protein_feature = 'paac',
set_type = 'random',
ratio = 0.2,
label = None,
pre_determined = True,
):
super().__init__(ratio = ratio, protein_feature = protein_feature,
set_type = set_type,label = label,
pre_determined = pre_determined, main_set = 'ec_dataset',
)
class GOID(data_importer.cls_data_loader):
'''
Description:
GOID is a function to import gene ontology term data. It gives X data and
y data separately.
Parameters:
ratio: {None, float, list}, (default = 0.2): used to split data
into train, test, validation sets as given values. If left None,
only X and y data can be obtained while float value gives train
and test set. If ratio = a (float), then test will be a% of total
data size. If ratio = [a,b] where a and b are in (0,1),
train, test and validation sets are formed according to them. For example,
If a = 0.2 and b = 0.1, train fraction is 0.7, test fraction is 0.2
and validation fraction is 0.1 of all dataset size. If set_type = 'temporal',
then ratio = None automatically.
protein_faeture: {'paac','aac','gaac','ctriad','ctdt','soc_number','kpssm'},
(default = 'paac'): numerical features of protein sequences
set_type: {'random','similarity','temporal'}, (default = 'random'):
split type of data, random:random splitting, target:
similarity based splitting, temporal: splitting according to
annotation time
pre_determined: bool, (default = False), if False, data is given
according to ratio type, If True, already splitted data will
provided.
label: {None, 'positive','negative'}, (default = None): If None, data
is given directly, if 'negative', only negative set is given,
If 'positive', only positive set is given.
Returns:
Multiple arrays that contains training, test and validation dataset and
their labels. {numpy array, list}
'''
def __init__(self,protein_feature = 'paac',
set_type = 'random',
ratio = 0.2,
label = None,
pre_determined = False
):
super().__init__(ratio = ratio, protein_feature = protein_feature,
set_type = set_type,label = label,
pre_determined = pre_determined, main_set = 'go_dataset'
)
class SelfGet(data_importer.casual_importer):
'''
Description:
This function is to provide users to import their datasets with
specified delimiter. The format of data should be like that if
delimiter is comma separated and name == True:
Name(or ID),feature_1,feature_2,...,feature_n
Name(or ID),feature_1,feature_2,...,feature_n
Name(or ID),feature_1,feature_2,...,feature_n
Parameters:
delimiter: default = "\t", a character to separate columns in file.
name: type = bool, default = False, If True, then first colmun
is considered as name of inputs else the first column is a
feature column.
label: type = bool, default = False, If True, then last colmun
is considered as label of inputs else the last column is a
feature column.
Return:
Feature matrix data and possible label data according to attributes. {list}
'''
def __init__(self, delimiter = '\t', name = False, label = False):
super().__init__(delimiter = delimiter, name = name, label = label)
"""!!!!THIS CLASS IS OUT OF USE FOR NOW. AS DTI DATASETS ARE IN PIPE,
DTI IMPORTER WILL BE READY TO USE!!!!"""
# class DTI(data_importer.rgs_data_loader):
# '''
# DTI is a function to import drug-target interaction data. It gives X data
# and y data separately
# Parameters:
# ratio: {None, float, list}, (default = 0.2): used to split data
# into train, test, validation sets as given values. If left None,
# only X and y data can be obtained while float value gives train
# and test set. If ratio = a (float), then test will be a% of total
# data size. If ratio = [a,b] where a and b are in (0,1), train, test
# and validation sets are formed according to them. For example,
# If a = 0.2 and b = 0.1, train fraction is 0.7, test fraction is 0.2
# and validation fraction is 0.1 of all dataset size.
# protein_faeture: {'paac','aac','gaac','ctriad','ctdt','soc_number', kpssm},
# (default = 'paac'): numerical features of protein sequences
# set_type: {'random','target'}, (default = 'random'):
# split type of data, random:random splitting, target:
# similarity based splitting
# '''
# def __init__(self,protein_feature = 'paac',
# set_type = 'random',
# ratio = None):
# self.protein_feature = protein_feature
# self.set_type = set_type
# self.ratio = ratio
# super().__init__(ratio = self.ratio, protein_feature = self.protein_feature,
# set_type = self.set_type)