forked from ETHmodlab/BIMODAL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
one_hot_encoder.py
116 lines (93 loc) · 3.83 KB
/
one_hot_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Implementation of one-hot-encoder for SMILES strings
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import os
import sys
class SMILESEncoder():
def __init__(self):
# Allowed tokens (adapted from default dictionary)
self._tokens = np.sort(['#', '=',
'\\', '/', '%', '@', '+', '-', '.',
'(', ')', '[', ']',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
'A', 'B', 'E', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'S', 'T', 'V',
'Z',
'a', 'b', 'c', 'd', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't'
])
# Dictionary mapping index to token
self._encoder = OneHotEncoder(categories=[self._tokens], dtype=np.uint8, sparse=False)
def encode_from_file(self, name='data'):
'''One-hot-encoding from .csv file
:param name: name of data file
:return: encoded data (data size, molecule size, allowed token size)
'''
# Read data
if os.path.isfile(name + '.csv'):
data = pd.read_csv(name + '.csv', header=None).values
elif os.path.isfile(name + '.tar.xz'):
# Skip first line since empty and last line since nan
data = pd.read_csv(name + '.tar.xz', compression='xz', header=None).values[1:-1]
else:
print('CAN NOT READ DATA')
sys.exit()
# Store dimensions
shape = data.shape
data = data.reshape(-1)
print(shape)
# Remove empty dimensions
data = np.squeeze(data)
# Return array with same first and second dimensions as input
return self.encode(data).reshape((shape[0], shape[1], -1, len(self._tokens)))
def encode(self, data):
'''One-hot-encoding
:param data: input data (sample size,)
:return one_hot: encoded data (sample size, molecule size, allowed token size)
'''
# Split SMILES into characters
data = self.smiles_to_char(data)
# Store dimensions and reshape to use encoder
shape = data.shape
data = data.reshape((-1, 1))
# Encode SMILES
data = self._encoder.fit_transform(data)
# Restore shape
data = data.reshape((shape[0], shape[1], -1))
return data
def decode(self, one_hot):
'''Decode one-hot encoding to SMILES
:param one_hot: one_hot data (sample size, molecule size, allowed token size)
:return data: SMILES (sample size,)
'''
# Store dimensions and reshape to use encoder
shape = one_hot.shape[0]
one_hot = one_hot.reshape((-1, len(self._tokens)))
# Decode SMILES
data = self._encoder.inverse_transform(one_hot)
# Restore shape
data = data.reshape((shape, -1))
# Merge char to SMILES
smiles = self.char_to_smiles(data)
return smiles
def smiles_to_char(self, data):
'''Split SMILES into array of char
:param data: input data (sample size,)
:return char_data: encoded data (sample size, molecule size)
'''
char_data = []
for i, s in enumerate(data):
char_data.append(np.array(list(s)))
# Get array from list
char_data = np.stack(char_data, axis=0)
return char_data
def char_to_smiles(self, char_data):
'''Merge array of char into SMILES
:param char_data: input data (sample size, molecule size)
:return data: encoded data (sample size, )
'''
data = []
for i in range(char_data.shape[0]):
data.append(''.join(char_data[i, :]))
return np.array(data)