From 280287584a5ec6c162ad1f5fdebcad6128e996e2 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 15:50:06 +0200 Subject: [PATCH 1/9] Use pyproject instead of setup.py --- pyproject.toml | 37 +++++++++++++++++++++++++++++++++++ setup.py | 52 -------------------------------------------------- 2 files changed, 37 insertions(+), 52 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..cb65318 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,37 @@ +[project] +name = "molexpress" +version = "0.1.0" +description = "Graph Neural Networks with Keras 3." +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Alexander Kensert", email = "alexander.kensert@gmail.com" }, +] +keywords = [ + "python", + "keras-3", + "machine-learning", + "deep-learning", + "graph-neural-networks", + "graph-convolutional-networks", + "graphs", + "molecules", + "chemistry", + "cheminformatics", + "bioinformatics", +] +classifiers = [ + "Programming Language :: Python :: 3", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", +] +requires-python = ">=3.10.6" +dependencies = ["rdkit>=2023.9.5", "keras>=3", "torch", "numpy"] + +[project.urls] +homepage = "https://github.com/compomics/molexpress" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py deleted file mode 100644 index e34478d..0000000 --- a/setup.py +++ /dev/null @@ -1,52 +0,0 @@ -import setuptools -import os -import sys - -def get_version(): - version_path = os.path.join(os.path.dirname(__file__), 'molexpress') - sys.path.insert(0, version_path) - from _version import __version__ as version - return version - -with open("README.md", "r") as fh: - long_description = fh.read() - -install_requires = [ - "tensorflow>=2.16.1", # Installs Keras 3 - "rdkit>=2023.9.5", - "jupyter", # Optional, but needed for the notebooks -] - -setuptools.setup( - name='molexpress', - version=get_version(), - author="Alexander Kensert", - author_email="alexander.kensert@gmail.com", - description="Graph Neural Networks with Keras 3.", - long_description=long_description, - long_description_content_type="text/markdown", - license="MIT", - url="https://github.com/compomics/molexpress", - packages=setuptools.find_packages(include=["molexpress*"]), - install_requires=install_requires, - classifiers=[ - "Programming Language :: Python :: 3", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX :: Linux", - ], - python_requires=">=3.10.6", - keywords=[ - 'python', - 'keras-3', - 'machine-learning', - 'deep-learning', - 'graph-neural-networks', - 'graph-convolutional-networks', - 'graphs', - 'molecules', - 'chemistry', - 'cheminformatics', - 'bioinformatics', - ] -) From fbb5558e7c8a9da7e1e79951e61f5b21f3ee25ba Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 15:50:48 +0200 Subject: [PATCH 2/9] Add init --- molexpress/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 molexpress/__init__.py diff --git a/molexpress/__init__.py b/molexpress/__init__.py new file mode 100644 index 0000000..e69de29 From 25bad16fa274f225903ab8ce7f0b7c0b08b82221 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 15:51:46 +0200 Subject: [PATCH 3/9] Don't add torch as dependency --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cb65318..49b7d8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ "Operating System :: POSIX :: Linux", ] requires-python = ">=3.10.6" -dependencies = ["rdkit>=2023.9.5", "keras>=3", "torch", "numpy"] +dependencies = ["rdkit>=2023.9.5", "keras>=3", "numpy"] [project.urls] homepage = "https://github.com/compomics/molexpress" From e410f46a5bd991466eb15363a0cd4d9273e049a7 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 15:52:51 +0200 Subject: [PATCH 4/9] Add code style tool configs; relax python requirement to >= 3.8 --- pyproject.toml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 49b7d8d..07a6f0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: POSIX :: Linux", ] -requires-python = ">=3.10.6" +requires-python = ">=3.8" dependencies = ["rdkit>=2023.9.5", "keras>=3", "numpy"] [project.urls] @@ -35,3 +35,14 @@ homepage = "https://github.com/compomics/molexpress" [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" + +[tool.isort] +profile = "black" + +[tool.black] +line-length = 99 +target-version = ['py38'] + +[tool.ruff] +line-length = 99 +target-version = 'py38' From 6c5d4746381921e861884703a1710b3be4574f83 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 15:53:45 +0200 Subject: [PATCH 5/9] Keep version in init for auto discovery --- molexpress/__init__.py | 1 + molexpress/_version.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 molexpress/_version.py diff --git a/molexpress/__init__.py b/molexpress/__init__.py index e69de29..3dc1f76 100644 --- a/molexpress/__init__.py +++ b/molexpress/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/molexpress/_version.py b/molexpress/_version.py deleted file mode 100644 index 6853c36..0000000 --- a/molexpress/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.0.0' \ No newline at end of file From b622eb99f4cf88ffc1daddff95c48782f28fa14d Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 10 Apr 2024 16:20:55 +0200 Subject: [PATCH 6/9] Fix compatibility with Python >= 3.9 (minimal for Keras); Automated ruff formatting --- molexpress/datasets/encoders.py | 59 ++++++++++----------- molexpress/datasets/featurizers.py | 32 ++++++------ molexpress/layers/base_layer.py | 27 +++++----- molexpress/layers/gcn_conv.py | 83 +++++++++++++----------------- molexpress/layers/gin_conv.py | 34 ++++++------ molexpress/layers/readout.py | 10 ++-- molexpress/ops/chem_ops.py | 10 ++-- molexpress/ops/gnn_ops.py | 36 +++++++------ molexpress/types.py | 26 +++++----- pyproject.toml | 5 +- 10 files changed, 163 insertions(+), 159 deletions(-) diff --git a/molexpress/datasets/encoders.py b/molexpress/datasets/encoders.py index 9114622..9deed79 100644 --- a/molexpress/datasets/encoders.py +++ b/molexpress/datasets/encoders.py @@ -1,10 +1,12 @@ +from __future__ import annotations + import numpy as np from rdkit import Chem from molexpress.datasets import featurizers from molexpress.ops import chem_ops from molexpress import types - + class MolecularGraphEncoder: @@ -12,7 +14,7 @@ def __init__( self, atom_featurizers: list[featurizers.Featurizer], bond_featurizers: list[featurizers.Featurizer] = None, - self_loops: bool = False, + self_loops: bool = False, ) -> None: self.node_encoder = MolecularNodeEncoder(atom_featurizers) self.edge_encoder = MolecularEdgeEncoder( @@ -20,7 +22,7 @@ def __init__( ) def __call__( - self, + self, molecule: types.Molecule | types.SMILES | types.InChI ) -> np.ndarray: molecule = chem_ops.get_molecule(molecule) @@ -30,20 +32,20 @@ def __call__( def _collate_fn( data: list[tuple[types.MolecularGraph, np.ndarray]] ) -> tuple[types.MolecularGraph, np.ndarray]: - - """TODO: Not sure where to implement this collate function. + + """TODO: Not sure where to implement this collate function. Temporarily putting it here. Procedure: Merges list of graphs into a single disjoint graph. """ - x, y = list(zip(*data)) - + x, y = list(zip(*data)) + num_nodes = np.array([ graph['node_state'].shape[0] for graph in x ]) - + disjoint_graph = {} disjoint_graph['node_state'] = np.concatenate([ @@ -58,12 +60,12 @@ def _collate_fn( edge_src = np.concatenate([graph['edge_src'] for graph in x]) edge_dst = np.concatenate([graph['edge_dst'] for graph in x]) num_edges = np.array([graph['edge_src'].shape[0] for graph in x]) - indices = np.repeat(range(len(x)), num_edges) + indices = np.repeat(range(len(x)), num_edges) edge_incr = np.concatenate([[0], num_nodes[:-1]]) edge_incr = np.take_along_axis(edge_incr, indices, axis=0) disjoint_graph['edge_src'] = edge_src + edge_incr - disjoint_graph['edge_dst'] = edge_dst + edge_incr + disjoint_graph['edge_dst'] = edge_dst + edge_incr disjoint_graph['graph_indicator'] = np.repeat(range(len(x)), num_nodes) return disjoint_graph, np.stack(y) @@ -72,7 +74,7 @@ def _collate_fn( class Composer: """Wraps a list of featurizers. - + While a Featurizer encodes an atom or bond based on a single property, the Composer encodes an atom or bond based on multiple properties. @@ -84,13 +86,13 @@ class Composer: def __init__(self, featurizers: list[featurizers.Featurizer]) -> None: self.featurizers = featurizers assert all( - self.featurizers[0].output_dtype == f.output_dtype + self.featurizers[0].output_dtype == f.output_dtype for f in self.featurizers ), "'dtype' of features need to be consistent." def __call__(self, inputs: types.Atom | types.Bond) -> np.ndarray: return np.concatenate([f(inputs) for f in self.featurizers]) - + @property def output_dim(self): return sum(f.output_dim for f in self.featurizers) @@ -98,16 +100,16 @@ def output_dim(self): @property def output_dtype(self): return self.featurizers[0].output_dtype - + class MolecularEdgeEncoder: def __init__( - self, - featurizers: list[featurizers.Featurizer], + self, + featurizers: list[featurizers.Featurizer], self_loops: bool = False ) -> None: - self.featurizer = Composer(featurizers) + self.featurizer = Composer(featurizers) self.self_loops = self_loops self.output_dim = self.featurizer.output_dim self.output_dtype = self.featurizer.output_dtype @@ -122,19 +124,19 @@ def __call__(self, molecule: types.Molecule) -> np.ndarray: if molecule.GetNumBonds() == 0: edge_state = np.zeros( - shape=(0, self.output_dim), + shape=(0, self.output_dim), dtype=self.output_dtype ) return { - 'edge_src': edge_src, - 'edge_dst': edge_dst, + 'edge_src': edge_src, + 'edge_dst': edge_dst, 'edge_state': edge_state } - + bond_encodings = [] for i, j in zip(edge_src, edge_dst): - + bond = molecule.GetBondBetweenAtoms(int(i), int(j)) if bond is None: @@ -151,23 +153,22 @@ def __call__(self, molecule: types.Molecule) -> np.ndarray: bond_encodings.append(bond_encoding) return { - 'edge_src': edge_src, - 'edge_dst': edge_dst, + 'edge_src': edge_src, + 'edge_dst': edge_dst, 'edge_state': np.stack(bond_encodings) } - + class MolecularNodeEncoder: def __init__( - self, - featurizers: list[featurizers.Featurizer], + self, + featurizers: list[featurizers.Featurizer], ) -> None: - self.featurizer = Composer(featurizers) + self.featurizer = Composer(featurizers) def __call__(self, molecule: types.Molecule) -> np.ndarray: node_encodings = np.stack([ self.featurizer(atom) for atom in molecule.GetAtoms() ], axis=0) return {'node_state': np.stack(node_encodings)} - \ No newline at end of file diff --git a/molexpress/datasets/featurizers.py b/molexpress/datasets/featurizers.py index e9e22bb..5bc835f 100644 --- a/molexpress/datasets/featurizers.py +++ b/molexpress/datasets/featurizers.py @@ -1,4 +1,6 @@ -from abc import ABC +from __future__ import annotations + +from abc import ABC from abc import abstractmethod from rdkit.Chem import Lipinski @@ -63,18 +65,18 @@ class Featurizer(ABC): """Abstract featurizer. - + Featurizes a single atom or bond based on a single property. """ def __init__( - self, - output_dim: int = None, + self, + output_dim: int = None, output_dtype: str = 'float32' ) -> None: self._output_dim = int(output_dim) if output_dim is not None else 1 self._output_dtype = output_dtype - + @abstractmethod def call(self, x: types.Atom | types.Bond) -> types.Scalar: pass @@ -94,7 +96,7 @@ class OneHotFeaturizer(Featurizer): def __init__( self, - vocab: list[str] | list[int] = None, + vocab: list[str] | list[int] = None, oov: bool = False, output_dtype: str = 'float32', ): @@ -102,22 +104,22 @@ def __init__( vocab = DEFAULT_VOCABULARY.get(self.__class__.__name__) if vocab is None: raise ValueError("Need to supply a 'vocab'.") - - self.vocab = list(vocab) + + self.vocab = list(vocab) self.vocab.sort(key=lambda x: x if x is not None else "") self.oov = oov super().__init__( - output_dim=len(self.vocab) + int(self.oov), + output_dim=len(self.vocab) + int(self.oov), output_dtype=output_dtype ) if self.oov: self.vocab += [''] - + encodings = np.eye(self.output_dim, dtype=self.output_dtype) self.mapping = dict(zip(self.vocab, encodings)) - + def __call__(self, x: types.Atom | types.Bond) -> np.ndarray: feature = self.call(x) encoding = self.mapping.get( @@ -127,7 +129,7 @@ def __call__(self, x: types.Atom | types.Bond) -> np.ndarray: return encoding return np.zeros([self.output_dim], dtype=self.output_dtype) - + class FloatFeaturizer(Featurizer): """Abstract scalar floating point featurizer.""" @@ -138,13 +140,13 @@ def __call__(self, x: types.Atom | types.Bond) -> np.ndarray: class AtomType(OneHotFeaturizer): def call(self, inputs: types.Atom) -> str: - return inputs.GetSymbol() + return inputs.GetSymbol() class Hybridization(OneHotFeaturizer): def call(self, inputs: types.Atom) -> str: return inputs.GetHybridization().name.lower() - + class CIPCode(OneHotFeaturizer): def call(self, atom: types.Atom) -> str | None: @@ -274,7 +276,7 @@ def call(self, bond: types.Bond) -> str: class Stereo(OneHotFeaturizer): def call(self, bond: types.Bond) -> str: return bond.GetStereo().name.lower() - + class Conjugated(FloatFeaturizer): def call(self, bond: types.Bond) -> bool: diff --git a/molexpress/layers/base_layer.py b/molexpress/layers/base_layer.py index 87c7715..fa8bd1f 100644 --- a/molexpress/layers/base_layer.py +++ b/molexpress/layers/base_layer.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import keras -from molexpress import types +from molexpress import types class BaseLayer(keras.layers.Layer): @@ -8,7 +10,7 @@ class BaseLayer(keras.layers.Layer): """Base layer.""" def __init__( - self, + self, units: int, activation: keras.layers.Activation = None, use_bias: bool = True, @@ -58,7 +60,7 @@ def get_config(self) -> dict[str, types.Any]: return config def compute_output_shape( - self, + self, input_shape: dict[str, tuple[int, ...]] ) -> dict[str, tuple[int, ...]]: output_shape = input_shape @@ -73,9 +75,9 @@ def compute_output_shape( def add_kernel( self, - name: str, - shape: tuple[int, ...], - dtype: str = 'float32', + name: str, + shape: tuple[int, ...], + dtype: str = 'float32', **kwargs ) -> types.Variable: return self.add_weight( @@ -85,12 +87,12 @@ def add_kernel( **self._common_weight_kwargs('kernel'), **kwargs, ) - + def add_bias( - self, - name: str, - shape: tuple[int, ...] = None, - dtype: str = 'float32', + self, + name: str, + shape: tuple[int, ...] = None, + dtype: str = 'float32', **kwargs ) -> types.Variable: return self.add_weight( @@ -102,7 +104,7 @@ def add_bias( ) def _common_weight_kwargs( - self, + self, weight_type: str ) -> dict[str, types.Any]: initializer = getattr(self, f"{weight_type}_initializer", None) @@ -119,4 +121,3 @@ def _common_weight_kwargs( 'regularizer': regularizer, 'constraint': constraint, } - \ No newline at end of file diff --git a/molexpress/layers/gcn_conv.py b/molexpress/layers/gcn_conv.py index ccd6fb6..c7be2ed 100644 --- a/molexpress/layers/gcn_conv.py +++ b/molexpress/layers/gcn_conv.py @@ -1,12 +1,11 @@ -import keras +import keras from molexpress import types -from molexpress.ops import gnn_ops from molexpress.layers.base_layer import BaseLayer +from molexpress.ops import gnn_ops class GCNConv(BaseLayer): - def __init__( self, units: int, @@ -15,8 +14,8 @@ def __init__( normalization: bool = True, skip_connection: bool = True, dropout_rate: float = 0, - kernel_initializer: keras.initializers.Initializer = 'glorot_uniform', - bias_initializer: keras.initializers.Initializer = 'zeros', + kernel_initializer: keras.initializers.Initializer = "glorot_uniform", + bias_initializer: keras.initializers.Initializer = "zeros", kernel_regularizer: keras.regularizers.Regularizer = None, bias_regularizer: keras.regularizers.Regularizer = None, activity_regularizer: keras.regularizers.Regularizer = None, @@ -36,17 +35,14 @@ def __init__( kernel_constraint=kernel_constraint, bias_constraint=bias_constraint, **kwargs, - ) + ) self.dropout_rate = dropout_rate self.skip_connection = skip_connection self.normalization = normalization - def build( - self, - input_shape: dict[str, tuple[int, ...]] - ) -> None: - node_state_shape = input_shape['node_state'] - edge_state_shape = input_shape.get('edge_state') + def build(self, input_shape: dict[str, tuple[int, ...]]) -> None: + node_state_shape = input_shape["node_state"] + edge_state_shape = input_shape.get("edge_state") node_dim = node_state_shape[-1] @@ -54,18 +50,16 @@ def build( if self._transform_skip_connection: self.skip_connect_kernel = self.add_kernel( - name='skip_connect_kernel', shape=(node_dim, self.units) + name="skip_connect_kernel", shape=(node_dim, self.units) ) - self.node_kernel = self.add_kernel( - name='node_kernel', shape=(node_dim, self.units) - ) - - self.bias = self.add_bias(name='bias') + self.node_kernel = self.add_kernel(name="node_kernel", shape=(node_dim, self.units)) + + self.bias = self.add_bias(name="bias") if edge_state_shape is not None: self.edge_kernel = self.add_kernel( - name='edge_kernel', shape=(edge_state_shape[-1], self.units) + name="edge_kernel", shape=(edge_state_shape[-1], self.units) ) if self.normalization: @@ -75,34 +69,27 @@ def build( self.dropout = keras.layers.Dropout(self.dropout_rate) def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: - x = inputs.copy() - node_state = x.pop('node_state') - edge_src = x['edge_src'] - edge_dst = x['edge_dst'] - edge_state = x.get('edge_state') - edge_weight = x.get('edge_weight') + node_state = x.pop("node_state") + edge_src = x["edge_src"] + edge_dst = x["edge_dst"] + edge_state = x.get("edge_state") + edge_weight = x.get("edge_weight") node_state_updated = gnn_ops.transform( - state=node_state, - kernel=self.node_kernel, - bias=self.bias + state=node_state, kernel=self.node_kernel, bias=self.bias ) if edge_state is not None: - edge_state = gnn_ops.transform( - state=edge_state, - kernel=self.edge_kernel, - bias=None - ) + edge_state = gnn_ops.transform(state=edge_state, kernel=self.edge_kernel, bias=None) node_state_updated = gnn_ops.aggregate( - node_state=node_state_updated, - edge_src=edge_src, - edge_dst=edge_dst, - edge_state=edge_state, - edge_weight=edge_weight + node_state=node_state_updated, + edge_src=edge_src, + edge_dst=edge_dst, + edge_state=edge_state, + edge_weight=edge_weight, ) if self.normalization: @@ -113,21 +100,21 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: if self.skip_connection: if self._transform_skip_connection: - node_state = gnn_ops.transform( - state=node_state, kernel=self.skip_connect_kernel - ) + node_state = gnn_ops.transform(state=node_state, kernel=self.skip_connect_kernel) node_state_updated += node_state if self.dropout_rate: node_state_updated = self.dropout(node_state_updated) return dict(node_state=node_state_updated, **x) - + def get_config(self) -> dict[str, types.Any]: config = super().get_config() - config.update({ - 'normalization': self.normalization, - 'skip_connection': self.skip_connection, - 'dropout_rate': self.dropout_rate - }) - return config \ No newline at end of file + config.update( + { + "normalization": self.normalization, + "skip_connection": self.skip_connection, + "dropout_rate": self.dropout_rate, + } + ) + return config diff --git a/molexpress/layers/gin_conv.py b/molexpress/layers/gin_conv.py index ab73901..c2470b8 100644 --- a/molexpress/layers/gin_conv.py +++ b/molexpress/layers/gin_conv.py @@ -1,4 +1,6 @@ -import keras +from __future__ import annotations + +import keras from molexpress import types from molexpress.ops import gnn_ops @@ -36,13 +38,13 @@ def __init__( kernel_constraint=kernel_constraint, bias_constraint=bias_constraint, **kwargs, - ) + ) self.dropout_rate = dropout_rate self.skip_connection = skip_connection self.normalization = normalization def build( - self, + self, input_shape: dict[str, tuple[int, ...]] ) -> None: node_state_shape = input_shape['node_state'] @@ -51,7 +53,7 @@ def build( node_dim = node_state_shape[-1] if edge_state_shape is not None: edge_dim = edge_state_shape[-1] - + self._transform_node_state = node_dim != self.units if self._transform_node_state: @@ -81,7 +83,7 @@ def build( self.epsilon = self.add_weight( name='epsilon', shape=(), initializer='zeros' ) - + if self.normalization: self.normalize = keras.layers.BatchNormalization() @@ -101,8 +103,8 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: if edge_state is not None and self._transform_edge_state: edge_state = gnn_ops.transform( - state=edge_state, - kernel=self.special_edge_kernel, + state=edge_state, + kernel=self.special_edge_kernel, bias=None ) @@ -114,18 +116,18 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: ) node_state_updated = gnn_ops.aggregate( - node_state=node_state, - edge_src=edge_src, - edge_dst=edge_dst, - edge_state=edge_state, + node_state=node_state, + edge_src=edge_src, + edge_dst=edge_dst, + edge_state=edge_state, edge_weight=edge_weight ) - + node_state_updated += (1 + self.epsilon) * node_state node_state_updated = gnn_ops.transform( state=node_state_updated, - kernel=self.node_kernel_1, + kernel=self.node_kernel_1, bias=self.node_bias_1 ) @@ -136,7 +138,7 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: node_state_updated = gnn_ops.transform( state=node_state_updated, - kernel=self.node_kernel_2, + kernel=self.node_kernel_2, bias=self.node_bias_2 ) @@ -150,7 +152,7 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: node_state_updated = self.dropout(node_state_updated) return dict(node_state=node_state_updated, **x) - + def get_config(self) -> dict[str, types.Any]: config = super().get_config() config.update({ @@ -158,4 +160,4 @@ def get_config(self) -> dict[str, types.Any]: 'skip_connection': self.skip_connection, 'dropout_rate': self.dropout_rate }) - return config \ No newline at end of file + return config diff --git a/molexpress/layers/readout.py b/molexpress/layers/readout.py index 45b0a5e..0172728 100644 --- a/molexpress/layers/readout.py +++ b/molexpress/layers/readout.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import keras from molexpress import types -from molexpress.ops import gnn_ops +from molexpress.ops import gnn_ops class Readout(keras.layers.Layer): @@ -10,7 +12,7 @@ def __init__(self, mode: str = 'mean', **kwargs) -> None: super().__init__(**kwargs) self.mode = mode if self.mode == 'max': - self._readout_fn = keras.ops.segment_max + self._readout_fn = keras.ops.segment_max elif self.mode == 'sum': self._readout_fn = keras.ops.segment_sum else: @@ -27,5 +29,5 @@ def call(self, inputs: types.MolecularGraph) -> types.Array: data=inputs['node_state'], segment_ids=graph_indicator, num_segments=None, - sorted=False, - ) \ No newline at end of file + sorted=False, + ) diff --git a/molexpress/ops/chem_ops.py b/molexpress/ops/chem_ops.py index 9a8ff0d..c0f1b84 100644 --- a/molexpress/ops/chem_ops.py +++ b/molexpress/ops/chem_ops.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import numpy as np -from rdkit import Chem +from rdkit import Chem from molexpress import types @@ -14,7 +16,7 @@ def get_molecule( if isinstance(molecule, Chem.Mol): return molecule - string = molecule + string = molecule if string.startswith('InChI'): molecule = Chem.MolFromInchi(string, sanitize=False) @@ -28,7 +30,7 @@ def get_molecule( if flag != Chem.SanitizeFlags.SANITIZE_NONE: if not catch_errors: return None - # Sanitize molecule again, without the sanitization step that caused + # Sanitize molecule again, without the sanitization step that caused # the error previously. Unrealistic molecules might pass without an error. Chem.SanitizeMol( molecule, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^flag) @@ -44,7 +46,7 @@ def get_adjacency( sparse: bool = True, dtype: str = 'int32', ) -> np.ndarray | tuple[np.ndarray, np.ndarray]: - + """Computes the (sparse) adjacency matrix of the molecule""" adjacency_matrix: np.ndarray = Chem.GetAdjacencyMatrix(molecule) diff --git a/molexpress/ops/gnn_ops.py b/molexpress/ops/gnn_ops.py index 6770471..e8b92c5 100644 --- a/molexpress/ops/gnn_ops.py +++ b/molexpress/ops/gnn_ops.py @@ -1,15 +1,17 @@ -import keras +from __future__ import annotations -from molexpress import types +import keras + +from molexpress import types def transform( - state, + state, kernel: types.Variable, bias: types.Variable = None, -) -> types.Array: +) -> types.Array: """Transforms node or edge states via learnable weights. - + Args: state: The current node or edge states to be updated. @@ -17,16 +19,16 @@ def transform( The learnable kernel. bias: The learnable bias. - + Returns: A transformed node state. """ state_transformed = keras.ops.matmul(state, kernel) if bias is not None: - state_transformed += bias + state_transformed += bias return state_transformed - + def aggregate( node_state: types.Array, edge_src: types.Array, @@ -35,16 +37,16 @@ def aggregate( edge_weight: types.Array = None, ) -> types.Array: """Aggregates node states based on edges. - + Given node A with edges AB and AC, the information (states) of nodes B and C will be passed to node A. Args: - node_state: + node_state: The current state of the nodes. edge_src: The indices of the source nodes. - edge_dst: + edge_dst: The indices of the destination nodes. edge_state: Optional edge states. @@ -56,20 +58,20 @@ def aggregate( """ num_nodes = keras.ops.shape(node_state)[0] - expected_rank = 2 + expected_rank = 2 current_rank = len(keras.ops.shape(edge_src)) for _ in range(expected_rank - current_rank): edge_src = keras.ops.expand_dims(edge_src, axis=-1) edge_dst = keras.ops.expand_dims(edge_dst, axis=-1) - + node_state_src = keras.ops.take_along_axis( node_state, edge_src, axis=0 ) if edge_weight is not None: - node_state_src *= edge_weight + node_state_src *= edge_weight if edge_state is not None: - node_state_src += edge_state + node_state_src += edge_state edge_dst = keras.ops.squeeze(edge_dst, axis=-1) @@ -88,7 +90,7 @@ def segment_mean( sorted: bool = False ) -> types.Array: """Performs a mean of data based on segment indices. - + A permutation invariant reduction of the node states to obtain an encoding of the graph. @@ -110,5 +112,5 @@ def segment_mean( segment_ids=segment_ids, num_segments=num_segments, sorted=sorted - ) + ) return x / keras.ops.cast(keras.ops.bincount(segment_ids), x.dtype)[:, None] diff --git a/molexpress/types.py b/molexpress/types.py index be12a48..3315e9b 100644 --- a/molexpress/types.py +++ b/molexpress/types.py @@ -1,12 +1,14 @@ -from typing import TypedDict -from typing import Protocol -from typing import TypeVar -from typing import Any +from __future__ import annotations -from rdkit import Chem +from typing import ( + Any, # noqa: F401 + Protocol, # noqa: F401 + TypedDict, + TypeVar, +) +from rdkit import Chem -Scalar = TypeVar("Scalar") Array = TypeVar("Array") Variable = TypeVar("Variable") @@ -14,17 +16,17 @@ DType = TypeVar("DType") Molecule = Chem.Mol -Atom = Chem.Atom +Atom = Chem.Atom Bond = Chem.Bond SMILES = TypeVar("SMILES", bound=str) -InChI = TypeVar("InChI", bound=str) +InChI = TypeVar("InChI", bound=str) class MolecularGraph(TypedDict): - node_state: Array + node_state: Array edge_src: Array - edge_dst: Array - edge_state: Array | None - edge_weight: Array | None + edge_dst: Array + edge_state: Array | None + edge_weight: Array | None graph_indicator: Array | None diff --git a/pyproject.toml b/pyproject.toml index 07a6f0b..d5b3438 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: POSIX :: Linux", ] -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = ["rdkit>=2023.9.5", "keras>=3", "numpy"] [project.urls] @@ -36,6 +36,9 @@ homepage = "https://github.com/compomics/molexpress" requires = ["setuptools"] build-backend = "setuptools.build_meta" +[tool.setuptools] +packages = ["molexpress"] + [tool.isort] profile = "black" From 399af15f1a5e35fbb36b944b5d417fe2fae258cf Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 11 Apr 2024 15:03:26 +0200 Subject: [PATCH 7/9] More ruff/black formatting --- molexpress/datasets/encoders.py | 85 ++++-------- molexpress/datasets/featurizers.py | 207 +++++++++++++++++++---------- molexpress/layers/__init__.py | 2 +- molexpress/layers/base_layer.py | 92 +++++-------- molexpress/layers/gin_conv.py | 78 +++++------ molexpress/layers/readout.py | 16 +-- molexpress/ops/chem_ops.py | 19 +-- molexpress/ops/gnn_ops.py | 18 +-- 8 files changed, 246 insertions(+), 271 deletions(-) diff --git a/molexpress/datasets/encoders.py b/molexpress/datasets/encoders.py index 9deed79..fa9cfa2 100644 --- a/molexpress/datasets/encoders.py +++ b/molexpress/datasets/encoders.py @@ -3,13 +3,12 @@ import numpy as np from rdkit import Chem +from molexpress import types from molexpress.datasets import featurizers from molexpress.ops import chem_ops -from molexpress import types class MolecularGraphEncoder: - def __init__( self, atom_featurizers: list[featurizers.Featurizer], @@ -17,22 +16,16 @@ def __init__( self_loops: bool = False, ) -> None: self.node_encoder = MolecularNodeEncoder(atom_featurizers) - self.edge_encoder = MolecularEdgeEncoder( - bond_featurizers, self_loops=self_loops - ) + self.edge_encoder = MolecularEdgeEncoder(bond_featurizers, self_loops=self_loops) - def __call__( - self, - molecule: types.Molecule | types.SMILES | types.InChI - ) -> np.ndarray: + def __call__(self, molecule: types.Molecule | types.SMILES | types.InChI) -> np.ndarray: molecule = chem_ops.get_molecule(molecule) return {**self.node_encoder(molecule), **self.edge_encoder(molecule)} @staticmethod def _collate_fn( - data: list[tuple[types.MolecularGraph, np.ndarray]] + data: list[tuple[types.MolecularGraph, np.ndarray]], ) -> tuple[types.MolecularGraph, np.ndarray]: - """TODO: Not sure where to implement this collate function. Temporarily putting it here. @@ -42,37 +35,30 @@ def _collate_fn( x, y = list(zip(*data)) - num_nodes = np.array([ - graph['node_state'].shape[0] for graph in x - ]) + num_nodes = np.array([graph["node_state"].shape[0] for graph in x]) disjoint_graph = {} - disjoint_graph['node_state'] = np.concatenate([ - graph['node_state'] for graph in x - ]) + disjoint_graph["node_state"] = np.concatenate([graph["node_state"] for graph in x]) - if 'edge_state' in x[0]: - disjoint_graph['edge_state'] = np.concatenate([ - graph['edge_state'] for graph in x - ]) + if "edge_state" in x[0]: + disjoint_graph["edge_state"] = np.concatenate([graph["edge_state"] for graph in x]) - edge_src = np.concatenate([graph['edge_src'] for graph in x]) - edge_dst = np.concatenate([graph['edge_dst'] for graph in x]) - num_edges = np.array([graph['edge_src'].shape[0] for graph in x]) + edge_src = np.concatenate([graph["edge_src"] for graph in x]) + edge_dst = np.concatenate([graph["edge_dst"] for graph in x]) + num_edges = np.array([graph["edge_src"].shape[0] for graph in x]) indices = np.repeat(range(len(x)), num_edges) edge_incr = np.concatenate([[0], num_nodes[:-1]]) edge_incr = np.take_along_axis(edge_incr, indices, axis=0) - disjoint_graph['edge_src'] = edge_src + edge_incr - disjoint_graph['edge_dst'] = edge_dst + edge_incr - disjoint_graph['graph_indicator'] = np.repeat(range(len(x)), num_nodes) + disjoint_graph["edge_src"] = edge_src + edge_incr + disjoint_graph["edge_dst"] = edge_dst + edge_incr + disjoint_graph["graph_indicator"] = np.repeat(range(len(x)), num_nodes) return disjoint_graph, np.stack(y) class Composer: - """Wraps a list of featurizers. While a Featurizer encodes an atom or bond based on a single property, @@ -86,8 +72,7 @@ class Composer: def __init__(self, featurizers: list[featurizers.Featurizer]) -> None: self.featurizers = featurizers assert all( - self.featurizers[0].output_dtype == f.output_dtype - for f in self.featurizers + self.featurizers[0].output_dtype == f.output_dtype for f in self.featurizers ), "'dtype' of features need to be consistent." def __call__(self, inputs: types.Atom | types.Bond) -> np.ndarray: @@ -103,11 +88,8 @@ def output_dtype(self): class MolecularEdgeEncoder: - def __init__( - self, - featurizers: list[featurizers.Featurizer], - self_loops: bool = False + self, featurizers: list[featurizers.Featurizer], self_loops: bool = False ) -> None: self.featurizer = Composer(featurizers) self.self_loops = self_loops @@ -115,35 +97,27 @@ def __init__( self.output_dtype = self.featurizer.output_dtype def __call__(self, molecule: types.Molecule) -> np.ndarray: - - edge_src, edge_dst = chem_ops.get_adjacency( - molecule, self_loops=self.self_loops) + edge_src, edge_dst = chem_ops.get_adjacency(molecule, self_loops=self.self_loops) if self.featurizer is None: - return {'edge_src': edge_src, 'edge_dst': edge_dst} + return {"edge_src": edge_src, "edge_dst": edge_dst} if molecule.GetNumBonds() == 0: - edge_state = np.zeros( - shape=(0, self.output_dim), - dtype=self.output_dtype - ) + edge_state = np.zeros(shape=(0, self.output_dim), dtype=self.output_dtype) return { - 'edge_src': edge_src, - 'edge_dst': edge_dst, - 'edge_state': edge_state + "edge_src": edge_src, + "edge_dst": edge_dst, + "edge_state": edge_state, } bond_encodings = [] for i, j in zip(edge_src, edge_dst): - bond = molecule.GetBondBetweenAtoms(int(i), int(j)) if bond is None: assert self.self_loops, "Found a bond to be None." - bond_encoding = np.zeros( - self.output_dim + 1, dtype=self.output_dtype - ) + bond_encoding = np.zeros(self.output_dim + 1, dtype=self.output_dtype) bond_encoding[-1] = 1 else: bond_encoding = self.featurizer(bond) @@ -153,14 +127,13 @@ def __call__(self, molecule: types.Molecule) -> np.ndarray: bond_encodings.append(bond_encoding) return { - 'edge_src': edge_src, - 'edge_dst': edge_dst, - 'edge_state': np.stack(bond_encodings) + "edge_src": edge_src, + "edge_dst": edge_dst, + "edge_state": np.stack(bond_encodings), } class MolecularNodeEncoder: - def __init__( self, featurizers: list[featurizers.Featurizer], @@ -168,7 +141,5 @@ def __init__( self.featurizer = Composer(featurizers) def __call__(self, molecule: types.Molecule) -> np.ndarray: - node_encodings = np.stack([ - self.featurizer(atom) for atom in molecule.GetAtoms() - ], axis=0) - return {'node_state': np.stack(node_encodings)} + node_encodings = np.stack([self.featurizer(atom) for atom in molecule.GetAtoms()], axis=0) + return {"node_state": np.stack(node_encodings)} diff --git a/molexpress/datasets/featurizers.py b/molexpress/datasets/featurizers.py index 5bc835f..17a5a17 100644 --- a/molexpress/datasets/featurizers.py +++ b/molexpress/datasets/featurizers.py @@ -1,79 +1,148 @@ from __future__ import annotations -from abc import ABC -from abc import abstractmethod - -from rdkit.Chem import Lipinski -from rdkit.Chem import Crippen -from rdkit.Chem import rdMolDescriptors -from rdkit.Chem import rdPartialCharges +import math +from abc import ABC, abstractmethod import numpy as np -import math +from rdkit.Chem import Crippen, Lipinski, rdMolDescriptors, rdPartialCharges from molexpress import types - DEFAULT_VOCABULARY = { - 'AtomType': { - 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', - 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', - 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', - 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', - 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', - 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', - 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', - 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', - 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', - 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', - 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', - 'Rg', 'Cn' - }, - 'Hybridization': { - 's', 'sp', 'sp2', 'sp3', 'sp3d', 'sp3d2', 'unspecified' - }, - 'CIPCode': { - 'R', 'S', 'None' - }, - 'FormalCharge': { - -3, -2, -1, 0, 1, 2, 3, 4 - }, - 'TotalNumHs': { - 0, 1, 2, 3, 4 - }, - 'TotalValence': { - 0, 1, 2, 3, 4, 5, 6, 7, 8 - }, - 'NumRadicalElectrons': { - 0, 1, 2, 3 - }, - 'Degree': { - 0, 1, 2, 3, 4, 5, 6, 7, 8 - }, - 'RingSize': { - 0, 3, 4, 5, 6, 7, 8 - }, - 'BondType': { - 'single', 'double', 'triple', 'aromatic' - }, - 'Stereo': { - 'stereoe', 'stereoz', 'stereoany', 'stereonone' + "AtomType": { + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + "Rf", + "Db", + "Sg", + "Bh", + "Hs", + "Mt", + "Ds", + "Rg", + "Cn", }, + "Hybridization": {"s", "sp", "sp2", "sp3", "sp3d", "sp3d2", "unspecified"}, + "CIPCode": {"R", "S", "None"}, + "FormalCharge": {-3, -2, -1, 0, 1, 2, 3, 4}, + "TotalNumHs": {0, 1, 2, 3, 4}, + "TotalValence": {0, 1, 2, 3, 4, 5, 6, 7, 8}, + "NumRadicalElectrons": {0, 1, 2, 3}, + "Degree": {0, 1, 2, 3, 4, 5, 6, 7, 8}, + "RingSize": {0, 3, 4, 5, 6, 7, 8}, + "BondType": {"single", "double", "triple", "aromatic"}, + "Stereo": {"stereoe", "stereoz", "stereoany", "stereonone"}, } class Featurizer(ABC): - """Abstract featurizer. Featurizes a single atom or bond based on a single property. """ - def __init__( - self, - output_dim: int = None, - output_dtype: str = 'float32' - ) -> None: + def __init__(self, output_dim: int = None, output_dtype: str = "float32") -> None: self._output_dim = int(output_dim) if output_dim is not None else 1 self._output_dtype = output_dtype @@ -91,14 +160,13 @@ def output_dtype(self) -> str: class OneHotFeaturizer(Featurizer): - """Abstract one-hot featurizer.""" def __init__( self, vocab: list[str] | list[int] = None, oov: bool = False, - output_dtype: str = 'float32', + output_dtype: str = "float32", ): if not vocab: vocab = DEFAULT_VOCABULARY.get(self.__class__.__name__) @@ -109,29 +177,23 @@ def __init__( self.vocab.sort(key=lambda x: x if x is not None else "") self.oov = oov - super().__init__( - output_dim=len(self.vocab) + int(self.oov), - output_dtype=output_dtype - ) + super().__init__(output_dim=len(self.vocab) + int(self.oov), output_dtype=output_dtype) if self.oov: - self.vocab += [''] + self.vocab += [""] encodings = np.eye(self.output_dim, dtype=self.output_dtype) self.mapping = dict(zip(self.vocab, encodings)) def __call__(self, x: types.Atom | types.Bond) -> np.ndarray: feature = self.call(x) - encoding = self.mapping.get( - feature, None if not self.oov else self.mapping[''] - ) + encoding = self.mapping.get(feature, None if not self.oov else self.mapping[""]) if encoding is not None: return encoding return np.zeros([self.output_dim], dtype=self.output_dtype) class FloatFeaturizer(Featurizer): - """Abstract scalar floating point featurizer.""" def __call__(self, x: types.Atom | types.Bond) -> np.ndarray: @@ -152,7 +214,7 @@ class CIPCode(OneHotFeaturizer): def call(self, atom: types.Atom) -> str | None: if atom.HasProp("_CIPCode"): return atom.GetProp("_CIPCode") - return 'None' + return "None" class ChiralCenter(FloatFeaturizer): @@ -262,7 +324,7 @@ class GasteigerCharge(FloatFeaturizer): def call(self, atom: types.Atom) -> float: mol = atom.GetOwningMol() rdPartialCharges.ComputeGasteigerCharges(mol) - val = atom.GetDoubleProp('_GasteigerCharge') + val = atom.GetDoubleProp("_GasteigerCharge") if val is not None and math.isfinite(val): return val return 0.0 @@ -286,6 +348,5 @@ def call(self, bond: types.Bond) -> bool: class Rotatable(FloatFeaturizer): def call(self, bond: types.Bond) -> bool: mol = bond.GetOwningMol() - atom_indices = tuple( - sorted([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])) + atom_indices = tuple(sorted([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])) return atom_indices in Lipinski._RotatableBonds(mol) diff --git a/molexpress/layers/__init__.py b/molexpress/layers/__init__.py index 0d79c61..fe99911 100644 --- a/molexpress/layers/__init__.py +++ b/molexpress/layers/__init__.py @@ -1,4 +1,4 @@ from molexpress.layers.base_layer import BaseLayer from molexpress.layers.gcn_conv import GCNConv from molexpress.layers.gin_conv import GINConv -from molexpress.layers.readout import Readout \ No newline at end of file +from molexpress.layers.readout import Readout diff --git a/molexpress/layers/base_layer.py b/molexpress/layers/base_layer.py index fa8bd1f..711cfa5 100644 --- a/molexpress/layers/base_layer.py +++ b/molexpress/layers/base_layer.py @@ -6,7 +6,6 @@ class BaseLayer(keras.layers.Layer): - """Base layer.""" def __init__( @@ -14,18 +13,16 @@ def __init__( units: int, activation: keras.layers.Activation = None, use_bias: bool = True, - kernel_initializer: keras.initializers.Initializer = 'glorot_uniform', - bias_initializer: keras.initializers.Initializer = 'zeros', + kernel_initializer: keras.initializers.Initializer = "glorot_uniform", + bias_initializer: keras.initializers.Initializer = "zeros", kernel_regularizer: keras.regularizers.Regularizer = None, bias_regularizer: keras.regularizers.Regularizer = None, activity_regularizer: keras.regularizers.Regularizer = None, kernel_constraint: keras.constraints.Constraint = None, bias_constraint: keras.constraints.Constraint = None, - **kwargs + **kwargs, ) -> None: - super().__init__( - activity_regularizer=activity_regularizer, **kwargs - ) + super().__init__(activity_regularizer=activity_regularizer, **kwargs) self.units = units self.use_bias = use_bias self.activation = keras.activations.get(activation) @@ -38,86 +35,65 @@ def __init__( def get_config(self) -> dict[str, types.Any]: config = super().get_config() - config.update({ - 'units': self.units, - 'activation': keras.activations.serialize(self.activation), - 'use_bias': self.use_bias, - 'kernel_initializer': keras.initializers.serialize( - self.kernel_initializer), - 'bias_initializer': keras.initializers.serialize( - self.bias_initializer), - 'kernel_regularizer': keras.regularizers.serialize( - self.kernel_regularizer), - 'bias_regularizer': keras.regularizers.serialize( - self.bias_regularizer), - 'activity_regularizer': keras.regularizers.serialize( - self.activity_regularizer), - 'kernel_constraint': keras.constraints.serialize( - self.kernel_constraint), - 'bias_constraint': keras.constraints.serialize( - self.bias_constraint), - }) + config.update( + { + "units": self.units, + "activation": keras.activations.serialize(self.activation), + "use_bias": self.use_bias, + "kernel_initializer": keras.initializers.serialize(self.kernel_initializer), + "bias_initializer": keras.initializers.serialize(self.bias_initializer), + "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer), + "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer), + "activity_regularizer": keras.regularizers.serialize(self.activity_regularizer), + "kernel_constraint": keras.constraints.serialize(self.kernel_constraint), + "bias_constraint": keras.constraints.serialize(self.bias_constraint), + } + ) return config def compute_output_shape( - self, - input_shape: dict[str, tuple[int, ...]] + self, input_shape: dict[str, tuple[int, ...]] ) -> dict[str, tuple[int, ...]]: output_shape = input_shape - output_shape['node_state'] = ( - *input_shape['node_state'][:-1], self.units - ) - if input_shape['edge_state'] is not None: - output_shape['edge_state'] = ( - *input_shape['edge_state'][:-1], self.units - ) + output_shape["node_state"] = (*input_shape["node_state"][:-1], self.units) + if input_shape["edge_state"] is not None: + output_shape["edge_state"] = (*input_shape["edge_state"][:-1], self.units) return output_shape def add_kernel( - self, - name: str, - shape: tuple[int, ...], - dtype: str = 'float32', - **kwargs + self, name: str, shape: tuple[int, ...], dtype: str = "float32", **kwargs ) -> types.Variable: return self.add_weight( name=name, shape=shape, dtype=dtype, - **self._common_weight_kwargs('kernel'), + **self._common_weight_kwargs("kernel"), **kwargs, ) def add_bias( - self, - name: str, - shape: tuple[int, ...] = None, - dtype: str = 'float32', - **kwargs + self, name: str, shape: tuple[int, ...] = None, dtype: str = "float32", **kwargs ) -> types.Variable: return self.add_weight( name=name, shape=shape if shape is not None else (self.units,), dtype=dtype, - **self._common_weight_kwargs('bias'), + **self._common_weight_kwargs("bias"), **kwargs, ) - def _common_weight_kwargs( - self, - weight_type: str - ) -> dict[str, types.Any]: + def _common_weight_kwargs(self, weight_type: str) -> dict[str, types.Any]: initializer = getattr(self, f"{weight_type}_initializer", None) regularizer = getattr(self, f"{weight_type}_regularizer", None) - regularizer = None if regularizer is None else regularizer.from_config( - regularizer.get_config() + regularizer = ( + None if regularizer is None else regularizer.from_config(regularizer.get_config()) ) constraint = getattr(self, f"{weight_type}_constraint", None) - constraint = None if constraint is None else constraint.from_config( - constraint.get_config() + constraint = ( + None if constraint is None else constraint.from_config(constraint.get_config()) ) return { - 'initializer': initializer, - 'regularizer': regularizer, - 'constraint': constraint, + "initializer": initializer, + "regularizer": regularizer, + "constraint": constraint, } diff --git a/molexpress/layers/gin_conv.py b/molexpress/layers/gin_conv.py index c2470b8..599e7b2 100644 --- a/molexpress/layers/gin_conv.py +++ b/molexpress/layers/gin_conv.py @@ -3,12 +3,11 @@ import keras from molexpress import types -from molexpress.ops import gnn_ops from molexpress.layers.base_layer import BaseLayer +from molexpress.ops import gnn_ops class GINConv(BaseLayer): - def __init__( self, units: int, @@ -17,8 +16,8 @@ def __init__( normalization: bool = True, skip_connection: bool = True, dropout_rate: float = 0, - kernel_initializer: keras.initializers.Initializer = 'glorot_uniform', - bias_initializer: keras.initializers.Initializer = 'zeros', + kernel_initializer: keras.initializers.Initializer = "glorot_uniform", + bias_initializer: keras.initializers.Initializer = "zeros", kernel_regularizer: keras.regularizers.Regularizer = None, bias_regularizer: keras.regularizers.Regularizer = None, activity_regularizer: keras.regularizers.Regularizer = None, @@ -43,12 +42,9 @@ def __init__( self.skip_connection = skip_connection self.normalization = normalization - def build( - self, - input_shape: dict[str, tuple[int, ...]] - ) -> None: - node_state_shape = input_shape['node_state'] - edge_state_shape = input_shape.get('edge_state') + def build(self, input_shape: dict[str, tuple[int, ...]]) -> None: + node_state_shape = input_shape["node_state"] + edge_state_shape = input_shape.get("edge_state") node_dim = node_state_shape[-1] if edge_state_shape is not None: @@ -58,31 +54,25 @@ def build( if self._transform_node_state: self.special_node_kernel = self.add_kernel( - name='special_node_kernel', shape=(node_dim, self.units) + name="special_node_kernel", shape=(node_dim, self.units) ) node_dim = self.units - self.node_kernel_1 = self.add_kernel( - name='node_kernel_2', shape=(node_dim, self.units) - ) - self.node_kernel_2 = self.add_kernel( - name='node_kernel_2', shape=(node_dim, self.units) - ) + self.node_kernel_1 = self.add_kernel(name="node_kernel_2", shape=(node_dim, self.units)) + self.node_kernel_2 = self.add_kernel(name="node_kernel_2", shape=(node_dim, self.units)) if self.use_bias: - self.node_bias_1 = self.add_bias(name='node_bias_1') - self.node_bias_2 = self.add_bias(name='node_bias_2') + self.node_bias_1 = self.add_bias(name="node_bias_1") + self.node_bias_2 = self.add_bias(name="node_bias_2") self._transform_edge_state = edge_dim != node_dim if edge_state_shape is not None and self._transform_edge_state: self.special_edge_kernel = self.add_kernel( - name='special_edge_kernel', shape=(edge_dim, node_dim) + name="special_edge_kernel", shape=(edge_dim, node_dim) ) - self.epsilon = self.add_weight( - name='epsilon', shape=(), initializer='zeros' - ) + self.epsilon = self.add_weight(name="epsilon", shape=(), initializer="zeros") if self.normalization: self.normalize = keras.layers.BatchNormalization() @@ -91,28 +81,22 @@ def build( self.dropout = keras.layers.Dropout(self.dropout_rate) def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: - x = inputs.copy() - node_state = x.pop('node_state') - edge_src = x['edge_src'] - edge_dst = x['edge_dst'] - edge_state = x.get('edge_state') - edge_weight = x.get('edge_weight') - + node_state = x.pop("node_state") + edge_src = x["edge_src"] + edge_dst = x["edge_dst"] + edge_state = x.get("edge_state") + edge_weight = x.get("edge_weight") if edge_state is not None and self._transform_edge_state: edge_state = gnn_ops.transform( - state=edge_state, - kernel=self.special_edge_kernel, - bias=None + state=edge_state, kernel=self.special_edge_kernel, bias=None ) if self._transform_node_state: node_state = gnn_ops.transform( - state=node_state, - kernel=self.special_node_kernel, - bias=None + state=node_state, kernel=self.special_node_kernel, bias=None ) node_state_updated = gnn_ops.aggregate( @@ -120,15 +104,13 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: edge_src=edge_src, edge_dst=edge_dst, edge_state=edge_state, - edge_weight=edge_weight + edge_weight=edge_weight, ) node_state_updated += (1 + self.epsilon) * node_state node_state_updated = gnn_ops.transform( - state=node_state_updated, - kernel=self.node_kernel_1, - bias=self.node_bias_1 + state=node_state_updated, kernel=self.node_kernel_1, bias=self.node_bias_1 ) if self.normalization: @@ -137,9 +119,7 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: node_state_updated = self.activation(node_state_updated) node_state_updated = gnn_ops.transform( - state=node_state_updated, - kernel=self.node_kernel_2, - bias=self.node_bias_2 + state=node_state_updated, kernel=self.node_kernel_2, bias=self.node_bias_2 ) if self.activation is not None: @@ -155,9 +135,11 @@ def call(self, inputs: types.MolecularGraph) -> types.MolecularGraph: def get_config(self) -> dict[str, types.Any]: config = super().get_config() - config.update({ - 'normalization': self.normalization, - 'skip_connection': self.skip_connection, - 'dropout_rate': self.dropout_rate - }) + config.update( + { + "normalization": self.normalization, + "skip_connection": self.skip_connection, + "dropout_rate": self.dropout_rate, + } + ) return config diff --git a/molexpress/layers/readout.py b/molexpress/layers/readout.py index 0172728..59fa7d5 100644 --- a/molexpress/layers/readout.py +++ b/molexpress/layers/readout.py @@ -7,26 +7,24 @@ class Readout(keras.layers.Layer): - - def __init__(self, mode: str = 'mean', **kwargs) -> None: + def __init__(self, mode: str = "mean", **kwargs) -> None: super().__init__(**kwargs) self.mode = mode - if self.mode == 'max': + if self.mode == "max": self._readout_fn = keras.ops.segment_max - elif self.mode == 'sum': + elif self.mode == "sum": self._readout_fn = keras.ops.segment_sum else: self._readout_fn = gnn_ops.segment_mean def build(self, input_shape: dict[str, tuple[int, ...]]) -> None: - if 'graph_indicator' not in input_shape: - raise ValueError( - "Cannot perform readout: 'graph_indicator' not found.") + if "graph_indicator" not in input_shape: + raise ValueError("Cannot perform readout: 'graph_indicator' not found.") def call(self, inputs: types.MolecularGraph) -> types.Array: - graph_indicator = keras.ops.cast(inputs['graph_indicator'], 'int32') + graph_indicator = keras.ops.cast(inputs["graph_indicator"], "int32") return self._readout_fn( - data=inputs['node_state'], + data=inputs["node_state"], segment_ids=graph_indicator, num_segments=None, sorted=False, diff --git a/molexpress/ops/chem_ops.py b/molexpress/ops/chem_ops.py index c0f1b84..c71c810 100644 --- a/molexpress/ops/chem_ops.py +++ b/molexpress/ops/chem_ops.py @@ -10,7 +10,6 @@ def get_molecule( molecule: types.Molecule | types.SMILES | types.InChI, catch_errors: bool = False, ) -> Chem.Mol | None: - """Generates an molecule object.""" if isinstance(molecule, Chem.Mol): @@ -18,7 +17,7 @@ def get_molecule( string = molecule - if string.startswith('InChI'): + if string.startswith("InChI"): molecule = Chem.MolFromInchi(string, sanitize=False) else: molecule = Chem.MolFromSmiles(string, sanitize=False) @@ -32,34 +31,28 @@ def get_molecule( return None # Sanitize molecule again, without the sanitization step that caused # the error previously. Unrealistic molecules might pass without an error. - Chem.SanitizeMol( - molecule, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL^flag) + Chem.SanitizeMol(molecule, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ flag) - Chem.AssignStereochemistry( - molecule, cleanIt=True, force=True, flagPossibleStereoCenters=True) + Chem.AssignStereochemistry(molecule, cleanIt=True, force=True, flagPossibleStereoCenters=True) return molecule + def get_adjacency( molecule: types.Molecule, self_loops: bool = False, sparse: bool = True, - dtype: str = 'int32', + dtype: str = "int32", ) -> np.ndarray | tuple[np.ndarray, np.ndarray]: - """Computes the (sparse) adjacency matrix of the molecule""" adjacency_matrix: np.ndarray = Chem.GetAdjacencyMatrix(molecule) if self_loops: - adjacency_matrix += np.eye( - adjacency_matrix.shape[0], dtype=adjacency_matrix.dtype - ) + adjacency_matrix += np.eye(adjacency_matrix.shape[0], dtype=adjacency_matrix.dtype) if not sparse: return adjacency_matrix.astype(dtype) edge_src, edge_dst = np.where(adjacency_matrix) return edge_src.astype(dtype), edge_dst.astype(dtype) - - diff --git a/molexpress/ops/gnn_ops.py b/molexpress/ops/gnn_ops.py index e8b92c5..c26c654 100644 --- a/molexpress/ops/gnn_ops.py +++ b/molexpress/ops/gnn_ops.py @@ -29,6 +29,7 @@ def transform( state_transformed += bias return state_transformed + def aggregate( node_state: types.Array, edge_src: types.Array, @@ -64,9 +65,7 @@ def aggregate( edge_src = keras.ops.expand_dims(edge_src, axis=-1) edge_dst = keras.ops.expand_dims(edge_dst, axis=-1) - node_state_src = keras.ops.take_along_axis( - node_state, edge_src, axis=0 - ) + node_state_src = keras.ops.take_along_axis(node_state, edge_src, axis=0) if edge_weight is not None: node_state_src *= edge_weight @@ -76,18 +75,16 @@ def aggregate( edge_dst = keras.ops.squeeze(edge_dst, axis=-1) node_state_updated = keras.ops.segment_sum( - data=node_state_src, - segment_ids=edge_dst, - num_segments=num_nodes, - sorted=False + data=node_state_src, segment_ids=edge_dst, num_segments=num_nodes, sorted=False ) return node_state_updated + def segment_mean( data: types.Array, segment_ids: types.Array, num_segments: int = None, - sorted: bool = False + sorted: bool = False, ) -> types.Array: """Performs a mean of data based on segment indices. @@ -108,9 +105,6 @@ def segment_mean( New data that has been reduced. """ x = keras.ops.segment_sum( - data=data, - segment_ids=segment_ids, - num_segments=num_segments, - sorted=sorted + data=data, segment_ids=segment_ids, num_segments=num_segments, sorted=sorted ) return x / keras.ops.cast(keras.ops.bincount(segment_ids), x.dtype)[:, None] From 955040e1c8c85a39d9f069d900f6490c16f5fcb1 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 11 Apr 2024 15:29:55 +0200 Subject: [PATCH 8/9] Add test CI (only linting for now) --- .github/workflows/test.yml | 36 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 16 +++++++++------- 2 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..53d06d3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,36 @@ +name: Tests + +on: + push: + branches: + - main + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + + - name: Run Ruff + run: ruff check --output-format=github . + + # - name: Install package + # run: pip install . + + # - name: Test with pytest + # run: | + # pytest diff --git a/pyproject.toml b/pyproject.toml index d5b3438..be7ca30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [project] name = "molexpress" -version = "0.1.0" -description = "Graph Neural Networks with Keras 3." +description = "Graph Neural Networks for molecules with Keras 3." readme = "README.md" license = { file = "LICENSE" } +dynamic = ["version"] authors = [ { name = "Alexander Kensert", email = "alexander.kensert@gmail.com" }, ] @@ -29,6 +29,9 @@ classifiers = [ requires-python = ">=3.9" dependencies = ["rdkit>=2023.9.5", "keras>=3", "numpy"] +[project.optional-dependencies] +dev = ["ruff", "isort"] + [project.urls] homepage = "https://github.com/compomics/molexpress" @@ -42,10 +45,9 @@ packages = ["molexpress"] [tool.isort] profile = "black" -[tool.black] -line-length = 99 -target-version = ['py38'] - [tool.ruff] line-length = 99 -target-version = 'py38' +target-version = 'py39' + +[tool.ruff.format] +docstring-code-format = true From 1c8b0536b2c5d28e787650971e6400ff76c483b9 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 11 Apr 2024 15:54:56 +0200 Subject: [PATCH 9/9] Fix linting errors --- .gitignore | 3 +++ molexpress/datasets/encoders.py | 1 - molexpress/layers/__init__.py | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index a61da04..eb1ab5d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ notebooks/_*.ipynb +# vscode +.vscode + # jupyter MANIFEST build diff --git a/molexpress/datasets/encoders.py b/molexpress/datasets/encoders.py index 2850a5b..7743c52 100644 --- a/molexpress/datasets/encoders.py +++ b/molexpress/datasets/encoders.py @@ -1,7 +1,6 @@ from __future__ import annotations import numpy as np -from rdkit import Chem from molexpress import types from molexpress.datasets import featurizers diff --git a/molexpress/layers/__init__.py b/molexpress/layers/__init__.py index fe99911..d48b332 100644 --- a/molexpress/layers/__init__.py +++ b/molexpress/layers/__init__.py @@ -1,4 +1,4 @@ -from molexpress.layers.base_layer import BaseLayer -from molexpress.layers.gcn_conv import GCNConv -from molexpress.layers.gin_conv import GINConv -from molexpress.layers.readout import Readout +from molexpress.layers.base_layer import BaseLayer as BaseLayer +from molexpress.layers.gcn_conv import GCNConv as GCNConv +from molexpress.layers.gin_conv import GINConv as GINConv +from molexpress.layers.readout import Readout as Readout