From 2ca12ed4183de911ec3ce3a4619db2cdeec77be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Zasso?= Date: Fri, 24 Feb 2023 15:41:37 +0100 Subject: [PATCH] feat: implement SmilesParser to allow SMARTS features (#156) Closes: https://github.com/cheminfo/openchemlib-js/issues/151 --- __tests__/SmilesParser.js | 144 ++++++++++++++++++ __tests__/__snapshots__/library.js.snap | 11 ++ __tests__/library.js | 1 + minimal.d.ts | 3 + .../research/gwt/minimal/JSMolecule.java | 16 +- .../research/gwt/minimal/JSSmilesParser.java | 78 ++++++++++ types.d.ts | 77 +++++++++- 7 files changed, 319 insertions(+), 11 deletions(-) create mode 100644 __tests__/SmilesParser.js create mode 100644 src/com/actelion/research/gwt/minimal/JSSmilesParser.java diff --git a/__tests__/SmilesParser.js b/__tests__/SmilesParser.js new file mode 100644 index 00000000..fd8b3b22 --- /dev/null +++ b/__tests__/SmilesParser.js @@ -0,0 +1,144 @@ +'use strict'; + +const { SmilesParser, Molecule } = require('../minimal'); + +it.each([ + ['COCO', { atoms: 4 }], + ['CC(=O)O', { atoms: 4 }], +])('should parse normal SMILES %s', (smiles, { atoms }) => { + const parser = new SmilesParser(); + const mol = parser.parseMolecule(smiles); + expect(mol.isFragment()).toBe(false); + expect(mol.getAllAtoms()).toBe(atoms); +}); + +it.each([ + ['COCO', { atoms: 4 }], + ['[C,c]', { atoms: 1 }], + ['[R0]', { atoms: 1 }], +])('should parse SMARTS %s', (smarts, { atoms }) => { + const parser = new SmilesParser({ smartsMode: 'smarts' }); + const mol = parser.parseMolecule(smarts); + expect(mol.isFragment()).toBe(true); + expect(mol.getAllAtoms()).toBe(atoms); +}); + +it('should guess SMARTS', () => { + const parser = new SmilesParser({ smartsMode: 'guess' }); + const molNormal = parser.parseMolecule('COCO'); + expect(molNormal.isFragment()).toBe(false); + const molSmarts = parser.parseMolecule('[C,c]'); + expect(molSmarts.isFragment()).toBe(true); +}); + +it('should optionally not parse CACTVS', () => { + const cactvs = '[C;z3]'; + const parserWithCactvs = new SmilesParser({ + smartsMode: 'smarts', + }); + const molecule = parserWithCactvs.parseMolecule(cactvs); + expect(molecule.getAllAtoms()).toBe(1); + const parserWithoutCactvs = new SmilesParser({ + smartsMode: 'smarts', + noCactvs: true, + }); + expect(() => { + parserWithoutCactvs.parseMolecule(cactvs); + }).toThrow(/'z'/); +}); + +it('should optionally skip coordinate templates', () => { + const cubane = 'C12C3C4C1C5C2C3C45'; + const molecule = new Molecule(0, 0); + const parserWithTemplates = new SmilesParser(); + parserWithTemplates.setRandomSeed(1); + parserWithTemplates.parseMolecule(cubane, { molecule }); + const coords1 = molecule.getIDCoordinates(); + const parserWithoutTemplates = new SmilesParser({ + skipCoordinateTemplates: true, + }); + parserWithoutTemplates.setRandomSeed(1); + parserWithoutTemplates.parseMolecule(cubane, { molecule }); + expect(molecule.getIDCoordinates()).not.toBe(coords1); +}); + +it('should optionally make hydrogens explicit', () => { + const smiles = '[CH4]'; + const molecule = new Molecule(0, 0); + const parserWithoutExplicitH = new SmilesParser({ smartsMode: 'smarts' }); + parserWithoutExplicitH.parseMolecule(smiles, { molecule }); + expect(molecule.getAllAtoms()).toBe(1); + const parserWithExplicitH = new SmilesParser({ + smartsMode: 'smarts', + makeHydrogenExplicit: true, + }); + parserWithExplicitH.parseMolecule(smiles, { molecule }); + expect(molecule.getAllAtoms()).toBe(5); +}); + +it('should allow to set random seed', () => { + const smiles = 'C1CN2CCN1CC2'; + const parser = new SmilesParser(); + const coords1 = parser.parseMolecule(smiles).getIDCoordinates(); + const coords2 = parser.parseMolecule(smiles).getIDCoordinates(); + // TODO: Find a SMILES that goes through the random branch of coordinate invention. + // expect(coords1).not.toBe(coords2); + expect(coords1).toBe(coords2); + parser.setRandomSeed(1); + const coords3 = parser.parseMolecule(smiles).getIDCoordinates(); + const coords4 = parser.parseMolecule(smiles).getIDCoordinates(); + expect(coords3).toBe(coords4); +}); + +it('should create smarts warnings', () => { + const parserWithoutWarnings = new SmilesParser({ + smartsMode: 'smarts', + }); + parserWithoutWarnings.parseMolecule('[R9]'); + expect(parserWithoutWarnings.getSmartsWarning()).toBe(''); + const parserWithWarnings = new SmilesParser({ + smartsMode: 'smarts', + createSmartsWarnings: true, + }); + parserWithWarnings.parseMolecule('[R9]'); + expect(parserWithWarnings.getSmartsWarning()).toBe( + 'Unresolved SMARTS features: R9', + ); +}); + +it('should parse into the passed molecule', () => { + const parser = new SmilesParser(); + const molecule = new Molecule(0, 0); + const mol = parser.parseMolecule('COCO', { molecule }); + expect(mol.toSmiles()).toBe('COCO'); + expect(mol).toBe(molecule); +}); + +it('should should optionally not invent coordinates', () => { + const parser = new SmilesParser(); + const molecule = new Molecule(0, 0); + parser.parseMolecule('COCO', { molecule, noCoordinates: false }); + expect(molecule.getAtomX(0)).not.toBe(0); + parser.parseMolecule('COCO', { molecule, noCoordinates: true }); + expect(molecule.getAtomX(0)).toBe(0); +}); + +it('should should optionally not parse stereo features', () => { + const parser = new SmilesParser(); + const vitaminA = 'C/C(=C\\CO)/C=C/C=C(/C)\\C=C\\C1=C(C)CCCC1(C)C'; + const molecule = new Molecule(0, 0); + parser.parseMolecule(vitaminA, { molecule, noStereo: false }); + const idCodeWithStereo = molecule.getIDCode(); + parser.parseMolecule(vitaminA, { molecule, noStereo: true }); + const idCodeWithoutStereo = molecule.getIDCode(); + expect(idCodeWithStereo).not.toBe(idCodeWithoutStereo); +}); + +it('should parse reactions', () => { + const parser = new SmilesParser(); + const reaction = parser.parseReaction('COCO>>COC.O'); + expect(reaction.getProducts()).toBe(1); + expect(reaction.getReactants()).toBe(1); + expect(reaction.getCatalysts()).toBe(0); + expect(reaction.toSmiles()).toBe('COCO>>COC.O'); +}); diff --git a/__tests__/__snapshots__/library.js.snap b/__tests__/__snapshots__/library.js.snap index 6cbd8f0e..71c8f651 100644 --- a/__tests__/__snapshots__/library.js.snap +++ b/__tests__/__snapshots__/library.js.snap @@ -350,6 +350,15 @@ exports[`prototype properties of SSSearcherWithIndex 1`] = ` exports[`prototype properties of SVGRenderer 1`] = `[]`; +exports[`prototype properties of SmilesParser 1`] = ` +[ + "getSmartsWarning", + "parseMolecule", + "parseReaction", + "setRandomSeed", +] +`; + exports[`prototype properties of StructureEditor 1`] = ` [ "getIDCode", @@ -719,6 +728,8 @@ exports[`static properties of SVGRenderer 1`] = ` ] `; +exports[`static properties of SmilesParser 1`] = `[]`; + exports[`static properties of StructureEditor 1`] = ` [ "addPasteHandler", diff --git a/__tests__/library.js b/__tests__/library.js index 6ce9cb4e..eb7f2e12 100644 --- a/__tests__/library.js +++ b/__tests__/library.js @@ -10,6 +10,7 @@ const minimalAPI = [ 'Reaction', 'RingCollection', 'SDFileParser', + 'SmilesParser', 'SSSearcher', 'SSSearcherWithIndex', 'Util', diff --git a/minimal.d.ts b/minimal.d.ts index edc91e32..706ef45a 100644 --- a/minimal.d.ts +++ b/minimal.d.ts @@ -8,6 +8,9 @@ export { IDepictorOptions, Reaction, SDFileParser, + ISmilesParserOptions, + ISmilesParserParseMoleculeOptions, + SmilesParser, SSSearcher, SSSearcherWithIndex, Util, diff --git a/src/com/actelion/research/gwt/minimal/JSMolecule.java b/src/com/actelion/research/gwt/minimal/JSMolecule.java index 4a6e337d..41e70bb0 100644 --- a/src/com/actelion/research/gwt/minimal/JSMolecule.java +++ b/src/com/actelion/research/gwt/minimal/JSMolecule.java @@ -41,9 +41,9 @@ public static native JSMolecule fromSmiles(String smiles, JavaScriptObject optio throws Exception /*-{ options = options || {}; - var coordinates = !options.noCoordinates; - var stereo = !options.noStereo; - return @com.actelion.research.gwt.minimal.JSMolecule::fromSmiles(Ljava/lang/String;ZZ)(smiles, coordinates, stereo); + var createCoordinates = !options.noCoordinates; + var readStereoFeatures = !options.noStereo; + return @com.actelion.research.gwt.minimal.JSMolecule::fromSmiles(Ljava/lang/String;ZZ)(smiles, createCoordinates, readStereoFeatures); }-*/; public static JSMolecule fromMolfile(String molfile) throws Exception { @@ -248,13 +248,11 @@ private void addImplicitHydrogens(int atomNumber) { } @JsIgnore - public static JSMolecule fromSmiles(String smiles, boolean ensure2DCoordinates, - boolean readStereoFeatures) throws Exception { + public static JSMolecule fromSmiles(String smiles, boolean createCoordinates, + boolean readStereoFeatures) throws Exception { + SmilesParser parser = new SmilesParser(); JSMolecule mol = new JSMolecule(); - new SmilesParser().parse(mol.oclMolecule, smiles.getBytes(), false, readStereoFeatures); - if (ensure2DCoordinates) { - mol.inventCoordinates(); - } + parser.parse(mol.oclMolecule, smiles.getBytes(), createCoordinates, readStereoFeatures); return mol; } diff --git a/src/com/actelion/research/gwt/minimal/JSSmilesParser.java b/src/com/actelion/research/gwt/minimal/JSSmilesParser.java new file mode 100644 index 00000000..7ec6e4bf --- /dev/null +++ b/src/com/actelion/research/gwt/minimal/JSSmilesParser.java @@ -0,0 +1,78 @@ +package com.actelion.research.gwt.minimal; + +import com.actelion.research.chem.*; +import com.google.gwt.core.client.JavaScriptObject; +import jsinterop.annotations.*; + +@JsType(name = "SmilesParser") +public class JSSmilesParser { + private SmilesParser oclParser; + + public JSSmilesParser(JavaScriptObject options) { + init(options); + } + + private native void init(JavaScriptObject options) + /*-{ + options = options || {}; + + var smartsMode = options.smartsMode || 'smiles'; + var createSmartsWarnings = options.createSmartsWarnings || false; + var skipCoordinateTemplates = options.skipCoordinateTemplates || false; + var makeHydrogenExplicit = options.makeHydrogenExplicit || false; + var noCactvs = options.noCactvs || false; + this.@com.actelion.research.gwt.minimal.JSSmilesParser::init(Ljava/lang/String;ZZZZ)(smartsMode, createSmartsWarnings, skipCoordinateTemplates, makeHydrogenExplicit, noCactvs); + }-*/; + + private void init(String smartsMode, boolean createSmartsWarnings, + boolean skipCoordinateTemplates, boolean makeHydrogenExplicit, + boolean noCactvs) { + int mode = SmilesParser.SMARTS_MODE_IS_SMILES; + switch (smartsMode) { + case "smarts": + mode = SmilesParser.SMARTS_MODE_IS_SMARTS; + break; + case "guess": + mode = SmilesParser.SMARTS_MODE_GUESS; + break; + } + if (makeHydrogenExplicit) { + mode |= SmilesParser.MODE_MAKE_HYDROGEN_EXPLICIT; + } + if (skipCoordinateTemplates) { + mode |= SmilesParser.MODE_SKIP_COORDINATE_TEMPLATES; + } + if (noCactvs) { + mode |= SmilesParser.MODE_NO_CACTUS_SYNTAX; + } + oclParser = new SmilesParser(mode, createSmartsWarnings); + } + + public void setRandomSeed(int seed) { + oclParser.setRandomSeed((long)seed); + } + + public native JSMolecule parseMolecule(String smiles, JavaScriptObject options) + /*-{ + options = options || {}; + + var molecule = options.molecule || @com.actelion.research.gwt.minimal.JSMolecule::new()(); + var createCoordinates = !options.noCoordinates; + var readStereoFeatures = !options.noStereo; + return this.@com.actelion.research.gwt.minimal.JSSmilesParser::parseMolecule(Lcom/actelion/research/gwt/minimal/JSMolecule;Ljava/lang/String;ZZ)(molecule, smiles, createCoordinates, readStereoFeatures); + }-*/; + + private JSMolecule parseMolecule(JSMolecule molecule, String smiles, + boolean createCoordinates, boolean readStereoFeatures) throws Exception { + oclParser.parse(molecule.getStereoMolecule(), smiles.getBytes(), createCoordinates, readStereoFeatures); + return molecule; + } + + public JSReaction parseReaction(String smiles) throws Exception { + return new JSReaction(oclParser.parseReaction(smiles)); + } + + public String getSmartsWarning() { + return oclParser.getSmartsWarning(); + } +} diff --git a/types.d.ts b/types.d.ts index fcce8bc3..2c03581e 100644 --- a/types.d.ts +++ b/types.d.ts @@ -2,12 +2,14 @@ export interface IMoleculeFromSmilesOptions { /** - * Disable extra coordinate computation. Default: false. + * Disable coordinate invention. + * @default `false` */ noCoordinates?: boolean; /** - * Disable stereo features parsing. Default: false. + * Disable stereo features parsing. + * @default `false` */ noStereo?: boolean; } @@ -2573,6 +2575,77 @@ export declare class Molecule { setAssignParitiesToNitrogen(b: boolean): void; } +export interface ISmilesParserOptions { + /** + * Enable SMARTS parsing with `'smarts'` or `'guess'`. + * @default `'smiles'` + */ + smartsMode?: 'smiles' | 'smarts' | 'guess'; + + createSmartsWarnings?: boolean; + + skipCoordinateTemplates?: boolean; + + makeHydrogenExplicit?: boolean; + + /** + * Disable parsing of CACTVS syntax. + */ + noCactvs?: boolean; +} + +export interface ISmilesParserParseMoleculeOptions { + /** + * Molecule to parse into. + */ + molecule?: Molecule; + + /** + * Disable coordinate invention. + * @default `false` + */ + noCoordinates?: boolean; + + /** + * Disable stereo features parsing. + * @default `false` + */ + noStereo?: boolean; +} + +export declare class SmilesParser { + /** + * Create a SMILES parser. + */ + constructor(options?: ISmilesParserOptions); + + /** + * Set the random seed used to invent coordinates. + * @param seed + */ + setRandomSeed(seed: number): void; + + /** + * Parse a SMILES string and return a molecule. + */ + parseMolecule( + smiles: string, + options?: ISmilesParserParseMoleculeOptions, + ): Molecule; + + /** + * Parse a SMILES string and return a reaction. + */ + parseReaction(smiles: string): Reaction; + + /** + * If createSmartsWarnings in the constructor was passed as true, then this method + * returns a list of all SMARTS features, which could not be interpreted in the most recently + * parsed SMILES/SMARTS pattern. + */ + getSmartsWarning(): string; +} + export interface MolecularFormula { absoluteWeight: number;