diff --git a/README.md b/README.md index 0193076..3f2c3e8 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,10 @@ Running SDFEater without parameters displays help. ## Output formats You can specify the output format using `-f,--format`. Available output formats: -* `cypher` - [Cypher](https://neo4j.com/developer/cypher-query-language/) compound, atoms, bonds and relation ready to [import to the Neo4j graph database](https://neo4j.com/developer/kb/export-sub-graph-to-cypher-and-import/), +* `cypher` - [Cypher](https://neo4j.com/developer/cypher-query-language/) molecule, atoms, bonds and relation ready to [import to the Neo4j graph database](https://neo4j.com/developer/kb/export-sub-graph-to-cypher-and-import/), * `cvme` - [CVME](http://cs.aalto.fi/en/current/events/2017-09-22-002/) file format based on SKOS, -* `smiles` - plain text SMILES (if available in the compound property) -* `inchi` - plain text InChI (if available in the compound property) +* `smiles` - plain text SMILES (if available in the molecule property) +* `inchi` - plain text InChI (if available in the molecule property) * `turtle` - [Terse RDF Triple Language](https://www.w3.org/TR/turtle/) (based on [MolecularEntitly](https://bioschemas.org/types/MolecularEntity/) type) * `ntriples` - [N-Triples](https://www.w3.org/TR/n-triples/) (based on [MolecularEntitly](https://bioschemas.org/types/MolecularEntity/) type) * `rdfxml` - [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) (based on [MolecularEntitly](https://bioschemas.org/types/MolecularEntity/) type) diff --git a/src/main/java/pl/edu/uwb/ii/sdfeater/File.java b/src/main/java/pl/edu/uwb/ii/sdfeater/File.java index 01525f8..b4f5647 100644 --- a/src/main/java/pl/edu/uwb/ii/sdfeater/File.java +++ b/src/main/java/pl/edu/uwb/ii/sdfeater/File.java @@ -58,7 +58,7 @@ class File { * Reads and retrieves data from the input file and then writes it to the * appropriate program structures * - * @param c Compound object to which values from the file will be entered + * @param molecule Molecule object to which values from the file will be entered * @param format Output format: c - Cypher, r - cvme, s - smiles, n - inchi * @param urls Try to generate full database URLs instead of IDs * (true/false) @@ -66,7 +66,7 @@ class File { * cypher format (true/false) * */ - void parse(Compound c, char format, boolean urls, boolean periodic) { + void parse(Molecule molecule, char format, boolean urls, boolean periodic) { try { FileInputStream fstream = new FileInputStream(filename); BufferedReader br = new BufferedReader(new InputStreamReader(fstream)); @@ -110,12 +110,12 @@ void parse(Compound c, char format, boolean urls, boolean periodic) { tokens = strLine.split("\\s+"); if (tokens.length == 16) { - c.atoms.add(new Atom(tokens[3], Float.parseFloat(tokens[0]), Float.parseFloat(tokens[1]), Float.parseFloat(tokens[2]))); + molecule.atoms.add(new Atom(tokens[3], Float.parseFloat(tokens[0]), Float.parseFloat(tokens[1]), Float.parseFloat(tokens[2]))); } // V2000, V3000; comment text exclusion if ((tokens.length == 7 && !tokens[6].startsWith("V") && isInt(tokens[0]) || tokens.length == 6 && isInt(tokens[0]))) { - c.bonds.add(new Bond(Integer.parseInt(tokens[0]), Byte.parseByte(tokens[2]), Integer.parseInt(tokens[1]), Byte.parseByte(tokens[3]))); + molecule.bonds.add(new Bond(Integer.parseInt(tokens[0]), Byte.parseByte(tokens[2]), Integer.parseInt(tokens[1]), Byte.parseByte(tokens[3]))); } } else if (molfileReady && !strLine.matches("M\\s+\\w+.*")) { // SDF file parse @@ -125,42 +125,42 @@ void parse(Compound c, char format, boolean urls, boolean periodic) { } else if (strLine.startsWith("$$$$")) { switch (format) { case 'c': - c.printCypherCompound(); + molecule.printCypherMolecule(); if (periodic) { - c.printCypherAtomsWithPeriodicTableData(); + molecule.printCypherAtomsWithPeriodicTableData(); } else { - c.printCypherAtoms(); + molecule.printCypherAtoms(); } - c.printCypherBonds(); + molecule.printCypherBonds(); System.out.println(';'); break; case 'r': - c.printChemSKOSCompound(); - c.printChemSKOSAtomsAndBonds(); + molecule.printChemSKOSMolecule(); + molecule.printChemSKOSAtomsAndBonds(); break; case 's': - c.printSMILES(); + molecule.printSMILES(); break; case 'i': - c.printInChI(); + molecule.printInChI(); break; case 't': case 'n': case 'j': case 'x': case 'h': - c.addToJenaModel(); + molecule.addToJenaModel(); break; case 'a': - c.printRDFaCompound(); + molecule.printRDFaMolecule(); break; case 'm': - c.printMicrodataCompound(); + molecule.printMicrodataMolecule(); break; default: break; } - c.clearAll(); + molecule.clearAll(); molfileReady = false; //} else if (strLine.isEmpty()) { } else if (!strLine.isEmpty()) { @@ -168,104 +168,104 @@ void parse(Compound c, char format, boolean urls, boolean periodic) { // Database links switch (pName) { case "Agricola Citation Links": - c.addPropertyByName(pName, "https://agricola.nal.usda.gov/cgi-bin/Pwebrecon.cgi?Search_Arg=" + strLine + "&DB=local&CNT=25&Search_Code=GKEY%5E&STARTDB=AGRIDB"); + molecule.addPropertyByName(pName, "https://agricola.nal.usda.gov/cgi-bin/Pwebrecon.cgi?Search_Arg=" + strLine + "&DB=local&CNT=25&Search_Code=GKEY%5E&STARTDB=AGRIDB"); break; case "ArrayExpress Database Links": - c.addPropertyByName(pName, "https://www.ebi.ac.uk/arrayexpress/experiments/" + strLine); + molecule.addPropertyByName(pName, "https://www.ebi.ac.uk/arrayexpress/experiments/" + strLine); break; case "BioModels Database Links": - c.addPropertyByName(pName, "https://www.ebi.ac.uk/biomodels-main/" + strLine); + molecule.addPropertyByName(pName, "https://www.ebi.ac.uk/biomodels-main/" + strLine); break; case "ChEBI ID": - c.addPropertyByName(pName, "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=" + strLine.substring(6)); + molecule.addPropertyByName(pName, "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=" + strLine.substring(6)); break; case "DrugBank Database Links": - c.addPropertyByName(pName, "https://www.drugbank.ca/drugs/" + strLine); + molecule.addPropertyByName(pName, "https://www.drugbank.ca/drugs/" + strLine); break; case "ECMDB Database Links": - c.addPropertyByName(pName, "http://ecmdb.ca/compounds/" + strLine); + molecule.addPropertyByName(pName, "http://ecmdb.ca/compounds/" + strLine); break; case "HMDB Database Links": // metabolites - c.addPropertyByName(pName, "http://www.hmdb.ca/metabolites/" + strLine); + molecule.addPropertyByName(pName, "http://www.hmdb.ca/metabolites/" + strLine); break; case "IntAct Database Links": - c.addPropertyByName(pName, "https://www.ebi.ac.uk/intact/interaction/" + strLine); + molecule.addPropertyByName(pName, "https://www.ebi.ac.uk/intact/interaction/" + strLine); break; case "IntEnz Database Links": strLine = strLine.replaceAll(" ", "+"); - c.addPropertyByName(pName, "http://www.ebi.ac.uk/intenz/query?q=" + strLine); + molecule.addPropertyByName(pName, "http://www.ebi.ac.uk/intenz/query?q=" + strLine); break; case "KEGG COMPOUND Database Links": - c.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?cpd:" + strLine); + molecule.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?cpd:" + strLine); break; case "KEGG DRUG Database Links": - c.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?dr:" + strLine); + molecule.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?dr:" + strLine); break; case "KEGG GLYCAN Database Links": - c.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?gl:" + strLine); + molecule.addPropertyByName(pName, "http://www.genome.jp/dbget-bin/www_bget?gl:" + strLine); break; case "KNApSAcK Database Links": - c.addPropertyByName(pName, "http://kanaya.naist.jp/knapsack_jsp/information.jsp?word=" + strLine); + molecule.addPropertyByName(pName, "http://kanaya.naist.jp/knapsack_jsp/information.jsp?word=" + strLine); break; case "LIPID MAPS instance Database Links": - c.addPropertyByName(pName, "http://www.lipidmaps.org/data/LMSDRecord.php?LMID=" + strLine); + molecule.addPropertyByName(pName, "http://www.lipidmaps.org/data/LMSDRecord.php?LMID=" + strLine); break; case "MetaCyc Database Links": - c.addPropertyByName(pName, "https://metacyc.org/compound?orgid=META&id=" + strLine); + molecule.addPropertyByName(pName, "https://metacyc.org/compound?orgid=META&id=" + strLine); break; case "Patent Database Links": - c.addPropertyByName(pName, "https://worldwide.espacenet.com/searchResults?query=" + strLine); + molecule.addPropertyByName(pName, "https://worldwide.espacenet.com/searchResults?query=" + strLine); break; case "PDBeChem Database Links": - c.addPropertyByName(pName, "http://www.ebi.ac.uk/pdbe-srv/pdbechem/chemicalCompound/show/" + strLine); + molecule.addPropertyByName(pName, "http://www.ebi.ac.uk/pdbe-srv/pdbechem/chemicalCompound/show/" + strLine); break; case "PubChem Database Links": // custom key value for compound and substance links switch (strLine.substring(0, 3)) { case "CID": - c.addPropertyByName("PubChem Database Compound Links", "https://pubchem.ncbi.nlm.nih.gov/compound/" + strLine.substring(5)); + molecule.addPropertyByName("PubChem Database Molecule Links", "https://pubchem.ncbi.nlm.nih.gov/compound/" + strLine.substring(5)); break; case "SID": - c.addPropertyByName("PubChem Database Substance Links", "https://pubchem.ncbi.nlm.nih.gov/substance/" + strLine.substring(5)); + molecule.addPropertyByName("PubChem Database Substance Links", "https://pubchem.ncbi.nlm.nih.gov/substance/" + strLine.substring(5)); break; } break; case "PubMed Central Citation Links": - c.addPropertyByName(pName, "https://www.ncbi.nlm.nih.gov/pmc/articles/" + strLine + "/"); + molecule.addPropertyByName(pName, "https://www.ncbi.nlm.nih.gov/pmc/articles/" + strLine + "/"); break; case "PubMed Citation Links": - c.addPropertyByName(pName, "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + strLine); + molecule.addPropertyByName(pName, "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + strLine); break; case "Reactome Database Links": - c.addPropertyByName(pName, "https://reactome.org/content/detail/" + strLine); + molecule.addPropertyByName(pName, "https://reactome.org/content/detail/" + strLine); break; case "RESID Database Links": - c.addPropertyByName(pName, "http://pir.georgetown.edu/cgi-bin/resid?id=" + strLine); + molecule.addPropertyByName(pName, "http://pir.georgetown.edu/cgi-bin/resid?id=" + strLine); break; case "Rhea Database Links": - c.addPropertyByName(pName, "https://www.rhea-db.org/reaction?id=" + strLine); + molecule.addPropertyByName(pName, "https://www.rhea-db.org/reaction?id=" + strLine); break; case "SABIO-RK Database Links": - c.addPropertyByName(pName, "http://sabio.h-its.org/reacdetails.jsp?reactid=" + strLine); + molecule.addPropertyByName(pName, "http://sabio.h-its.org/reacdetails.jsp?reactid=" + strLine); break; case "UM-BBD compID Database Links": - c.addPropertyByName(pName, "http://eawag-bbd.ethz.ch/servlets/pageservlet?ptype=c&compID=" + strLine); + molecule.addPropertyByName(pName, "http://eawag-bbd.ethz.ch/servlets/pageservlet?ptype=c&compID=" + strLine); break; case "UniProt Database Links": - c.addPropertyByName(pName, "https://www.uniprot.org/uniprot/" + strLine); + molecule.addPropertyByName(pName, "https://www.uniprot.org/uniprot/" + strLine); break; case "Wikipedia Database Links": - c.addPropertyByName(pName, "https://en.wikipedia.org/wiki/" + strLine); + molecule.addPropertyByName(pName, "https://en.wikipedia.org/wiki/" + strLine); break; case "YMDB Database Links": - c.addPropertyByName(pName, "http://www.ymdb.ca/compounds/" + strLine); + molecule.addPropertyByName(pName, "http://www.ymdb.ca/compounds/" + strLine); break; default: - c.addPropertyByName(pName, strLine); + molecule.addPropertyByName(pName, strLine); } } else { - c.addPropertyByName(pName, strLine); + molecule.addPropertyByName(pName, strLine); } } } diff --git a/src/main/java/pl/edu/uwb/ii/sdfeater/Compound.java b/src/main/java/pl/edu/uwb/ii/sdfeater/Molecule.java similarity index 97% rename from src/main/java/pl/edu/uwb/ii/sdfeater/Compound.java rename to src/main/java/pl/edu/uwb/ii/sdfeater/Molecule.java index 76b1685..4910b6b 100644 --- a/src/main/java/pl/edu/uwb/ii/sdfeater/Compound.java +++ b/src/main/java/pl/edu/uwb/ii/sdfeater/Molecule.java @@ -40,18 +40,22 @@ import static pl.edu.uwb.ii.sdfeater.SDFEater.periodic_table_data; /** - * Class that stores information about chemical compound + * Class that stores information about chemical molecule * - * @author Łukasz Szeremeta 2017-2018 + * @author Łukasz Szeremeta 2017-2019 * @author Dominik Tomaszuk 2017-2018 */ -class Compound { +class Molecule { /** * Consts for UUID */ private static final byte STRIKE = 0; private static final byte UNDERLINE = 1; + + /** + * Incremental ID + */ private static final AtomicLong idCounter = new AtomicLong(); /** * Stores atoms data @@ -63,21 +67,24 @@ class Compound { */ final List bonds = new ArrayList<>(); /** - * Stores all properties of the chemical compound + * Stores all properties of the chemical molecule */ private final Map> properties = new HashMap<>(); private UUID uuid; - Compound() { + Molecule() { uuid = UUID.randomUUID(); } + /** + * Create incremental ID + */ private static String createID() { return String.valueOf(idCounter.getAndIncrement()); } /** - * Set compound property name + * Set molecule property name * * @param propertyName property name (key) */ @@ -154,11 +161,11 @@ List getPropertiesByName(String propertyName) { } /** - * Print main compound data in Cypher + * Print main molecule data in Cypher */ - void printCypherCompound() { + void printCypherMolecule() { StringBuilder val_tmp = new StringBuilder(); - StringBuilder query_str = new StringBuilder("CREATE (c" + addUUID(UNDERLINE) + ":Compound {"); + StringBuilder query_str = new StringBuilder("CREATE (c" + addUUID(UNDERLINE) + ":Molecule {"); for (Map.Entry> entry : properties.entrySet()) { String key = entry.getKey(); @@ -185,9 +192,9 @@ void printCypherCompound() { } /** - * Print main compound data in CVME + * Print main molecule data in CVME */ - void printChemSKOSCompound() { + void printChemSKOSMolecule() { StringBuilder val_tmp = new StringBuilder(); StringBuilder query_str = new StringBuilder(); @@ -303,7 +310,7 @@ void printChemSKOSCompound() { } else if ("Patent Database Links".equals(key)) { String value = values.get(0); query_str.append(" cvme:patent ").append(printValueAsNumberOrStringCVME(value)).append(" .\n"); - } else if ("PubChem Database Compound Links".equals(key)) { + } else if ("PubChem Database Molecule Links".equals(key)) { String value = values.get(0); query_str.append(" rdfs:seeAlso ").append(printValueAsNumberOrStringCVME(value)).append(" .\n"); } else if ("PubChem Database Substance Links".equals(key)) { @@ -316,7 +323,7 @@ void printChemSKOSCompound() { } /** - * Add main compound data to Jena model + * Add main molecule data to Jena model */ void addToJenaModel() { Resource me = ResourceFactory.createResource(); @@ -372,9 +379,9 @@ void addToJenaModel() { } /** - * Print main compound data in RDFa + * Print main molecule data in RDFa */ - void printRDFaCompound() { + void printRDFaMolecule() { StringBuilder output_str = new StringBuilder(); for (Map.Entry> entry : properties.entrySet()) { String key = entry.getKey(); @@ -422,9 +429,9 @@ void printRDFaCompound() { } /** - * Print main compound data in Microdata + * Print main molecule data in Microdata */ - void printMicrodataCompound() { + void printMicrodataMolecule() { StringBuilder output_str = new StringBuilder(); for (Map.Entry> entry : properties.entrySet()) { String key = entry.getKey(); @@ -653,7 +660,7 @@ void printInChI() { } /** - * Print atoms data and Compound-Atom relations in Cypher + * Print atoms data and Molecule-Atom relations in Cypher */ void printCypherAtoms() { if (!atoms.isEmpty()) { @@ -663,12 +670,12 @@ void printCypherAtoms() { it++; } - printCypherCompoundAtomRelation(); + printCypherMoleculeAtomRelation(); } } /** - * Print atoms data with additional periodic table data and Compound-Atom + * Print atoms data with additional periodic table data and Molecule-Atom * relations in Cypher */ void printCypherAtomsWithPeriodicTableData() { @@ -699,7 +706,7 @@ void printCypherAtomsWithPeriodicTableData() { } System.out.print(str); - printCypherCompoundAtomRelation(); + printCypherMoleculeAtomRelation(); } } @@ -714,9 +721,9 @@ private Map getAtomPeriodicDataByAtomSymbol(String symbol) { } /** - * Print Compound-Atom relations in Cypher + * Print Molecule-Atom relations in Cypher */ - private void printCypherCompoundAtomRelation() { + private void printCypherMoleculeAtomRelation() { if (!atoms.isEmpty()) { StringBuilder query_str = new StringBuilder("CREATE"); @@ -840,7 +847,7 @@ private String addUUID(byte dash) { } /** - * Prepare program structures for new compound + * Prepare program structures for new molecule */ void clearAll() { properties.clear(); diff --git a/src/main/java/pl/edu/uwb/ii/sdfeater/SDFEater.java b/src/main/java/pl/edu/uwb/ii/sdfeater/SDFEater.java index 74d3c37..2eb388c 100644 --- a/src/main/java/pl/edu/uwb/ii/sdfeater/SDFEater.java +++ b/src/main/java/pl/edu/uwb/ii/sdfeater/SDFEater.java @@ -80,7 +80,7 @@ private static void initializeJenaModel() { * @param args the command line arguments */ public static void main(String[] args) { - Compound c = new Compound(); + Molecule molecule = new Molecule(); Options options = new Options(); Option input = new Option("i", "input", true, "input file path"); input.setRequired(true); @@ -106,40 +106,40 @@ public static void main(String[] args) { if (format.equalsIgnoreCase("cypher")) { if (cmd.hasOption("urls") && cmd.hasOption("periodic")) { loadPeriodicTableData(); - file.parse(c, 'c', true, true); + file.parse(molecule, 'c', true, true); } else if (!cmd.hasOption("urls") && cmd.hasOption("periodic")) { loadPeriodicTableData(); - file.parse(c, 'c', false, true); + file.parse(molecule, 'c', false, true); } else if (cmd.hasOption("urls") && !cmd.hasOption("periodic")) { - file.parse(c, 'c', true, false); + file.parse(molecule, 'c', true, false); } else if (!cmd.hasOption("urls") && !cmd.hasOption("periodic")) { - file.parse(c, 'c', false, false); + file.parse(molecule, 'c', false, false); } } else if (format.equalsIgnoreCase("cvme")) { - file.parse(c, 'r', true, false); + file.parse(molecule, 'r', true, false); } else if (format.equalsIgnoreCase("smiles")) { - file.parse(c, 's', false, false); + file.parse(molecule, 's', false, false); } else if (format.equalsIgnoreCase("inchi")) { - file.parse(c, 'i', false, false); + file.parse(molecule, 'i', false, false); } else if (format.equalsIgnoreCase("turtle")) { initializeJenaModel(); - file.parse(c, 't', false, false); + file.parse(molecule, 't', false, false); } else if (format.equalsIgnoreCase("ntriples")) { initializeJenaModel(); - file.parse(c, 'n', false, false); + file.parse(molecule, 'n', false, false); } else if (format.equalsIgnoreCase("jsonld")) { initializeJenaModel(); - file.parse(c, 'j', false, false); + file.parse(molecule, 'j', false, false); } else if (format.equalsIgnoreCase("rdfxml")) { initializeJenaModel(); - file.parse(c, 'x', false, false); + file.parse(molecule, 'x', false, false); } else if (format.equalsIgnoreCase("rdfthrift")) { initializeJenaModel(); - file.parse(c, 'h', false, false); + file.parse(molecule, 'h', false, false); } else if (format.equalsIgnoreCase("rdfa")) { - file.parse(c, 'a', false, false); + file.parse(molecule, 'a', false, false); } else if (format.equalsIgnoreCase("microdata")) { - file.parse(c, 'm', false, false); + file.parse(molecule, 'm', false, false); } }