diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java index 1b2d5b1b0..fa4f2002f 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java @@ -1,6 +1,7 @@ package uk.ac.ebi.rdf2json.annotators; import java.util.Set; +import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,12 +37,38 @@ public static void annotateShortForms(OntologyGraph graph) { } String shortForm = extractShortForm(graph, ontologyBaseUris, preferredPrefix, c.uri); - String curie = shortForm.replaceFirst("_", ":"); + + /* + CURIEs are formed by following rules: + If there is only one underscore "_" AND the characters before the underscore are PreferredPrefix then replace the underscore with colon ":" + If there is only one underscore "_" AND the characters after the underscore are numbers then replace the underscore with colon ":" + If there is only one underscore "_" and the characters after the underscore are not just numbers then just keep the curie same as shortform + If there are multiple underscore but has only digits after the last underscore then the code replaces the last underscore with a colon + */ + String curie; + // Pattern for: single underscore, prefix matches preferredPrefix + String preferredPrefixPattern = "^(?:" + Pattern.quote(preferredPrefix) + ")_([^_]+)$"; + // Pattern for: single underscore, suffix is all digits + String singleUnderscoreDigitsPattern = "^[^_]+_(\\d+)$"; + // Pattern for: multiple underscores, suffix is all digits + String multipleUnderscoresDigitsPattern = "^(.*)_(\\d+)$"; + if (shortForm.matches(preferredPrefixPattern)) { + curie = shortForm.replaceFirst("_", ":"); + } else if (shortForm.matches(singleUnderscoreDigitsPattern)) { + curie = shortForm.replaceFirst("_", ":"); + } else if (shortForm.matches(multipleUnderscoresDigitsPattern)) { + // Multiple underscores, suffix is digits + // Replace the last underscore with a colon + curie = shortForm.replaceFirst("_(?=\\d+$)", ":"); + } else { + // No transformation needed + curie = shortForm; + } c.properties.addProperty("shortForm", PropertyValueLiteral.fromString(shortForm)); c.properties.addProperty("curie", PropertyValueLiteral.fromString(curie)); - } } + } long endTime3 = System.nanoTime(); logger.info("annotate short forms: {}", ((endTime3 - startTime3) / 1000 / 1000 / 1000));