diff --git a/schemas/src/digital-objects/asct-b.yaml b/schemas/src/digital-objects/asct-b.yaml index 9c6591b..4643f62 100644 --- a/schemas/src/digital-objects/asct-b.yaml +++ b/schemas/src/digital-objects/asct-b.yaml @@ -41,16 +41,6 @@ classes: - ccf_pref_label - ccf_asctb_type - ccf_is_provisional - slot_usage: - ccf_pref_label: - annotations: - owl: AnnotationAssertion, AnnotationProperty - ccf_asctb_type: - annotations: - owl: AnnotationAssertion, AnnotationProperty - ccf_is_provisional: - annotations: - owl: AnnotationAssertion, AnnotationProperty AnatomicalStructure: is_a: AsctbConcept @@ -62,9 +52,6 @@ classes: syntax: "({uberon}|{fma}):\\d+|{asctb_temp}[a-zA-Z0-9-]+" interpolated: true partial_match: false - ccf_part_of: - annotations: - owl: AnnotationAssertion, AnnotationProperty annotations: owl: Class @@ -80,12 +67,6 @@ classes: syntax: "({cl}|{pcl}|{lmha}):\\d+|{asctb_temp}[a-zA-Z0-9-]+" interpolated: true partial_match: false - ccf_ct_isa: - annotations: - owl: AnnotationAssertion, AnnotationProperty - ccf_located_in: - annotations: - owl: AnnotationAssertion, AnnotationProperty annotations: owl: Class owl.template: |- @@ -117,9 +98,6 @@ classes: syntax: "{hgnc}:\\d+|{asctb_temp}[a-zA-Z0-9-]+|{hgnco}:[a-zA-Z0-9-]+" interpolated: true partial_match: false - ccf_biomarker_type: - annotations: - owl: AnnotationAssertion, AnnotationProperty annotations: owl: Class @@ -127,44 +105,42 @@ classes: slots: - members - references - slot_usage: - references: - structured_pattern: - syntax: "^https:\\/\\/doi\\.org\\/10\\.\\d+\\/.*" - interpolated: true - partial_match: false - CellTypeInstance: + AsctbRecord: mixins: - Named - Instance slots: - - ccf_located_in - - ccf_has_biomarker_set + - record_number + - anatomical_structure_list + - cell_type_list + - gene_marker_list + - protein_marker_list + - lipid_marker_list + - metabolites_marker_list + - proteoforms_marker_list + - references slot_usage: - ccf_located_in: - annotations: - owl: AnnotationAssertion, AnnotationProperty - ccf_has_biomarker_set: + references: annotations: owl: AnnotationAssertion, AnnotationProperty - BiomarkerSet: + CellMarkerDescriptor: mixins: - Named - Instance slots: - - members + - target_cell_type + - cell_type_location + - biomarker_set - references + - derived_from slot_usage: - members: + references: annotations: owl: AnnotationAssertion, AnnotationProperty - references: - structured_pattern: - syntax: "^https:\\/\\/doi\\.org\\/10\\.\\d+\\/.*" - interpolated: true - partial_match: false + derived_from: + range: AsctbRecord annotations: owl: AnnotationAssertion, AnnotationProperty @@ -173,7 +149,8 @@ classes: - anatomical_structures - cell_types - biomarkers - - cell_type_instances + - asctb_record + - cell_marker_descriptor AsctbMetadata: class_uri: dcat:Dataset @@ -224,10 +201,14 @@ slots: multivalued: true inlined_as_list: true range: Biomarker - cell_type_instances: + asctb_record: multivalued: true inlined_as_list: true - range: CellTypeInstance + range: AsctbRecord + cell_marker_descriptor: + multivalued: true + inlined_as_list: true + range: CellMarkerDescriptor members: multivalued: true range: Biomarker @@ -237,36 +218,118 @@ slots: required: false range: CharacterizingMarkerSet multivalued: true - ccf_has_biomarker_set: - required: false - range: BiomarkerSet - multivalued: true - inlined_as_list: true ccf_pref_label: required: true slot_uri: ccf:ccf_pref_label + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_asctb_type: required: true slot_uri: ccf:ccf_asctb_type + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_is_provisional: required: true slot_uri: ccf:ccf_is_provisional range: boolean + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_part_of: required: false slot_uri: ccf:ccf_part_of range: AnatomicalStructure multivalued: true + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_located_in: required: false slot_uri: ccf:ccf_located_in range: AnatomicalStructure multivalued: true + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_ct_isa: required: false slot_uri: ccf:ccf_ct_isa range: CellType multivalued: true + annotations: + owl: AnnotationAssertion, AnnotationProperty ccf_biomarker_type: required: true slot_uri: ccf:ccf_biomarker_type + annotations: + owl: AnnotationAssertion, AnnotationProperty + record_number: + required: true + slot_uri: ccf:record_number + range: integer + annotations: + owl: AnnotationAssertion, AnnotationProperty + anatomical_structure_list: + required: true + multivalued: true + range: AnatomicalStructure + slot_uri: ccf:anatomical_structure_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + cell_type_list: + required: true + multivalued: true + range: CellType + slot_uri: ccf:cell_type_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + gene_marker_list: + required: true + multivalued: true + range: Biomarker + slot_uri: ccf:gene_marker_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + protein_marker_list: + required: false + multivalued: true + range: Biomarker + slot_uri: ccf:protein_marker_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + lipid_marker_list: + required: false + multivalued: true + range: Biomarker + slot_uri: ccf:lipid_marker_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + metabolites_marker_list: + required: false + multivalued: true + range: Biomarker + slot_uri: ccf:metabolites_marker_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + proteoforms_marker_list: + required: false + multivalued: true + range: Biomarker + slot_uri: ccf:proteoforms_marker_item + annotations: + owl: AnnotationAssertion, AnnotationProperty + target_cell_type: + required: true + slot_uri: ccf:target_cell_type + range: CellType + annotations: + owl: AnnotationAssertion, AnnotationProperty + cell_type_location: + required: true + slot_uri: ccf:cell_type_location + range: AnatomicalStructure + annotations: + owl: AnnotationAssertion, AnnotationProperty + biomarker_set: + multivalued: true + range: Biomarker + slot_uri: ccf:biomarker + annotations: + owl: AnnotationAssertion, AnnotationProperty diff --git a/src/normalization/normalize-asct-b.js b/src/normalization/normalize-asct-b.js index 28fa75e..6edaea8 100644 --- a/src/normalization/normalize-asct-b.js +++ b/src/normalization/normalize-asct-b.js @@ -5,13 +5,13 @@ import { resolve } from 'path'; import sh from 'shelljs'; import { info, more, warning } from '../utils/logging.js'; import { makeASCTBData } from './asct-b-utils/api.functions.js'; +import { BM_TYPE } from './asct-b-utils/api.model.js'; import { getPatchesForAnatomicalStructure, getPatchesForBiomarker, getPatchesForCellType, isAsIdValid, isCtIdValid, - isDoiValid, isIdValid, normalizeDoi, } from './patches.js'; @@ -67,7 +67,8 @@ function normalizeData(context, data) { anatomical_structures: normalizeAsData(context, data), cell_types: normalizeCtData(context, data), biomarkers: normalizeBmData(context, data), - cell_type_instances: normalizeCtInstanceData(context, data), + asctb_record: normalizeAsctbRecord(context, data), + cell_marker_descriptor: normalizeCellMarkerDescriptor(context, data) }; } @@ -150,10 +151,15 @@ function normalizeCtData(context, data) { .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) .map(({ id, name }) => generateIdWhenEmpty(id, name)) .filter((id) => passIdFilterCriteria(context, id)); + + // Get the references const references = row.references - .filter(({ id }) => checkNotEmpty(id)) - .map(({ id }) => normalizeDoi(id)) - .filter((doi) => passDoiFilterCriteria(context, doi)); + .map((ref) => { + const refId = checkNotEmpty(ref.id) ? ref.id : "N/A"; + return checkIsDoi(refId) ? normalizeDoi(refId) : refId; + }); + + // Get the last cell type as the primary cell const last_ct = valid_ct.pop(); if (last_ct) { addCharacterizingBiomarkers(collector, last_ct, biomarkers, references); @@ -254,90 +260,120 @@ function normalizeBm(collector, { id: bm_id, name: bm_name, b_type, is_provision return collector; } -function normalizeCtInstanceData(context, data) { +function normalizeAsctbRecord(context, data) { return data.reduce((collector, row, index) => { - row.cell_types - .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) - .map(({ id, name }) => { - return { - id: generateInstanceId(context, name, index), - type_of: generateIdWhenEmpty(id, name), - label: generateInstanceLabel(context, name, index), - }; - }) - .filter(({ type_of }) => passCtIdFilterCriteria(context, type_of)) - .reduce(normalizeCtInstance, collector); + // Determine record number + const recordNumber = index + 1; - // Add ccf_located_in relationship that is between AS and CT - const valid_as = row.anatomical_structures + // Populate all valid anatomical structure concepts + const validAs = row.anatomical_structures .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) .map(({ id, name }) => generateIdWhenEmpty(id, name)) .filter((id) => passAsIdFilterCriteria(context, id)); - const valid_ct_instance = row.cell_types + + // Populate all valid cell type concepts + const validCt = row.cell_types .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) - .map(({ id, name }) => ({ id: generateIdWhenEmpty(id, name), name })) - .filter(({ id }) => passCtIdFilterCriteria(context, id)) - .map(({ name }) => generateInstanceId(context, name, index)); - // Each CT instance will be associated with all AS via the ccf_located_in relationship - valid_ct_instance.forEach((ct_instance) => { - valid_as.forEach((as) => addLocatedIn(collector, ct_instance, as)) + .map(({ id, name }) => generateIdWhenEmpty(id, name)) + .filter((id) => passCtIdFilterCriteria(context, id)); + + // Populate all valid biomarker concepts + const validBm = row.biomarkers + .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) + .map(({ id, name, b_type }) => ({ + id: generateIdWhenEmpty(id, name), + b_type + })) + .filter(({ id }) => passIdFilterCriteria(context, id)); + + // Populate all valid references + const validReferences = row.references + .map((ref) => { + const refId = checkNotEmpty(ref.id) ? ref.id : "N/A"; + return checkIsDoi(refId) ? normalizeDoi(refId) : refId; + }); + + // Collect all the items + collector.push({ + id: generateAsctbRecordId(context, recordNumber), + label: `Record ${recordNumber}`, + type_of: [`AsctbRecord`], + record_number: recordNumber, + anatomical_structure_list: validAs, + cell_type_list: validCt, + gene_marker_list: validBm.filter(({ b_type }) => b_type === BM_TYPE.G).map(getConceptId), + protein_marker_list: validBm.filter(({ b_type }) => b_type === BM_TYPE.P).map(getConceptId), + lipid_marker_list: validBm.filter(({ b_type }) => b_type === BM_TYPE.BL).map(getConceptId), + metabolites_marker_list: validBm.filter(({ b_type }) => b_type === BM_TYPE.BM).map(getConceptId), + proteoforms_marker_list: validBm.filter(({ b_type }) => b_type === BM_TYPE.BF).map(getConceptId), + references: validReferences }); + return collector; + }, []); +} - // Add has_biomarker relationship between CT and BM - const biomarkers = row.biomarkers +function normalizeCellMarkerDescriptor(context, data) { + return data.reduce((collector, row, index) => { + // Determine record number + const recordNumber = index + 1; + + // Populate all valid anatomical structure concepts + const validAs = row.anatomical_structures + .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) + .map(({ id, name }) => generateIdWhenEmpty(id, name)) + .filter((id) => passAsIdFilterCriteria(context, id)); + const lastAs = validAs.pop(); + + // Populate all valid cell type concepts + const validCt = row.cell_types + .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) + .map(({ id, name }) => ({ + id: generateIdWhenEmpty(id, name), + name + })) + .filter(({ id }) => passCtIdFilterCriteria(context, id)); + const lastCt = validCt.pop(); + + // Populate all valid biomarker concepts + const validBm = row.biomarkers .filter(({ id, name }) => checkNotEmpty(id) || checkNotEmpty(name)) .map(({ id, name }) => generateIdWhenEmpty(id, name)) .filter((id) => passIdFilterCriteria(context, id)); - const references = row.references - .filter(({ doi }) => checkNotEmpty(doi)) - .map(({ doi }) => normalizeDoi(doi)) - .filter((doi) => passDoiFilterCriteria(context, doi)); - const last_ct_instance = valid_ct_instance.pop(); - if (last_ct_instance) { - addBiomarkerSetInstances(collector, last_ct_instance, biomarkers, references); - } + + // Populate all valid references + const validReferences = row.references + .map((ref) => { + const refId = checkNotEmpty(ref.id) ? ref.id : "N/A"; + return checkIsDoi(refId) ? normalizeDoi(refId) : refId; + }); + + // Collect all the items + collector.push({ + id: generateCellMarkerDescriptorId(context, recordNumber), + label: `Cell marker descriptor for ${lastCt.name}`, + type_of: [`CellMarkerDescriptor`], + target_cell_type: lastCt.id, + cell_type_location: lastAs, + biomarker_set: validBm, + references: validReferences, + derived_from: generateAsctbRecordId(context, recordNumber) + }); return collector; }, []); } -function normalizeCtInstance(collector, { id, label, type_of }) { - const obj = { - id, - label, - type_of: [type_of], - ccf_located_in: [], - ccf_has_biomarker_set: [], - }; - collector.push(obj); - return collector; +function getConceptId({ id }) { + return id; } -function addBiomarkerSetInstances(collector, ctInstance, biomarkers, references) { - const foundInstance = collector.find((instanceInCollector) => instanceInCollector.id === ctInstance); - if (foundInstance) { - const oldHasBiomarker = foundInstance.ccf_has_biomarker_set; - const newHasBiomarker = [ - ...oldHasBiomarker, - { - id: `${ctInstance}_biomarker_set`, - label: 'Biomarker Set', - type_of: ['BiomarkerSet'], - members: removeDuplicates(biomarkers), - references: removeDuplicates(references), - }, - ]; - foundInstance.ccf_has_biomarker_set = removeDuplicates(newHasBiomarker); - } -} - -function generateInstanceId(context, ctName, ctIndex) { +function generateAsctbRecordId(context, recordNumber) { const { type: doType, name: doName, version: doVersion } = context.selectedDigitalObject; - return `${context.purlIri}${doType}/${doName}#${normalizeName(ctName)}_${doVersion}_R${ctIndex}`; + return `${context.purlIri}${doType}/${doName}/${doVersion}#R${recordNumber}`; } -function generateInstanceLabel(context, ctName, ctIndex) { - const { name: doName, version: doVersion } = context.selectedDigitalObject; - return `Instance of ${ctName} in ${doName} (${doVersion}) [R${ctIndex}]`; +function generateCellMarkerDescriptorId(context, recordNumber) { + const { type: doType, name: doName, version: doVersion } = context.selectedDigitalObject; + return `${context.purlIri}${doType}/${doName}/${doVersion}#R${recordNumber}-cell-marker-descriptor`; } function generateIdWhenEmpty(id, name) { @@ -360,6 +396,11 @@ function checkNotEmpty(str) { return str && str.trim() !== ''; } +function checkIsDoi(str) { + const doiRegex = /(10\.\d{4,9}\/[\w\-._;()/:]+)/i; + return doiRegex.test(str); +} + function removeDuplicates(array) { return Array.from(new Set(array.map(JSON.stringify)), JSON.parse); } @@ -374,8 +415,4 @@ function passAsIdFilterCriteria(context, id) { function passCtIdFilterCriteria(context, id) { return isCtIdValid(id) || !context.excludeBadValues; -} - -function passDoiFilterCriteria(context, doi) { - return isDoiValid(doi) || !context.excludeBadValues; -} +} \ No newline at end of file diff --git a/src/normalization/patches.js b/src/normalization/patches.js index c751d43..0b62b48 100644 --- a/src/normalization/patches.js +++ b/src/normalization/patches.js @@ -309,10 +309,18 @@ export function getPatchesForBiomarker() { export function normalizeDoi(doi) { let normDoi = doi.replace(/\s+/g, ''); + + // Case 1: 10.1016/j.exphem.2018.09.004 if (/^10\.\d+\/.*/.test(normDoi)) { normDoi = normDoi.replace(/^(10\.\d+\/.*)/, 'https://doi.org/$1'); - } else { - normDoi = normDoi.replace(/^(DOI|doi):\s*|^(https?:\/\/)?doi\.org\//, 'https://doi.org/'); + } + // Case 2: DOI:10.1016/j.exphem.2018.09.004 + else if (/^(?:DOI|doi):.*/.test(normDoi)) { + normDoi = normDoi.replace(/^(DOI|doi):\s*/, 'https://doi.org/'); + } + // Case 3: doi.org/10.1016/j.exphem.2018.09.004 + else if (/^doi\.org\/.*/.test(normDoi)) { + normDoi = normDoi.replace(/^doi\.org\//, 'https://doi.org/'); } return normDoi; } @@ -328,8 +336,3 @@ export function isAsIdValid(id) { export function isCtIdValid(id) { return /(CL|PCL|LMHA):\d+|https\:\/\/purl.org\/ccf\/ASCTB\-TEMP\_[a-zA-Z0-9\-]+/.test(id); } - -export function isDoiValid(doi) { - const doiString = doi.replace(/\s+/g, ''); - return /^https:\/\/doi\.org\/10\.\d+\/.*/.test(doiString); -}