diff --git a/Jenkinsfile b/Jenkinsfile index a831986..8b6b92e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,7 +2,7 @@ pipeline { agent { docker { reuseNode false - image 'caufieldjh/ubuntu20-python-3-8-5-dev:4-with-dbs-v6' + image 'caufieldjh/ubuntu20-python-3-9-14-dev:2' } } // No scheduled builds for now @@ -14,10 +14,7 @@ pipeline { S3BUCKETNAME = 'kg-hub-public-data' S3PROJECTDIR = 'kg-bioportal' // no trailing slash MERGEDKGNAME_BASE = "kg_bioportal" - MERGEDKGNAME_GENERIC = "merged-kg" - // Ontologies to merge in this run, if not using --merge_all flag - ONTOSET = 'CCONT,GRO-CPGA,STY,HP,PMO,CDPEO,GRO-CPD,ISO19115CC,TEDDY,NMOBR,IDQA,RDFS,LUNGMAP_M_CELL,PCO,ISSVA,IOBC,APADISORDERS,TESTEX,ONL-DP,XEO,EXTRACT,CHEMINF,ECSO,FAST-GENREFORM,VODANAKENYA,CTX,ISO19115DI,CARO,TEO,COMODI,IRD,OGDI,VEO,OHPI,GEXO,CIDO,GMM,RNAO,BCTT,MADS-RDF,GAZ,OBA,OSM,TRANS,BP-METADATA,PE,PCMO,UO,NMR,NEOMARK3,EVI,MCHVODANATERMS,EO1,APACOMPUTER,ICECI,DISDRIV,ONTONEO,ENM,ONTODM-CORE,UBERON,ISO19115TCC,SBO,CU-VO,SHR,ETHOPD,SPO,HOIP,ISO19115ROLES,DCT,WETAXTOPICS,PECO,IRDG,SEQ,HL7,SEDI,CASE-BASE-ONTO,AHOL,AD-DROP,TM-CONST,MATR,APATANDT,BCO,FLYGLYCODB,RXNORM,HOOM,HIO,PTS,CRISP,OCMR,TAXRANK,OMO,SO,ODNAE,ROCKNROLLTEST,GO,OBI,FOBI,PLANA,HIVO004,AGROMOP,ONTOPBM,ADMO,PCAO,EDAM,BE,ONE,CODO,FOVT,OCE,OFSMR,OMIM,KISAO,NOMEN,DEB,HCDR,ID-AMR,DERMLEX,BTO_ONTOLOGY,OBOREL,MOC,ALLERGYDETECTOR,ADALAB,MS,RDL,AERO,TML,MATRCOMPOUND,CEDARVS,PACO,MEGO,BRSO,TGMA,RPO,EHDAA2,GENO,MCBCC,HAMIDEHSGH,RNPRIO,FAST-TITLE,CWD,VODANA-MIGRANTS,AMINO-ACID,INTO,TADS,RCTONT,MIM,SITBAC,PP,OM,DLORO,ETANC,SIO,IMGT-ONTOLOGY,CLO,RVO,APO,HMIS033B,RXNO,MOOCCUADO,KENYAANC,UPA,EXO,OBS,SYMP,IBD,IAML-MOP,OBOE-SBC,EPO,FIX,OLATDV,OA,CONTSONTO,SNOMEDCT,NCBITAXON,ERO,ISO-ANNOTATIONS,BRCT,HRDO,MAMO,CHEAR,BCGO,RADLEX,MATRROCKIGNEOUS,MOSAIC,CYTO,PDO_CAS,PDO,AGROCYMAC,VODANA-UG,MIXSCV,FB-BT,CANCO,SD3,REPRODUCE-ME,BCS7,CN,NCCO,EP,PDQ,FENICS,VDOT,NEOMARK4,FISH-AST,EPIE,MA,PANET,TCO,CLAO,OGR,ODAE,PPO,NATPRO,FAST-EVENT-SKOS,WEAR,CVAO,GLYCORDF,ISO19108TO,CMPO,OAE,ISO19115PR,PIERO,MPO,TAO,PHMAMMADO,STO-DRAFT,NPOKB,EDAM-BIOIMAGING,CISAVIADO,ROLEO,DCM,ONTOPARON_SOCIAL,MNV,INFRARISK,NCRO,CDO,RNRMU,NMOSP,BCTEO,ONTOTOXNUC,DERMO,ICDO,WB-BT,ATO,VFB_DRIVERS,MDDB,NLN,GMO,SAO,EMAPA,BHN,DOID,OCRE,TCDO,TM-MER,ISO19115CON,GEOSPECIES,VARIO,UGANDA_DISEASES,SCIO,AHSO,TM-OTHER-FACTORS,KORO,ENVO,MCCV,ECG,UNITSONT,ONTOSINASC,ECAO,REX,NEO,AO,ACESO,FAST-FORMGENRE,EHDAA,LOINC,NERO,CLYH,MERA,ONTODM-KDD,PLIO,CANONT,TRAK,PO,PHYLONT,MOP,BSAO,OPTION-ONTOLOGY,ELD,CVDO,TDWGSPEC,RDA-ISSUANCE,TEST_A,FHHO,ZONMW-GENERIC,COHSI2STUDY,IDO-COVID-19,ADW,NIHSS,GFO,PEAO,DDPHENO,TRON,HAROREADO,CKDO,OARCS,LUNGMAP-HUMAN,ICO,HIVMT,PATEL,GLYCO,CARRE,EDDA_PT,suicideo,BRO,PATO,REXO,MMUSDV,BIOMO,ICD10,CHIRO,LAND-SURFACE,MLTX,GO-PLUS,OBIWS,DCAT-FDC,HOM,CHD,MCCL,MELO,NIFDYS,ONTOAVIDA,ECTO,HSO,PE-O,HUPSON,SOS,NCIT,PR,BIOMODELS,ESFO,MFO,LEPAO,BAO,EHDA,FIRE,ADO,ATC,REPO,JERM,EDDA,NMDCO,PHFUMIADO,COPDO,OMRSE,GRO,FYPO,LUNGMAP-MOUSE,TXPO,BDO' } options { timestamps() @@ -42,11 +39,10 @@ pipeline { sh 'echo "$GIT_BRANCH"' sh 'cat env.txt' sh 'cat branch.txt' - sh "echo $BUILDSTARTDATE > dow.txt" sh "echo $BUILDSTARTDATE" sh "echo $MERGEDKGNAME_BASE" sh "echo $MERGEDKGNAME_GENERIC" - sh "python3.8 --version" + sh "python3.9 --version" sh "id" sh "whoami" // this should be jenkinsuser // if the above fails, then the docker host didn't start the docker @@ -66,88 +62,48 @@ pipeline { url: 'https://github.com/ncbo/kg-bioportal', branch: 'main' ) - sh '/usr/bin/python3.8 -m venv venv' + sh '/usr/bin/python3.9 -m venv venv' sh '. venv/bin/activate' - // Now move on to the actual install + reqs sh './venv/bin/pip install .' sh './venv/bin/pip install awscli boto3 s3cmd' } } } - // the download step uses s3cmd instead of the standard kghub_downloader - // this is so we can access the private object - stage('Download') { steps { dir('./gitrepo') { script { + // Get the names of all BioPortal ontologies + sh ". venv/bin/activate && kgbioportal get-ontology-list --api_key ${NCBO_API_KEY} --output data/raw/" + + // Now download all + // or at least in the future, do them all. + // For now just do a few + sh "printf 'ENVO\nPO\nSEPIO\n' > data/raw/ontologylist.tsv" - // Verify that the project directory is defined, or it will make a mess - // when it uploads everything to the wrong directory - if (S3PROJECTDIR.replaceAll("\\s","") == '') { - error("Project name contains only whitespace. Will not continue.") - } - withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) { - sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG get s3://$S3BUCKETNAME/frozen_incoming_data/bioportal_transformed/bioportal_transformed.tar.gz data/raw/bioportal_transformed.tar.gz' - } + // Download the ontologies + sh ". venv/bin/activate && kbbioportal download --api_key ${NCBO_API_KEY} --ontology_file data/raw/ontologylist.tsv --output_dir data/raw/" } } } } - // Transform step just moves and decompresses the raw sources - + // Transform the downloaded ontologies stage('Transform') { steps { dir('./gitrepo') { - sh '. venv/bin/activate && env && mv data/raw/* ../ && tar -xvzf ../bioportal_transformed.tar.gz -C ../' - sh 'du -a ../' - sh 'pwd' + sh ". venv/bin/activate && kgbioportal transform --input_dir data/raw/ --output_dir data/transformed/" } } } - // Currently using cat-merge - stage('Merge') { - steps { - dir('./gitrepo') { - sh 'echo "Starting that big merge."' - sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all' - sh 'echo "Finished that big merge."' - //sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT' - sh 'gunzip data/merged/merged-kg.tar.gz' - sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/' - sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv' - sh 'gzip data/merged/merged-kg.tar' - //sh '. venv/bin/activate && python3.8 run.py catmerge --include_only $ONTOSET' - //sh 'cp merged_graph_stats.yaml merged_graph_stats_$BUILDSTARTDATE.yaml' - //sh 'tar -rvfz data/merged/merged-kg.tar.gz merged_graph_stats_$BUILDSTARTDATE.yaml' - } - } - } - stage('Publish') { steps { dir('./gitrepo') { script { - // make sure we aren't going to clobber existing data - withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) { - REMOTE_BUILD_DIR_CONTENTS = sh ( - script: '. venv/bin/activate && s3cmd -c $S3CMD_CFG ls s3://$S3BUCKETNAME/$S3PROJECTDIR/$BUILDSTARTDATE/', - returnStdout: true - ).trim() - echo "REMOTE_BUILD_DIR_CONTENTS (THIS SHOULD BE EMPTY): '${REMOTE_BUILD_DIR_CONTENTS}'" - if("${REMOTE_BUILD_DIR_CONTENTS}" != ''){ - echo "Will not overwrite existing remote S3 directory: $S3PROJECTDIR/$BUILDSTARTDATE" - sh 'exit 1' - } else { - echo "remote directory $S3PROJECTDIR/$BUILDSTARTDATE is empty, proceeding" - } - } - if (env.GIT_BRANCH != 'origin/main') { echo "Will not push if not on main branch." } else { @@ -157,21 +113,13 @@ pipeline { string(credentialsId: 'aws_kg_hub_access_key', variable: 'AWS_ACCESS_KEY_ID'), string(credentialsId: 'aws_kg_hub_secret_key', variable: 'AWS_SECRET_ACCESS_KEY')]) { - // - // make $BUILDSTARTDATE/ directory and sync to s3 bucket - // Don't create any index - none of this will be public - // - sh 'mkdir $BUILDSTARTDATE/' - sh 'cp -p data/merged/merged-kg.tar.gz $BUILDSTARTDATE/${MERGEDKGNAME_BASE}.tar.gz' - sh 'cp Jenkinsfile $BUILDSTARTDATE/' - - // Add updated stats - sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr graph_stats.yaml $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/graph_stats.yaml' - - sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/' - sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG rm -r s3://$S3BUCKETNAME/$S3PROJECTDIR/current/' - sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE/* s3://$S3BUCKETNAME/$S3PROJECTDIR/current/' + // Index, then upload + sh '. venv/bin/activate && multi_indexer -v --directory data/transformed/ --prefix https://kghub.io/$S3PROJECTDIR/ -x -u' + sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate data/transformed/ s3://kg-hub-public-data/$S3PROJECTDIR/' + // Now update the index for the whole project + sh '. venv/bin/activate && multi_indexer -v --prefix https://kghub.io/$S3PROJECTDIR/ -b kg-hub-public-data -r $S3PROJECTDIR -x' + sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate ./index.html s3://kg-hub-public-data/$S3PROJECTDIR/' } } diff --git a/src/kg_bioportal/downloader.py b/src/kg_bioportal/downloader.py index 1a626e0..55d26f8 100644 --- a/src/kg_bioportal/downloader.py +++ b/src/kg_bioportal/downloader.py @@ -52,6 +52,7 @@ def __init__( return None + # TODO: save NCBO ID and version for each ontology, then pass to transformer def download(self, onto_list: list = []) -> None: """Downloads data files from list of ontologies into data directory. diff --git a/src/kg_bioportal/transformer.py b/src/kg_bioportal/transformer.py index 8f11436..5cf2c54 100644 --- a/src/kg_bioportal/transformer.py +++ b/src/kg_bioportal/transformer.py @@ -12,6 +12,8 @@ # TODO: Fix KGX hijacking logging # TODO: Save KGX logs to a file for each ontology # TODO: Address BNodes +# TODO: get version from BioPortal API (in the downloader) + class Transformer: @@ -77,14 +79,15 @@ def transform_all(self) -> None: logging.info(f"Found {len(filepaths)} ontologies to transform.") for filepath in filepaths: - if not self.transform(filepath): + if not self.transform(filepath, version="latest"): logging.error(f"Error transforming {filepath}.") else: logging.info(f"Transformed {filepath}.") return None - def transform(self, ontology: str) -> bool: + # TODO: use NCBO ID to name the output, not the filename + def transform(self, ontology: str, version: str) -> bool: """Transforms a single ontology to KGX nodes and edges. Args: @@ -97,7 +100,9 @@ def transform(self, ontology: str) -> bool: logging.info(f"Transforming {ontology} to nodes and edges.") ontology_name = os.path.splitext(os.path.basename(ontology))[0] - owl_output_path = os.path.join(self.output_dir, f"{ontology_name}.owl") + owl_output_path = os.path.join( + self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}.owl" + ) # Convert if not robot_convert( @@ -109,7 +114,12 @@ def transform(self, ontology: str) -> bool: status = False # Relax - relaxed_outpath = os.path.join(self.output_dir, f"{ontology_name}_relaxed.owl") + relaxed_outpath = os.path.join( + self.output_dir, + f"{ontology_name}", + f"{version}", + f"{ontology_name}_relaxed.owl", + ) if not robot_relax( robot_path=self.robot_path, input_path=owl_output_path, @@ -120,7 +130,9 @@ def transform(self, ontology: str) -> bool: # Transform to KGX nodes + edges txr = KGXTransformer(stream=True) - outfilename = os.path.join(self.output_dir, f"{ontology_name}") + outfilename = os.path.join( + self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}" + ) nodefilename = outfilename + "_nodes.tsv" edgefilename = outfilename + "_edges.tsv" input_args = { @@ -139,7 +151,9 @@ def transform(self, ontology: str) -> bool: input_args=input_args, output_args=output_args, ) - logging.info(f"Nodes and edges written to {nodefilename} and {edgefilename}.") + logging.info( + f"Nodes and edges written to {nodefilename} and {edgefilename}." + ) status = True except Exception as e: logging.error(f"Error transforming {ontology} to KGX nodes and edges: {e}")