Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new Jenkins build process #55

Merged
merged 3 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 20 additions & 72 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pipeline {
agent {
docker {
reuseNode false
image 'caufieldjh/ubuntu20-python-3-8-5-dev:4-with-dbs-v6'
image 'caufieldjh/ubuntu20-python-3-9-14-dev:2'
}
}
// No scheduled builds for now
Expand All @@ -14,10 +14,7 @@ pipeline {
S3BUCKETNAME = 'kg-hub-public-data'
S3PROJECTDIR = 'kg-bioportal' // no trailing slash
MERGEDKGNAME_BASE = "kg_bioportal"
MERGEDKGNAME_GENERIC = "merged-kg"

// Ontologies to merge in this run, if not using --merge_all flag
ONTOSET = 'CCONT,GRO-CPGA,STY,HP,PMO,CDPEO,GRO-CPD,ISO19115CC,TEDDY,NMOBR,IDQA,RDFS,LUNGMAP_M_CELL,PCO,ISSVA,IOBC,APADISORDERS,TESTEX,ONL-DP,XEO,EXTRACT,CHEMINF,ECSO,FAST-GENREFORM,VODANAKENYA,CTX,ISO19115DI,CARO,TEO,COMODI,IRD,OGDI,VEO,OHPI,GEXO,CIDO,GMM,RNAO,BCTT,MADS-RDF,GAZ,OBA,OSM,TRANS,BP-METADATA,PE,PCMO,UO,NMR,NEOMARK3,EVI,MCHVODANATERMS,EO1,APACOMPUTER,ICECI,DISDRIV,ONTONEO,ENM,ONTODM-CORE,UBERON,ISO19115TCC,SBO,CU-VO,SHR,ETHOPD,SPO,HOIP,ISO19115ROLES,DCT,WETAXTOPICS,PECO,IRDG,SEQ,HL7,SEDI,CASE-BASE-ONTO,AHOL,AD-DROP,TM-CONST,MATR,APATANDT,BCO,FLYGLYCODB,RXNORM,HOOM,HIO,PTS,CRISP,OCMR,TAXRANK,OMO,SO,ODNAE,ROCKNROLLTEST,GO,OBI,FOBI,PLANA,HIVO004,AGROMOP,ONTOPBM,ADMO,PCAO,EDAM,BE,ONE,CODO,FOVT,OCE,OFSMR,OMIM,KISAO,NOMEN,DEB,HCDR,ID-AMR,DERMLEX,BTO_ONTOLOGY,OBOREL,MOC,ALLERGYDETECTOR,ADALAB,MS,RDL,AERO,TML,MATRCOMPOUND,CEDARVS,PACO,MEGO,BRSO,TGMA,RPO,EHDAA2,GENO,MCBCC,HAMIDEHSGH,RNPRIO,FAST-TITLE,CWD,VODANA-MIGRANTS,AMINO-ACID,INTO,TADS,RCTONT,MIM,SITBAC,PP,OM,DLORO,ETANC,SIO,IMGT-ONTOLOGY,CLO,RVO,APO,HMIS033B,RXNO,MOOCCUADO,KENYAANC,UPA,EXO,OBS,SYMP,IBD,IAML-MOP,OBOE-SBC,EPO,FIX,OLATDV,OA,CONTSONTO,SNOMEDCT,NCBITAXON,ERO,ISO-ANNOTATIONS,BRCT,HRDO,MAMO,CHEAR,BCGO,RADLEX,MATRROCKIGNEOUS,MOSAIC,CYTO,PDO_CAS,PDO,AGROCYMAC,VODANA-UG,MIXSCV,FB-BT,CANCO,SD3,REPRODUCE-ME,BCS7,CN,NCCO,EP,PDQ,FENICS,VDOT,NEOMARK4,FISH-AST,EPIE,MA,PANET,TCO,CLAO,OGR,ODAE,PPO,NATPRO,FAST-EVENT-SKOS,WEAR,CVAO,GLYCORDF,ISO19108TO,CMPO,OAE,ISO19115PR,PIERO,MPO,TAO,PHMAMMADO,STO-DRAFT,NPOKB,EDAM-BIOIMAGING,CISAVIADO,ROLEO,DCM,ONTOPARON_SOCIAL,MNV,INFRARISK,NCRO,CDO,RNRMU,NMOSP,BCTEO,ONTOTOXNUC,DERMO,ICDO,WB-BT,ATO,VFB_DRIVERS,MDDB,NLN,GMO,SAO,EMAPA,BHN,DOID,OCRE,TCDO,TM-MER,ISO19115CON,GEOSPECIES,VARIO,UGANDA_DISEASES,SCIO,AHSO,TM-OTHER-FACTORS,KORO,ENVO,MCCV,ECG,UNITSONT,ONTOSINASC,ECAO,REX,NEO,AO,ACESO,FAST-FORMGENRE,EHDAA,LOINC,NERO,CLYH,MERA,ONTODM-KDD,PLIO,CANONT,TRAK,PO,PHYLONT,MOP,BSAO,OPTION-ONTOLOGY,ELD,CVDO,TDWGSPEC,RDA-ISSUANCE,TEST_A,FHHO,ZONMW-GENERIC,COHSI2STUDY,IDO-COVID-19,ADW,NIHSS,GFO,PEAO,DDPHENO,TRON,HAROREADO,CKDO,OARCS,LUNGMAP-HUMAN,ICO,HIVMT,PATEL,GLYCO,CARRE,EDDA_PT,suicideo,BRO,PATO,REXO,MMUSDV,BIOMO,ICD10,CHIRO,LAND-SURFACE,MLTX,GO-PLUS,OBIWS,DCAT-FDC,HOM,CHD,MCCL,MELO,NIFDYS,ONTOAVIDA,ECTO,HSO,PE-O,HUPSON,SOS,NCIT,PR,BIOMODELS,ESFO,MFO,LEPAO,BAO,EHDA,FIRE,ADO,ATC,REPO,JERM,EDDA,NMDCO,PHFUMIADO,COPDO,OMRSE,GRO,FYPO,LUNGMAP-MOUSE,TXPO,BDO'
}
options {
timestamps()
Expand All @@ -42,11 +39,10 @@ pipeline {
sh 'echo "$GIT_BRANCH"'
sh 'cat env.txt'
sh 'cat branch.txt'
sh "echo $BUILDSTARTDATE > dow.txt"
sh "echo $BUILDSTARTDATE"
sh "echo $MERGEDKGNAME_BASE"
sh "echo $MERGEDKGNAME_GENERIC"
sh "python3.8 --version"
sh "python3.9 --version"
sh "id"
sh "whoami" // this should be jenkinsuser
// if the above fails, then the docker host didn't start the docker
Expand All @@ -66,88 +62,48 @@ pipeline {
url: 'https://github.com/ncbo/kg-bioportal',
branch: 'main'
)
sh '/usr/bin/python3.8 -m venv venv'
sh '/usr/bin/python3.9 -m venv venv'
sh '. venv/bin/activate'
// Now move on to the actual install + reqs
sh './venv/bin/pip install .'
sh './venv/bin/pip install awscli boto3 s3cmd'
}
}
}

// the download step uses s3cmd instead of the standard kghub_downloader
// this is so we can access the private object

stage('Download') {
steps {
dir('./gitrepo') {
script {
// Get the names of all BioPortal ontologies
sh ". venv/bin/activate && kgbioportal get-ontology-list --api_key ${NCBO_API_KEY} --output data/raw/"

// Now download all
// or at least in the future, do them all.
// For now just do a few
sh "printf 'ENVO\nPO\nSEPIO\n' > data/raw/ontologylist.tsv"

// Verify that the project directory is defined, or it will make a mess
// when it uploads everything to the wrong directory
if (S3PROJECTDIR.replaceAll("\\s","") == '') {
error("Project name contains only whitespace. Will not continue.")
}
withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG get s3://$S3BUCKETNAME/frozen_incoming_data/bioportal_transformed/bioportal_transformed.tar.gz data/raw/bioportal_transformed.tar.gz'
}
// Download the ontologies
sh ". venv/bin/activate && kbbioportal download --api_key ${NCBO_API_KEY} --ontology_file data/raw/ontologylist.tsv --output_dir data/raw/"

}
}
}
}

// Transform step just moves and decompresses the raw sources

// Transform the downloaded ontologies
stage('Transform') {
steps {
dir('./gitrepo') {
sh '. venv/bin/activate && env && mv data/raw/* ../ && tar -xvzf ../bioportal_transformed.tar.gz -C ../'
sh 'du -a ../'
sh 'pwd'
sh ". venv/bin/activate && kgbioportal transform --input_dir data/raw/ --output_dir data/transformed/"
}
}
}

// Currently using cat-merge
stage('Merge') {
steps {
dir('./gitrepo') {
sh 'echo "Starting that big merge."'
sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
sh 'echo "Finished that big merge."'
//sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT'
sh 'gunzip data/merged/merged-kg.tar.gz'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv'
sh 'gzip data/merged/merged-kg.tar'
//sh '. venv/bin/activate && python3.8 run.py catmerge --include_only $ONTOSET'
//sh 'cp merged_graph_stats.yaml merged_graph_stats_$BUILDSTARTDATE.yaml'
//sh 'tar -rvfz data/merged/merged-kg.tar.gz merged_graph_stats_$BUILDSTARTDATE.yaml'
}
}
}

stage('Publish') {
steps {
dir('./gitrepo') {
script {

// make sure we aren't going to clobber existing data
withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
REMOTE_BUILD_DIR_CONTENTS = sh (
script: '. venv/bin/activate && s3cmd -c $S3CMD_CFG ls s3://$S3BUCKETNAME/$S3PROJECTDIR/$BUILDSTARTDATE/',
returnStdout: true
).trim()
echo "REMOTE_BUILD_DIR_CONTENTS (THIS SHOULD BE EMPTY): '${REMOTE_BUILD_DIR_CONTENTS}'"
if("${REMOTE_BUILD_DIR_CONTENTS}" != ''){
echo "Will not overwrite existing remote S3 directory: $S3PROJECTDIR/$BUILDSTARTDATE"
sh 'exit 1'
} else {
echo "remote directory $S3PROJECTDIR/$BUILDSTARTDATE is empty, proceeding"
}
}

if (env.GIT_BRANCH != 'origin/main') {
echo "Will not push if not on main branch."
} else {
Expand All @@ -157,21 +113,13 @@ pipeline {
string(credentialsId: 'aws_kg_hub_access_key', variable: 'AWS_ACCESS_KEY_ID'),
string(credentialsId: 'aws_kg_hub_secret_key', variable: 'AWS_SECRET_ACCESS_KEY')]) {

//
// make $BUILDSTARTDATE/ directory and sync to s3 bucket
// Don't create any index - none of this will be public
//
sh 'mkdir $BUILDSTARTDATE/'
sh 'cp -p data/merged/merged-kg.tar.gz $BUILDSTARTDATE/${MERGEDKGNAME_BASE}.tar.gz'
sh 'cp Jenkinsfile $BUILDSTARTDATE/'

// Add updated stats
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr graph_stats.yaml $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/graph_stats.yaml'

sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG rm -r s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE/* s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
// Index, then upload
sh '. venv/bin/activate && multi_indexer -v --directory data/transformed/ --prefix https://kghub.io/$S3PROJECTDIR/ -x -u'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate data/transformed/ s3://kg-hub-public-data/$S3PROJECTDIR/'

// Now update the index for the whole project
sh '. venv/bin/activate && multi_indexer -v --prefix https://kghub.io/$S3PROJECTDIR/ -b kg-hub-public-data -r $S3PROJECTDIR -x'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate ./index.html s3://kg-hub-public-data/$S3PROJECTDIR/'
}

}
Expand Down
1 change: 1 addition & 0 deletions src/kg_bioportal/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(

return None

# TODO: save NCBO ID and version for each ontology, then pass to transformer
def download(self, onto_list: list = []) -> None:
"""Downloads data files from list of ontologies into data directory.

Expand Down
26 changes: 20 additions & 6 deletions src/kg_bioportal/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# TODO: Fix KGX hijacking logging
# TODO: Save KGX logs to a file for each ontology
# TODO: Address BNodes
# TODO: get version from BioPortal API (in the downloader)


class Transformer:

Expand Down Expand Up @@ -77,14 +79,15 @@ def transform_all(self) -> None:
logging.info(f"Found {len(filepaths)} ontologies to transform.")

for filepath in filepaths:
if not self.transform(filepath):
if not self.transform(filepath, version="latest"):
logging.error(f"Error transforming {filepath}.")
else:
logging.info(f"Transformed {filepath}.")

return None

def transform(self, ontology: str) -> bool:
# TODO: use NCBO ID to name the output, not the filename
def transform(self, ontology: str, version: str) -> bool:
"""Transforms a single ontology to KGX nodes and edges.

Args:
Expand All @@ -97,7 +100,9 @@ def transform(self, ontology: str) -> bool:

logging.info(f"Transforming {ontology} to nodes and edges.")
ontology_name = os.path.splitext(os.path.basename(ontology))[0]
owl_output_path = os.path.join(self.output_dir, f"{ontology_name}.owl")
owl_output_path = os.path.join(
self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}.owl"
)

# Convert
if not robot_convert(
Expand All @@ -109,7 +114,12 @@ def transform(self, ontology: str) -> bool:
status = False

# Relax
relaxed_outpath = os.path.join(self.output_dir, f"{ontology_name}_relaxed.owl")
relaxed_outpath = os.path.join(
self.output_dir,
f"{ontology_name}",
f"{version}",
f"{ontology_name}_relaxed.owl",
)
if not robot_relax(
robot_path=self.robot_path,
input_path=owl_output_path,
Expand All @@ -120,7 +130,9 @@ def transform(self, ontology: str) -> bool:

# Transform to KGX nodes + edges
txr = KGXTransformer(stream=True)
outfilename = os.path.join(self.output_dir, f"{ontology_name}")
outfilename = os.path.join(
self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}"
)
nodefilename = outfilename + "_nodes.tsv"
edgefilename = outfilename + "_edges.tsv"
input_args = {
Expand All @@ -139,7 +151,9 @@ def transform(self, ontology: str) -> bool:
input_args=input_args,
output_args=output_args,
)
logging.info(f"Nodes and edges written to {nodefilename} and {edgefilename}.")
logging.info(
f"Nodes and edges written to {nodefilename} and {edgefilename}."
)
status = True
except Exception as e:
logging.error(f"Error transforming {ontology} to KGX nodes and edges: {e}")
Expand Down
Loading