CAST-genomics · nicholema · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024
diff --git a/docker/plink2/Dockerfile b/docker/plink2/Dockerfile
@@ -0,0 +1,41 @@
+FROM ubuntu:20.04
+
+RUN apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install -qqy \
+    python3 \
+    python3-pip \
+    wget \
+    build-essential \
+    curl \
+    unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Install Python dependencies
+RUN pip3 install pandas
+
+# Download and install gcloud
+RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
+RUN mkdir -p /usr/local/gcloud \
+  && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
+  && /usr/local/gcloud/google-cloud-sdk/install.sh
+ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
+
+# Copy python script 
+COPY convert_phenotype_plink.py /usr/bin/
+RUN chmod +x /usr/bin/convert_phenotype_plink.py
+
+# Download and install PLINK2
+RUN wget https://s3.amazonaws.com/plink2-assets/alpha6/plink2_linux_x86_64_20241124.zip \
+    && unzip plink2_linux_x86_64_20241124.zip\
+    && mv plink2 /usr/bin/ \
+    && chmod +x /usr/bin/plink2 \
+    && rm plink2_linux_x86_64_20241124.zip
+
+
+
+
+
+
+
+
+
diff --git a/docker/plink2/Makefile b/docker/plink2/Makefile
@@ -0,0 +1,18 @@
+# Definitions
+repository = gcr.io/ucsd-medicine-cast
+identifier = plink2
+version = 0.0.0
+git_commit ?= $(shell git log --pretty=oneline -n 1 | cut -f1 -d " ")
+name = ${repository}/${identifier}
+tag = ${version}--${git_commit}
+
+# Steps
+build:
+	# do the docker build
+	docker build --no-cache -t ${name}:${tag} .
+	docker tag ${name}:${tag} ${name}:latest
+
+push:
+	# Requires ~/.dockercfg
+	docker push ${name}:${tag}
+	docker push ${name}:latest
diff --git a/docker/plink2/convert_phenotype_plink.py b/docker/plink2/convert_phenotype_plink.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+
+
+"""
+Convert AoU phenotypes to plink friendly format.
+Output cleaned phenotype and comined covariates files
+for the specified phenotype
+
+Usage:
+./convert_phenotype_plink.py --phenotype <phenotype>
+
+"""
+
+import os
+import pandas as pd
+import subprocess
+import sys
+import csv
+import argparse
+
+
+# Get token and set up project
+token_fetch_command = subprocess.run(['gcloud', 'auth', 'application-default', 'print-access-token'], \
+    capture_output=True, check=True, encoding='utf-8')
+token = str.strip(token_fetch_command.stdout)
+project = os.getenv("GCS_REQUESTER_PAYS_PROJECT")
+bucket = os.getenv("WORKSPACE_BUCKET")
+#print(f"Workspace Bucket: {bucket}")
+#print all the environmental variables 
+#print(dict(os.environ))
+
+def GetPTCovarPath(phenotype):
+    return os.path.join(bucket, \
+        "phenotypes", "%s_phenocovar.csv"%phenotype)
+
+def DownloadPT(filename):
+    """
+	Download phenotype_covar.csv locally
+
+	Arguments
+	---------
+	filename : str
+	   GCP path
+    """
+    cmd = "gsutil cp {filename} .".format(filename=filename)
+    output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout.read()
+    print(output.decode("utf-8"))
+
+    return filename.split("/")[-1]  # Return local filename
+
+
+def GetFloatFromPC(x):
+    x = x.replace("[","").replace("]","")
+
+    return float(x)
+
+def LoadAncestry(ancestry_pred_path,project):
+    """
+	Download ancestry_pred.tsv locally
+
+	Arguments
+	---------
+	ancestry_pred_path : str
+	   GCP path
+    project : str
+    google project for downloading
+    """
+    if ancestry_pred_path.startswith("gs://"):
+        os.system(f"gsutil -u {project} cp {ancestry_pred_path} .")
+        ancestry_pred_path = "ancestry_preds.tsv"
+    ancestry =  pd.read_csv(ancestry_pred_path, sep="\t")
+    ancestry.rename({"research_id": "IID"}, axis=1, inplace=True)
+    ancestry['IID'] = ancestry['IID'].astype(str)
+    num_pcs = len(ancestry["pca_features"].values[0].split(","))
+    pcols = ["PC_%s"%i for i in range(num_pcs)]
+    ancestry[pcols] = ancestry["pca_features"].str.split(",", expand=True)
+    for p in pcols:
+        ancestry[p] = ancestry[p].apply(lambda x: GetFloatFromPC(x), 1)
+    ancestry.insert(0, 'FID', 0)
+
+    return ancestry
+
+def convert_csv_to_plink (ptfile):
+    df = pd.read_csv(ptfile)
+    df.insert(0, 'FID', 0)
+    df.rename(columns={"person_id": "IID"}, inplace=True)
+    df['IID'] = df['IID'].astype(str)
+
+    return df
+
+def main():
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument("--phenotype", help="Comma-separated list of phenotypes to process. If not provided, process all available phenotypes", type=str, required=True)
+    parser.add_argument("--num-pcs", help="Number of PCs to use as covariates", type=int, default=10)
+    parser.add_argument("--ancestry-pred-path", help="Path to ancestry predictions",type=str, default="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv")
+    parser.add_argument("--ptcovars", help="Comma-separated list of phenotype-specific covariates. Default: age", type=str, default="age")
+    parser.add_argument("--sharedcovars", help="Comma-separated list of shared covariates (besides PCs). Default: sex_at_birth_Male", type=str, default="sex_at_birth_Male")
+    parser.add_argument("--dryrun", help="Don't actually run the workflow. Just set up", action="store_true")
+    args = parser.parse_args()
+
+
+    # Set up paths
+    if args.phenotype.endswith(".csv"):
+        ptcovar_path = args.phenotype
+    else:
+        ptcovar_path = GetPTCovarPath(args.phenotype)
+
+
+    # Get covarlist
+    pcols = ["PC_%s"%i for i in range(1, args.num_pcs+1)]
+    shared_covars = [item for item in args.sharedcovars.split(",") if item != ""]
+    pt_covars = [item for item in args.ptcovars.split(",") if item != ""]
+    covars = pt_covars + shared_covars
+
+    # Set up data frame with phenotype and covars
+    ancestry = LoadAncestry(args.ancestry_pred_path,project)
+    plink = convert_csv_to_plink(DownloadPT(ptcovar_path))
+
+    # Extract phenotype and covars only
+    data = pd.merge(plink[["FID","IID"]+covars], ancestry[["IID"]+pcols],on=["IID"],how="inner")
+    plink_pheno = plink[["FID","IID","phenotype"]]
+
+    # Output files
+    pheno_file_path = f"{args.phenotype}_pheno_plink.txt"
+    covar_file_path = f"{args.phenotype}_covar_combined.txt"
+
+    plink_pheno.to_csv(pheno_file_path, sep="\t", index=False)
+    data.to_csv(covar_file_path, sep="\t", index=False)
+
+    sys.stderr.write(f"Done converting {args.phenotype} to plink format.")
+
+if __name__ == "__main__":
+    plink_pheno,data = main()
diff --git a/tr-gwas/README.md b/tr-gwas/README.md
@@ -0,0 +1,94 @@
+# Running TR-GWAS genome-wide on the All of Us workbench
+
+The goal of this workflow is to run TR-gwas genome-wide with plink2 across different phenotypes in different cohorts. 
+
+For most of our use cases, users do not interact with the WDL described here directly, but rather call launcher scripts which handle setting up inputs and calling the WDL using cromshell. Sections below give additional WDL details which can be helpful for development/testing or debugging.
+
+
+## Setup
+In all cases you'll have to run the following steps in the AoU workbench before starting a workflow:
+
+1. Start the cromwell environment (pink circle icon on the right).
+2. Start a cloud environment (Jupyter icon) with:
+    * "General Analysis" environment
+    * 2 CPU/7.5GB RAM
+    * Compute type: Standard VM
+    * Automatically pause after: 30 minutes
+    * Storage disk options: standard disk, 120GB
+3. Open a terminal (terminal icon) and run the commands below:
+
+```
+git clone https://github.com/cast-genomics/cast-workflows/
+cd cast-workflows/tr-gwas
+../utils/configure-cromshell.py
+```
+
+## Run TR-Gwas on targeted phenotype and cohorts
+
+Example test to run tr-gwas :
+
+```
+./tr_gwas_aou.py \
+--name test \
+--phenotype platelet_count \
+--cohort EUR,AFR 
+```
+
+This will print out a friendly turtle with the job ID if successful. Use the following to check the status of your job. It will take around 10-20 minutes to run. If successful the status will eventually change to "Succeeded".
+
+```
+cromshell status $JOBID
+```
+
+If you see "Failed", you can look at the logs to see what happened:
+
+```
+cromshell logs -s ALL $JOBID
+cromshell logs -s ALL -des $JOBID (use -des for descriptive log info)
+cromshell logs -s Failed -des $JOBID
+```
+
+You can check the output:
+```
+cromshell list-outputs $JOBID
+```
+
+## Run a full job on all samples and all phenotypes 
+
+```
+./tr_gwas_aou.py \
+--name test_all
+```
+## Detailed usage
+
+Required arguments:
+
+* `--name <STR>`: name of the run
+
+Optional arguments:
+
+* `--phenotype <STR>`: name of targeted phenotype, seperated by comma. Default: None, run all 
+* `--cohort <STR>`: name of targeted cohort, seperated by comma.  Options: AFR, EUR, NOT_AFR, ALL . Default: None, run all 
+
+Warning: This script manually filters PC6 for EUR and NOT_AFR cohorts
+
+
+## Samples preprocessed in cast-workflows/gwas/aou/sample_preprocessing
+
+## Output files 
+
+plink friendly phenotype files
+```
+<phenotype>_covar_combined.txt
+<phenotype>_pheno_plink.txt
+```
+
+TR-gwas results 
+```
+<phenotype>_<cohort>_gwas.tab
+```
+
+## TODO
+
+ (1) looks at pairwise correlation of all covariates + the phenotype and removes highly correlated ones and 
+ (2) filtering for any categorical covariates as we do in Margoliash et al. e.g. statin usage and sex are examples of categorial covariates we have used.