Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-863] Add patch release #21

Merged
merged 41 commits into from
Sep 13, 2024
Merged
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
fb28674
Add patch release
thomasyu888 Aug 15, 2024
a2c0152
Add patch release
thomasyu888 Aug 15, 2024
c77a853
Use genie container
thomasyu888 Aug 15, 2024
3a627c9
Create dashboard html module
thomasyu888 Aug 16, 2024
b6564db
Add staging option
thomasyu888 Aug 16, 2024
0f3ccc9
Lint
thomasyu888 Aug 16, 2024
10d7f05
Update process config
thomasyu888 Aug 16, 2024
8613e0d
Add missing module
thomasyu888 Aug 16, 2024
bcdb828
Add channel values
thomasyu888 Aug 16, 2024
e8b08f8
Patch
thomasyu888 Aug 16, 2024
20b90cb
Positional arguments
thomasyu888 Aug 16, 2024
946afde
Bump memory for patch release
thomasyu888 Aug 16, 2024
3809302
Remove quote
thomasyu888 Aug 16, 2024
34ca5c2
cd into /root/Genie
thomasyu888 Aug 16, 2024
ea22341
Add readme
thomasyu888 Aug 16, 2024
318b50b
Compare two folders that should contain identical data
thomasyu888 Aug 18, 2024
3971366
Add production
thomasyu888 Aug 18, 2024
00f2257
Since project_id is specified, use project_id to determine if product…
thomasyu888 Aug 18, 2024
5414262
Fix args
thomasyu888 Aug 18, 2024
b6e1919
Remove subprocess
thomasyu888 Aug 18, 2024
72da9d4
Modularize patch code by splitting off patch_file function
thomasyu888 Aug 18, 2024
89c7be5
Comment out code and shuffle things around
thomasyu888 Aug 18, 2024
15ba576
the release name is no longer included in the filename
thomasyu888 Aug 18, 2024
9e0fb00
Patch
thomasyu888 Aug 18, 2024
312a2f8
Add patch cna file function
thomasyu888 Aug 18, 2024
ef6b2ec
Revoke access for data clinical file, and shuffle code around
thomasyu888 Aug 19, 2024
27de200
Patch releases
thomasyu888 Aug 19, 2024
4bc8e2e
Remove dead code and add TODOs
thomasyu888 Aug 19, 2024
965e609
Update scripts/patch_release/patch.py
thomasyu888 Aug 20, 2024
532269a
Add returns
thomasyu888 Aug 20, 2024
2a4f312
Merge branch 'modularize-patch-code' of github.com:Sage-Bionetworks-W…
thomasyu888 Aug 20, 2024
8c0bbd1
Merge pull request #22 from Sage-Bionetworks-Workflows/modularize-pat…
thomasyu888 Aug 20, 2024
ff8b4bd
lint
thomasyu888 Aug 20, 2024
4915f38
Add schema
thomasyu888 Aug 27, 2024
b7b3f4e
Add release
thomasyu888 Aug 27, 2024
d9cffb1
Rename
thomasyu888 Aug 27, 2024
4e385d9
Add argparse
thomasyu888 Aug 27, 2024
efe387b
Add compare release module
thomasyu888 Aug 27, 2024
d86aa42
Update schema and workflow
thomasyu888 Aug 27, 2024
5ee64be
Use
thomasyu888 Aug 27, 2024
264bf13
Update scripts/patch_release/compare_patch.py
thomasyu888 Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add patch release
thomasyu888 committed Aug 15, 2024
commit fb2867457d24c559eed367d17e6e70399c1c579c
5 changes: 5 additions & 0 deletions scripts/patch_release/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM sagebionetworks/synapsepythonclient:v2.6.0

WORKDIR /patch_release

COPY . .
358 changes: 358 additions & 0 deletions scripts/patch_release/patch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
"""
Patch releases occur when samples need to be retracted due to
patients withdrawing consent.

Patches should always occur on the latest consortium release for
the specific public release. Due to the GENIE retraction policy,
it is best to retract data on the 3rd consortium release of the
subsequent release series.
"""
import argparse
import os
import shutil
import tempfile

import pandas as pd
import synapseclient

from genie import process_functions
from genie import create_case_lists
from genie import dashboard_table_updater


# Run time functions
def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> None:
"""
Replaces the old version with the new version in the meta file.

Args:
meta_file_path (str): The path to the meta file.
old_version (str): The old version to be replaced.
new_version (str): The new version to replace the old version.

Returns:
None
"""
with open(meta_file_path, "r") as meta:
meta_text = meta.read()
with open(meta_file_path, "w") as meta:
meta_text = meta_text.replace(old_version, new_version)
meta.write(meta_text)


def store_file(
syn: synapseclient.Synapse, new_path: str, new_release_synid: str, release_name: str
) -> None:
"""
Stores a file into Synapse.

Args:
syn (synapseclient.Synapse): The Synapse client object.
new_path (str): The path to the file to be stored.
new_release_synid (str): The Synapse ID of the release folder where the file will be stored.
release_name (str): The name of the release.

Returns:
None
"""
ent_name = os.path.basename(new_path.replace(f"_{release_name}", ""))
new_ent = synapseclient.File(new_path, name=ent_name, parentId=new_release_synid)
syn.store(new_ent)


def patch_release_workflow(
release_synid: str, new_release_synid: str, retracted_sample_synid: str
):
"""
These need to be modified per retraction.
The release_synid, new_release_synid, and retracted_sample_synid
variables need to be changed to reflect different Synapse ids per release.
"""
syn = synapseclient.login()

remove_centers = []
remove_seqassays = []
# 11 release series
release_synid = "" # Fill in synapse id here
old_release = syn.get(release_synid).name
# The new release folder MUST be created on synapse first.
new_release_synid = "" # Fill in synapse id here
new_release = syn.get(new_release_synid).name
# samples to retract. Example: syn27734047 (12.3 consortium)
# If you are creating the 11.1-public patch, you will be using the
# 12.3-consortium samples to retract file
# Synapse id configurations
retracted_sample_synid = ""
# Data base mapping synid
database_mapping_synid = "syn10967259"

retracted_samples_ent = syn.get(retracted_sample_synid)
retracted_samplesdf = pd.read_csv(retracted_samples_ent.path)
release_files = syn.getChildren(release_synid)

# Get file mapping
file_mapping = {
release_file["name"]: release_file["id"] for release_file in release_files
}
# case_list_folder_synid = file_mapping['case_lists']
case_list_folder_synid = syn.store(
synapseclient.Folder("case_lists", parentId=new_release_synid)
).id

sample_synid = file_mapping["data_clinical_sample.txt"]
patient_synid = file_mapping["data_clinical_patient.txt"]
cna_synid = file_mapping["data_CNA.txt"]
fusion_synid = file_mapping["data_fusions.txt"]
gene_synid = file_mapping["data_gene_matrix.txt"]
maf_synid = file_mapping["data_mutations_extended.txt"]
genomic_info_synid = file_mapping.get("genie_combined.bed")
if genomic_info_synid is None:
genomic_info_synid = file_mapping["genomic_information.txt"]
seg_synid = file_mapping.get("genie_public_data_cna_hg19.seg")
if seg_synid is None:
seg_synid = file_mapping["genie_private_data_cna_hg19.seg"]
assay_info_synid = file_mapping["assay_information.txt"]

# Sample and patient column to cBioPortal mappings
mapping_table = syn.tableQuery("SELECT * FROM syn9621600")
mapping = mapping_table.asDataFrame()

# Create temporary directory to download files
tempdir_o = tempfile.TemporaryDirectory()
tempdir = tempdir_o.name
# Create clinical file

# Obtain samples retracted
sample_ent = syn.get(sample_synid, followLink=True)
sampledf = pd.read_csv(sample_ent.path, sep="\t", comment="#")
centers = [patient.split("-")[1] for patient in sampledf.PATIENT_ID]
sampledf["CENTER"] = centers
# Retract samples from SEQ_ASSAY_ID, CENTER and retract samples list
to_remove_seqassay_rows = sampledf["SEQ_ASSAY_ID"].isin(remove_seqassays)
sampledf = sampledf[~to_remove_seqassay_rows]
to_remove_center_rows = sampledf["CENTER"].isin(remove_centers)
sampledf = sampledf[~to_remove_center_rows]
to_remove_samples = sampledf["SAMPLE_ID"].isin(retracted_samplesdf.SAMPLE_ID)
final_sampledf = sampledf[~to_remove_samples]
# Check number of seq assay ids is the same after removal of samples
# Must add to removal of seq assay list for gene panel removal
seq_assay_after = final_sampledf["SEQ_ASSAY_ID"].unique()
seq_assay_before = sampledf["SEQ_ASSAY_ID"].unique()
if len(seq_assay_after) != len(seq_assay_before):
remove_seqassays.extend(
seq_assay_before[~seq_assay_before.isin(seq_assay_after)].tolist()
)
# Check number of centers is the same after removal of samples
# Must add to removal of seq assay list for gene panel removal
center_after = final_sampledf["CENTER"].unique()
center_before = sampledf["CENTER"].unique()
if len(center_after) != len(center_before):
remove_centers.extend(center_before[~center_before.isin(center_after)].tolist())

del final_sampledf["CENTER"]

keep_samples = final_sampledf["SAMPLE_ID"].drop_duplicates()
keep_patients = final_sampledf["PATIENT_ID"].drop_duplicates()

patient_ent = syn.get(patient_synid, followLink=True)
patientdf = pd.read_csv(patient_ent.path, sep="\t", comment="#")
patientdf = patientdf[patientdf["PATIENT_ID"].isin(keep_patients)]

clinicaldf = final_sampledf.merge(patientdf, on="PATIENT_ID", how="outer")

clin_ent = syn.get(file_mapping.get("data_clinical.txt"), followLink=True)
full_clin_df = pd.read_csv(clin_ent.path, sep="\t", comment="#")
clinical_path = os.path.join(tempdir, os.path.basename(clin_ent.path))
# GEN-646: Make sure to subset the clinical dataframe or else
# There will be issues downstream. The dashboard code along with
# public release code rely on the merged clinical file.
full_clin_df = full_clin_df[full_clin_df["SAMPLE_ID"].isin(keep_samples)]
full_clin_df.to_csv(clinical_path, sep="\t", index=False)
store_file(syn, clinical_path, new_release_synid, new_release)

sample_path = os.path.join(
tempdir, os.path.basename(sample_ent.path).replace(old_release, new_release)
)
patient_path = os.path.join(
tempdir, os.path.basename(patient_ent.path).replace(old_release, new_release)
)

process_functions.addClinicalHeaders(
clinicaldf,
mapping,
patientdf.columns,
sampledf.columns,
sample_path,
patient_path,
)
store_file(syn, sample_path, new_release_synid, new_release)
store_file(syn, patient_path, new_release_synid, new_release)
# Patch CNA file
cna_ent = syn.get(cna_synid, followLink=True)
cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#")
cna_cols = ["Hugo_Symbol"]
cna_cols.extend(keep_samples.tolist())
cna_cols_idx = cnadf.columns.isin(cna_cols)
if not cna_cols_idx.all():
cnadf = cnadf[cnadf.columns[cna_cols_idx]]
cnatext = process_functions.removePandasDfFloat(cnadf)
cna_path = os.path.join(
tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release)
)
with open(cna_path, "w") as cna_file:
cna_file.write(cnatext)
store_file(syn, cna_path, new_release_synid, new_release)
# Patch Fusion file
fusion_ent = syn.get(fusion_synid, followLink=True)
fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#")
# if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all():
fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)]
fusiontext = process_functions.removePandasDfFloat(fusiondf)
fusion_path = os.path.join(
tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release)
)
with open(fusion_path, "w") as fusion_file:
fusion_file.write(fusiontext)
store_file(syn, fusion_path, new_release_synid, new_release)
# Patch SEG file
seg_ent = syn.get(seg_synid, followLink=True)
segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#")
# if not segdf.ID.isin(keep_samples).all():
segdf = segdf[segdf.ID.isin(keep_samples)]
segtext = process_functions.removePandasDfFloat(segdf)
seg_path = os.path.join(
tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release)
)
with open(seg_path, "w") as seg_file:
seg_file.write(segtext)
store_file(syn, seg_path, new_release_synid, new_release)

# Patch gene matrix file
gene_ent = syn.get(gene_synid, followLink=True)
genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#")
genedf = genedf[genedf.SAMPLE_ID.isin(keep_samples)]
genedf[genedf.isnull()] = "NA"
gene_path = os.path.join(
tempdir, os.path.basename(gene_ent.path).replace(old_release, new_release)
)
genedf.to_csv(gene_path, sep="\t", index=False)
store_file(syn, gene_path, new_release_synid, new_release)
# Patch maf file
maf_ent = syn.get(maf_synid, followLink=True)
mafdf = pd.read_csv(maf_ent.path, sep="\t", comment="#")
mafdf = mafdf[mafdf["Tumor_Sample_Barcode"].isin(keep_samples)]
maftext = process_functions.removePandasDfFloat(mafdf)
maf_path = os.path.join(
tempdir, os.path.basename(maf_ent.path).replace(old_release, new_release)
)
with open(maf_path, "w") as maf_file:
maf_file.write(maftext)
store_file(syn, maf_path, new_release_synid, new_release)
# Patch genomic information file
# clinicalReported column needs to be added
# Patch genomic information file
genome_info_ent = syn.get(genomic_info_synid, followLink=True)
genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#")
keep_rows = [
seq not in remove_seqassays and not seq.startswith(tuple(remove_centers))
for seq in genome_info_df["SEQ_ASSAY_ID"]
]
genome_info_df = genome_info_df[keep_rows]

# Write genomic file
genome_info_text = process_functions.removePandasDfFloat(genome_info_df)
genome_info_path = os.path.join(
tempdir,
os.path.basename(genome_info_ent.path).replace(old_release, new_release),
)

with open(genome_info_path, "w") as bed_file:
bed_file.write(genome_info_text)
store_file(syn, genome_info_path, new_release_synid, new_release)
# Create cBioPortal gene panel and meta files
for name in file_mapping:
if name.startswith("data_gene_panel"):
seq_name = name.replace("data_gene_panel_", "").replace(".txt", "")
if seq_name not in remove_seqassays:
gene_panel_ent = syn.get(file_mapping[name], followLink=True)
new_panel_path = os.path.join(
tempdir,
os.path.basename(gene_panel_ent.path).replace(
old_release, new_release
),
)
shutil.copyfile(gene_panel_ent.path, new_panel_path)
store_file(syn, new_panel_path, new_release_synid, new_release)
elif name.startswith("meta") or "_meta_" in name:
meta_ent = syn.get(file_mapping[name], followLink=True)
new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path))
shutil.copyfile(meta_ent.path, new_meta_path)
revise_meta_file(new_meta_path, old_release, new_release)
store_file(syn, new_meta_path, new_release_synid, new_release)
# Patch assay information file
assay_ent = syn.get(assay_info_synid, followLink=True)
assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#")
keep_rows = [
seq not in remove_seqassays and not seq.startswith(tuple(remove_centers))
for seq in assaydf["SEQ_ASSAY_ID"]
]
assaydf = assaydf[keep_rows]
assay_text = process_functions.removePandasDfFloat(assaydf)
assay_path = os.path.join(
tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release)
)
with open(assay_path, "w") as assay_file:
assay_file.write(assay_text)
store_file(syn, assay_path, new_release_synid, new_release)
# Create cBioPortal case lists
case_list_path = os.path.join(tempdir, "case_lists")
if not os.path.exists(case_list_path):
os.mkdir(case_list_path)
create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private")

case_list_files = os.listdir(case_list_path)

for case_filename in case_list_files:
# if case_filename in case_file_synids:
case_path = os.path.join(case_list_path, case_filename)
store_file(syn, case_path, case_list_folder_synid, new_release)

tempdir_o.cleanup()
# Update dashboard tables
# You may have to execute this twice in case the file view isn't updated
database_mapping = syn.tableQuery(f"select * from {database_mapping_synid} limit 1")
database_mapping = syn.tableQuery(f"select * from {database_mapping_synid}")
database_mappingdf = database_mapping.asDataFrame()
dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release)


def main():
parser = argparse.ArgumentParser(description="Store a file in Synapse.")

parser.add_argument(
"release_synid",
type=str,
help="The Synapse Id of the consortium release folder",
)
parser.add_argument(
"new_release_synid",
type=str,
help="The Synapse Id of the new release folder (has to be created)",
)
parser.add_argument(
"retracted_sample_synid",
type=str,
help="The Synapse Id of the samples_to_retract.csv file generated in the current 3rd consortium release.",
)

args = parser.parse_args()

patch_release_workflow(
release_synid=args.release_synid,
new_release_synid=args.new_release_synid,
retracted_sample_synid=args.retracted_sample_synid,
)


if __name__ == "__main__":
main()