From d34726fca19c0668498c3b07ae3bd370e812f3ba Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 12:27:52 -0400 Subject: [PATCH 01/24] initial peppy imports working --- rules/common.smk | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index f6b9635..63feb61 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,13 +1,36 @@ import pandas as pd +from peppy import Project, SNAKEMAKE_CONFIG_KEY as pep_to_snake, SAMPLE_NAME_COLNAME as PEPPY_SAMPLE_COLUMN from snakemake.utils import validate +SAMPLE_COLUMN = "sample" + report: "../report/workflow.rst" ###### Config file and sample sheets ##### -configfile: "config.yaml" +p = Project("prjcfg.yaml") +#print("CONFIGFILE: {}".format(configfile)) +#configfile: "config.yaml" +configfile: getattr(p, pep_to_snake) +print("CONFIG: {}".format(config)) validate(config, schema="../schemas/config.schema.yaml") -samples = pd.read_table(config["samples"]).set_index("sample", drop=False) +sample_sheet_file = config["samples"] +dt = pd.read_table(sample_sheet_file) + +print("DT: {}".format(dt)) + +#samples = dt.set_index("sample", drop=False) +samples = p.sheet +if SAMPLE_COLUMN in samples.columns and SAMPLE_COLUMN in samples.columns: + raise Exception("Two sample identifier columns in samples sheet: {}".format(sample_sheet_file)) +samples.rename({PEPPY_SAMPLE_COLUMN: SAMPLE_COLUMN}, axis=1, inplace=True) + +print("SAMPLES: {}".format(samples)) + +print("CONFIG SAMPLES: {}".format(config["samples"])) + +#print("SAMPLES:\n{}".format("\n".join("{} ({})".format(str(s), type(s)) for s in samples))) + validate(samples, schema="../schemas/samples.schema.yaml") units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) From 59eb4d485e0bb847a6e7a1b8fadab1fc5f39dec1 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 15:31:03 -0400 Subject: [PATCH 02/24] more peppy interop --- config.yaml | 2 +- rules/common.smk | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index fc84797..46c2d58 100644 --- a/config.yaml +++ b/config.yaml @@ -4,7 +4,7 @@ units: units.tsv ref: name: GRCh38.86 # Path to the reference genome, ideally as it is provided by the GATK bundle. - genome: data/ref/genome.chr21.fa + genome: $GENOMES/hg38/hg38.fa # Path to any database of known variants, ideally as it is provided by the GATK bundle. known-variants: data/ref/dbsnp.vcf.gz diff --git a/rules/common.smk b/rules/common.smk index 63feb61..ae28c51 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,5 +1,7 @@ +import os import pandas as pd from peppy import Project, SNAKEMAKE_CONFIG_KEY as pep_to_snake, SAMPLE_NAME_COLNAME as PEPPY_SAMPLE_COLUMN +from peppy.utils import expandpath from snakemake.utils import validate SAMPLE_COLUMN = "sample" @@ -15,6 +17,7 @@ print("CONFIG: {}".format(config)) validate(config, schema="../schemas/config.schema.yaml") sample_sheet_file = config["samples"] +print("SAMPLE SHEET FILE: {}".format(sample_sheet_file)) dt = pd.read_table(sample_sheet_file) print("DT: {}".format(dt)) @@ -37,8 +40,12 @@ units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index validate(units, schema="../schemas/units.schema.yaml") +print("GENOME: {}".format(config["ref"]["genome"])) + # contigs in reference genome -contigs = pd.read_table(config["ref"]["genome"] + ".fai", +refgen = expandpath(config["ref"]["genome"]) +print("REFGEN: {}".format(refgen)) +contigs = pd.read_table(refgen + ".fai", header=None, usecols=[0], squeeze=True, dtype=str) From f92ae7c76882cf1f28df48f12230aebc90b8e09d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 16:11:20 -0400 Subject: [PATCH 03/24] set the index; use master config --- config.yaml | 2 +- rules/common.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index 46c2d58..fc84797 100644 --- a/config.yaml +++ b/config.yaml @@ -4,7 +4,7 @@ units: units.tsv ref: name: GRCh38.86 # Path to the reference genome, ideally as it is provided by the GATK bundle. - genome: $GENOMES/hg38/hg38.fa + genome: data/ref/genome.chr21.fa # Path to any database of known variants, ideally as it is provided by the GATK bundle. known-variants: data/ref/dbsnp.vcf.gz diff --git a/rules/common.smk b/rules/common.smk index ae28c51..8716a52 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -22,11 +22,11 @@ dt = pd.read_table(sample_sheet_file) print("DT: {}".format(dt)) -#samples = dt.set_index("sample", drop=False) samples = p.sheet if SAMPLE_COLUMN in samples.columns and SAMPLE_COLUMN in samples.columns: raise Exception("Two sample identifier columns in samples sheet: {}".format(sample_sheet_file)) samples.rename({PEPPY_SAMPLE_COLUMN: SAMPLE_COLUMN}, axis=1, inplace=True) +samples = dt.set_index(SAMPLE_COLUMN, drop=False) print("SAMPLES: {}".format(samples)) From 484976b559dc06a374b54dfecd3713d0521a146d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 16:17:31 -0400 Subject: [PATCH 04/24] cleanup --- rules/common.smk | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index 8716a52..0b4e6ff 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -10,41 +10,27 @@ report: "../report/workflow.rst" ###### Config file and sample sheets ##### p = Project("prjcfg.yaml") -#print("CONFIGFILE: {}".format(configfile)) -#configfile: "config.yaml" configfile: getattr(p, pep_to_snake) -print("CONFIG: {}".format(config)) validate(config, schema="../schemas/config.schema.yaml") sample_sheet_file = config["samples"] print("SAMPLE SHEET FILE: {}".format(sample_sheet_file)) dt = pd.read_table(sample_sheet_file) -print("DT: {}".format(dt)) - samples = p.sheet if SAMPLE_COLUMN in samples.columns and SAMPLE_COLUMN in samples.columns: raise Exception("Two sample identifier columns in samples sheet: {}".format(sample_sheet_file)) samples.rename({PEPPY_SAMPLE_COLUMN: SAMPLE_COLUMN}, axis=1, inplace=True) samples = dt.set_index(SAMPLE_COLUMN, drop=False) -print("SAMPLES: {}".format(samples)) - -print("CONFIG SAMPLES: {}".format(config["samples"])) - -#print("SAMPLES:\n{}".format("\n".join("{} ({})".format(str(s), type(s)) for s in samples))) - validate(samples, schema="../schemas/samples.schema.yaml") units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index validate(units, schema="../schemas/units.schema.yaml") -print("GENOME: {}".format(config["ref"]["genome"])) - # contigs in reference genome refgen = expandpath(config["ref"]["genome"]) -print("REFGEN: {}".format(refgen)) contigs = pd.read_table(refgen + ".fai", header=None, usecols=[0], squeeze=True, dtype=str) From b3661dc6bf799c985f3968b88bdfa3c73be54cfb Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 16:18:06 -0400 Subject: [PATCH 05/24] remove additional print --- rules/common.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/rules/common.smk b/rules/common.smk index 0b4e6ff..a023dc4 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -14,7 +14,6 @@ configfile: getattr(p, pep_to_snake) validate(config, schema="../schemas/config.schema.yaml") sample_sheet_file = config["samples"] -print("SAMPLE SHEET FILE: {}".format(sample_sheet_file)) dt = pd.read_table(sample_sheet_file) samples = p.sheet From ba4debb5471166a7e06e1d6b54b0faa204f10a6f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 13 Mar 2019 16:24:09 -0400 Subject: [PATCH 06/24] more cleanup --- rules/common.smk | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index a023dc4..bc0529a 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -4,8 +4,6 @@ from peppy import Project, SNAKEMAKE_CONFIG_KEY as pep_to_snake, SAMPLE_NAME_COL from peppy.utils import expandpath from snakemake.utils import validate -SAMPLE_COLUMN = "sample" - report: "../report/workflow.rst" ###### Config file and sample sheets ##### @@ -13,14 +11,10 @@ p = Project("prjcfg.yaml") configfile: getattr(p, pep_to_snake) validate(config, schema="../schemas/config.schema.yaml") -sample_sheet_file = config["samples"] -dt = pd.read_table(sample_sheet_file) - samples = p.sheet -if SAMPLE_COLUMN in samples.columns and SAMPLE_COLUMN in samples.columns: - raise Exception("Two sample identifier columns in samples sheet: {}".format(sample_sheet_file)) -samples.rename({PEPPY_SAMPLE_COLUMN: SAMPLE_COLUMN}, axis=1, inplace=True) -samples = dt.set_index(SAMPLE_COLUMN, drop=False) +if "sample" in samples.columns and PEPPY_SAMPLE_COLUMN in samples.columns: + raise Exception("Two sample identifier columns in samples sheet: {}".format(config["samples"])) +samples = samples.rename({PEPPY_SAMPLE_COLUMN: "sample"}, axis=1).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") From e1af9a167c7d9622ef5a02369dbeb0673501d5a7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 12:33:44 -0400 Subject: [PATCH 07/24] peppy files --- prjcfg.yaml | 6 ++++++ samples_peppy.tsv | 4 ++++ 2 files changed, 10 insertions(+) create mode 100644 prjcfg.yaml create mode 100644 samples_peppy.tsv diff --git a/prjcfg.yaml b/prjcfg.yaml new file mode 100644 index 0000000..739863e --- /dev/null +++ b/prjcfg.yaml @@ -0,0 +1,6 @@ +metadata: +# output_dir: $PROCESSED/microtest + sample_annotation: samples_peppy.tsv + +snake_config: "config.yaml" + diff --git a/samples_peppy.tsv b/samples_peppy.tsv new file mode 100644 index 0000000..cc9db9b --- /dev/null +++ b/samples_peppy.tsv @@ -0,0 +1,4 @@ +sample_name +A +B + From d5b6d468ba0310a56728f494518fab1c8334926e Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 13:30:45 -0400 Subject: [PATCH 08/24] minimize changes, shorten names --- rules/common.smk | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index bc0529a..65f88a0 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,20 +1,16 @@ import os import pandas as pd -from peppy import Project, SNAKEMAKE_CONFIG_KEY as pep_to_snake, SAMPLE_NAME_COLNAME as PEPPY_SAMPLE_COLUMN -from peppy.utils import expandpath -from snakemake.utils import validate - -report: "../report/workflow.rst" +from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL ###### Config file and sample sheets ##### p = Project("prjcfg.yaml") -configfile: getattr(p, pep_to_snake) +configfile: p.snake_config validate(config, schema="../schemas/config.schema.yaml") samples = p.sheet -if "sample" in samples.columns and PEPPY_SAMPLE_COLUMN in samples.columns: +if "sample" in samples.columns and PEP_SAMPLE_COL in samples.columns: raise Exception("Two sample identifier columns in samples sheet: {}".format(config["samples"])) -samples = samples.rename({PEPPY_SAMPLE_COLUMN: "sample"}, axis=1).set_index("sample", drop=False) +samples = samples.rename({PEP_SAMPLE_COL: "sample"}, axis=1).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") @@ -23,8 +19,7 @@ units.index = units.index.set_levels([i.astype(str) for i in units.index.levels] validate(units, schema="../schemas/units.schema.yaml") # contigs in reference genome -refgen = expandpath(config["ref"]["genome"]) -contigs = pd.read_table(refgen + ".fai", +contigs = pd.read_table(config["ref"]["genome"] + ".fai", header=None, usecols=[0], squeeze=True, dtype=str) From 5964ecf32f1849dc70eb10424dd614626ca13411 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 13:40:20 -0400 Subject: [PATCH 09/24] remove unused import --- rules/common.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/rules/common.smk b/rules/common.smk index 65f88a0..89a4d55 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,4 +1,3 @@ -import os import pandas as pd from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL From 6b54e14c9fba7186b546cf6d5c294abe4ac54c60 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 13:44:13 -0400 Subject: [PATCH 10/24] get back validate --- rules/common.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/rules/common.smk b/rules/common.smk index 89a4d55..195494d 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,5 +1,6 @@ import pandas as pd from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL +from snakemake.utils import validate ###### Config file and sample sheets ##### p = Project("prjcfg.yaml") From 37ded42fb821ba27cc41654ce203ab8037a8920e Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 16:44:21 -0400 Subject: [PATCH 11/24] need to check files entry --- prjcfg.yaml | 1 + rules/common.smk | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/prjcfg.yaml b/prjcfg.yaml index 739863e..d980e49 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,6 +1,7 @@ metadata: # output_dir: $PROCESSED/microtest sample_annotation: samples_peppy.tsv + sample_subannotation: units_peppy.tsv snake_config: "config.yaml" diff --git a/rules/common.smk b/rules/common.smk index 195494d..733f066 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -2,6 +2,25 @@ import pandas as pd from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL from snakemake.utils import validate + +def peppy_rename(df): + return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1) + + +def peppy_units(df): + if "unit" in df.columns: + return df + def count_names(names): + def go(rem, n, curr, acc): + if rem == []: + return acc + [n] + h, t = rem[0], rem[1:] + return go(t, n + 1, curr, acc) if h == curr else go(t, 1, h, acc + [n]) + return go(names[1:], 1, names[0], []) if names else [] + df.insert(1, "unit", [i for n in count_names(list(df[PEP_SAMPLE_COL])) for i in range(1, n + 1)]) + return df + + ###### Config file and sample sheets ##### p = Project("prjcfg.yaml") configfile: p.snake_config @@ -10,12 +29,17 @@ validate(config, schema="../schemas/config.schema.yaml") samples = p.sheet if "sample" in samples.columns and PEP_SAMPLE_COL in samples.columns: raise Exception("Two sample identifier columns in samples sheet: {}".format(config["samples"])) -samples = samples.rename({PEP_SAMPLE_COL: "sample"}, axis=1).set_index("sample", drop=False) +samples = peppy_rename(samples).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") -units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) +subann = peppy_rename(p.sample_subannotation).applymap(str) +units = peppy_units(subann).set_index(["sample", "unit"], drop=False) +#units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index + +print("UNITS:\n{}".format(units)) + validate(units, schema="../schemas/units.schema.yaml") # contigs in reference genome From ca76544e8edbb0436210133a27ceb0ffbb8a4c4d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 17:22:52 -0400 Subject: [PATCH 12/24] guards and cleanup --- rules/common.smk | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index 733f066..d31b956 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -4,6 +4,8 @@ from snakemake.utils import validate def peppy_rename(df): + if df is None: + return None return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1) @@ -33,13 +35,9 @@ samples = peppy_rename(samples).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") -subann = peppy_rename(p.sample_subannotation).applymap(str) -units = peppy_units(subann).set_index(["sample", "unit"], drop=False) -#units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) +units = peppy_units(peppy_rename(p.sample_subannotation)).set_index(["sample", "unit"], drop=False) units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index -print("UNITS:\n{}".format(units)) - validate(units, schema="../schemas/units.schema.yaml") # contigs in reference genome From e85a876a8cbd467697b73babd3212ba526c1ab25 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 18:04:28 -0400 Subject: [PATCH 13/24] clear unused KV in project config --- prjcfg.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/prjcfg.yaml b/prjcfg.yaml index d980e49..cee7a6b 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,5 +1,4 @@ metadata: -# output_dir: $PROCESSED/microtest sample_annotation: samples_peppy.tsv sample_subannotation: units_peppy.tsv From 6711da24446ea3aaaa5f1c5c69c7fc7b2dce67fb Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 18:04:51 -0400 Subject: [PATCH 14/24] condense and explain --- rules/common.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index d31b956..35414ab 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -4,12 +4,16 @@ from snakemake.utils import validate def peppy_rename(df): + """ Rename peppy's column for sample name identification to snakemake's. """ if df is None: return None + if "sample" in df.columns and PEP_SAMPLE_COL in df.columns: + raise Exception("Multiple sample identifier columns present: {}".format(", ".join(["sample", PEP_SAMPLE_COL]))) return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1) def peppy_units(df): + """ Add unit/subsample indices to peppy a data frame. """ if "unit" in df.columns: return df def count_names(names): @@ -29,8 +33,6 @@ configfile: p.snake_config validate(config, schema="../schemas/config.schema.yaml") samples = p.sheet -if "sample" in samples.columns and PEP_SAMPLE_COL in samples.columns: - raise Exception("Two sample identifier columns in samples sheet: {}".format(config["samples"])) samples = peppy_rename(samples).set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") From 6fde15f62635bdb41b6406aa173c8967747e3ee7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 14 Mar 2019 18:05:44 -0400 Subject: [PATCH 15/24] peppy-compatible subannotation / units sheet --- units_peppy.tsv | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 units_peppy.tsv diff --git a/units_peppy.tsv b/units_peppy.tsv new file mode 100644 index 0000000..98f46b9 --- /dev/null +++ b/units_peppy.tsv @@ -0,0 +1,4 @@ +sample_name unit platform fq1 fq2 +A 1 ILLUMINA data/reads/a.chr21.1.fq data/reads/a.chr21.2.fq +B 1 ILLUMINA data/reads/b.chr21.1.fq data/reads/b.chr21.2.fq +B 2 ILLUMINA data/reads/b.chr21.1.fq From bd8f36b19750180687a36466bd6f3d7bbc1c7a91 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 26 Apr 2019 17:39:36 -0400 Subject: [PATCH 16/24] see about adding units dynamically --- units_peppy.tsv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/units_peppy.tsv b/units_peppy.tsv index 98f46b9..57eaf1e 100644 --- a/units_peppy.tsv +++ b/units_peppy.tsv @@ -1,4 +1,4 @@ -sample_name unit platform fq1 fq2 -A 1 ILLUMINA data/reads/a.chr21.1.fq data/reads/a.chr21.2.fq -B 1 ILLUMINA data/reads/b.chr21.1.fq data/reads/b.chr21.2.fq -B 2 ILLUMINA data/reads/b.chr21.1.fq +sample_name platform fq1 fq2 +A ILLUMINA data/reads/a.chr21.1.fq data/reads/a.chr21.2.fq +B ILLUMINA data/reads/b.chr21.1.fq data/reads/b.chr21.2.fq +B ILLUMINA data/reads/b.chr21.1.fq From 6e491675f00011fe17353bb1ae6a908e44dfeaea Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 26 Apr 2019 17:39:51 -0400 Subject: [PATCH 17/24] using SnakeProject --- rules/common.smk | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/rules/common.smk b/rules/common.smk index 35414ab..0a68f2b 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,43 +1,18 @@ import pandas as pd -from peppy import Project, SAMPLE_NAME_COLNAME as PEP_SAMPLE_COL +from peppy import SnakeProject from snakemake.utils import validate -def peppy_rename(df): - """ Rename peppy's column for sample name identification to snakemake's. """ - if df is None: - return None - if "sample" in df.columns and PEP_SAMPLE_COL in df.columns: - raise Exception("Multiple sample identifier columns present: {}".format(", ".join(["sample", PEP_SAMPLE_COL]))) - return df.rename({PEP_SAMPLE_COL: "sample"}, axis=1) - - -def peppy_units(df): - """ Add unit/subsample indices to peppy a data frame. """ - if "unit" in df.columns: - return df - def count_names(names): - def go(rem, n, curr, acc): - if rem == []: - return acc + [n] - h, t = rem[0], rem[1:] - return go(t, n + 1, curr, acc) if h == curr else go(t, 1, h, acc + [n]) - return go(names[1:], 1, names[0], []) if names else [] - df.insert(1, "unit", [i for n in count_names(list(df[PEP_SAMPLE_COL])) for i in range(1, n + 1)]) - return df - - ###### Config file and sample sheets ##### -p = Project("prjcfg.yaml") +p = SnakeProject("prjcfg.yaml") configfile: p.snake_config validate(config, schema="../schemas/config.schema.yaml") -samples = p.sheet -samples = peppy_rename(samples).set_index("sample", drop=False) +samples = p.sample_table validate(samples, schema="../schemas/samples.schema.yaml") -units = peppy_units(peppy_rename(p.sample_subannotation)).set_index(["sample", "unit"], drop=False) +units = p.subsample_table units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index validate(units, schema="../schemas/units.schema.yaml") From 682bb5d99a7e188c7ccca50f2a1bf9bec3704ea1 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 12:56:12 -0400 Subject: [PATCH 18/24] use Snakemake naming --- samples_peppy.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples_peppy.tsv b/samples_peppy.tsv index cc9db9b..19b8191 100644 --- a/samples_peppy.tsv +++ b/samples_peppy.tsv @@ -1,4 +1,4 @@ -sample_name +sample A B From 0d2fc1f854afc45f6e60c584ad024ca5882f60fa Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 12:58:12 -0400 Subject: [PATCH 19/24] use base anns file due to identical content --- prjcfg.yaml | 2 +- samples_peppy.tsv | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 samples_peppy.tsv diff --git a/prjcfg.yaml b/prjcfg.yaml index cee7a6b..4243e5b 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,5 +1,5 @@ metadata: - sample_annotation: samples_peppy.tsv + sample_annotation: samples.tsv sample_subannotation: units_peppy.tsv snake_config: "config.yaml" diff --git a/samples_peppy.tsv b/samples_peppy.tsv deleted file mode 100644 index 19b8191..0000000 --- a/samples_peppy.tsv +++ /dev/null @@ -1,4 +0,0 @@ -sample -A -B - From e7301f3e7eb83fa475f4a30dcb800c4214fc115e Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 13:06:26 -0400 Subject: [PATCH 20/24] add prj cfg that uses the base files --- prjcfg_native.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 prjcfg_native.yaml diff --git a/prjcfg_native.yaml b/prjcfg_native.yaml new file mode 100644 index 0000000..f5d96fe --- /dev/null +++ b/prjcfg_native.yaml @@ -0,0 +1,6 @@ +metadata: + sample_annotation: samples.tsv + sample_subannotation: units.tsv + +snake_config: "config.yaml" + From 1750ab61711325fdc7899da69a4d63b17fb4dbb0 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 16:31:01 -0400 Subject: [PATCH 21/24] use the native encoding --- rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/common.smk b/rules/common.smk index 0a68f2b..415a81e 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -4,7 +4,7 @@ from snakemake.utils import validate ###### Config file and sample sheets ##### -p = SnakeProject("prjcfg.yaml") +p = SnakeProject("prjcfg_native.yaml") configfile: p.snake_config validate(config, schema="../schemas/config.schema.yaml") From 6192d29dbfcdcda4d766dd35b1857572fa2acdfe Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 29 Apr 2019 16:40:38 -0400 Subject: [PATCH 22/24] condense config files --- prjcfg.yaml | 2 +- prjcfg_native.yaml | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 prjcfg_native.yaml diff --git a/prjcfg.yaml b/prjcfg.yaml index 4243e5b..f5d96fe 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,6 +1,6 @@ metadata: sample_annotation: samples.tsv - sample_subannotation: units_peppy.tsv + sample_subannotation: units.tsv snake_config: "config.yaml" diff --git a/prjcfg_native.yaml b/prjcfg_native.yaml deleted file mode 100644 index f5d96fe..0000000 --- a/prjcfg_native.yaml +++ /dev/null @@ -1,6 +0,0 @@ -metadata: - sample_annotation: samples.tsv - sample_subannotation: units.tsv - -snake_config: "config.yaml" - From f728c3753fba886d59ce8d15911ffb70a6b52929 Mon Sep 17 00:00:00 2001 From: Vince Date: Fri, 21 Jun 2019 10:27:19 -0400 Subject: [PATCH 23/24] update to reflect peppy updates; https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling/pull/8#issuecomment-504316445 --- prjcfg.yaml | 5 ++--- rules/common.smk | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/prjcfg.yaml b/prjcfg.yaml index f5d96fe..e5676aa 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,6 +1,5 @@ metadata: - sample_annotation: samples.tsv - sample_subannotation: units.tsv + sample_table: samples.tsv + sample_subtable: units.tsv snake_config: "config.yaml" - diff --git a/rules/common.smk b/rules/common.smk index 415a81e..73bf0ac 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -13,7 +13,6 @@ samples = p.sample_table validate(samples, schema="../schemas/samples.schema.yaml") units = p.subsample_table -units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index validate(units, schema="../schemas/units.schema.yaml") From 0e37cfbf3c046420adeb910b8d5cf96b5eb52034 Mon Sep 17 00:00:00 2001 From: Vince Date: Wed, 19 Feb 2020 15:10:25 -0500 Subject: [PATCH 24/24] fix name mistake; https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling/pull/8/files#r380673348 --- prjcfg.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prjcfg.yaml b/prjcfg.yaml index e5676aa..d0aace4 100644 --- a/prjcfg.yaml +++ b/prjcfg.yaml @@ -1,5 +1,5 @@ metadata: sample_table: samples.tsv - sample_subtable: units.tsv + subsample_table: units.tsv snake_config: "config.yaml"