diff --git a/.test/config_ecoli/config.yaml b/.test/config_ecoli/config.yaml index eae0a1b..7bfff2e 100644 --- a/.test/config_ecoli/config.yaml +++ b/.test/config_ecoli/config.yaml @@ -37,10 +37,10 @@ settings: # ---------------------------------------------------------------------- kmers_gwas: - # TODO Convert kmers table to PLINK - # PLINK-conversion: - # # Set to true in order to convert k-mers table to PLINK format (.bed) - # activate: false + # Convert kmers table to PLINK .bed format + kmers_table_to_bed: + # Set to true in order to convert k-mers table to PLINK format (.bed) + activate: true use_kmers_kinship: # Set to true in order to use k-mers based kinship matrix @@ -167,13 +167,39 @@ params: extra: "" # ---------------------------------------------------------------------- - # merge_kemrs + # merge_kmers Params # ---------------------------------------------------------------------- merge_kmers: # Number of threads for kmc threads: 8 + # ---------------------------------------------------------------------- + # convert_kmers_table_to_plink Params + # ---------------------------------------------------------------------- + + kmers_table_to_bed: + # Phenotypes to use (based on the phenos.tsv file) + # Only the individuals that exist in the given + # phenotype file will be used + phenos: ["resistence"] + # Number of threads for kmers_table_to_bed + threads: 32 + # k-mer length (should be between 15-31) + kmer_len: 31 + # Minor allele count + # (Default is: 5) + minor_allele_count: 5 + # Minor allele frequency + # (Default is: 0.05) + minor_allele_freq: 0.05 + # Maximal number of variants in each PLINK bed file + # Set by -b option + # Use a very large number if you want to avoid splitting + max_num_var: 10000000000 + # Set to true in order to keep only unique presence/absence patterns + only_unique: true + # ---------------------------------------------------------------------- # kmersGWAS Params # ---------------------------------------------------------------------- @@ -192,9 +218,6 @@ params: # Minor allele frequency # (Default is: 0.05) minor_allele_freq: 0.05 - - # Maximal number of variants in each PLINK bed file - max_num_var: 10000000 # Number of k-mers to filter from first step # (Default is: 10001) diff --git a/config/config.yaml b/config/config.yaml index 35eaa1c..77434e8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -37,10 +37,10 @@ settings: # ---------------------------------------------------------------------- kmers_gwas: - # TODO Convert kmers table to PLINK - # PLINK-conversion: + # Convert kmers table to PLINK .bed format + kmers_table_to_bed: # Set to true in order to convert k-mers table to PLINK format (.bed) - # activate: false + activate: false use_kmers_kinship: # Set to true in order to use k-mers based kinship matrix @@ -163,16 +163,42 @@ params: # counted. This parameter depends on the coverage, but should be # the same for all the individuals. count_thresh: 2 - + # Additional parameters for kmc extra: "" # ---------------------------------------------------------------------- - # merge_kmers + # merge_kmers Params # ---------------------------------------------------------------------- merge_kmers: # Number of threads for kmc threads: 8 + + # ---------------------------------------------------------------------- + # convert_kmers_table_to_plink Params + # ---------------------------------------------------------------------- + + kmers_table_to_bed: + # Phenotypes to use (based on the phenos.tsv file) + # Only the individuals that exist in the given + # phenotype file will be used + phenos: ["resistence"] + # Number of threads for kmers_table_to_bed + threads: 32 + # k-mer length (should be between 15-31) + kmer_len: 31 + # Minor allele count + # (Default is: 5) + minor_allele_count: 5 + # Minor allele frequency + # (Default is: 0.05) + minor_allele_freq: 0.05 + # Maximal number of variants in each PLINK bed file + # Set by -b option + # Use a very large number if you want to avoid splitting + max_num_var: 10000000 + # Set to true in order to keep only unique presence/absence patterns + only_unique: true # ---------------------------------------------------------------------- # kmersGWAS Params @@ -192,9 +218,6 @@ params: # Minor allele frequency # (Default is: 0.05) minor_allele_freq: 0.05 - - # Maximal number of variants in each PLINK bed file - max_num_var: 10000000 # Number of k-mers to filter from first step # (Default is: 10001) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 582de0d..0f60f0a 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -144,17 +144,21 @@ KMERSGWAS_BIN_PATH = os.path.join(KMERSGWAS_DIR, "bin/") # Common Helper Functions # ================================================================================================= -def sra_only(sample, library): +# Check if the sample is SRA only +def sra_only(sample, library): return pd.isnull(samples.loc[(sample, library), "fq1"]) and pd.isnull(samples.loc[(sample, library), "fq2"]) \ and not pd.isnull(samples.loc[(sample, library), "sra"]) -def is_sra_se(sample, library): +# Check if the sample is SRA only and single-end +def is_sra_se(sample, library): return sra_only(sample, library) and config["settings"]["single_end"] -def is_sra_pe(sample, library): +# Check if the sample is SRA only and paired-end +def is_sra_pe(sample, library): return sra_only(sample, library) and not config["settings"]["single_end"] -def get_individual_fastq(wildcards): +# Get individual FASTQ files +def get_individual_fastq(wildcards): # Adapted from: https://github.com/snakemake-workflows/chipseq """Get individual raw FASTQ files from samples sheet.""" fastqs = samples.loc[(wildcards.sample, wildcards.library), ["fq1", "fq2"]].dropna() @@ -174,7 +178,8 @@ def get_individual_fastq(wildcards): else: return samples.loc[ (wildcards.sample, wildcards.library), "fq2" ] -def get_fastqs(wildcards): +# Get FASTQ files +def get_fastqs(wildcards): # Adapted from: https://github.com/snakemake-workflows/chipseq """Get raw FASTQ files from samples sheet.""" fastqs = samples.loc[(wildcards.sample, wildcards.library), ["fq1", "fq2"]].dropna() @@ -190,6 +195,7 @@ def get_fastqs(wildcards): u = samples.loc[ (wildcards.sample, wildcards.library), ["fq1", "fq2"] ].dropna() return [ f"{u.fq1}", f"{u.fq2}" ] +# Check if FASTQ files are gzipped def ends_with_gz(samp_tab): fqs = samp_tab.loc[:, ['fq1']] if not pd.isnull(fqs.fq1).all(): @@ -200,7 +206,8 @@ def ends_with_gz(samp_tab): else: return False -def get_multiqc_input(wildcards): +# Get MultiQC input +def get_multiqc_input(wildcards): """Get multiqc input.""" multiqc_input = [] multiqc_input.extend( @@ -222,18 +229,12 @@ def get_multiqc_input(wildcards): ) return multiqc_input -def get_phenos(wildcards): +# Get phenotypes +def get_phenos(wildcards): """Get fastq files using samples sheet.""" pheno_names = phenos.loc[(wildcards.pheno), ["pheno_path"]].dropna() return {"pheno_path": pheno_names.pheno_path} -def get_input_path_for_generate_input_lists(): - """Get input path of reads to create input lists.""" - if config["settings"]["trimming"]["activate"]: - return "results/trimmed" - else: - return "results/reads" - def get_plink_prefix(): """Get prefix fopr the SNPs plink file.""" plink_path = config["settings"]["kmers_gwas"]["use_snps_kinship"]["snps_plink"] @@ -289,18 +290,25 @@ def get_target_output(wildcards): Get all requested inputs (target outputs) for rule all. """ - align_kmers = config["settings"]["align_kmers"]["activate"] - align_reads_with_kmers = config["settings"]["align_reads_with_kmers"]["activate"] - assemble_reads_with_kmers = config["settings"]["assemble_reads"]["activate"] - blast_assembled_reads = config["settings"]["blast_contigs"]["activate"] + # Activate/Deactivate rules + align_kmers = config["settings"]["align_kmers"]["activate"] # Activate align kmers + align_reads_with_kmers = config["settings"]["align_reads_with_kmers"]["activate"] # Activate align reads with kmers + assemble_reads_with_kmers = config["settings"]["assemble_reads"]["activate"] # Activate assemble reads with kmers + blast_assembled_reads = config["settings"]["blast_contigs"]["activate"] # Activate BLAST assembled reads + convert_kmers_table_to_plink = config["settings"]["kmers_gwas"]["kmers_table_to_bed"]["activate"] # Activate convert kmers table to plink - target_output = [] + target_output = [] # List of target outputs + ### Add target outputs for rule all ### + + # MultiQC target_output.extend( expand( "results/qc/multiqc.html" ) ), + + # KMC, k-mers stats target_output.extend( expand( [ @@ -313,11 +321,25 @@ def get_target_output(wildcards): ] ) ), + + # Combine and filter k-mers target_output.extend( expand( "results/plots/kmers_list/kmer_allele_counts.pdf" ) ), + + # Convert the k-mers table to PLINK + if convert_kmers_table_to_plink: + # If convert_kmers_table_to_plink is activated + target_output.extend( + expand( + "results/kmers_table/plink/pheno_{pheno}/convert_kmers_table_to_plink.done", + pheno= config["params"]["kmers_table_to_bed"]["phenos"] + ) + ), + + # kmersGWAS target_output.extend( expand( "results/kmers_gwas/{pheno}/kmers/pass_threshold_5per", @@ -325,6 +347,7 @@ def get_target_output(wildcards): ) ), + # Align k-mers if align_kmers: target_output.extend( expand( @@ -332,6 +355,7 @@ def get_target_output(wildcards): ) ), + # Align reads with k-mers if align_reads_with_kmers: target_output.extend( expand( @@ -339,6 +363,7 @@ def get_target_output(wildcards): ) ), + # Assemble reads with k-mers if assemble_reads_with_kmers: target_output.extend( expand( @@ -346,6 +371,7 @@ def get_target_output(wildcards): ) ), + # BLAST assembled reads if blast_assembled_reads: target_output.extend( expand( @@ -353,6 +379,7 @@ def get_target_output(wildcards): ) ), + # Return target outputs return target_output # ================================================================================================= \ No newline at end of file diff --git a/workflow/rules/generate_kmers_table.smk b/workflow/rules/generate_kmers_table.smk index 8cc1061..e5c8a5b 100644 --- a/workflow/rules/generate_kmers_table.smk +++ b/workflow/rules/generate_kmers_table.smk @@ -29,5 +29,44 @@ rule create_kmers_table: """ # ================================================================================================= -# TODO: Convert the k-mers table to PLINK binary file +# Convert the k-mers table to PLINK binary file +# ================================================================================================= + +rule convert_kmers_table_to_plink: + input: + kmers_table = "results/kmers_table/kmers_table.table", + phenotype = lambda wildcards: phenos.loc[(wildcards.pheno), ["pheno_path"]], + kmersGWAS_bin = rules.extract_kmersGWAS.output.kmersGWAS_bin, + output: + bed = "results/kmers_table/plink/pheno_{pheno}/kmers_table.presence_absence.0.bed", + done = touch("results/kmers_table/plink/pheno_{pheno}/convert_kmers_table_to_plink.done"), + params: + input_prefix = lambda w, input: os.path.splitext(input.kmers_table)[0], + output_prefix = lambda w, output: os.path.splitext(output.bed)[0].rsplit(".", 1)[0], + kmer_len = config["params"]["kmers_table_to_bed"]["kmer_len"], + max_num_var = config["params"]["kmers_table_to_bed"]["max_num_var"], + mac = config["params"]["kmers_table_to_bed"]["minor_allele_count"], + maf = config["params"]["kmers_table_to_bed"]["minor_allele_freq"], + only_unique = "-u" if config["params"]["kmers_table_to_bed"]["only_unique"] else "", + conda: + "../envs/kmers_gwas.yaml" + log: + "logs/create_kmers_table/convert_kmers_table_to_plink.pheno_{pheno}.log" + message: + "Converting the k-mers table to PLINK binary file..." + shell: + """ + export LD_LIBRARY_PATH=$CONDA_PREFIX/lib + + {input.kmersGWAS_bin}/kmers_table_to_bed \ + -t {params.input_prefix} \ + -p {input.phenotype} \ + -o {params.output_prefix} \ + -k {params.kmer_len} \ + --maf {params.maf} --mac {params.mac} \ + -b {params.max_num_var} \ + {params.only_unique} \ + >>{log} 2>&1 + """ + # ================================================================================================= \ No newline at end of file