Skip to content

Commit

Permalink
feat: add a rule to convert kmers table to PLINK
Browse files Browse the repository at this point in the history
* use kmers_table_to_bed function from kmersGWAS library
* add parameters for kmers_table_to_bed on config.yaml
* add comments to common.smk
  • Loading branch information
akcorut committed Mar 21, 2023
1 parent 373b406 commit 8c4bfad
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 36 deletions.
39 changes: 31 additions & 8 deletions .test/config_ecoli/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ settings:
# ----------------------------------------------------------------------

kmers_gwas:
# TODO Convert kmers table to PLINK
# PLINK-conversion:
# # Set to true in order to convert k-mers table to PLINK format (.bed)
# activate: false
# Convert kmers table to PLINK .bed format
kmers_table_to_bed:
# Set to true in order to convert k-mers table to PLINK format (.bed)
activate: true

use_kmers_kinship:
# Set to true in order to use k-mers based kinship matrix
Expand Down Expand Up @@ -167,13 +167,39 @@ params:
extra: ""

# ----------------------------------------------------------------------
# merge_kemrs
# merge_kmers Params
# ----------------------------------------------------------------------

merge_kmers:
# Number of threads for kmc
threads: 8

# ----------------------------------------------------------------------
# convert_kmers_table_to_plink Params
# ----------------------------------------------------------------------

kmers_table_to_bed:
# Phenotypes to use (based on the phenos.tsv file)
# Only the individuals that exist in the given
# phenotype file will be used
phenos: ["resistence"]
# Number of threads for kmers_table_to_bed
threads: 32
# k-mer length (should be between 15-31)
kmer_len: 31
# Minor allele count
# (Default is: 5)
minor_allele_count: 5
# Minor allele frequency
# (Default is: 0.05)
minor_allele_freq: 0.05
# Maximal number of variants in each PLINK bed file
# Set by -b option
# Use a very large number if you want to avoid splitting
max_num_var: 10000000000
# Set to true in order to keep only unique presence/absence patterns
only_unique: true

# ----------------------------------------------------------------------
# kmersGWAS Params
# ----------------------------------------------------------------------
Expand All @@ -192,9 +218,6 @@ params:
# Minor allele frequency
# (Default is: 0.05)
minor_allele_freq: 0.05

# Maximal number of variants in each PLINK bed file
max_num_var: 10000000

# Number of k-mers to filter from first step
# (Default is: 10001)
Expand Down
39 changes: 31 additions & 8 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ settings:
# ----------------------------------------------------------------------

kmers_gwas:
# TODO Convert kmers table to PLINK
# PLINK-conversion:
# Convert kmers table to PLINK .bed format
kmers_table_to_bed:
# Set to true in order to convert k-mers table to PLINK format (.bed)
# activate: false
activate: false

use_kmers_kinship:
# Set to true in order to use k-mers based kinship matrix
Expand Down Expand Up @@ -163,16 +163,42 @@ params:
# counted. This parameter depends on the coverage, but should be
# the same for all the individuals.
count_thresh: 2

# Additional parameters for kmc
extra: ""

# ----------------------------------------------------------------------
# merge_kmers
# merge_kmers Params
# ----------------------------------------------------------------------

merge_kmers:
# Number of threads for kmc
threads: 8

# ----------------------------------------------------------------------
# convert_kmers_table_to_plink Params
# ----------------------------------------------------------------------

kmers_table_to_bed:
# Phenotypes to use (based on the phenos.tsv file)
# Only the individuals that exist in the given
# phenotype file will be used
phenos: ["resistence"]
# Number of threads for kmers_table_to_bed
threads: 32
# k-mer length (should be between 15-31)
kmer_len: 31
# Minor allele count
# (Default is: 5)
minor_allele_count: 5
# Minor allele frequency
# (Default is: 0.05)
minor_allele_freq: 0.05
# Maximal number of variants in each PLINK bed file
# Set by -b option
# Use a very large number if you want to avoid splitting
max_num_var: 10000000
# Set to true in order to keep only unique presence/absence patterns
only_unique: true

# ----------------------------------------------------------------------
# kmersGWAS Params
Expand All @@ -192,9 +218,6 @@ params:
# Minor allele frequency
# (Default is: 0.05)
minor_allele_freq: 0.05

# Maximal number of variants in each PLINK bed file
max_num_var: 10000000

# Number of k-mers to filter from first step
# (Default is: 10001)
Expand Down
65 changes: 46 additions & 19 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,21 @@ KMERSGWAS_BIN_PATH = os.path.join(KMERSGWAS_DIR, "bin/")
# Common Helper Functions
# =================================================================================================

def sra_only(sample, library):
# Check if the sample is SRA only
def sra_only(sample, library):
return pd.isnull(samples.loc[(sample, library), "fq1"]) and pd.isnull(samples.loc[(sample, library), "fq2"]) \
and not pd.isnull(samples.loc[(sample, library), "sra"])

def is_sra_se(sample, library):
# Check if the sample is SRA only and single-end
def is_sra_se(sample, library):
return sra_only(sample, library) and config["settings"]["single_end"]

def is_sra_pe(sample, library):
# Check if the sample is SRA only and paired-end
def is_sra_pe(sample, library):
return sra_only(sample, library) and not config["settings"]["single_end"]

def get_individual_fastq(wildcards):
# Get individual FASTQ files
def get_individual_fastq(wildcards):
# Adapted from: https://github.com/snakemake-workflows/chipseq
"""Get individual raw FASTQ files from samples sheet."""
fastqs = samples.loc[(wildcards.sample, wildcards.library), ["fq1", "fq2"]].dropna()
Expand All @@ -174,7 +178,8 @@ def get_individual_fastq(wildcards):
else:
return samples.loc[ (wildcards.sample, wildcards.library), "fq2" ]

def get_fastqs(wildcards):
# Get FASTQ files
def get_fastqs(wildcards):
# Adapted from: https://github.com/snakemake-workflows/chipseq
"""Get raw FASTQ files from samples sheet."""
fastqs = samples.loc[(wildcards.sample, wildcards.library), ["fq1", "fq2"]].dropna()
Expand All @@ -190,6 +195,7 @@ def get_fastqs(wildcards):
u = samples.loc[ (wildcards.sample, wildcards.library), ["fq1", "fq2"] ].dropna()
return [ f"{u.fq1}", f"{u.fq2}" ]

# Check if FASTQ files are gzipped
def ends_with_gz(samp_tab):
fqs = samp_tab.loc[:, ['fq1']]
if not pd.isnull(fqs.fq1).all():
Expand All @@ -200,7 +206,8 @@ def ends_with_gz(samp_tab):
else:
return False

def get_multiqc_input(wildcards):
# Get MultiQC input
def get_multiqc_input(wildcards):
"""Get multiqc input."""
multiqc_input = []
multiqc_input.extend(
Expand All @@ -222,18 +229,12 @@ def get_multiqc_input(wildcards):
)
return multiqc_input

def get_phenos(wildcards):
# Get phenotypes
def get_phenos(wildcards):
"""Get fastq files using samples sheet."""
pheno_names = phenos.loc[(wildcards.pheno), ["pheno_path"]].dropna()
return {"pheno_path": pheno_names.pheno_path}

def get_input_path_for_generate_input_lists():
"""Get input path of reads to create input lists."""
if config["settings"]["trimming"]["activate"]:
return "results/trimmed"
else:
return "results/reads"

def get_plink_prefix():
"""Get prefix fopr the SNPs plink file."""
plink_path = config["settings"]["kmers_gwas"]["use_snps_kinship"]["snps_plink"]
Expand Down Expand Up @@ -289,18 +290,25 @@ def get_target_output(wildcards):
Get all requested inputs (target outputs) for rule all.
"""

align_kmers = config["settings"]["align_kmers"]["activate"]
align_reads_with_kmers = config["settings"]["align_reads_with_kmers"]["activate"]
assemble_reads_with_kmers = config["settings"]["assemble_reads"]["activate"]
blast_assembled_reads = config["settings"]["blast_contigs"]["activate"]
# Activate/Deactivate rules
align_kmers = config["settings"]["align_kmers"]["activate"] # Activate align kmers
align_reads_with_kmers = config["settings"]["align_reads_with_kmers"]["activate"] # Activate align reads with kmers
assemble_reads_with_kmers = config["settings"]["assemble_reads"]["activate"] # Activate assemble reads with kmers
blast_assembled_reads = config["settings"]["blast_contigs"]["activate"] # Activate BLAST assembled reads
convert_kmers_table_to_plink = config["settings"]["kmers_gwas"]["kmers_table_to_bed"]["activate"] # Activate convert kmers table to plink

target_output = []
target_output = [] # List of target outputs

### Add target outputs for rule all ###

# MultiQC
target_output.extend(
expand(
"results/qc/multiqc.html"
)
),

# KMC, k-mers stats
target_output.extend(
expand(
[
Expand All @@ -313,46 +321,65 @@ def get_target_output(wildcards):
]
)
),

# Combine and filter k-mers
target_output.extend(
expand(
"results/plots/kmers_list/kmer_allele_counts.pdf"
)
),

# Convert the k-mers table to PLINK
if convert_kmers_table_to_plink:
# If convert_kmers_table_to_plink is activated
target_output.extend(
expand(
"results/kmers_table/plink/pheno_{pheno}/convert_kmers_table_to_plink.done",
pheno= config["params"]["kmers_table_to_bed"]["phenos"]
)
),

# kmersGWAS
target_output.extend(
expand(
"results/kmers_gwas/{pheno}/kmers/pass_threshold_5per",
pheno= pheno_names
)
),

# Align k-mers
if align_kmers:
target_output.extend(
expand(
"results/align_kmers/align_kmers.done"
)
),

# Align reads with k-mers
if align_reads_with_kmers:
target_output.extend(
expand(
"results/align_reads_with_kmers/align_reads_with_kmers.done"
)
),

# Assemble reads with k-mers
if assemble_reads_with_kmers:
target_output.extend(
expand(
"results/align_contigs/align_contigs.done"
)
),

# BLAST assembled reads
if blast_assembled_reads:
target_output.extend(
expand(
"results/blast_contigs/blast_contigs.done"
)
),

# Return target outputs
return target_output

# =================================================================================================
41 changes: 40 additions & 1 deletion workflow/rules/generate_kmers_table.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,44 @@ rule create_kmers_table:
"""

# =================================================================================================
# TODO: Convert the k-mers table to PLINK binary file
# Convert the k-mers table to PLINK binary file
# =================================================================================================

rule convert_kmers_table_to_plink:
input:
kmers_table = "results/kmers_table/kmers_table.table",
phenotype = lambda wildcards: phenos.loc[(wildcards.pheno), ["pheno_path"]],
kmersGWAS_bin = rules.extract_kmersGWAS.output.kmersGWAS_bin,
output:
bed = "results/kmers_table/plink/pheno_{pheno}/kmers_table.presence_absence.0.bed",
done = touch("results/kmers_table/plink/pheno_{pheno}/convert_kmers_table_to_plink.done"),
params:
input_prefix = lambda w, input: os.path.splitext(input.kmers_table)[0],
output_prefix = lambda w, output: os.path.splitext(output.bed)[0].rsplit(".", 1)[0],
kmer_len = config["params"]["kmers_table_to_bed"]["kmer_len"],
max_num_var = config["params"]["kmers_table_to_bed"]["max_num_var"],
mac = config["params"]["kmers_table_to_bed"]["minor_allele_count"],
maf = config["params"]["kmers_table_to_bed"]["minor_allele_freq"],
only_unique = "-u" if config["params"]["kmers_table_to_bed"]["only_unique"] else "",
conda:
"../envs/kmers_gwas.yaml"
log:
"logs/create_kmers_table/convert_kmers_table_to_plink.pheno_{pheno}.log"
message:
"Converting the k-mers table to PLINK binary file..."
shell:
"""
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib
{input.kmersGWAS_bin}/kmers_table_to_bed \
-t {params.input_prefix} \
-p {input.phenotype} \
-o {params.output_prefix} \
-k {params.kmer_len} \
--maf {params.maf} --mac {params.mac} \
-b {params.max_num_var} \
{params.only_unique} \
>>{log} 2>&1
"""

# =================================================================================================

0 comments on commit 8c4bfad

Please sign in to comment.