Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --binning, --zoomify and --balancing arguments to pipeline #76

Merged
merged 17 commits into from
Aug 24, 2023
Merged
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
- name: Install dependencies
run: |
conda config --add channels bioconda
conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest pytest-cov pylint codecov mappy
conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy
pip install -Ur requirements.txt
pip install cooler pytest-pylint
pip install pytest-pylint
- name: Lint and test
run: |
pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff
Expand Down
20 changes: 18 additions & 2 deletions hicstuff/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,9 @@ class Pipeline(AbstractCommand):
pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
[--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
[--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
[--quality-min=30] [--read-len=INT] [--remove-centromeres=0] [--size=0]
[--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
[--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30]
[--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq]
[--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]

arguments:
input1: Forward fastq file, if start_stage is "fastq", sam
Expand Down Expand Up @@ -811,6 +812,12 @@ class Pipeline(AbstractCommand):
at different steps of the pipeline.
-P, --prefix=STR Overrides default filenames and prefixes all
output files with a custom name.
-b,--binning=INT Bin the resulting matrix to a given resolution
(only used if `--matfmt cool")
-z, --zoomify Zoomify binned cool matrix
(only used if mat_fmt == "cool" and binning is set)
-B, --balancing_args=STR Arguments to pass to `cooler balance`
(default: "") (only used if zoomify == True)
-q, --quality-min=INT Minimum mapping quality for selecting
contacts. [default: 30].
-r, --remove-centromeres=INT Integer. Number of kb that will be remove around
Expand Down Expand Up @@ -858,6 +865,12 @@ def execute(self):
if not self.args["--outdir"]:
self.args["--outdir"] = os.getcwd()

if not self.args["--binning"]:
self.args["--binning"] = "0"

if not self.args["--balancing_args"]:
self.args["--balancing_args"] = ""

if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
logger.error("matfmt must be either bg2, cool or graal.")
raise ValueError
Expand All @@ -879,6 +892,9 @@ def execute(self):
force=self.args["--force"],
mapping=self.args["--mapping"],
mat_fmt=self.args["--matfmt"],
binning=int(self.args["--binning"]),
zoomify=self.args["--zoomify"],
balancing_args=self.args["--balancing_args"],
min_qual=int(self.args["--quality-min"]),
min_size=int(self.args["--size"]),
no_cleanup=self.args["--no-cleanup"],
Expand Down
2 changes: 1 addition & 1 deletion hicstuff/digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def write_frag_info(
frag_length = len(frag)
if frag_length > 0:
end_pos = start_pos + frag_length
gc_content = SeqUtils.GC(frag) / 100.0
gc_content = SeqUtils.gc_fraction(frag) / 100.0

current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
current_id,
Expand Down
2 changes: 1 addition & 1 deletion hicstuff/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,7 @@ def gc_bins(genome_path, frags):
seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)]
# Fill GC values for bins in the chromosome
idx = np.flatnonzero(mask)
gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0
gc_bins[idx] = np.array(list(map(SeqUtils.gc_fraction, seqs))) / 100.0

return gc_bins

Expand Down
84 changes: 82 additions & 2 deletions hicstuff/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import subprocess as sp
import gzip
from Bio import SeqIO
import cooler
import pandas as pd
import numpy as np
import pysam as ps
Expand Down Expand Up @@ -339,6 +340,67 @@ def pairs2cool(pairs_file, cool_file, bins_file):
os.remove(bins_tmp)


def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
"""
Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.

Parameters
----------

pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
info_contigs : pathlib.Path or str
The path to the contigs info file
"""

# Get chrom sizes
contigs = pd.read_csv(info_contigs, sep="\t")
chroms = dict(zip(contigs.contig, contigs.length))

# Save chrom sizes as separate file
chroms_tmp = info_contigs + ".chroms"
pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None)

# Run `cooler cload pairs`
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ")
sp.call(cooler_cmd + [chroms_tmp+":"+str(binning), pairs_file, cool_file], shell=False)
os.remove(chroms_tmp)


def cool2mcool(cool_file, output_file, balance_args):
"""
Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.

Parameters
----------

cool_file : str
Path to an existing binned .cool file.
output_file : str
Path to the new multi-resolution .mcool file.
balance_args : str
Arguments passed to `cooler balance` (default: "")
"""

# Get resolutions
clr = cooler.Cooler(cool_file)
res = clr.binsize
multires = [res, res*2, res*5, res*10, res*20, res*50, res*100]

# Run cooler zoomify
cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000)

# Balance
for res in multires:
cooler_cmd = "cooler balance".split(" ")
sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False)


def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
Expand Down Expand Up @@ -468,7 +530,10 @@ def full_pipeline(
filter_events=False,
force=False,
mapping="normal",
mat_fmt="graal",
mat_fmt="cool",
binning=0,
zoomify=True,
balancing_args="",
min_qual=30,
min_size=0,
no_cleanup=False,
Expand Down Expand Up @@ -497,6 +562,13 @@ def full_pipeline(
input2 : str
Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
in BAM format, or None, depending on the value of start_stage.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
zoomify : bool
Whether to zoomify binned cool matrix (only used if mat_fmt == "cool" and binning is set)
balancing_args : str
Arguments to pass to `chromosight balance` (default: None) (only used if zoomify == True)
enzyme : int or strtest_data/genome/seq.fa
circular : bool
Use if the genome is circular.
Expand Down Expand Up @@ -940,6 +1012,14 @@ def _out_file(fname):
# Name matrix file in .cool
cool_file = os.path.splitext(mat)[0] + ".cool"
pairs2cool(use_pairs, cool_file, fragments_list)

if (binning > 0):
binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool"
pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs)
if (zoomify is True):
mcool_file = os.path.splitext(mat)[0] + ".mcool"
cool2mcool(binned_cool_file, mcool_file, balancing_args)

else:
pairs2matrix(
use_pairs,
Expand All @@ -958,7 +1038,7 @@ def _out_file(fname):
pairs_filtered,
bam1,
bam2,
pairs_pcr,
# pairs_pcr,
tmp_genome,
]
# Do not delete files that were given as input
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ requests
scikit-learn
pysam
pyfastx
cooler
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@
install_requires=REQUIREMENTS,
long_description_content_type="text/markdown",
entry_points={"console_scripts": ["hicstuff=hicstuff.main:main"]},
extras_require={"mappy": ["mappy"], "cooler": ["cooler"]},
extras_require={"mappy": ["mappy"]},
)
26 changes: 25 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,26 @@ def test_full_pipeline_frags():
)


def test_full_pipeline_frags_with_balancing():
mapping_to_enzyme = {
'normal': "DpnII",
'cutsite': "DpnII,HpaII"
}
for mapping, enzyme in mapping_to_enzyme.items():
hpi.full_pipeline(
input1="test_data/sample.reads_for.fastq.gz",
input2="test_data/sample.reads_rev.fastq.gz",
genome="test_data/genome/seq",
enzyme=enzyme,
mapping=mapping,
out_dir="test_out",
plot=True,
pcr_duplicates=True,
filter_events=True,
no_cleanup=True,
force=True,
)


def test_full_pipeline_frags_gzipped_genome():
mapping_to_enzyme = {
Expand All @@ -86,6 +106,10 @@ def test_full_pipeline_frags_gzipped_genome():
genome="test_data/genome/seq.fa.gz",
enzyme=enzyme,
mapping=mapping,
mat_fmt="cool",
binning = 1000,
zoomify = True,
balancing_args = "--max-iters 10 --min-nnz 10",
out_dir="test_out",
plot=True,
pcr_duplicates=True,
Expand Down Expand Up @@ -122,7 +146,7 @@ def test_full_pipeline_bin(mapping, aligner):
prefix="test",
distance_law=True,
start_stage=stage,
mat_fmt="cooler",
mat_fmt="cool",
no_cleanup=True,
force=True,
)
Expand Down