Skip to content

Commit

Permalink
Merge pull request #76 from js2264/master
Browse files Browse the repository at this point in the history
Add `--binning`, `--zoomify` and `--balancing` arguments to pipeline
  • Loading branch information
js2264 authored Aug 24, 2023
2 parents ef278d2 + 749c895 commit 00f37df
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 10 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
- name: Install dependencies
run: |
conda config --add channels bioconda
conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest pytest-cov pylint codecov mappy
conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy
pip install -Ur requirements.txt
pip install cooler pytest-pylint
pip install pytest-pylint
- name: Lint and test
run: |
pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff
Expand Down
20 changes: 18 additions & 2 deletions hicstuff/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,9 @@ class Pipeline(AbstractCommand):
pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
[--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
[--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
[--quality-min=30] [--read-len=INT] [--remove-centromeres=0] [--size=0]
[--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
[--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30]
[--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq]
[--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
arguments:
input1: Forward fastq file, if start_stage is "fastq", sam
Expand Down Expand Up @@ -811,6 +812,12 @@ class Pipeline(AbstractCommand):
at different steps of the pipeline.
-P, --prefix=STR Overrides default filenames and prefixes all
output files with a custom name.
-b,--binning=INT Bin the resulting matrix to a given resolution
(only used if `--matfmt cool")
-z, --zoomify Zoomify binned cool matrix
(only used if mat_fmt == "cool" and binning is set)
-B, --balancing_args=STR Arguments to pass to `cooler balance`
(default: "") (only used if zoomify == True)
-q, --quality-min=INT Minimum mapping quality for selecting
contacts. [default: 30].
-r, --remove-centromeres=INT Integer. Number of kb that will be remove around
Expand Down Expand Up @@ -858,6 +865,12 @@ def execute(self):
if not self.args["--outdir"]:
self.args["--outdir"] = os.getcwd()

if not self.args["--binning"]:
self.args["--binning"] = "0"

if not self.args["--balancing_args"]:
self.args["--balancing_args"] = ""

if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
logger.error("matfmt must be either bg2, cool or graal.")
raise ValueError
Expand All @@ -879,6 +892,9 @@ def execute(self):
force=self.args["--force"],
mapping=self.args["--mapping"],
mat_fmt=self.args["--matfmt"],
binning=int(self.args["--binning"]),
zoomify=self.args["--zoomify"],
balancing_args=self.args["--balancing_args"],
min_qual=int(self.args["--quality-min"]),
min_size=int(self.args["--size"]),
no_cleanup=self.args["--no-cleanup"],
Expand Down
2 changes: 1 addition & 1 deletion hicstuff/digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def write_frag_info(
frag_length = len(frag)
if frag_length > 0:
end_pos = start_pos + frag_length
gc_content = SeqUtils.GC(frag) / 100.0
gc_content = SeqUtils.gc_fraction(frag) / 100.0

current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
current_id,
Expand Down
2 changes: 1 addition & 1 deletion hicstuff/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,7 @@ def gc_bins(genome_path, frags):
seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)]
# Fill GC values for bins in the chromosome
idx = np.flatnonzero(mask)
gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0
gc_bins[idx] = np.array(list(map(SeqUtils.gc_fraction, seqs))) / 100.0

return gc_bins

Expand Down
84 changes: 82 additions & 2 deletions hicstuff/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import subprocess as sp
import gzip
from Bio import SeqIO
import cooler
import pandas as pd
import numpy as np
import pysam as ps
Expand Down Expand Up @@ -339,6 +340,67 @@ def pairs2cool(pairs_file, cool_file, bins_file):
os.remove(bins_tmp)


def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
"""
Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
info_contigs : pathlib.Path or str
The path to the contigs info file
"""

# Get chrom sizes
contigs = pd.read_csv(info_contigs, sep="\t")
chroms = dict(zip(contigs.contig, contigs.length))

# Save chrom sizes as separate file
chroms_tmp = info_contigs + ".chroms"
pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None)

# Run `cooler cload pairs`
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ")
sp.call(cooler_cmd + [chroms_tmp+":"+str(binning), pairs_file, cool_file], shell=False)
os.remove(chroms_tmp)


def cool2mcool(cool_file, output_file, balance_args):
"""
Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
cool_file : str
Path to an existing binned .cool file.
output_file : str
Path to the new multi-resolution .mcool file.
balance_args : str
Arguments passed to `cooler balance` (default: "")
"""

# Get resolutions
clr = cooler.Cooler(cool_file)
res = clr.binsize
multires = [res, res*2, res*5, res*10, res*20, res*50, res*100]

# Run cooler zoomify
cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000)

# Balance
for res in multires:
cooler_cmd = "cooler balance".split(" ")
sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False)


def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
Expand Down Expand Up @@ -468,7 +530,10 @@ def full_pipeline(
filter_events=False,
force=False,
mapping="normal",
mat_fmt="graal",
mat_fmt="cool",
binning=0,
zoomify=True,
balancing_args="",
min_qual=30,
min_size=0,
no_cleanup=False,
Expand Down Expand Up @@ -497,6 +562,13 @@ def full_pipeline(
input2 : str
Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
in BAM format, or None, depending on the value of start_stage.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
zoomify : bool
Whether to zoomify binned cool matrix (only used if mat_fmt == "cool" and binning is set)
balancing_args : str
Arguments to pass to `chromosight balance` (default: None) (only used if zoomify == True)
enzyme : int or strtest_data/genome/seq.fa
circular : bool
Use if the genome is circular.
Expand Down Expand Up @@ -940,6 +1012,14 @@ def _out_file(fname):
# Name matrix file in .cool
cool_file = os.path.splitext(mat)[0] + ".cool"
pairs2cool(use_pairs, cool_file, fragments_list)

if (binning > 0):
binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool"
pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs)
if (zoomify is True):
mcool_file = os.path.splitext(mat)[0] + ".mcool"
cool2mcool(binned_cool_file, mcool_file, balancing_args)

else:
pairs2matrix(
use_pairs,
Expand All @@ -958,7 +1038,7 @@ def _out_file(fname):
pairs_filtered,
bam1,
bam2,
pairs_pcr,
# pairs_pcr,
tmp_genome,
]
# Do not delete files that were given as input
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ requests
scikit-learn
pysam
pyfastx
cooler
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@
install_requires=REQUIREMENTS,
long_description_content_type="text/markdown",
entry_points={"console_scripts": ["hicstuff=hicstuff.main:main"]},
extras_require={"mappy": ["mappy"], "cooler": ["cooler"]},
extras_require={"mappy": ["mappy"]},
)
26 changes: 25 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,26 @@ def test_full_pipeline_frags():
)


def test_full_pipeline_frags_with_balancing():
mapping_to_enzyme = {
'normal': "DpnII",
'cutsite': "DpnII,HpaII"
}
for mapping, enzyme in mapping_to_enzyme.items():
hpi.full_pipeline(
input1="test_data/sample.reads_for.fastq.gz",
input2="test_data/sample.reads_rev.fastq.gz",
genome="test_data/genome/seq",
enzyme=enzyme,
mapping=mapping,
out_dir="test_out",
plot=True,
pcr_duplicates=True,
filter_events=True,
no_cleanup=True,
force=True,
)


def test_full_pipeline_frags_gzipped_genome():
mapping_to_enzyme = {
Expand All @@ -86,6 +106,10 @@ def test_full_pipeline_frags_gzipped_genome():
genome="test_data/genome/seq.fa.gz",
enzyme=enzyme,
mapping=mapping,
mat_fmt="cool",
binning = 1000,
zoomify = True,
balancing_args = "--max-iters 10 --min-nnz 10",
out_dir="test_out",
plot=True,
pcr_duplicates=True,
Expand Down Expand Up @@ -122,7 +146,7 @@ def test_full_pipeline_bin(mapping, aligner):
prefix="test",
distance_law=True,
start_stage=stage,
mat_fmt="cooler",
mat_fmt="cool",
no_cleanup=True,
force=True,
)
Expand Down

0 comments on commit 00f37df

Please sign in to comment.