Merge pull request #76 from js2264/master

Add `--binning`, `--zoomify` and `--balancing` arguments to pipeline
koszullab · Aug 24, 2023 · 00f37df · 00f37df
2 parents ef278d2 + 749c895
commit 00f37df
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 10 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -21,9 +21,9 @@ jobs:
     - name: Install dependencies
       run: |
         conda config --add channels bioconda
-        conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest  pytest-cov pylint codecov mappy
+        conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy
         pip install -Ur requirements.txt
-        pip install cooler pytest-pylint
+        pip install pytest-pylint
     - name: Lint and test
       run: |
         pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff

diff --git a/hicstuff/commands.py b/hicstuff/commands.py
@@ -743,8 +743,9 @@ class Pipeline(AbstractCommand):
         pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
                  [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
                  [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
-                 [--quality-min=30] [--read-len=INT] [--remove-centromeres=0] [--size=0]
-                 [--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
+                 [--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30] 
+                 [--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq] 
+                 [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
 
     arguments:
         input1:             Forward fastq file, if start_stage is "fastq", sam
@@ -811,6 +812,12 @@ class Pipeline(AbstractCommand):
                                       at different steps of the pipeline.
         -P, --prefix=STR              Overrides default filenames and prefixes all
                                       output files with a custom name.
+        -b,--binning=INT              Bin the resulting matrix to a given resolution
+                                      (only used if `--matfmt cool")
+        -z, --zoomify                 Zoomify binned cool matrix 
+                                      (only used if mat_fmt == "cool" and binning is set)
+        -B, --balancing_args=STR      Arguments to pass to `cooler balance` 
+                                      (default: "") (only used if zoomify == True)
         -q, --quality-min=INT         Minimum mapping quality for selecting
                                       contacts. [default: 30].
         -r, --remove-centromeres=INT  Integer. Number of kb that will be remove around
@@ -858,6 +865,12 @@ def execute(self):
         if not self.args["--outdir"]:
             self.args["--outdir"] = os.getcwd()
 
+        if not self.args["--binning"]:
+            self.args["--binning"] = "0"
+
+        if not self.args["--balancing_args"]:
+            self.args["--balancing_args"] = ""
+
         if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
             logger.error("matfmt must be either bg2, cool or graal.")
             raise ValueError
@@ -879,6 +892,9 @@ def execute(self):
             force=self.args["--force"],
             mapping=self.args["--mapping"],
             mat_fmt=self.args["--matfmt"],
+            binning=int(self.args["--binning"]),
+            zoomify=self.args["--zoomify"],
+            balancing_args=self.args["--balancing_args"],
             min_qual=int(self.args["--quality-min"]),
             min_size=int(self.args["--size"]),
             no_cleanup=self.args["--no-cleanup"],

diff --git a/hicstuff/digest.py b/hicstuff/digest.py
@@ -112,7 +112,7 @@ def write_frag_info(
                     frag_length = len(frag)
                     if frag_length > 0:
                         end_pos = start_pos + frag_length
-                        gc_content = SeqUtils.GC(frag) / 100.0
+                        gc_content = SeqUtils.gc_fraction(frag) / 100.0
 
                         current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
                             current_id,

diff --git a/hicstuff/io.py b/hicstuff/io.py
@@ -1166,7 +1166,7 @@ def gc_bins(genome_path, frags):
         seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)]
         # Fill GC values for bins in the chromosome
         idx = np.flatnonzero(mask)
-        gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0
+        gc_bins[idx] = np.array(list(map(SeqUtils.gc_fraction, seqs))) / 100.0
 
     return gc_bins
 

diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
@@ -13,6 +13,7 @@
 import subprocess as sp
 import gzip
 from Bio import SeqIO
+import cooler
 import pandas as pd
 import numpy as np
 import pysam as ps
@@ -339,6 +340,67 @@ def pairs2cool(pairs_file, cool_file, bins_file):
     os.remove(bins_tmp)
 
 
+def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
+    """
+    Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
+    
+    Parameters
+    ----------
+
+    pairs_file : str
+        Path to the pairs file containing input contact data.
+    cool_file : str
+        Path to the output cool file name to generate.
+    binning : int
+        If mat_fmt is set to "cool", the cool file will be further binned to 
+        this resolution.
+    info_contigs : pathlib.Path or str
+        The path to the contigs info file
+    """
+
+    # Get chrom sizes
+    contigs = pd.read_csv(info_contigs, sep="\t")
+    chroms = dict(zip(contigs.contig, contigs.length))
+
+    # Save chrom sizes as separate file
+    chroms_tmp = info_contigs + ".chroms"
+    pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None)
+
+    # Run `cooler cload pairs`
+    cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ")
+    sp.call(cooler_cmd + [chroms_tmp+":"+str(binning), pairs_file, cool_file], shell=False)
+    os.remove(chroms_tmp)
+
+
+def cool2mcool(cool_file, output_file, balance_args):
+    """
+    Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
+    
+    Parameters
+    ----------
+
+    cool_file : str
+        Path to an existing binned .cool file.
+    output_file : str
+        Path to the new multi-resolution .mcool file.
+    balance_args : str
+        Arguments passed to `cooler balance` (default: "")
+    """
+
+    # Get resolutions
+    clr = cooler.Cooler(cool_file)
+    res = clr.binsize
+    multires = [res, res*2, res*5, res*10, res*20, res*50, res*100]
+
+    # Run cooler zoomify
+    cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000)
+
+    # Balance 
+    for res in multires: 
+        cooler_cmd = "cooler balance".split(" ")
+        sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False)
+
+
 def pairs2matrix(
     pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
 ):
@@ -468,7 +530,10 @@ def full_pipeline(
     filter_events=False,
     force=False,
     mapping="normal",
-    mat_fmt="graal",
+    mat_fmt="cool",
+    binning=0,
+    zoomify=True,
+    balancing_args="",
     min_qual=30,
     min_size=0,
     no_cleanup=False,
@@ -497,6 +562,13 @@ def full_pipeline(
     input2 : str
         Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
         in BAM format, or None, depending on the value of start_stage.
+    binning : int
+        If mat_fmt is set to "cool", the cool file will be further binned to 
+        this resolution.
+    zoomify : bool
+        Whether to zoomify binned cool matrix (only used if mat_fmt == "cool" and binning is set)
+    balancing_args : str
+        Arguments to pass to `chromosight balance` (default: None) (only used if zoomify == True)
     enzyme : int or strtest_data/genome/seq.fa
     circular : bool
         Use if the genome is circular.
@@ -940,6 +1012,14 @@ def _out_file(fname):
         # Name matrix file in .cool
         cool_file = os.path.splitext(mat)[0] + ".cool"
         pairs2cool(use_pairs, cool_file, fragments_list)
+
+        if (binning > 0):
+            binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool"
+            pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs)
+            if (zoomify is True):
+                mcool_file = os.path.splitext(mat)[0] + ".mcool"
+                cool2mcool(binned_cool_file, mcool_file, balancing_args)
+
     else:
         pairs2matrix(
             use_pairs,
@@ -958,7 +1038,7 @@ def _out_file(fname):
             pairs_filtered,
             bam1,
             bam2,
-            pairs_pcr,
+            # pairs_pcr,
             tmp_genome,
         ]
         # Do not delete files that were given as input

diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ requests
 scikit-learn
 pysam
 pyfastx
+cooler
diff --git a/setup.py b/setup.py
@@ -64,5 +64,5 @@
     install_requires=REQUIREMENTS,
     long_description_content_type="text/markdown",
     entry_points={"console_scripts": ["hicstuff=hicstuff.main:main"]},
-    extras_require={"mappy": ["mappy"], "cooler": ["cooler"]},
+    extras_require={"mappy": ["mappy"]},
 )
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -73,6 +73,26 @@ def test_full_pipeline_frags():
         )
 
 
+def test_full_pipeline_frags_with_balancing():
+    mapping_to_enzyme = {
+        'normal': "DpnII",
+        'cutsite': "DpnII,HpaII"
+    }
+    for mapping, enzyme in mapping_to_enzyme.items():
+        hpi.full_pipeline(
+            input1="test_data/sample.reads_for.fastq.gz",
+            input2="test_data/sample.reads_rev.fastq.gz",
+            genome="test_data/genome/seq",
+            enzyme=enzyme,
+            mapping=mapping,
+            out_dir="test_out",
+            plot=True,
+            pcr_duplicates=True,
+            filter_events=True,
+            no_cleanup=True,
+            force=True,
+        )
+
 
 def test_full_pipeline_frags_gzipped_genome():
     mapping_to_enzyme = {
@@ -86,6 +106,10 @@ def test_full_pipeline_frags_gzipped_genome():
             genome="test_data/genome/seq.fa.gz",
             enzyme=enzyme,
             mapping=mapping,
+            mat_fmt="cool",
+            binning = 1000, 
+            zoomify = True, 
+            balancing_args = "--max-iters 10 --min-nnz 10",
             out_dir="test_out",
             plot=True,
             pcr_duplicates=True,
@@ -122,7 +146,7 @@ def test_full_pipeline_bin(mapping, aligner):
             prefix="test",
             distance_law=True,
             start_stage=stage,
-            mat_fmt="cooler",
+            mat_fmt="cool",
             no_cleanup=True,
             force=True,
         )
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ requests @@
     scikit-learn
     pysam
     pyfastx
+    cooler