From 70382ea3fee8702b39c542cf0af2e54fedcca02f Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Thu, 24 Aug 2023 23:12:04 +0200
Subject: [PATCH 01/11] feat: switch to `mat_fmt=cool` by default (implying
 --zoomify & default balancing)

---
 hicstuff/commands.py | 18 +++++++++++-------
 hicstuff/pipeline.py | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/hicstuff/commands.py b/hicstuff/commands.py
index 4e8866c..01a5029 100644
--- a/hicstuff/commands.py
+++ b/hicstuff/commands.py
@@ -743,7 +743,7 @@ class Pipeline(AbstractCommand):
         pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
                  [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
                  [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
-                 [--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30] 
+                 [--binning=INT] [--zoomify=BOOL] [--balancing_args=STR] [--quality-min=30] 
                  [--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq] 
                  [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
 
@@ -802,7 +802,7 @@ class Pipeline(AbstractCommand):
                                       Can be "bg2" for 2D Bedgraph format,
                                       "cool" for Mirnylab's cooler software, or
                                       "graal" for graal-compatible plain text
-                                      COO format. [default: graal]
+                                      COO format. [default: cool]
         -n, --no-cleanup              If enabled, intermediary BED files will
                                       be kept after generating the contact map.
                                       Disabled by defaut.
@@ -812,9 +812,10 @@ class Pipeline(AbstractCommand):
                                       at different steps of the pipeline.
         -P, --prefix=STR              Overrides default filenames and prefixes all
                                       output files with a custom name.
-        -b,--binning=INT              Bin the resulting matrix to a given resolution
+        -b,--binning=INT              Bin the contact matrix to a given resolution. 
+                                      By default, the contact matrix is not binned. 
                                       (only used if `--matfmt cool")
-        -z, --zoomify                 Zoomify binned cool matrix 
+        -z, --zoomify=BOOL            Zoomify binned cool matrix [default: True]
                                       (only used if mat_fmt == "cool" and binning is set)
         -B, --balancing_args=STR      Arguments to pass to `cooler balance` 
                                       (default: "") (only used if zoomify == True)
@@ -868,8 +869,11 @@ def execute(self):
         if not self.args["--binning"]:
             self.args["--binning"] = "0"
 
+        if not self.args["--zoomify"]:
+            self.args["--zoomify"] = "True"
+            
         if not self.args["--balancing_args"]:
-            self.args["--balancing_args"] = ""
+            self.args["--balancing_args"] = None
 
         if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
             logger.error("matfmt must be either bg2, cool or graal.")
@@ -878,7 +882,7 @@ def execute(self):
         read_len = self.args["--read-len"]
         if read_len is not None:
             read_len = int(read_len)
-
+        
         hpi.full_pipeline(
             genome=self.args["--genome"],
             input1=self.args["<input1>"],
@@ -893,7 +897,7 @@ def execute(self):
             mapping=self.args["--mapping"],
             mat_fmt=self.args["--matfmt"],
             binning=int(self.args["--binning"]),
-            zoomify=self.args["--zoomify"],
+            zoomify=eval(self.args["--zoomify"]),
             balancing_args=self.args["--balancing_args"],
             min_qual=int(self.args["--quality-min"]),
             min_size=int(self.args["--size"]),
diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
index 0b014f2..6184063 100644
--- a/hicstuff/pipeline.py
+++ b/hicstuff/pipeline.py
@@ -533,7 +533,7 @@ def full_pipeline(
     mat_fmt="cool",
     binning=0,
     zoomify=True,
-    balancing_args="",
+    balancing_args=None,
     min_qual=30,
     min_size=0,
     no_cleanup=False,
@@ -746,6 +746,17 @@ def _out_file(fname):
     hcl.set_file_handler(log_file)
     generate_log_header(log_file, input1, input2, genome, enzyme)
 
+    # If defautl `mat_fmt` or set `mat_fmt=cool`, notify the user
+    if mat_fmt == 'cool':
+        try:
+            import cooler
+            logger.info("The default output format is now `.cool`. The Hi-C "
+                        "matrix will be generated with cooler v%s " 
+                        "(Abdennur & Mirny, Bioinformatics 2020).", 
+                        cooler.__version__
+                        )
+        except: raise
+    
     # If the user chose bowtie2 and supplied an index, extract fasta from it
     # For later steps of the pipeline (digestion / frag attribution)
     # Check if the genome is an index or fasta file

From 77f508c488142e40b5308f473f842ffdb941cf0b Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Thu, 24 Aug 2023 23:12:59 +0200
Subject: [PATCH 02/11] fix: workflow to bin -> zoomify and/or balance

---
 hicstuff/pipeline.py | 61 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
index 6184063..efea325 100644
--- a/hicstuff/pipeline.py
+++ b/hicstuff/pipeline.py
@@ -372,7 +372,7 @@ def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
     os.remove(chroms_tmp)
 
 
-def cool2mcool(cool_file, output_file, balance_args):
+def cool2mcool(cool_file, output_file):
     """
     Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
     
@@ -383,8 +383,6 @@ def cool2mcool(cool_file, output_file, balance_args):
         Path to an existing binned .cool file.
     output_file : str
         Path to the new multi-resolution .mcool file.
-    balance_args : str
-        Arguments passed to `cooler balance` (default: "")
     """
 
     # Get resolutions
@@ -395,10 +393,34 @@ def cool2mcool(cool_file, output_file, balance_args):
     # Run cooler zoomify
     cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000)
 
+def balance(cool_file, balancing_args):
+    """
+    Balance a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
+    
+    Parameters
+    ----------
+
+    cool_file : str
+        Path to an existing binned .(m)cool file.
+    balancing_args : str
+        Extra rguments passed to `cooler balance` (default: None)
+    """
+
     # Balance 
-    for res in multires: 
+    if (cooler.fileops.is_multires_file(cool_file)):
+        paths = cooler.fileops.list_coolers(cool_file)
+        for path in paths: 
+            cooler_cmd = "cooler balance".split(" ")
+            if balancing_args is not None:
+                sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file+"::"+path], shell=False)
+            else:
+                sp.call(cooler_cmd + [cool_file+"::"+path], shell=False)
+    else:
         cooler_cmd = "cooler balance".split(" ")
-        sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False)
+        if balancing_args is not None:
+            sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file], shell=False)
+        else:
+            sp.call(cooler_cmd + [cool_file], shell=False)
 
 
 def pairs2matrix(
@@ -1020,17 +1042,26 @@ def _out_file(fname):
 
     # Build matrix from pairs.
     if mat_fmt == "cool":
-        # Name matrix file in .cool
-        cool_file = os.path.splitext(mat)[0] + ".cool"
-        pairs2cool(use_pairs, cool_file, fragments_list)
-        
-        if (binning > 0):
-            binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool"
-            pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs)
-            if (zoomify is True):
+
+        # If binning is **not** set, parse the pairs into a **un-binned** cool
+        if (binning == 0):
+            # Name matrix file in .cool
+            mat = os.path.splitext(mat)[0] + ".cool"
+            pairs2cool(use_pairs, mat, fragments_list)
+
+        # If binning is set, proceed to bin the pairs instead
+        else:
+            pairs2binnedcool(use_pairs, mat, binning, info_contigs)
+
+            # If zoomify == True, zoomify binned cool
+            if zoomify:
                 mcool_file = os.path.splitext(mat)[0] + ".mcool"
-                cool2mcool(binned_cool_file, mcool_file, balancing_args)
-                
+                cool2mcool(mat, mcool_file)
+                mat = mcool_file
+            
+            # Balance binned matrix
+            balance(mat, balancing_args)
+            
     else:
         pairs2matrix(
             use_pairs,

From 58cc0b080f9d994029595d8762129b43faddce22 Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Thu, 24 Aug 2023 23:28:18 +0200
Subject: [PATCH 03/11] fix: make sure final pairs file is retained

---
 hicstuff/pipeline.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
index efea325..b3d7a1c 100644
--- a/hicstuff/pipeline.py
+++ b/hicstuff/pipeline.py
@@ -1043,15 +1043,19 @@ def _out_file(fname):
     # Build matrix from pairs.
     if mat_fmt == "cool":
 
+        # Name matrix file in .cool
+        mat = os.path.splitext(mat)[0] + ".cool"
+        
         # If binning is **not** set, parse the pairs into a **un-binned** cool
         if (binning == 0):
-            # Name matrix file in .cool
-            mat = os.path.splitext(mat)[0] + ".cool"
+            ## THIS NEEDS TO BE FIXED AT SOME POINT
             pairs2cool(use_pairs, mat, fragments_list)
 
         # If binning is set, proceed to bin the pairs instead
         else:
-            pairs2binnedcool(use_pairs, mat, binning, info_contigs)
+            cool_file = os.path.splitext(mat)[0] + ".cool"
+            pairs2binnedcool(use_pairs, cool_file, binning, info_contigs)
+            mat = cool_file
 
             # If zoomify == True, zoomify binned cool
             if zoomify:
@@ -1061,7 +1065,6 @@ def _out_file(fname):
             
             # Balance binned matrix
             balance(mat, balancing_args)
-            
     else:
         pairs2matrix(
             use_pairs,
@@ -1072,15 +1075,20 @@ def _out_file(fname):
             tmp_dir=tmp_dir,
         )
 
+    # Move final pairs file to main dir. 
+    p = pathlib.Path(use_pairs).absolute()
+    parent_dir = p.parents[1]
+    p.rename(parent_dir / p.name)
+
     # Clean temporary files
     if not no_cleanup:
         tempfiles = [
             pairs,
             pairs_idx,
             pairs_filtered,
+            pairs_pcr,
             bam1,
             bam2,
-            # pairs_pcr,
             tmp_genome,
         ]
         # Do not delete files that were given as input
@@ -1089,6 +1097,15 @@ def _out_file(fname):
             tempfiles.remove(input2)
         except ValueError:
             pass
+        
+        # Delete single-resolution matrix if `--zoomify` is set
+        if binning > 0 and zoomify: 
+            try:
+                tempfiles.append(cool_file)
+            except ValueError:
+                pass
+        
+        # Remove the rest of tempfiles
         for file in tempfiles:
             try:
                 os.remove(file)

From f3dffc51e16279ab7e890811410fa1000f2fe2df Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Thu, 24 Aug 2023 23:28:41 +0200
Subject: [PATCH 04/11] test: more tests for binning/zoomify/balance
 combinations

---
 tests/test_pipeline.py | 101 +++++++++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 20 deletions(-)

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 396579f..3637180 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -72,26 +72,87 @@ def test_full_pipeline_frags():
             force=True,
         )
 
-
-def test_full_pipeline_frags_with_balancing():
-    mapping_to_enzyme = {
-        'normal': "DpnII",
-        'cutsite': "DpnII,HpaII"
-    }
-    for mapping, enzyme in mapping_to_enzyme.items():
-        hpi.full_pipeline(
-            input1="test_data/sample.reads_for.fastq.gz",
-            input2="test_data/sample.reads_rev.fastq.gz",
-            genome="test_data/genome/seq",
-            enzyme=enzyme,
-            mapping=mapping,
-            out_dir="test_out",
-            plot=True,
-            pcr_duplicates=True,
-            filter_events=True,
-            no_cleanup=True,
-            force=True,
-        )
+# Testing:
+# no binning (no zoomify, no balancing)
+# binning: no zoomify (balancing)
+# binning: no zoomify, tweaked balancing
+# binning: zoomify, (balancing)
+# binning: zoomify, tweaked balancing
+
+def test_full_pipeline_frags_with_binning_balancing():
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=True,
+        force=True,
+    )
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=True,
+        force=True,
+        binning = 1000,
+        zoomify = False,
+    )
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=True,
+        force=True,
+        binning = 1000,
+        zoomify = False,
+        balancing_args="--min-nnz 1 --mad-max 1",
+    )
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=True,
+        force=True,
+        binning = 1000,
+    )
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=True,
+        force=True,
+        binning = 1000,
+        balancing_args="--min-nnz 1 --mad-max 1",
+    )
 
 
 def test_full_pipeline_frags_gzipped_genome():

From 3299ffb91956f8ea068d1b0822d7fdde6a7cc467 Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Mon, 28 Aug 2023 13:28:34 +0200
Subject: [PATCH 05/11] fix: np.array error from updated numpy

---
 hicstuff/distance_law.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hicstuff/distance_law.py b/hicstuff/distance_law.py
index 13cf2cd..25ee949 100644
--- a/hicstuff/distance_law.py
+++ b/hicstuff/distance_law.py
@@ -559,7 +559,7 @@ def normalize_distance_law(xs, ps, inf=3000, sup=None):
         List of ps each normalized separately.
     """
     # Sanity check: xs and ps have the same dimension
-    if np.shape(xs) != np.shape(ps):
+    if np.shape(np.asarray(xs, dtype="object")) != np.shape(np.asarray(ps, dtype="object")):
         logger.error("xs and ps should have the same dimension.")
         sys.exit(1)
     # Define the length of shortest chromosomes as a lower bound for the sup boundary

From a75f2c14752d36afea3242f27d54628dd29bf484 Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Mon, 28 Aug 2023 15:04:08 +0200
Subject: [PATCH 06/11] fix: added check for number of fastq/bam entries in
 each file

---
 hicstuff/io.py       | 63 ++++++++++++++++++++++++++++++++++++++++++++
 hicstuff/pipeline.py | 19 +++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/hicstuff/io.py b/hicstuff/io.py
index e606422..d06799c 100644
--- a/hicstuff/io.py
+++ b/hicstuff/io.py
@@ -6,6 +6,7 @@
 import bz2
 import io
 import os
+import pysam
 import functools
 import sys
 import numpy as np
@@ -1431,3 +1432,65 @@ def check_is_fasta(in_file):
         fasta = False
 
     return fasta
+
+def check_fastq_entries(in_file):
+    """
+    Check how many reads are in the input fastq. Requires zcat.
+    
+    Parameters
+    ----------
+    in_file : str
+        Path to the input file.
+
+    Returns
+    -------
+    int :
+        How many reads listed in the input fastq
+    """
+
+    with open(in_file, 'rb') as f:
+        is_gzip = f.read(2) == b'\x1f\x8b'
+    
+    if is_gzip:
+        n_lines = sp.run(
+            "zcat < {f} | wc -l".format(f = in_file),
+            stdout=sp.PIPE,
+            stderr=sp.PIPE,
+            shell = True, 
+            encoding = 'utf-8'
+        ).stdout.removesuffix("\n")
+    else:
+        n_lines = sp.run(
+            "wc -l {f}".format(f = in_file),
+            stdout=sp.PIPE,
+            stderr=sp.PIPE,
+            shell = True, 
+            encoding = 'utf-8'
+        ).stdout.removesuffix("\n")
+
+    n_reads = int(n_lines)/4
+    return n_reads
+
+def check_bam_entries(in_file):
+    """
+    Check how many reads are in the input bam
+    
+    Parameters
+    ----------
+    in_file : str
+        Path to the input file.
+
+    Returns
+    -------
+    int :
+        How many reads listed in the input bam
+    """
+
+    n_reads = sp.run(
+        ["samtools", "view", "-c", in_file],
+        stdout=sp.PIPE,
+        stderr=sp.PIPE,
+        encoding = 'utf-8'
+    ).stdout.removesuffix("\n")
+
+    return int(n_reads)
diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
index b3d7a1c..569731f 100644
--- a/hicstuff/pipeline.py
+++ b/hicstuff/pipeline.py
@@ -880,6 +880,15 @@ def _out_file(fname):
     # Perform genome alignment
     if start_stage == 0:
         
+        # Check number of reads in both fastqs
+        logger.info("Checking content of fastq files.")
+        nreads_input1 = hio.check_fastq_entries(reads1)
+        nreads_input2 = hio.check_fastq_entries(reads2)
+        if (nreads_input1 != nreads_input2):
+            logger.error("Fastq files do not have the same number of reads.")
+        else:
+            logger.info("{n} reads found in each fastq file.".format(n = nreads_input1))
+        
         # Define mapping choice (default normal):
         if mapping == "normal":
             iterative = False
@@ -942,6 +951,16 @@ def _out_file(fname):
     # Starting from bam files
     if start_stage <= 1:
 
+        # Check number of reads in both fastqs
+        if (bam1 == input1):
+            logger.info("Checking content of bam files.")
+            nreads_input1 = hio.check_bam_entries(bam1)
+            nreads_input2 = hio.check_bam_entries(bam2)
+            if (nreads_input1 != nreads_input2):
+                logger.error("Bam files do not have the same number of reads.")
+            else:
+                logger.info("{n} reads found in each bam file.".format(n = nreads_input1))
+
         fragments_updated = True
         # Generate info_contigs and fragments_list output files
         hcd.write_frag_info(

From 5963b24948464e66d9795b654b5fde408d8f09b9 Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Tue, 29 Aug 2023 13:13:00 +0200
Subject: [PATCH 07/11] feat: add --exclude to pipeline to exclude chromosoms
 from generated cool matrix

---
 hicstuff/commands.py | 17 +++++++++++++----
 hicstuff/pipeline.py | 25 ++++++++++++++-----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/hicstuff/commands.py b/hicstuff/commands.py
index 01a5029..fbdbfd0 100644
--- a/hicstuff/commands.py
+++ b/hicstuff/commands.py
@@ -741,10 +741,11 @@ class Pipeline(AbstractCommand):
 
     usage:
         pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
-                 [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
-                 [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
-                 [--binning=INT] [--zoomify=BOOL] [--balancing_args=STR] [--quality-min=30] 
-                 [--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq] 
+                 [--duplicates] [--enzyme=5000] [--exclude=STR] [--filter] [--force] 
+                 [--mapping=normal] [--matfmt=graal] [--no-cleanup] [--outdir=DIR] 
+                 [--plot] [--prefix=PREFIX] [--binning=INT] [--zoomify=BOOL] 
+                 [--balancing_args=STR] [--quality-min=30] [--read-len=INT] 
+                 [--remove-centromeres=0] [--size=0] [--start-stage=fastq] 
                  [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]
 
     arguments:
@@ -767,6 +768,10 @@ class Pipeline(AbstractCommand):
                                       option.
         -C, --circular                Enable if the genome is circular.
                                       Discordant with the centromeres option.
+        -E, --exclude=STR             Exclude specific chromosomes from the 
+                                      generated matrix. Multiple chromosomes 
+                                      can be listed separated by commas (e.g. 
+                                      `--exclude "chrM,2u"`) [default: None].
         -d, --distance-law            If enabled, generates a distance law file
                                       with the values of the probabilities to
                                       have a contact between two distances for
@@ -875,6 +880,9 @@ def execute(self):
         if not self.args["--balancing_args"]:
             self.args["--balancing_args"] = None
 
+        if not self.args["--exclude"]:
+            self.args["--exclude"] = None
+
         if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
             logger.error("matfmt must be either bg2, cool or graal.")
             raise ValueError
@@ -890,6 +898,7 @@ def execute(self):
             aligner=self.args["--aligner"],
             centromeres=self.args["--centromeres"],
             circular=self.args["--circular"],
+            exclude=self.args["--exclude"],
             distance_law=self.args["--distance-law"],
             enzyme=self.args["--enzyme"],
             filter_events=self.args["--filter"],
diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
index 569731f..82dd0c3 100644
--- a/hicstuff/pipeline.py
+++ b/hicstuff/pipeline.py
@@ -314,7 +314,7 @@ def filter_pcr_dup(pairs_idx_file, filtered_file):
         )
 
 
-def pairs2cool(pairs_file, cool_file, bins_file):
+def pairs2cool(pairs_file, cool_file, bins_file, exclude):
     """
     Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
     
@@ -329,18 +329,20 @@ def pairs2cool(pairs_file, cool_file, bins_file):
         Path to the file containing genomic segmentation information. (fragments_list.txt).
     """
 
-    # Make bins file compatible with cooler cload
+    # Exclude some chromosomes from bins
     bins_tmp = bins_file + ".cooler"
     bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
+    bins = bins[~bins[1].isin(exclude.split(','))]
     bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
 
+    # Make cool 
     cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
     cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
     sp.call(cooler_cmd.format(**cool_args), shell=True)
     os.remove(bins_tmp)
 
 
-def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
+def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs, exclude):
     """
     Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
     
@@ -358,13 +360,11 @@ def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs):
         The path to the contigs info file
     """
 
-    # Get chrom sizes
-    contigs = pd.read_csv(info_contigs, sep="\t")
-    chroms = dict(zip(contigs.contig, contigs.length))
-
-    # Save chrom sizes as separate file
+    # Exclude some chromosomes from bins
     chroms_tmp = info_contigs + ".chroms"
-    pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None)
+    chroms = pd.read_csv(info_contigs, sep="\t", usecols=[0, 1], skiprows=1, header=None)
+    chroms = chroms[~chroms[0].isin(exclude.split(','))]
+    chroms.to_csv(chroms_tmp, index=False, sep='\t', header=False)
 
     # Run `cooler cload pairs`
     cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ")
@@ -546,6 +546,7 @@ def full_pipeline(
     input2=None,
     aligner="bowtie2",
     centromeres=None,
+    exclude=None,
     circular=False,
     distance_law=False,
     enzyme=5000,
@@ -646,6 +647,8 @@ def full_pipeline(
     centromeres : None or str
         If not None, path of file with Positions of the centromeres separated by a
         space and in the same order than the chromosomes.
+    exclude : None or str
+        If not None, the name of the chromosomes to remove (e.g. "2u,chrM")
     read_len : int
         Maximum read length to expect in the fastq file. Optionally used in iterative
         alignment mode. Estimated from the first read by default. Useful if input fastq
@@ -1068,12 +1071,12 @@ def _out_file(fname):
         # If binning is **not** set, parse the pairs into a **un-binned** cool
         if (binning == 0):
             ## THIS NEEDS TO BE FIXED AT SOME POINT
-            pairs2cool(use_pairs, mat, fragments_list)
+            pairs2cool(use_pairs, mat, fragments_list, exclude)
 
         # If binning is set, proceed to bin the pairs instead
         else:
             cool_file = os.path.splitext(mat)[0] + ".cool"
-            pairs2binnedcool(use_pairs, cool_file, binning, info_contigs)
+            pairs2binnedcool(use_pairs, cool_file, binning, info_contigs, exclude)
             mat = cool_file
 
             # If zoomify == True, zoomify binned cool

From 70771a5dceff604817c9b5e51ff43c75e85afb43 Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Tue, 29 Aug 2023 14:53:58 +0200
Subject: [PATCH 08/11] fix: cutsite doc

---
 hicstuff/cutsite.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hicstuff/cutsite.py b/hicstuff/cutsite.py
index bae6175..bbd2e75 100644
--- a/hicstuff/cutsite.py
+++ b/hicstuff/cutsite.py
@@ -185,15 +185,15 @@ def cutsite_read(ligation_sites, seq, qual, seed_size=0):
     Returns:
     --------
     list of str
-        List of string of the sequences. The split is made at the start of the
-        ligation sites.
+        List of cut sequences. The split is made 4 bases after the start of 
+        the ligation site.
     list of str
         List of string of the qualities.
 
     Examples:
     ---------
     >>> cutsite_read(re.compile(r'GA.TA.TC'), "AAGAGTATTC", "FFF--FAFAF")
-    (['AA', 'GAGTATTC'], ['FF', 'F--FAFAF'])
+    (['AAGAGT', 'ATTC'], ['FFF--F', 'AFAF'])
     """
 
     # Find the ligation sites.

From 786aed086fa968eabc230023846e584866c132bb Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Tue, 29 Aug 2023 15:20:18 +0200
Subject: [PATCH 09/11] GHA: revamp with micromamba

---
 .github/workflows/build.yml | 32 ++++++++++++++++++++------------
 environment.yml             | 24 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 12 deletions(-)
 create mode 100644 environment.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4baa675..d668780 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -9,23 +9,31 @@ jobs:
       max-parallel: 5
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.9
-      uses: actions/setup-python@v2
+
+    - name: Checkout repo
+      uses: actions/checkout@v2
+    
+    - name: Create micromamba env. for package
+      uses: mamba-org/setup-micromamba@v1
       with:
-        python-version: 3.9
-    - name: Add conda to system path
+        generate-run-shell: true
+        environment-file: environment.yml
+
+    - name: Install package
       run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-    - name: Install dependencies
+        pip install .
+      shell: micromamba-shell {0}
+
+    - name: Check installed package
       run: |
-        conda config --add channels bioconda
-        conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy
-        pip install -Ur requirements.txt
-        pip install pytest-pylint
+        hicstuff --version
+      shell: micromamba-shell {0}
+
     - name: Lint and test
       run: |
+        pip install pytest-pylint pytest pytest-cov pylint codecov mappy
         pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff
         pytest --cov=hicstuff
         codecov
+      shell: micromamba-shell {0}
+
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..9407551
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,24 @@
+name: env-name
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python >= 3.7
+  - pip
+  - bowtie2 
+  - bwa 
+  - minimap2
+  - samtools
+  - numpy
+  - scipy
+  - pandas >= 1.5.0
+  - matplotlib >= 3.4.0
+  - docopt
+  - biopython
+  - requests
+  - scikit-learn
+  - pysam
+  - htslib
+  - pyfastx
+  - cooler
\ No newline at end of file

From dd4dbad3852e973c82bd8178ea40bb86843b285b Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Tue, 29 Aug 2023 15:38:23 +0200
Subject: [PATCH 10/11] tests

---
 tests/test_pipeline.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 3637180..28970b8 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -153,6 +153,22 @@ def test_full_pipeline_frags_with_binning_balancing():
         binning = 1000,
         balancing_args="--min-nnz 1 --mad-max 1",
     )
+    hpi.full_pipeline(
+        input1="test_data/sample.reads_for.fastq.gz",
+        input2="test_data/sample.reads_rev.fastq.gz",
+        genome="test_data/genome/seq",
+        enzyme="DpnII,HpaII",
+        mapping="normal",
+        out_dir="test_out",
+        plot=True,
+        pcr_duplicates=True,
+        filter_events=True,
+        no_cleanup=False,
+        force=True,
+        binning = 1000,
+        exclude = 'seq2',
+        balancing_args="--min-nnz 1 --mad-max 1",
+    )
 
 
 def test_full_pipeline_frags_gzipped_genome():

From 2d328754862a766d3da0136d17cc03c6d8f0bf5a Mon Sep 17 00:00:00 2001
From: js2264 <jacquesserizay@gmail.com>
Date: Fri, 1 Sep 2023 00:00:05 +0200
Subject: [PATCH 11/11] bump to v3.2.0

---
 Dockerfile          | 2 +-
 doc/conf.py         | 4 ++--
 hicstuff/version.py | 2 +-
 setup.py            | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 91c26c6..b156163 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM continuumio/miniconda3:4.9.2
 
-LABEL Name=hicstuff Version=3.1.7
+LABEL Name=hicstuff Version=3.2.0
 
 COPY * ./ /app/
 WORKDIR /app
diff --git a/doc/conf.py b/doc/conf.py
index dad2f71..4a7539a 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -24,9 +24,9 @@
 author = "Cyril Matthey-Doret"
 
 # The short X.Y version
-version = "3.1"
+version = "3.2"
 # The full version, including alpha/beta/rc tags
-release = "3.1.7"
+release = "3.2.0"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/hicstuff/version.py b/hicstuff/version.py
index 47ed839..573cf70 100644
--- a/hicstuff/version.py
+++ b/hicstuff/version.py
@@ -1 +1 @@
-__version__ = '3.1.7'
+__version__ = '3.2.0'
diff --git a/setup.py b/setup.py
index 783a5a2..99d7c8b 100644
--- a/setup.py
+++ b/setup.py
@@ -29,8 +29,8 @@
 name = "hicstuff"
 
 MAJOR = 3
-MINOR = 1
-MAINTENANCE = 7
+MINOR = 2
+MAINTENANCE = 0
 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE)
 
 LICENSE = "BSD-3-Clause"