diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 09a43f7..4baa675 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,9 +21,9 @@ jobs: - name: Install dependencies run: | conda config --add channels bioconda - conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest pytest-cov pylint codecov mappy + conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy pip install -Ur requirements.txt - pip install cooler pytest-pylint + pip install pytest-pylint - name: Lint and test run: | pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff diff --git a/hicstuff/commands.py b/hicstuff/commands.py index 4d4cba7..4e8866c 100644 --- a/hicstuff/commands.py +++ b/hicstuff/commands.py @@ -743,8 +743,9 @@ class Pipeline(AbstractCommand): pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law] [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal] [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX] - [--quality-min=30] [--read-len=INT] [--remove-centromeres=0] [--size=0] - [--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE [] + [--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30] + [--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq] + [--threads=1] [--tmpdir=DIR] --genome=FILE [] arguments: input1: Forward fastq file, if start_stage is "fastq", sam @@ -811,6 +812,12 @@ class Pipeline(AbstractCommand): at different steps of the pipeline. -P, --prefix=STR Overrides default filenames and prefixes all output files with a custom name. + -b,--binning=INT Bin the resulting matrix to a given resolution + (only used if `--matfmt cool") + -z, --zoomify Zoomify binned cool matrix + (only used if mat_fmt == "cool" and binning is set) + -B, --balancing_args=STR Arguments to pass to `cooler balance` + (default: "") (only used if zoomify == True) -q, --quality-min=INT Minimum mapping quality for selecting contacts. [default: 30]. -r, --remove-centromeres=INT Integer. Number of kb that will be remove around @@ -858,6 +865,12 @@ def execute(self): if not self.args["--outdir"]: self.args["--outdir"] = os.getcwd() + if not self.args["--binning"]: + self.args["--binning"] = "0" + + if not self.args["--balancing_args"]: + self.args["--balancing_args"] = "" + if self.args["--matfmt"] not in ("graal", "bg2", "cool"): logger.error("matfmt must be either bg2, cool or graal.") raise ValueError @@ -879,6 +892,9 @@ def execute(self): force=self.args["--force"], mapping=self.args["--mapping"], mat_fmt=self.args["--matfmt"], + binning=int(self.args["--binning"]), + zoomify=self.args["--zoomify"], + balancing_args=self.args["--balancing_args"], min_qual=int(self.args["--quality-min"]), min_size=int(self.args["--size"]), no_cleanup=self.args["--no-cleanup"], diff --git a/hicstuff/digest.py b/hicstuff/digest.py index daea36e..c98ad3f 100755 --- a/hicstuff/digest.py +++ b/hicstuff/digest.py @@ -112,7 +112,7 @@ def write_frag_info( frag_length = len(frag) if frag_length > 0: end_pos = start_pos + frag_length - gc_content = SeqUtils.GC(frag) / 100.0 + gc_content = SeqUtils.gc_fraction(frag) / 100.0 current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( current_id, diff --git a/hicstuff/io.py b/hicstuff/io.py index 976a070..e606422 100644 --- a/hicstuff/io.py +++ b/hicstuff/io.py @@ -1166,7 +1166,7 @@ def gc_bins(genome_path, frags): seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)] # Fill GC values for bins in the chromosome idx = np.flatnonzero(mask) - gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0 + gc_bins[idx] = np.array(list(map(SeqUtils.gc_fraction, seqs))) / 100.0 return gc_bins diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 77d2c3c..0b014f2 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -13,6 +13,7 @@ import subprocess as sp import gzip from Bio import SeqIO +import cooler import pandas as pd import numpy as np import pysam as ps @@ -339,6 +340,67 @@ def pairs2cool(pairs_file, cool_file, bins_file): os.remove(bins_tmp) +def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs): + """ + Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations. + + Parameters + ---------- + + pairs_file : str + Path to the pairs file containing input contact data. + cool_file : str + Path to the output cool file name to generate. + binning : int + If mat_fmt is set to "cool", the cool file will be further binned to + this resolution. + info_contigs : pathlib.Path or str + The path to the contigs info file + """ + + # Get chrom sizes + contigs = pd.read_csv(info_contigs, sep="\t") + chroms = dict(zip(contigs.contig, contigs.length)) + + # Save chrom sizes as separate file + chroms_tmp = info_contigs + ".chroms" + pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None) + + # Run `cooler cload pairs` + cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ") + sp.call(cooler_cmd + [chroms_tmp+":"+str(binning), pairs_file, cool_file], shell=False) + os.remove(chroms_tmp) + + +def cool2mcool(cool_file, output_file, balance_args): + """ + Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations. + + Parameters + ---------- + + cool_file : str + Path to an existing binned .cool file. + output_file : str + Path to the new multi-resolution .mcool file. + balance_args : str + Arguments passed to `cooler balance` (default: "") + """ + + # Get resolutions + clr = cooler.Cooler(cool_file) + res = clr.binsize + multires = [res, res*2, res*5, res*10, res*20, res*50, res*100] + + # Run cooler zoomify + cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000) + + # Balance + for res in multires: + cooler_cmd = "cooler balance".split(" ") + sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False) + + def pairs2matrix( pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None ): @@ -468,7 +530,10 @@ def full_pipeline( filter_events=False, force=False, mapping="normal", - mat_fmt="graal", + mat_fmt="cool", + binning=0, + zoomify=True, + balancing_args="", min_qual=30, min_size=0, no_cleanup=False, @@ -497,6 +562,13 @@ def full_pipeline( input2 : str Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads in BAM format, or None, depending on the value of start_stage. + binning : int + If mat_fmt is set to "cool", the cool file will be further binned to + this resolution. + zoomify : bool + Whether to zoomify binned cool matrix (only used if mat_fmt == "cool" and binning is set) + balancing_args : str + Arguments to pass to `chromosight balance` (default: None) (only used if zoomify == True) enzyme : int or strtest_data/genome/seq.fa circular : bool Use if the genome is circular. @@ -940,6 +1012,14 @@ def _out_file(fname): # Name matrix file in .cool cool_file = os.path.splitext(mat)[0] + ".cool" pairs2cool(use_pairs, cool_file, fragments_list) + + if (binning > 0): + binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool" + pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs) + if (zoomify is True): + mcool_file = os.path.splitext(mat)[0] + ".mcool" + cool2mcool(binned_cool_file, mcool_file, balancing_args) + else: pairs2matrix( use_pairs, @@ -958,7 +1038,7 @@ def _out_file(fname): pairs_filtered, bam1, bam2, - pairs_pcr, + # pairs_pcr, tmp_genome, ] # Do not delete files that were given as input diff --git a/requirements.txt b/requirements.txt index 55a6139..dad2784 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ requests scikit-learn pysam pyfastx +cooler diff --git a/setup.py b/setup.py index 5d8c9d8..f9694c8 100644 --- a/setup.py +++ b/setup.py @@ -64,5 +64,5 @@ install_requires=REQUIREMENTS, long_description_content_type="text/markdown", entry_points={"console_scripts": ["hicstuff=hicstuff.main:main"]}, - extras_require={"mappy": ["mappy"], "cooler": ["cooler"]}, + extras_require={"mappy": ["mappy"]}, ) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index d83aebb..396579f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -73,6 +73,26 @@ def test_full_pipeline_frags(): ) +def test_full_pipeline_frags_with_balancing(): + mapping_to_enzyme = { + 'normal': "DpnII", + 'cutsite': "DpnII,HpaII" + } + for mapping, enzyme in mapping_to_enzyme.items(): + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme=enzyme, + mapping=mapping, + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + ) + def test_full_pipeline_frags_gzipped_genome(): mapping_to_enzyme = { @@ -86,6 +106,10 @@ def test_full_pipeline_frags_gzipped_genome(): genome="test_data/genome/seq.fa.gz", enzyme=enzyme, mapping=mapping, + mat_fmt="cool", + binning = 1000, + zoomify = True, + balancing_args = "--max-iters 10 --min-nnz 10", out_dir="test_out", plot=True, pcr_duplicates=True, @@ -122,7 +146,7 @@ def test_full_pipeline_bin(mapping, aligner): prefix="test", distance_law=True, start_stage=stage, - mat_fmt="cooler", + mat_fmt="cool", no_cleanup=True, force=True, )