diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4baa675..d668780 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,23 +9,31 @@ jobs: max-parallel: 5 steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 + + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Create micromamba env. for package + uses: mamba-org/setup-micromamba@v1 with: - python-version: 3.9 - - name: Add conda to system path + generate-run-shell: true + environment-file: environment.yml + + - name: Install package run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH - - name: Install dependencies + pip install . + shell: micromamba-shell {0} + + - name: Check installed package run: | - conda config --add channels bioconda - conda install -c conda-forge python=3.9 minimap2 bowtie2=2.4.5 bwa samtools htslib pysam pytest cooler pytest-cov pylint codecov mappy - pip install -Ur requirements.txt - pip install pytest-pylint + hicstuff --version + shell: micromamba-shell {0} + - name: Lint and test run: | + pip install pytest-pylint pytest pytest-cov pylint codecov mappy pytest --pylint --pylint-error-types=EF --pylint-rcfile=.pylintrc --doctest-modules --doctest-modules hicstuff pytest --cov=hicstuff codecov + shell: micromamba-shell {0} + diff --git a/Dockerfile b/Dockerfile index 91c26c6..b156163 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda3:4.9.2 -LABEL Name=hicstuff Version=3.1.7 +LABEL Name=hicstuff Version=3.2.0 COPY * ./ /app/ WORKDIR /app diff --git a/doc/conf.py b/doc/conf.py index dad2f71..4a7539a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -24,9 +24,9 @@ author = "Cyril Matthey-Doret" # The short X.Y version -version = "3.1" +version = "3.2" # The full version, including alpha/beta/rc tags -release = "3.1.7" +release = "3.2.0" # -- General configuration --------------------------------------------------- diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..9407551 --- /dev/null +++ b/environment.yml @@ -0,0 +1,24 @@ +name: env-name +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python >= 3.7 + - pip + - bowtie2 + - bwa + - minimap2 + - samtools + - numpy + - scipy + - pandas >= 1.5.0 + - matplotlib >= 3.4.0 + - docopt + - biopython + - requests + - scikit-learn + - pysam + - htslib + - pyfastx + - cooler \ No newline at end of file diff --git a/hicstuff/commands.py b/hicstuff/commands.py index 4e8866c..fbdbfd0 100644 --- a/hicstuff/commands.py +++ b/hicstuff/commands.py @@ -741,10 +741,11 @@ class Pipeline(AbstractCommand): usage: pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law] - [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal] - [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX] - [--binning=INT] [--zoomify] [--balancing_args=STR] [--quality-min=30] - [--read-len=INT] [--remove-centromeres=0] [--size=0] [--start-stage=fastq] + [--duplicates] [--enzyme=5000] [--exclude=STR] [--filter] [--force] + [--mapping=normal] [--matfmt=graal] [--no-cleanup] [--outdir=DIR] + [--plot] [--prefix=PREFIX] [--binning=INT] [--zoomify=BOOL] + [--balancing_args=STR] [--quality-min=30] [--read-len=INT] + [--remove-centromeres=0] [--size=0] [--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE [] arguments: @@ -767,6 +768,10 @@ class Pipeline(AbstractCommand): option. -C, --circular Enable if the genome is circular. Discordant with the centromeres option. + -E, --exclude=STR Exclude specific chromosomes from the + generated matrix. Multiple chromosomes + can be listed separated by commas (e.g. + `--exclude "chrM,2u"`) [default: None]. -d, --distance-law If enabled, generates a distance law file with the values of the probabilities to have a contact between two distances for @@ -802,7 +807,7 @@ class Pipeline(AbstractCommand): Can be "bg2" for 2D Bedgraph format, "cool" for Mirnylab's cooler software, or "graal" for graal-compatible plain text - COO format. [default: graal] + COO format. [default: cool] -n, --no-cleanup If enabled, intermediary BED files will be kept after generating the contact map. Disabled by defaut. @@ -812,9 +817,10 @@ class Pipeline(AbstractCommand): at different steps of the pipeline. -P, --prefix=STR Overrides default filenames and prefixes all output files with a custom name. - -b,--binning=INT Bin the resulting matrix to a given resolution + -b,--binning=INT Bin the contact matrix to a given resolution. + By default, the contact matrix is not binned. (only used if `--matfmt cool") - -z, --zoomify Zoomify binned cool matrix + -z, --zoomify=BOOL Zoomify binned cool matrix [default: True] (only used if mat_fmt == "cool" and binning is set) -B, --balancing_args=STR Arguments to pass to `cooler balance` (default: "") (only used if zoomify == True) @@ -868,8 +874,14 @@ def execute(self): if not self.args["--binning"]: self.args["--binning"] = "0" + if not self.args["--zoomify"]: + self.args["--zoomify"] = "True" + if not self.args["--balancing_args"]: - self.args["--balancing_args"] = "" + self.args["--balancing_args"] = None + + if not self.args["--exclude"]: + self.args["--exclude"] = None if self.args["--matfmt"] not in ("graal", "bg2", "cool"): logger.error("matfmt must be either bg2, cool or graal.") @@ -878,7 +890,7 @@ def execute(self): read_len = self.args["--read-len"] if read_len is not None: read_len = int(read_len) - + hpi.full_pipeline( genome=self.args["--genome"], input1=self.args[""], @@ -886,6 +898,7 @@ def execute(self): aligner=self.args["--aligner"], centromeres=self.args["--centromeres"], circular=self.args["--circular"], + exclude=self.args["--exclude"], distance_law=self.args["--distance-law"], enzyme=self.args["--enzyme"], filter_events=self.args["--filter"], @@ -893,7 +906,7 @@ def execute(self): mapping=self.args["--mapping"], mat_fmt=self.args["--matfmt"], binning=int(self.args["--binning"]), - zoomify=self.args["--zoomify"], + zoomify=eval(self.args["--zoomify"]), balancing_args=self.args["--balancing_args"], min_qual=int(self.args["--quality-min"]), min_size=int(self.args["--size"]), diff --git a/hicstuff/cutsite.py b/hicstuff/cutsite.py index bae6175..bbd2e75 100644 --- a/hicstuff/cutsite.py +++ b/hicstuff/cutsite.py @@ -185,15 +185,15 @@ def cutsite_read(ligation_sites, seq, qual, seed_size=0): Returns: -------- list of str - List of string of the sequences. The split is made at the start of the - ligation sites. + List of cut sequences. The split is made 4 bases after the start of + the ligation site. list of str List of string of the qualities. Examples: --------- >>> cutsite_read(re.compile(r'GA.TA.TC'), "AAGAGTATTC", "FFF--FAFAF") - (['AA', 'GAGTATTC'], ['FF', 'F--FAFAF']) + (['AAGAGT', 'ATTC'], ['FFF--F', 'AFAF']) """ # Find the ligation sites. diff --git a/hicstuff/distance_law.py b/hicstuff/distance_law.py index 13cf2cd..25ee949 100644 --- a/hicstuff/distance_law.py +++ b/hicstuff/distance_law.py @@ -559,7 +559,7 @@ def normalize_distance_law(xs, ps, inf=3000, sup=None): List of ps each normalized separately. """ # Sanity check: xs and ps have the same dimension - if np.shape(xs) != np.shape(ps): + if np.shape(np.asarray(xs, dtype="object")) != np.shape(np.asarray(ps, dtype="object")): logger.error("xs and ps should have the same dimension.") sys.exit(1) # Define the length of shortest chromosomes as a lower bound for the sup boundary diff --git a/hicstuff/io.py b/hicstuff/io.py index e606422..d06799c 100644 --- a/hicstuff/io.py +++ b/hicstuff/io.py @@ -6,6 +6,7 @@ import bz2 import io import os +import pysam import functools import sys import numpy as np @@ -1431,3 +1432,65 @@ def check_is_fasta(in_file): fasta = False return fasta + +def check_fastq_entries(in_file): + """ + Check how many reads are in the input fastq. Requires zcat. + + Parameters + ---------- + in_file : str + Path to the input file. + + Returns + ------- + int : + How many reads listed in the input fastq + """ + + with open(in_file, 'rb') as f: + is_gzip = f.read(2) == b'\x1f\x8b' + + if is_gzip: + n_lines = sp.run( + "zcat < {f} | wc -l".format(f = in_file), + stdout=sp.PIPE, + stderr=sp.PIPE, + shell = True, + encoding = 'utf-8' + ).stdout.removesuffix("\n") + else: + n_lines = sp.run( + "wc -l {f}".format(f = in_file), + stdout=sp.PIPE, + stderr=sp.PIPE, + shell = True, + encoding = 'utf-8' + ).stdout.removesuffix("\n") + + n_reads = int(n_lines)/4 + return n_reads + +def check_bam_entries(in_file): + """ + Check how many reads are in the input bam + + Parameters + ---------- + in_file : str + Path to the input file. + + Returns + ------- + int : + How many reads listed in the input bam + """ + + n_reads = sp.run( + ["samtools", "view", "-c", in_file], + stdout=sp.PIPE, + stderr=sp.PIPE, + encoding = 'utf-8' + ).stdout.removesuffix("\n") + + return int(n_reads) diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 0b014f2..82dd0c3 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -314,7 +314,7 @@ def filter_pcr_dup(pairs_idx_file, filtered_file): ) -def pairs2cool(pairs_file, cool_file, bins_file): +def pairs2cool(pairs_file, cool_file, bins_file, exclude): """ Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations. @@ -329,18 +329,20 @@ def pairs2cool(pairs_file, cool_file, bins_file): Path to the file containing genomic segmentation information. (fragments_list.txt). """ - # Make bins file compatible with cooler cload + # Exclude some chromosomes from bins bins_tmp = bins_file + ".cooler" bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None) + bins = bins[~bins[1].isin(exclude.split(','))] bins.to_csv(bins_tmp, sep="\t", header=False, index=False) + # Make cool cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}" cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file} sp.call(cooler_cmd.format(**cool_args), shell=True) os.remove(bins_tmp) -def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs): +def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs, exclude): """ Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations. @@ -358,13 +360,11 @@ def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs): The path to the contigs info file """ - # Get chrom sizes - contigs = pd.read_csv(info_contigs, sep="\t") - chroms = dict(zip(contigs.contig, contigs.length)) - - # Save chrom sizes as separate file + # Exclude some chromosomes from bins chroms_tmp = info_contigs + ".chroms" - pd.Series(chroms).to_csv(chroms_tmp, index=True, sep='\t', header=None) + chroms = pd.read_csv(info_contigs, sep="\t", usecols=[0, 1], skiprows=1, header=None) + chroms = chroms[~chroms[0].isin(exclude.split(','))] + chroms.to_csv(chroms_tmp, index=False, sep='\t', header=False) # Run `cooler cload pairs` cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ") @@ -372,7 +372,7 @@ def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs): os.remove(chroms_tmp) -def cool2mcool(cool_file, output_file, balance_args): +def cool2mcool(cool_file, output_file): """ Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations. @@ -383,8 +383,6 @@ def cool2mcool(cool_file, output_file, balance_args): Path to an existing binned .cool file. output_file : str Path to the new multi-resolution .mcool file. - balance_args : str - Arguments passed to `cooler balance` (default: "") """ # Get resolutions @@ -395,10 +393,34 @@ def cool2mcool(cool_file, output_file, balance_args): # Run cooler zoomify cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000) +def balance(cool_file, balancing_args): + """ + Balance a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations. + + Parameters + ---------- + + cool_file : str + Path to an existing binned .(m)cool file. + balancing_args : str + Extra rguments passed to `cooler balance` (default: None) + """ + # Balance - for res in multires: + if (cooler.fileops.is_multires_file(cool_file)): + paths = cooler.fileops.list_coolers(cool_file) + for path in paths: + cooler_cmd = "cooler balance".split(" ") + if balancing_args is not None: + sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file+"::"+path], shell=False) + else: + sp.call(cooler_cmd + [cool_file+"::"+path], shell=False) + else: cooler_cmd = "cooler balance".split(" ") - sp.call(cooler_cmd + balance_args.split(" ") + [output_file+"::/resolutions/"+str(res)], shell=False) + if balancing_args is not None: + sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file], shell=False) + else: + sp.call(cooler_cmd + [cool_file], shell=False) def pairs2matrix( @@ -524,6 +546,7 @@ def full_pipeline( input2=None, aligner="bowtie2", centromeres=None, + exclude=None, circular=False, distance_law=False, enzyme=5000, @@ -533,7 +556,7 @@ def full_pipeline( mat_fmt="cool", binning=0, zoomify=True, - balancing_args="", + balancing_args=None, min_qual=30, min_size=0, no_cleanup=False, @@ -624,6 +647,8 @@ def full_pipeline( centromeres : None or str If not None, path of file with Positions of the centromeres separated by a space and in the same order than the chromosomes. + exclude : None or str + If not None, the name of the chromosomes to remove (e.g. "2u,chrM") read_len : int Maximum read length to expect in the fastq file. Optionally used in iterative alignment mode. Estimated from the first read by default. Useful if input fastq @@ -746,6 +771,17 @@ def _out_file(fname): hcl.set_file_handler(log_file) generate_log_header(log_file, input1, input2, genome, enzyme) + # If defautl `mat_fmt` or set `mat_fmt=cool`, notify the user + if mat_fmt == 'cool': + try: + import cooler + logger.info("The default output format is now `.cool`. The Hi-C " + "matrix will be generated with cooler v%s " + "(Abdennur & Mirny, Bioinformatics 2020).", + cooler.__version__ + ) + except: raise + # If the user chose bowtie2 and supplied an index, extract fasta from it # For later steps of the pipeline (digestion / frag attribution) # Check if the genome is an index or fasta file @@ -847,6 +883,15 @@ def _out_file(fname): # Perform genome alignment if start_stage == 0: + # Check number of reads in both fastqs + logger.info("Checking content of fastq files.") + nreads_input1 = hio.check_fastq_entries(reads1) + nreads_input2 = hio.check_fastq_entries(reads2) + if (nreads_input1 != nreads_input2): + logger.error("Fastq files do not have the same number of reads.") + else: + logger.info("{n} reads found in each fastq file.".format(n = nreads_input1)) + # Define mapping choice (default normal): if mapping == "normal": iterative = False @@ -909,6 +954,16 @@ def _out_file(fname): # Starting from bam files if start_stage <= 1: + # Check number of reads in both fastqs + if (bam1 == input1): + logger.info("Checking content of bam files.") + nreads_input1 = hio.check_bam_entries(bam1) + nreads_input2 = hio.check_bam_entries(bam2) + if (nreads_input1 != nreads_input2): + logger.error("Bam files do not have the same number of reads.") + else: + logger.info("{n} reads found in each bam file.".format(n = nreads_input1)) + fragments_updated = True # Generate info_contigs and fragments_list output files hcd.write_frag_info( @@ -1009,17 +1064,29 @@ def _out_file(fname): # Build matrix from pairs. if mat_fmt == "cool": + # Name matrix file in .cool - cool_file = os.path.splitext(mat)[0] + ".cool" - pairs2cool(use_pairs, cool_file, fragments_list) + mat = os.path.splitext(mat)[0] + ".cool" - if (binning > 0): - binned_cool_file = os.path.splitext(mat)[0] + "_" + str(binning) + ".cool" - pairs2binnedcool(use_pairs, binned_cool_file, binning, info_contigs) - if (zoomify is True): + # If binning is **not** set, parse the pairs into a **un-binned** cool + if (binning == 0): + ## THIS NEEDS TO BE FIXED AT SOME POINT + pairs2cool(use_pairs, mat, fragments_list, exclude) + + # If binning is set, proceed to bin the pairs instead + else: + cool_file = os.path.splitext(mat)[0] + ".cool" + pairs2binnedcool(use_pairs, cool_file, binning, info_contigs, exclude) + mat = cool_file + + # If zoomify == True, zoomify binned cool + if zoomify: mcool_file = os.path.splitext(mat)[0] + ".mcool" - cool2mcool(binned_cool_file, mcool_file, balancing_args) - + cool2mcool(mat, mcool_file) + mat = mcool_file + + # Balance binned matrix + balance(mat, balancing_args) else: pairs2matrix( use_pairs, @@ -1030,15 +1097,20 @@ def _out_file(fname): tmp_dir=tmp_dir, ) + # Move final pairs file to main dir. + p = pathlib.Path(use_pairs).absolute() + parent_dir = p.parents[1] + p.rename(parent_dir / p.name) + # Clean temporary files if not no_cleanup: tempfiles = [ pairs, pairs_idx, pairs_filtered, + pairs_pcr, bam1, bam2, - # pairs_pcr, tmp_genome, ] # Do not delete files that were given as input @@ -1047,6 +1119,15 @@ def _out_file(fname): tempfiles.remove(input2) except ValueError: pass + + # Delete single-resolution matrix if `--zoomify` is set + if binning > 0 and zoomify: + try: + tempfiles.append(cool_file) + except ValueError: + pass + + # Remove the rest of tempfiles for file in tempfiles: try: os.remove(file) diff --git a/hicstuff/version.py b/hicstuff/version.py index 47ed839..573cf70 100644 --- a/hicstuff/version.py +++ b/hicstuff/version.py @@ -1 +1 @@ -__version__ = '3.1.7' +__version__ = '3.2.0' diff --git a/setup.py b/setup.py index 783a5a2..99d7c8b 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,8 @@ name = "hicstuff" MAJOR = 3 -MINOR = 1 -MAINTENANCE = 7 +MINOR = 2 +MAINTENANCE = 0 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE) LICENSE = "BSD-3-Clause" diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 396579f..28970b8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -72,26 +72,103 @@ def test_full_pipeline_frags(): force=True, ) - -def test_full_pipeline_frags_with_balancing(): - mapping_to_enzyme = { - 'normal': "DpnII", - 'cutsite': "DpnII,HpaII" - } - for mapping, enzyme in mapping_to_enzyme.items(): - hpi.full_pipeline( - input1="test_data/sample.reads_for.fastq.gz", - input2="test_data/sample.reads_rev.fastq.gz", - genome="test_data/genome/seq", - enzyme=enzyme, - mapping=mapping, - out_dir="test_out", - plot=True, - pcr_duplicates=True, - filter_events=True, - no_cleanup=True, - force=True, - ) +# Testing: +# no binning (no zoomify, no balancing) +# binning: no zoomify (balancing) +# binning: no zoomify, tweaked balancing +# binning: zoomify, (balancing) +# binning: zoomify, tweaked balancing + +def test_full_pipeline_frags_with_binning_balancing(): + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + ) + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + binning = 1000, + zoomify = False, + ) + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + binning = 1000, + zoomify = False, + balancing_args="--min-nnz 1 --mad-max 1", + ) + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + binning = 1000, + ) + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=True, + force=True, + binning = 1000, + balancing_args="--min-nnz 1 --mad-max 1", + ) + hpi.full_pipeline( + input1="test_data/sample.reads_for.fastq.gz", + input2="test_data/sample.reads_rev.fastq.gz", + genome="test_data/genome/seq", + enzyme="DpnII,HpaII", + mapping="normal", + out_dir="test_out", + plot=True, + pcr_duplicates=True, + filter_events=True, + no_cleanup=False, + force=True, + binning = 1000, + exclude = 'seq2', + balancing_args="--min-nnz 1 --mad-max 1", + ) def test_full_pipeline_frags_gzipped_genome():