v0.2.4

wangyibin · Jan 25, 2025 · 69221f4 · 69221f4
1 parent 5a63831
commit 69221f4
Show file tree

Hide file tree

Showing 40 changed files with 1,959 additions and 646 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,8 +1,24 @@
 # Release notes #
-Data: 2025-01-15
+
+## [v0.2.4]
+Data: 2025-01-22
+## Enhancement
+- `pairs2depth`, speed up by pipeline
+- `alleles`, add `--trim-length` to trim the both end of contigs to remove the 
+                effect of overlapping from hifiasm assembly graph.
+- `pipeline`
+    - `--preset precision`: Optimize parameters to improve accuracy at the expense of anchor rate
+    - `--preset sensitive`: Using in some complex genome,  which contain many fragmented contigs and low signals contigs
+
 ## Bug fixes
 - `activate_cphasing`, fixed bug that exit shell window when pixi failed to install
-- `pipeline`, fixed bug that program can not exit when error occurred 
+- `pipeline`
+    - fixed bug that program can not exit when error occurred 
+    - fixed bug that alleles parameters cannot affect in phasing mode
+    - fixed bug that reported in issue #14, which `--chimeric-correct` mode cannot use correct fasta in 3.hyperpartition
+- `plot`, fixed bug that program can not coarsen adjusted matrix to plot another binsize matrix
+
+## [v0.2.3]
 Data: 2025-01-14
 ## Enhancement
 - `pipeline`, report peak memory usage
@@ -13,8 +29,8 @@ Data: 2025-01-14
 ## Bug fixes
 - `pairs2cool`, fixed bug that "bin1_id > bin2_id" in some cases
 
-Data: 2025-01-10
 ## [v0.2.2]
+Data: 2025-01-10
 ## Enhancement
 - `pipeline`, When mode=phasing, pipeline integrating 1.alleles into 3.hyperpartition to speed up
 ## Bug fixes

diff --git a/README.md b/README.md
@@ -27,6 +27,9 @@ git clone https://github.com/wangyibin/CPhasing.git
 
 ## activate environment (For the first configuration, run it when the network is accessible.)
 source ./CPhasing/bin/activate_cphasing
+
+## deactivate environment
+exit
 ```
 
 ### Via Anaconda
@@ -43,22 +46,6 @@ export PYTHONPATH=/path/to/CPhasing:$PYTHONPATH
 ## The hic pipeline require GLIBCXX_3.4.29, or you can add this command to your environment (.bash_profile)
 export LD_LIBRARY_PATH=/path/to/anaconda3/envs/cphasing/lib:$LD_LIBRARY_PATH
 ```
-### Custom install the dependencies
-#### Install C-Phasing
-```bash
-## Download C-Phasing and install python dependencies
-git clone https://github.com/wangyibin/CPhasing.git
-cd CPhasing
-pip install .
-
-## Add following to the .bash_profile or .bashrc
-export PATH=/path/to/CPhasing/bin:$PATH
-```
-#### Dependencies
-1. For core function
-    - [pigz](https://github.com/madler/pigz)
-2. For Pore-C pipeline
-    - [minimap2](https://github.com/lh3/minimap2)(>= v2.24)
 
 
 ## One command pipeline of C-Phasing
@@ -73,8 +60,8 @@ cphasing pipeline -f draft.asm.fasta -pcd sample.fastq.gz -t 10 -n 8:4
 ```bash
 cphasing pipeline -f draft.asm.fasta -pcd sample1.fastq.gz -pcd sample2.fastq.gz -t 10 -n 8:4
 ```  
-    
-Note: If you want to run on cluster system and submit them to multiple nodes, you can use `cphasing mapper` and `cphasing-rs porec-merge` to generate the merged `porec.gz` file and input it by `-pct` parameter. 
+> [!NOTE]
+> If you want to run on cluster system and submit them to multiple nodes, you can use `cphasing mapper` and `cphasing-rs porec-merge` to generate the merged `porec.gz` file and input it by `-pct` parameter. 
 
 - Start from a **pore-c table (porec.gz)**, which is generated by `cphasing mapper`.
 ```bash
@@ -85,8 +72,10 @@ cphasing pipeline -f draft.asm.fasta -pct sample.porec.gz -t 10 -n 8:4
 ```bash
 cphasing pipeline -f draft.asm.fasta -hic1 Lib_R1.fastq.gz -hic2 Lib_R2.fastq.gz -t 10 -n 8:4
 ```
-Note1: If you want to run multiple samples, you can use `cphasing hic mapper` and `cphasing-rs pairs-merge` to generate the merged `pairs.gz` file, and input it by `-prs` parameter.  
-Note2: If the total length of your input genome is larger than 8 Gb, the `-hic-mapper-k 27 -hic-mapper-w 14` should be specified, to avoid the error of chromap. 
+> [!NOTE]
+> If you want to run multiple samples, you can use `cphasing hic mapper` and `cphasing-rs pairs-merge` to generate the merged `pairs.gz` file, and input it by `-prs` parameter.  
+> [!NOTE]
+> If the total length of your input genome is larger than 8 Gb, the `-hic-mapper-k 27 -hic-mapper-w 14` should be specified, to avoid the error of chromap. 
 
 
 - Start from a 4DN pairs file,
@@ -117,7 +106,8 @@ cphasing pairs2mnd sample.pairs.gz -o sample.mnd.txt
 cphasing utils agp2assembly groups.agp > groups.assembly
 bash ~/software/3d-dna/visualize/run-assembly-visualizer.sh sample.assembly sample.mnd.txt
 ```
-Note: if chimeric corrected, please use `groups.corrected.agp` and generate a new `corrected.pairs.gz` by `cphasing-rs pairs-break`
+> [!NOTE]
+> if chimeric corrected, please use `groups.corrected.agp` and generate a new `corrected.pairs.gz` by `cphasing-rs pairs-break`
 - After curation
 ```bash
 ## convert assembly to agp
@@ -130,120 +120,23 @@ cphasing agp2fasta groups.review.agp draft.asm.fasta --contigs > contigs.fasta
 cphasing agp2fasta groups.review.agp draft.asm.fasta > groups.review.asm.fasta
 ```
 
-
 ### Rename 
 Rename and orient chromosome according a monoploid reference (or genome of closely related species).
 ```bash
 cphasing rename -r mono.fasta -f draft.asm.fasta -a groups.review.agp -t 20
 ```
-Note: To reduce the time consumed, we only align the first haplotype (g1) to the monoploid, which the orientation among different haplotypes has already been set to the same in the `scaffolding` step. If not, you can set `—unphased` to align all haplotypes to the monoploid to adjust the orientation.  
+> [!NOTE]
+> To reduce the time consumed, we only align the first haplotype (g1) to the monoploid, which the orientation among different haplotypes has already been set to the same in the `scaffolding` step. If not, you can set `—unphased` to align all haplotypes to the monoploid to adjust the orientation.  
 
-----------------------------------------------
 
+----------------------------------------------
 
 ## Pipeline of Ultra-long data [Optional]
 C-Phasing enable to use ultra-long to correct chimeric and identify the high confidence regions (HCRs) to help assembly.  
 ### **[hitig tutorial](cphasing/hitig)**
 
 
-## Step by step pipeline of Pore-C data  
-0. **mapping**  
-    > Mapping pore-c data into contig-level assembly and output the pore-c table and 4DN pairs.
-    ```bash
-    ## results are `sample.porec.gz` and `sample.pairs.gz`  
-    cphasing mapper draft.asm.fasta sample.fastq.gz -t 10
-
-    ```  
-    > For Hi-C data please use `cphasing hic mapper`.
-
-    Note: If you are mapping multiple pore-c data, the multiple `pairs.gz` files should be merged by following steps:
-    ```bash
-    cphasing-rs pairs-merge sample1.pairs.gz sample2.pairs.gz -o sample.merge.pairs.gz
-    ```
-
-1. **alleles** (Optional for phasing mode)  
-    > Identify the inter-allelic contig pairs.
-    ```bash
-    ## result is `draft.asm.allele.table`
-    cphasing alleles -f draft.asm.fasta
-    ```
-    Note: the memory usage of this step is depend on the genome size of input fasta, when input the length larger 10 Gb, the peak memory will reach 130 Gb, or more. If you want to reduce the memory usage, you can increase the `k` and `w`, for example, when we benchmark a 20 Gb hexaploid, we set the `k` and `w` to 27 and 24, respectively.
-
-2. **prepare**  
-    > Prepare some data for subsequence analysis  
-    ```bash  
-    ## results are `sample.counts_AAGCTT.txt`, `sample.clm.gz`, `sample.split.contacts`, `sample.contacts`
-    cphasing prepare draft.asm.fasta sample.pairs.gz 
-    ```  
-
-3. **hyperpartition**  
-    ```bash  
-    ## result is `output.clusters.txt`
-    ## for haploid scaffolding 
-    cphasing hyperpartition sample.porec.gz draft.asm.contigsizes output.clusters.txt
-    ## for polyploid or diploid phasing must add prune information and use the incremental partition mode
-    ### chromsome number aware, 8:4 indicate that this is a tetraploid with 8 chromosome in each haplotype
-    cphasing hyperpartition sample.porec.gz draft.asm.contigsizes output.clusters.txt -pt prune.contig.table -inc -n 8:4 -t 4
-    ### auto generate groups
-    cphasing hyperpartition sample.porec.gz draft.asm.contigsizes output.clusters.txt -pt prune.contig.table -inc -t 4
-    ```  
-
-    Note: If you are not satisfied with the number of groups, you can adjust the resolution parameters (`-r1` or `-r2`) to make difference groups. `-r1` controls the first cluster while `-r2` controls the second cluster in phasing mode. Increasing the resolution can generate more groups while decreasing it will get fewer groups.  
-    If you want to group some contigs, you can input a contig list with the `-wl` parameter.
-
-4. **scaffolding**  
-    ```bash  
-    ## result is `groups.agp`
-    cphasing scaffolding output.clusters.txt sample.counts_AAGCTT.txt sample.clm.gz -sc sample.split.contacts -f draft.asm.fasta -t 4
-
-    ## for polyploid can specified allele table to adjust the orientation of different haplotypes
-    cphasing scaffolding output.clusters.txt sample.counts_AAGCTT.txt sample.clm -at draft.asm.allele.table -sc sample.split.contacts -f draft.asm.fasta 
-    ```  
-5. **plot**  
-    > Adjust the contig-level contacts matrix into chromosome-level and plot a heatmap.  
-
-    ```bash  
-    ## result is `sample.10000.cool` and `groups.wg.png`
-    cphasing pairs2cool sample.pairs.gz draft.asm.contigsizes sample.10000.cool
-    cphasing plot -a groups.agp -m sample.10000.cool -o groups.wg.png
-    ```  
-
-    > `plot` also enable only plot the heatmap from a matrix file (.cool)
-
-    ```bash
-    ## directly plot the heatmap in the input resolution, 
-    cphasing plot -m sample.500k.chrom.cool -o sample.500k.chrom.png
-
-    ## or first coarsen small binsize to larger binsize, e.g. 10k to 500k, specify the `-bs 500k` parameter.
-
-    cphasing plot -m sample.10k.chrom.cool -o sample.500k.chrom.png -bs 500k
-    ```
-
-    Note: If the number of bins in matrix is too large (large genome with small binsize), the memory may overflow. To fix it, users can improve the binsize or specified several chromosomes with `-c` parameter. 
-
-    ```bash
-    cphasing plot -m sample.100k.chrom -c Chr01g1,Chr01g2,Chr01g3,Chr01g4 -o Chr01.100k.png 
-    ```
-
-## FAQ
-### The results of the first round partition are unsatisfactory.
-In our two-round partition algorithm, the first round partition depends on the h-trans errors between homologous chromosomes; if you input a contig assembly with low level switch errors or input a high accuracy pore-c data, the h-trans will be not enough to cluster all contigs to correct homologous groups, resulting in unsatisfactory results. You can set the `-q1 0` for `hyperpartition` to increase the rate of h-trans errors. However, this parameter may raise error of `out of memory` when you input huge pore-c data in porec table or hic contacts in pairs file. 
-
-### How to set the `-n` parameter when assembling an aneuploid genome. 
-The aneuploid genome, such as modern cultivated sugarcane, contains unequal homologous chromosomes. The `-n` parameter can be set to zero (`-n 10:0`) to automatically partition contigs into different chromosomes within a homologous group.     
-However, we also allow the user to input a file with two columns: the first column is the index(1-base) of the first round partition, and the second column is the chromosome number of each homologous. And then specified the `-n 10:second.number.tsv` in `cphasing pipeline` or `cphasing hyperpartition`.
-- `second.number.tsv`
-```text
-1    13
-2    12
-3    12
-4    11
-5    10
-6    12
-7    12
-8    10
-9    12
-10    12
-```
-
+## More
+More details please check the documentation:  
+[Documentation](https://wangyibin.github.io/CPhasing/latest) | [中文文档](https://wangyibin.github.io/CPhasing/latest/zh)
 
diff --git a/bin/cphasing-rs b/bin/cphasing-rs
diff --git a/bin/deactivate_cphasing b/bin/deactivate_cphasing
@@ -13,7 +13,7 @@ software_dir="$(dirname "$script_dir")"
 
 if [ ! -f $software_dir/pixi.toml ]; then
     echo "pixi.toml not found in $software_dir"
-    exit 1
+    return 1
 fi
 
 is_tty=$(tty)   

diff --git a/cphasing/__init__.py b/cphasing/__init__.py
@@ -29,7 +29,7 @@
 __email__ = ("[email protected]", "[email protected]")
 __license__ = "BSD"
 __status__ = "Development"
-__version__ = "0.2.3.r287"
+__version__ = "0.2.4.r288"
 __epilog__ =  f"""
             \b
             Version: {__version__} | \n

diff --git a/cphasing/_config.py b/cphasing/_config.py
@@ -14,4 +14,24 @@
 
 ## Datatype of HyperGraph order
 HYPERGRAPH_ORDER_DTYPE = "int8"
-HYPERGRAPH_COL_DTYPE = "int32"
+HYPERGRAPH_COL_DTYPE = "int32"
+
+
+## Default parameters for CLI 
+ALLELES_TRIM_LENGTH = 25_000
+
+## HyperGraph construction
+EDGE_LENGTH = "500k"
+
+
+## HyperPartition
+HCR_LOWER = 0.01
+HCR_UPPER = 1.5
+MIN_CONTACTS = 5
+MIN_CIS_WEIGHT = 5.0
+MIN_WEIGHT = 0.1
+
+MIN_SCAFFOLD_LENGTH = 2e6 
+MIN_LENGTH = 10_000
+
+
diff --git a/cphasing/algorithms/hypergraph.py b/cphasing/algorithms/hypergraph.py
@@ -240,9 +240,10 @@ def clique_expansion_init(H, P_allelic_idx=None,
 
         if min_weight > 0:
             mask = A >= min_weight
-            A.multiply(mask)
-
-
+            # mask = (A >= min_weight).astype(bool)
+            # mask += (A <= -min_weight).astype(bool)
+            A = A.multiply(mask)
+
         if P_allelic_idx or P_weak_idx:
             # P = np.ones((H.shape[0], H.shape[0]), dtype=np.int8)
             # P[np.diag_indices_from(P)] = 0
@@ -288,7 +289,7 @@ def to_contacts(A, nodes,
 
         df = pd.DataFrame({0: V[A.row], 1: V[A.col], 2: A.data})
         df = df[df[0] <= df[1]]
-        df = df[df[2] >= min_weight]
+        df = df[(df[2] >= min_weight) | (df[2] <= -min_weight)]
         df = df[abs(df[2]) != np.inf]
         # df = df.query(f"0 <= 1 & 2 >= {min_weight} & abs(2) != inf")
         df.to_csv(output, sep='\t', header=False, index=False)
@@ -461,6 +462,7 @@ def IRMM(H, A=None,
     --------
     >>> IRMM(H, vertices)
     """
+
     if max_round > 1:
         import dask.array as da 
     # if P_allelic_idx or P_weak_idx:
@@ -497,6 +499,8 @@ def IRMM(H, A=None,
 
         if min_weight > 0:
             mask = A >= min_weight
+            # mask = (A >= min_weight).astype(bool)
+            # mask += (A <= -min_weight).astype(bool)
             A = A.multiply(mask)
         # normalization
         if NW is not None:

diff --git a/cphasing/algorithms/recluster.py b/cphasing/algorithms/recluster.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+"""
+recluster the contigs on existing clusters
+"""
+
+import argparse
+import logging
+import os
+import os.path as op
+import sys
+
+import pandas as pd
+import numpy as np
+
+from ..core import AlleleTable, PruneTable, ClusterTable
+
+
+class Recluster:
+    """
+    Developping...
+    """
+    def __init__(self, clustertable, contacts, countre, prunetable, min_length=50000):
+        self.clustertable = clustertable
+        self.contacts = contacts
+        self.countre = countre
+        self.prunetable = prunetable
+
+        self.min_length = min_length
+
+
+
+