wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chromFa.tar.gz
wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/hg38-human/hg38.blacklist.bed.gz
gunzip hg38.blacklist.bed.gz
tar -xvf hg38.chromFa.tar.gz
for file in `ls chroms/ | grep _`;do
rm chroms/$file
done
genome=hg38
genome_fa_dir=./chroms/
genome_chrom_size=./hg38.chrom.sizes
# for example: DpnII-->GATC; HindIII-->AAGCTT. This would depend on which enzyme you use for Hi-C experiment.
cutsite=GATC
enzyme=DPNII
blackregion=./hg38.blacklist.bed
lib=HiCorr/bin/generateReference_lib/
output=./${genome}_${enzyme}
mkdir $output
chmod +x $lib/*
## start build references #################################################################################
# 1. generate fragment bed file
$lib/find_RE_sites.pl $genome_fa_dir $genome_chrom_size $cutsite $lib> $genome.cutting.sites
python3 $lib/sites_to_frag.py $genome_chrom_size $genome.cutting.sites | awk '{print $0,$3-$2+1}' OFS='\t' > $genome.$enzyme.frag.bed
# 2. generate ~5kb anchor (Here we average each fragment into ~5kb anchor)
python3 $lib/generate.fragment.py $genome.$enzyme.frag.bed 5000 > ${genome}_${enzyme}_frag_2_anchor
python3 $lib/get_aveg_frag_length.py ${genome}_${enzyme}_frag_2_anchor anchor.bed > ${genome}_${enzyme}_anchors_avg.bed
mkdir $genome.anchor.5kb
cat ${genome}_${enzyme}_anchors_avg.bed | cut -f1-4 | awk '{print>"'$genome'"".anchor.5kb/"$1".bed"}'
# 3. divided all anchor based on their length into 20 equal size groups
$lib/get_group_range.pl ${genome}_${enzyme}_anchors_avg.bed 6 20 > ${genome}_anchor_length.groups
# 4. generate all possible trans contact matric
$lib/count_trans_pairs_by_GC.pl ${genome}_${enzyme}_anchors_avg.bed ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups > $genome.trans.possible.pairs
# 5. generate anchor to blacklist file by overlaping 5kb anchor and blacklist region
bedtools intersect -wa -a ${genome}_${enzyme}_anchors_avg.bed -b $blackregion | cut -f1-4 | sort -u > ${genome}_5kb_anchors_blacklist
# 6. generate all possible pairs within 2Mb (blacklist has been removed)
$lib/list_full_matrix.pl ${genome}_${enzyme}_anchors_avg.bed 2000000 | python3 $lib/remove.blacklist.py ${genome}_5kb_anchors_blacklist > ${genome}.full.matrix
# 7. Set up 400 groups for distance within 0-2Mb (5kb per group)
### "$genome.dist.5kb.group" contains distance groups (401 groups for 200,0000)
# 1 -1 1000
# 2 1001 5000
# 3 5001 10000
# 4 10001 15000
# ...
# 400 1990001 1995000
#401 1995001 2e+06
cp HiCorr/bin/dist.401.group ${genome}.dist.401.group
# 8. To create length and distance statistical file for full matrix
$lib/get_group_statistics.pl ${genome}.full.matrix ${genome}_${enzyme}_anchors_avg.bed ${genome}_anchor_length.groups $genome.dist.5kb.group | awk '{print $0,0}' OFS='\t' > $genome.full.dist.stat.5kb