-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_HiC1Dmetrics.py
151 lines (129 loc) · 5.98 KB
/
run_HiC1Dmetrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
import os
import sys, subprocess, time
import numpy as np
import pandas as pd
import argparse
import cooler
parser = argparse.ArgumentParser()
parser.add_argument("--regfile", "-i", type=str, required=True)
parser.add_argument("--coolfile1", "-f", type=str, required=True)
parser.add_argument("--coolfile2", "-s", type=str, required=True)
parser.add_argument("--sample_name1", "-a", type=str, required=True)
parser.add_argument("--sample_name2", "-b", type=str, required=True)
parser.add_argument("--resolution", "-r", type=int, required=True)
parser.add_argument("--chrom", "-c", type=str, required=True)
parser.add_argument("--window", "-w", type=int, required=True)
parser.add_argument("--processes", "-n", type=int, required=True)
parser.add_argument( "--outdir","-o",type=str)
args = parser.parse_args()
reg_file = args.regfile
cool_file1 = args.coolfile1
cool_file2 = args.coolfile2
sample_name1 = args.sample_name1
sample_name2 = args.sample_name2
res = args.resolution
chrom = args.chrom
window_size = args.window
num_processes = args.processes
outdir = args.outdir
## From basenji
def exec_par(cmds, max_proc=None, verbose=False):
total = len(cmds)
finished = 0
running = 0
p = []
if max_proc == None:
max_proc = len(cmds)
if max_proc == 1:
while finished < total:
if verbose:
print(cmds[finished], file=sys.stderr)
op = subprocess.Popen(cmds[finished], shell=True)
os.waitpid(op.pid, 0)
finished += 1
else:
while finished + running < total:
# launch jobs up to max
while running < max_proc and finished+running < total:
if verbose:
print(cmds[finished+running], file=sys.stderr)
p.append(subprocess.Popen(cmds[finished+running], shell=True))
#print 'Running %d' % p[running].pid
running += 1
# are any jobs finished
new_p = []
for i in range(len(p)):
if p[i].poll() != None:
running -= 1
finished += 1
else:
new_p.append(p[i])
# if none finished, sleep
if len(new_p) == len(p):
time.sleep(1)
p = new_p
# wait for all to finish
for i in range(len(p)):
p[i].wait()
def run_HiC1Dmetrics_target_matrix(df,res,genome_hic_cool_1,genome_hic_cool_2,sample_name1,sample_name2,num_processes,outdir):
hic1d_jobs = []
num_regs = df.shape[0]
for i in range(num_regs):
chrom = df.iloc[i]['chrom']
start = df.iloc[i]['start']
end = df.iloc[i]['end']
mseq_str = '%s:%d-%d' % (chrom, start, end)
seq_hic_raw_s1 = genome_hic_cool_1.matrix(balance=True).fetch(mseq_str)
seq_hic_raw_s1_row = np.r_[[(start//res)*res+res*np.array(range(seq_hic_raw_s1.shape[0]))],seq_hic_raw_s1]
seq_hic_raw_s1_row_column = np.column_stack((np.append(np.nan,(start//res)*res+res*np.array(range(seq_hic_raw_s1.shape[0]))),seq_hic_raw_s1_row))
seq_hic_raw_s2 = genome_hic_cool_2.matrix(balance=True).fetch(mseq_str)
seq_hic_raw_s2_row = np.r_[[(start//res)*res+res*np.array(range(seq_hic_raw_s2.shape[0]))],seq_hic_raw_s2]
seq_hic_raw_s2_row_column = np.column_stack((np.append(np.nan,(start//res)*res+res*np.array(range(seq_hic_raw_s2.shape[0]))),seq_hic_raw_s2_row))
try:
os.makedirs(f'{outdir}/{chrom}/{res}')
except:
pass
np.savetxt(f'{outdir}/{chrom}/{res}/{sample_name1}_{res}_{i}_regions.txt',seq_hic_raw_s1_row_column,delimiter='\t')
np.savetxt(f'{outdir}/{chrom}/{res}/{sample_name2}_{res}_{i}_regions.txt',seq_hic_raw_s2_row_column,delimiter='\t')
if(res==20480):
param = 102400
param_DLR = 307200
else:
param = 10240
param_DLR = 30720
for metric in ['ISC','CIC','SSC','deltaDLR','CD']:
cmd = f'h1d two {metric}'
cmd += f' {outdir}/{chrom}/{res}/{sample_name1}_{res}_{i}_regions.txt'
cmd += f' {outdir}/{chrom}/{res}/{sample_name2}_{res}_{i}_regions.txt'
if(metric=='deltaDLR'):
cmd += f' {res} {chrom} --datatype matrix -p {param_DLR}'
elif(metric=='CD'):
cmd += f' {res} {chrom} --datatype matrix'
else:
cmd += f' {res} {chrom} --datatype matrix -p {param}'
cmd += f' -o {outdir}/{chrom}/{res}/{sample_name1}_vs_{sample_name2}_{i}_{metric}'
hic1d_jobs.append(cmd)
exec_par(hic1d_jobs, num_processes, verbose=True)
def extract_HiC1Dmetrics_results(df,res,sample_name1,sample_name2,outdir):
metrics = ['ISC','CIC','SSC','deltaDLR','CD']
num_regs = df.shape[0]
for metric in metrics:
metric_value_list = []
for i in range(num_regs):
chrom = df.iloc[i]['chrom']
data_df = pd.read_table(f'{outdir}/{chrom}/{res}/{sample_name1}_vs_{sample_name2}_{i}_{metric}.bedGraph',
header=None,sep='\t',names=['chrom','start','end','value'])
metric_value = data_df['value'].abs().mean()
metric_value_list.append(metric_value)
df[metric] = metric_value_list
df.drop(columns=['window'],inplace=True)
df.to_csv(f'{outdir}/{sample_name1}_vs_{sample_name2}_comp_HiC1Dmetrics_results_{res}.tsv',sep='\t',index=False)
if __name__ == '__main__':
regs = pd.read_table(reg_file,header=0,sep='\t')
regs['window'] = regs['end']-regs['start']
regs_sub = regs[(regs['window']==window_size) & (regs['chrom']==chrom)]
clr_s1 = cooler.Cooler(f'{cool_file1}::resolutions/{res}')
clr_s2 = cooler.Cooler(f'{cool_file2}::resolutions/{res}')
run_HiC1Dmetrics_target_matrix(regs_sub,res,clr_s1,clr_s2,sample_name1,sample_name2,num_processes,outdir)
extract_HiC1Dmetrics_results(regs_sub,res,sample_name1,sample_name2,outdir)