-
Notifications
You must be signed in to change notification settings - Fork 3
/
normalizeBedgraph.py
executable file
·60 lines (45 loc) · 1.43 KB
/
normalizeBedgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# normalizeBedgraph.py
# 6/16/14
# normalizes one bedgraph at a time
import csv, sys, math
csv.register_dialect("textdialect", delimiter='\t')
csv.register_dialect("spacedialect", delimiter=' ')
ifn = sys.argv[1]
sizes = sys.argv[2]
ofn = sys.argv[3]
# get biggest allowed sequence sizes
ifile = open(sizes, 'r')
reader = csv.reader(ifile, 'textdialect')
chrToLimit = {}
for row in reader:
chrToLimit[row[0]] = int(row[1])
ifile.close()
# normalize by scaling everything to a total read density of 500 million
ifile = open(ifn, 'r')
reader = csv.reader(ifile, 'textdialect')
sum = 0
for row in reader:
if len(row) < 4: continue
if '_' in row[0] and 'chr' in row[0]: continue
sum += math.fabs((int(row[2]) - int(row[1])) * float(row[3]))
multFactor = 500000000/sum
ifile.seek(0)
ofile = open(ofn, 'w')
writer = csv.writer(ofile, 'textdialect')
pastRow = ['0', 0, 0, 0]
for row in reader:
if len(row) < 4: continue
if '_' in row[0] and 'chr' in row[0]: continue
start, stop, val = int(row[1]), int(row[2]), float(row[-1])
if val == 0: continue
if start > chrToLimit[row[0]] or stop > chrToLimit[row[0]]: continue
val = val * multFactor
row[-1] = val
if start == int(pastRow[2]) and val == float(pastRow[-1]):
pastRow[2] = stop
else:
if pastRow[0] != '0': writer.writerow(pastRow)
pastRow = row
if pastRow[0] != '0': writer.writerow(pastRow)
ifile.close()
ofile.close()