-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCalcFalloff.py
More file actions
223 lines (186 loc) · 8.08 KB
/
CalcFalloff.py
File metadata and controls
223 lines (186 loc) · 8.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
""" CalcFalloff -- Calculates the read falloff around a gene
This is code for finding the average(?) expression across all genes as a
function of position. We'd like to see the upstream area (which should be
relatively low), the gene itself (relatively high, and nearly constant across
the gene), and then the downstream, which ought to fall off quickly in the
no-drug case, and fall off much more slowly in the +drug case.
"""
from __future__ import division
from argparse import ArgumentParser, FileType
import numpy as np
import progressbar
import sys
import Utils
import bx.bbi.bigwig_file as bigwig
def parse_args():
parser = ArgumentParser()
parser.add_argument('-d', '--downstream', type=int, default=300)
parser.add_argument('-u', '--upstream', type=int, default=300)
parser.add_argument('-r', '--resolution', type=int, default=200)
parser.add_argument('-c', '--clearance', type=int, default=50)
parser.add_argument('-g', '--gtf-filename',
default='E_coli_k12.EB1_e_coli_k12.13.gtf')
parser.add_argument('-t', '--use-non-terminated', dest='use_terminated',
action='store_false')
parser.add_argument('-T', '--use-terminated', dest='use_terminated',
action='store_true')
parser.add_argument('-G', '--terminated-genes', default=False)
parser.add_argument('-S', '--genome-size', type=int, default=4639675)
parser.add_argument('bigwig', type=FileType('rb'))
global args
args = parser.parse_args()
if args.terminated_genes:
terminated_genes = set()
for line in open(args.terminated_genes):
terminated_genes.add(line.strip())
args.terminated_genes = terminated_genes
return args
def find_upper_limits(end, strand, to_left, to_right):
""" """
if strand == '+':
dist = args.downstream
else:
dist = args.upstream
no_gene_pos = end + dist
gene_pos = to_right[end] - args.clearance
downstream = max(min(no_gene_pos, gene_pos),
end)
return downstream
def find_lower_limits(start, strand, to_left, to_right):
""" """
if strand == "+":
dist = args.upstream
else:
dist = args.downstream
no_gene_pos = start - dist
gene_pos = nearest_to_left[start] + args.clearance
upstream = min(max(no_gene_pos, gene_pos),
start)
return upstream
def calc_deltas(us, ue, ds, de, strand):
delta_up = np.zeros(args.upstream)
delta_down = np.zeros(args.downstream)
if strand == '+':
delta_up[args.upstream - ue + us:] += 1
delta_down[:de - ds] += 1
elif strand == '-':
delta_up[args.upstream - de + ds:] += 1
delta_down[:ue - us] += 1
return delta_up, delta_down
if __name__ == "__main__":
args = parse_args()
gtf_data = Utils.parse_gtf(args.gtf_filename)
nearest_to_left, nearest_to_right = Utils.find_nearest_genes(gtf_data,
args.genome_size)
reads = bigwig.BigWigFile(args.bigwig)
# number of reads (calculated with pileup)
upstream = np.zeros(args.upstream)
# number of genes used at each position
upstream_n = np.zeros(args.upstream)
downstream = np.zeros(args.downstream)
downstream_n = np.zeros(args.downstream)
gene_cov = np.zeros(args.resolution)
gene_cov_n = 0
dists = []
Utils.filter_gtf(gtf_data)
pbar = progressbar.ProgressBar(widgets=['Pileups: ',
progressbar.Percentage(), ' ',
progressbar.Bar(), ' ',
progressbar.ETA()],
maxval=len(gtf_data))
all_ratios = {}
for gene in pbar(gtf_data):
start = gene.start
end = gene.end
strand = gene.strand
chrom = gene.chrom # Should always be "Chromosome"
if start == 4275492:
continue
# soxR-1 has sraL with an overlapping region with sraL. I'm a little
# surprised this seems to be the only problem...
if args.terminated_genes:
name = Utils.guess_gene_name(gene.other)
if args.use_terminated and name not in args.terminated_genes:
continue
if not args.use_terminated and name in args.terminated_genes:
continue
# Look to the left of the gene
# (upstream for the + strand, downstream for the - strand)
upstream_start = find_lower_limits(start, strand, nearest_to_left,
nearest_to_right)
upstream_end = start
downstream_start = end
downstream_end = find_upper_limits(end, strand, nearest_to_left,
nearest_to_right)
delta_up, delta_down = calc_deltas(upstream_start, upstream_end,
downstream_start, downstream_end,
strand)
dists.append(upstream_end - upstream_start)
total_downstream = 0.0
if upstream_start < upstream_end:
for n, pos in zip(reads.get_as_array(chrom, upstream_start,
upstream_end),
range(upstream_start, upstream_end)):
if not upstream_start <= pos < upstream_end:
continue
if strand == '+':
i = args.upstream - start + pos
upstream[i] += n
elif strand == '-':
i = start - pos - 1
total_downstream += n
downstream[i] += n
# Look at the gene itself
last_pct = -1
total_reads = 0
for n, pos in zip(reads.get_as_array(chrom, start, end),
range(start, end)):
if not start <= pos < end:
continue
if last_pct == -1:
# Genes with any expression whatsoever should get included
gene_cov_n += 1
upstream_n += delta_up
downstream_n += delta_down
last_pct = 0
# Note the from future import __division__
if strand == '+':
hi = int(np.floor(args.resolution *
(pos - start) / (end - start)))
elif strand == '-':
hi = int(np.floor(args.resolution *
(end - pos) / (end - start)))
total_reads += n
if last_pct == hi:
gene_cov[hi] += n
else:
gene_cov[last_pct:hi] += n
last_pct = hi
mean_reads = total_reads / float(end - start)
# Look to the right of the gene
# (downstream for the + strand, upstream for the - strand)
dists.append(downstream_end - downstream_start)
if downstream_start < downstream_end:
for n, pos in zip(reads.get_as_array(chrom, downstream_start,
downstream_end),
range(downstream_start, downstream_end)):
if not downstream_start < pos < downstream_end:
continue
if strand == '+':
i = pos - end - 1
downstream[i] += n
total_downstream += n
elif strand == '-':
i = args.upstream - pos + end
upstream[i] += n
if sum(delta_down) and total_reads:
all_ratios[gene.other] = total_downstream / (sum(delta_down) *
total_reads)
avg_upstream = upstream / upstream_n
avg_gene_cov = gene_cov / gene_cov_n
avg_downstream = downstream / downstream_n
print np.median(avg_upstream), (np.median(avg_upstream) /
np.median(avg_gene_cov))
print np.median(avg_gene_cov), 1
print np.median(avg_downstream), (np.median(avg_downstream) /
np.median(avg_gene_cov))