-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUTR_compare.py
More file actions
125 lines (112 loc) · 5.83 KB
/
UTR_compare.py
File metadata and controls
125 lines (112 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import pysam
import bx.bbi.bigwig_file as bigwig
import sys
import progressbar
from Utils import parse_gtf, find_nearest_genes, filter_gtf
import argparse
from Bio import SeqIO
GENOME_SIZE = 4639675 # E. coli K12 MG1655 from Ensembl
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--gtf-fname', '-G')
parser.add_argument('--no-drug-fname', '-0')
parser.add_argument('--drug-fname', '-d')
parser.add_argument('--genome', '-g', type=open)
parser.add_argument('--outfh', '-o', type=argparse.FileType('w'))
parser.add_argument('--downstream_start', '-S', type=int, default=0)
parser.add_argument('--downstream_end', '-E', type=int, default=100)
parser.add_argument('--downstream_region', '-D', type=int, default=50)
parser.add_argument('--downstream_cleareance', '-C', type=int, default=50)
parser.add_argument('--ratio-high', '-R', type=float, default=1e100)
parser.add_argument('--ratio_low', '-r', type=float, default=10.0)
parser.add_argument('--keep-neg', dest='keep_pos', default=1,
action='store_const', const=-1)
parser.add_argument('--auto-name-out', '-A', default=False,
action="store_true")
args = parser.parse_args()
print args.keep_pos
if args.auto_name_out:
name = 'hidrug' if (args.keep_pos > 0) else 'drug_nonresp'
name += '_utrs_'
name += '%d_%d_%gxlo_%gxhi.fa' % (args.downstream_start,
args.downstream_end,
args.ratio_low,
args.ratio_high)
print "making file: ", name
args.outfh = open(name, 'w')
args.Genome = {r.name: r
for r in SeqIO.parse(args.genome, 'fasta')}
args.nonfh = False
return args
if __name__ == "__main__":
args = parse_args()
gtf_data = parse_gtf(args.gtf_fname)
nearest_to_left, nearest_to_right = find_nearest_genes(gtf_data,
GENOME_SIZE)
no_drug_reads = bigwig.BigWigFile(open(args.no_drug_fname))
drug_reads = bigwig.BigWigFile(open(args.drug_fname))
filter_gtf(gtf_data)
all_ratios = {}
outputted_data = []
pbar = progressbar.ProgressBar(widgets=['Pileups: ',
progressbar.Percentage(), ' ',
progressbar.Bar(), ' ',
progressbar.ETA()],
maxval=len(gtf_data))
for gene in pbar(gtf_data):
gene_data = gene.other.split(';')
gene_data = {dat.split()[0]: dat.split()[1] for dat in gene_data if dat}
gene_id = gene_data.get('gene_name', 'unknown_gene').strip('"')
if gene.strand == '+':
if ((gene.end + args.downstream_region)
> (nearest_to_right[gene.end + 3] - args.downstream_clearance)):
continue
drug_pileup = drug_reads.get_as_array(gene.chrom, gene.end,
gene.end + args.downstream_region)
no_drug_pileup = no_drug_reads.get_as_array(gene.chrom, gene.end,
gene.end + args.downstream_region)
drug_counts = sum(drug_pileup)
no_drug_counts = sum(no_drug_pileup)
if no_drug_counts:
all_ratios[gene.other] = drug_counts / no_drug_counts
if (args.ratio_low < all_ratios[gene.other] < args.ratio_high
and args.outfh):
seq = args.Genome[gene.chrom][gene.end + args.downstream_start:
gene.end + args.downstream_end]
seq.id = gene_id
if gene_id not in outputted_data:
seq.description = '%d-%d' % (gene.start, gene.end)
SeqIO.write(seq, args.outfh, 'fasta')
outputted_data.append(gene_id)
elif gene.strand == '-':
if ((gene.start - args.downstream_region)
< (nearest_to_left[gene.start - 3] + args.downstream_clearance)):
continue
drug_pileup = drug_reads.get_as_array(gene.chrom,
gene.start - args.downstream_region,
gene.start)
no_drug_pileup = no_drug_reads.get_as_array(gene.chrom,
gene.start - args.downstream_region,
gene.start)
drug_counts = sum(drug_pileup)
no_drug_counts = sum(no_drug_pileup)
if no_drug_counts:
all_ratios[gene.other] = drug_counts / no_drug_counts
if (args.ratio_low < all_ratios[gene.other] < args.ratio_high
and args.outfh):
seq = args.Genome[gene.chrom][gene.start - args.downstream_end:
gene.start - args.downstream_start]
seq = seq.reverse_complement()
seq.id = gene_id
if gene_id not in outputted_data:
seq.description = '%d-%d' % (gene.start, gene.end)
SeqIO.write(seq, args.outfh, 'fasta')
outputted_data.append(gene_id)
elif all_ratios[gene.other] < 1 and args.nonfh:
seq = args.Genome[gene.chrom][gene.start - args.downstream_end:
gene.start - args.downstream_start]
seq = seq.reverse_complement()
seq.id = gene_id
seq.description = '%d-%d' % (gene.start, gene.end)
SeqIO.write(seq, nonfh, 'fasta')
args.outfh.close()