-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUtils.py
More file actions
98 lines (80 loc) · 3.07 KB
/
Utils.py
File metadata and controls
98 lines (80 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from collections import namedtuple
import progressbar
Gene = namedtuple("Gene", ['chrom', 'start', 'end', 'strand', 'type', 'other'])
def parse_gtf(filename):
""" Parses the GTF in filename into a list of Genes
At the moment, this strips out everything that's a comment or on the F
chromosome, as well as anything that has the exact same extent as the
previous entry.
"""
genelist = []
last_pos = ('', 0, 0)
for line in open(filename):
if line.startswith('#') or line.startswith('F'):
continue
data = line.split('\t')
if (data[0], data[3], data[4]) != last_pos:
genelist.append(Gene(chrom=data[0],
start=int(data[3]),
end=int(data[4]),
strand=data[6],
type=data[2],
other=data[-1].strip()))
last_pos = data[0], data[3], data[4]
return genelist
def find_nearest_genes(gtf_data, genome_size):
"""For each base on the genome find the nearest genes to the left and right
This is useful for doing lookups, although it may be too large to scale to
non-bacterial organisms.
"""
nearest_to_left = [0] * genome_size
nearest_to_right = [0] * genome_size
old_start = 0
old_end = 0
first_start = 0
first_end = 0
gene = None
pbar = progressbar.ProgressBar(maxval=len(gtf_data))
for gene in pbar(gtf_data):
if gene.chrom != 'Chromosome':
continue
for i in range(old_start, gene.start):
nearest_to_right[i] = gene.start
if first_end == 0:
first_start = gene.start
first_end = gene.end
else:
for i in range(old_end, gene.end):
nearest_to_left[i] = old_end
old_start = gene.start
old_end = gene.end
assert gene # Should catch an empty gtf file
for i in range(first_end):
nearest_to_left[i] = gene.end
for i in range(gene.end, len(nearest_to_left)):
nearest_to_left[i] = gene.end
for i in range(gene.start, len(nearest_to_right)):
nearest_to_right[i] = first_start
return nearest_to_left, nearest_to_right
def filter_gtf(gtf_data):
""" Remove non-CDS entries from the GTF data"""
for entry in gtf_data[:]:
if entry.type != 'CDS' and entry.type != 'operon':
gtf_data.remove(entry)
def guess_gene_name(annotation_string):
""" Guess name based on "Other" field of GTF/GFF entry"""
guess_order = ['operon', 'operon_', 'gene', 'gene_', 'transcript']
entries = annotation_string.split(';')
names = {}
for entry in entries:
entry = entry.strip()
if 'name' in entry.lower():
kind = entry[:entry.lower().find('name')]
name = entry[entry.find('"') + 1:-1] # Within the quotes
names[kind] = name
for guess in guess_order:
if guess in names:
return names[guess]
if len(names) == 1:
return names.values()[0]
return annotation_string