PythonCourse-2012/Utils.py at master · petercombs/PythonCourse-2012 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from collections import namedtuple
import progressbar

Gene = namedtuple("Gene", ['chrom', 'start', 'end', 'strand', 'type', 'other'])


def parse_gtf(filename):
    """ Parses the GTF in filename into a list of Genes

    At the moment, this strips out everything that's a comment or on the F
    chromosome, as well as anything that has the exact same extent as the
    previous entry.
    """
    genelist = []
    last_pos = ('', 0, 0)
    for line in open(filename):
        if line.startswith('#') or line.startswith('F'):
            continue

        data = line.split('\t')
        if (data[0], data[3], data[4]) != last_pos:
            genelist.append(Gene(chrom=data[0],
                                 start=int(data[3]),
                                 end=int(data[4]),
                                 strand=data[6],
                                 type=data[2],
                                 other=data[-1].strip()))
            last_pos = data[0], data[3], data[4]
    return genelist


def find_nearest_genes(gtf_data, genome_size):
    """For each base on the genome find the nearest genes to the left and right

    This is useful for doing lookups, although it may be too large to scale to
    non-bacterial organisms.
    """
    nearest_to_left = [0] * genome_size
    nearest_to_right = [0] * genome_size

    old_start = 0
    old_end = 0
    first_start = 0
    first_end = 0
    gene = None

    pbar = progressbar.ProgressBar(maxval=len(gtf_data))

    for gene in pbar(gtf_data):
        if gene.chrom != 'Chromosome':
            continue
        for i in range(old_start, gene.start):
            nearest_to_right[i] = gene.start
        if first_end == 0:
            first_start = gene.start
            first_end = gene.end
        else:
            for i in range(old_end, gene.end):
                nearest_to_left[i] = old_end

        old_start = gene.start
        old_end = gene.end

    assert gene  # Should catch an empty gtf file

    for i in range(first_end):
        nearest_to_left[i] = gene.end
    for i in range(gene.end, len(nearest_to_left)):
        nearest_to_left[i] = gene.end
    for i in range(gene.start, len(nearest_to_right)):
        nearest_to_right[i] = first_start
    return nearest_to_left, nearest_to_right


def filter_gtf(gtf_data):
    """ Remove non-CDS entries from the GTF data"""
    for entry in gtf_data[:]:
        if entry.type != 'CDS' and entry.type != 'operon':
            gtf_data.remove(entry)


def guess_gene_name(annotation_string):
    """ Guess name based on "Other" field of GTF/GFF entry"""
    guess_order = ['operon', 'operon_', 'gene', 'gene_', 'transcript']
    entries = annotation_string.split(';')
    names = {}
    for entry in entries:
        entry = entry.strip()
        if 'name' in entry.lower():
            kind = entry[:entry.lower().find('name')]
            name = entry[entry.find('"') + 1:-1]  # Within the quotes
            names[kind] = name
    for guess in guess_order:
        if guess in names:
            return names[guess]
    if len(names) == 1:
        return names.values()[0]
    return annotation_string