python-根据gtf绘制基因全部转录本结构图

程序员文章站 2022-07-14 14:18:20

...

根据基因组注释文件gtf绘制基因全部转录本的结构图，利用python进行实现，并实现了GUI

可以下载各种gtf，从NCBI,ENSEMBL,UCSC,GENCODE都可以，但是要根据相应的版本修改代码

重点是得到所有基因的转录本个数，以及每个转录本的外显子的坐标。

先上效果图，这是人类基因组中的ANXA1基因的全部转录本

python-根据gtf绘制基因全部转录本结构图

下面直接上代码

#!/usr/bin/env python
#  coding:utf-8
import re
import os
import os.path
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.lines as lines
from tkinter import *
from tkinter import ttk
import time
import imageio
import shutil



class showGene(object):
    def __init__(self):

        list = os.listdir(os.getcwd())  # 列出文件夹下所有的目录与文件
        list2 = []
        for i in range(0, len(list)):
            if list[i][-4:-1] + list[i][-1] == '.gtf':
                list2.append(list[i])
        #选择基因组以及要显示的基因的GUI界面
        self.top = Tk()
        self.genename = StringVar()
        self.genedata = StringVar()
        self.getinfoFm = Frame(self.top)
        self.data = Label(self.getinfoFm, text="Chooes the gtf of a species").grid(column=0, row=0)  # 添加一个标签，并将其列设置为1，行设置为0
        self.gene = Label(self.getinfoFm, text="       Enter a genename:").grid(column=1, row=0)
        self.choosedata = ttk.Combobox(self.getinfoFm, width = 15, textvariable=self.genedata)
        self.choosedata['values'] = [''] + list2
        self.choosedata.current(0)
        self.choosedata.grid(column = 0, row = 1)
        self.genenameEntry = Entry(self.getinfoFm, textvariable=self.genename, width=12, )
        self.genenameEntry.grid(column = 1, row = 1)
        self.getinfoFm.pack()
        #按钮界面
        self.buttonFm = Frame(self.top)
        self.showB = Button(self.buttonFm, text = 'SHOW', command = self.show,
            activeforeground = 'white',
            activebackground = 'blue').grid(column = 0, row = 0)
        self.quitB = Button(self.buttonFm, text = 'QUIT', command = self.top.quit,
            activeforeground = 'white',
            activebackground = 'blue').grid(column = 1, row = 0)
        self.buttonFm.pack()

    def show(self):
        global genetxt_path
        transcript_num = 0
        gene_exist = 0
        transcript_list = []
        line_width = 5
        if self.genedata.get() == '' or self.genename.get() == '':
            return

        #gtf文件路径，默认为可执行文件（exe）所在文件夹
        data_path = os.getcwd() + '/' + self.genedata.get()
        genename = self.genename.get()
        #储存各基因结果（txt文本和png图片）的文件夹
        result_path = os.getcwd() + '/My result/' + genename
        #若路径不存在，创造路径
        if not os.path.exists(result_path):
            os.makedirs(result_path)
        #临时文件（基因的信息）的路径
        genetxt_path = result_path + '/' + genename + '.txt'

        f = open(data_path, 'r')
        fp = open(genetxt_path, 'w')
        allLines = f.readlines()
        for eachLine in allLines:
            if not eachLine.startswith('#'):
                eachLine_list = eachLine.split('\t')
                name = re.search('gene_name "(.*?)";', eachLine_list[-1]).group(1)
                if name == genename:
                    gene_exist = 1
                    fp.write(eachLine)
                    if eachLine_list[2] == 'transcript':
                        transcript_num += 1
                        transcript_name = re.search('transcript_name "(.*?)";', eachLine_list[-1]).group(1)
                        if transcript_name not in transcript_list:
                            transcript_list.append(transcript_name)
        f.close()
        fp.close()
        #若gtf文件中不存在相应基因的信息，弹出提示
        if gene_exist == 0:
            self.failedFm = Frame(self.top)
            self.faillb = Label(self.failedFm, text="genename no exist,please check and retry").pack()
            self.failedFm.pack()
            self.top.update()
            time.sleep(3)
            self.genename.set('')
            self.failedFm.destroy()
            shutil.rmtree(result_path)
        #反之向下执行
        else:
            fig = plt.figure(1)
            num = 0
            tmp_colors = ['lime', 'red', 'blue', 'yellow', 'yellow', 'w']
            names_tmp_colors = ['gene', 'CDS', 'exon', 'three_prime_utr', 'five_prime_utr', 'stop_codon']
            colors_legend_name = ['gene', 'CDS_exon', 'non_CDS_exon', 'UTR_exon']
            color_dict = dict(zip(names_tmp_colors, tmp_colors))
            png_path = result_path + '/' + genename + '.png'
            #读取之前保存的包含基因信息的txt文件，若不存在，弹出警告信息
            fp = open(genetxt_path, 'r')
            allLines = fp.readlines()
            for eachLine in allLines:
                eachLine_list = eachLine.split('\t')
                if eachLine_list[2] == 'gene':
                    #判断方向
                    if eachLine_list[6] == '+':
                        arr = '->'
                    else:
                        arr = '<-'

                    ax = fig.add_axes([0.2, 0.2, 0.5, 0.6])
                    arrow = mpatches.FancyArrowPatch(
                        (int(eachLine_list[3]), 0.1),
                        (int(eachLine_list[4]), 0.1),
                        arrowstyle=arr,
                        mutation_scale=25, lw=1, color='lime', antialiased=True)
                    # 画箭头
                    ax.add_patch(arrow)
                    # 坐标轴标签
                    ax.set_xlim(int(eachLine_list[3]), int(eachLine_list[4]))
                    ax.set_ylim(-0.5, transcript_num + 1)
                    ax.set_xticks(np.linspace(int(eachLine_list[3]), int(eachLine_list[4]), 5))
                    ax.set_yticks([0.1] + list(range(1, transcript_num + 1)))
                    ax.set_yticklabels(['gene'] + transcript_list)
                    ax.set_xticklabels([str(j) for j in [int(i) for i in np.linspace
                        (int(eachLine_list[3]), int(eachLine_list[4]),5)]])
                    # 坐标轴显示
                    ax.spines['top'].set_visible(False)
                    ax.spines['left'].set_visible(False)
                    ax.spines['right'].set_visible(False)
                    ax.get_xaxis().tick_bottom()
                    ax.get_yaxis().tick_left()
                    ax.get_xaxis().set_tick_params(direction='out')
                    ax.tick_params(axis=u'y', which=u'both', length=0)  # 纵坐标刻度线不显示（length=0）
                    # 坐标轴字体大小
                    for tick in ax.xaxis.get_major_ticks():
                        tick.label.set_fontsize(6)
                    for tick in ax.yaxis.get_major_ticks():
                        tick.label.set_fontsize(6)


                elif eachLine_list[2] == 'transcript':
                    num = num + 1
                    line1 = [(int(eachLine_list[3]), num), (int(eachLine_list[4]), num)]
                    (line1_xs, line1_ys) = zip(*line1)
                    ax.add_line(lines.Line2D(line1_xs, line1_ys, linewidth=0.2,
                                            solid_capstyle='butt', solid_joinstyle='miter',
                                            antialiased=False, color='black'))

                elif eachLine_list[2] in color_dict.keys():
                    # 添加结构图
                    line2 = [(int(eachLine_list[3]) - 0.5, num), (int(eachLine_list[4]) + 0.5, num)]
                    (line2_xs, line2_ys) = zip(*line2)
                    ax.add_line(lines.Line2D(line2_xs, line2_ys,
                                            solid_capstyle='butt', solid_joinstyle='miter',
                                            linewidth=int(line_width), alpha=1,
                                            color=color_dict[eachLine_list[2]],
                                            antialiased=False))
                # 做图例
                # add_axes 是在一张图上指定特定区域作图，第一个数字为从左边%74处，下面20%处开始，宽20%，高60%区域作图
                ax_legend = fig.add_axes([0.76, 0.2, 0.2, 0.6])
                for i in range(len(colors_legend_name)):
                    line3 = [(0, (9 - i) * 0.1), (0.1, (9 - i) * 0.1)]
                    (line3_xs, line3_ys) = zip(*line3)
                    ax_legend.add_line(lines.Line2D(line3_xs, line3_ys, linewidth=5,
                                                    color=color_dict[names_tmp_colors[i]],
                                                    solid_capstyle='butt', solid_joinstyle='miter',
                                                    antialiased=False))
                    ax_legend.text(0.2, (8.9 - i) * 0.1, colors_legend_name[i], fontsize=6)
                    ax_legend.set_axis_off()
                fig.suptitle('\n\n\nchr' + str(eachLine_list[0]) + ': ' + genename, fontsize=10)
                # 保存图片
                fig.savefig(png_path, dpi=150)

            #显示图片
            self.img_gif = PhotoImage(file = png_path)
            self.label_img = Label(self.top, image=self.img_gif)
            self.clearB = Button(self.top, text = 'CLEAR', command = self.clear,
                                    activeforeground = 'white',
                                    activebackground = 'blue')
            self.clearB.pack()
            self.label_img.pack()




    def clear(self):
        global genetxt_path
        self.top.update()
        self.label_img.destroy()
        self.clearB.destroy()
        #删除保存基因信息的txt文件，只保留图片
        os.remove(genetxt_path)


def main():
    d = showGene()
    mainloop()

if __name__ == '__main__':
    main()

注意！

1.默认自带了人、小鼠以及斑马鱼的基因组GTF文件，别的物种的gtf文件可以自行下载，将解压生成的gtf文件放入软件所在文件夹即可

2.生成的图片会自动保存在软件所在文件夹的 My result 文件夹（软件自动生成）中

另外我将这个脚本转成了exe，可以直接在电脑上运行（无需安装python），感兴趣的朋友可以下载试试

链接：https://pan.baidu.com/s/1_ncNtcrpJs7r8PshsR4lgw 密码：tpwy

上一篇： CS231N-Lecture3 Loss Functions and Optimization

下一篇：手写call、apply、bind函数