生信技能树-task1-人类基因组外显子区域长度

程序员文章站 2024-03-02 19:49:22

...

目标：统计人类基因组外显子区域长度

题目数据来源为：ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt

perl

open F,"CCDS.current.txt";
while(<F>){#一行一行读取数据
  next if /^#/;#pass掉带#符号开头的行，即首行
  chomp;#去掉末尾的换行符
  @arr=split /\t/;#以制表符切割读取的每一行数据
  next unless /\[(.*?)\]/;#无外显子坐标的行pass
  @start_end=split /,\s+/,$1;#切割文件的第10列
  foreach(@start_end){
    @loci=split /-/;#以“-”切割每个外显子
    foreach(@loci[0][email protected][1]){$hash{"$arr[0]:$_"}=1;}#以染色体:坐标符号 格式存入perl哈希中，目的去重
  }
}
close F;

$length=keys %hash;#得到哈希键值长度
print "外显子总长度:$length\n";

输出结果

外显子总长度:35185772

用时 0m39.897s

python3

import re

chr_pos = set()#定义set数据,目的去重
with open("CCDS.current.txt", 'r') as file:
    for line in file:
        if re.match('^#',line): 
            continue #跳过首行
        line = line.split("\t")
        if line[9] != "-":#pass掉第10列为"-"的数据行
            chr = line[0] #获得该行数据的染色体编号
            exons = line[9].lstrip("[").rstrip("]").split(", ") #去掉字符串[],以", "切割字符串,得到exons列表
            for exon in exons:#对切割得到的exons列表进行循环
                exon=exon.split("-")
                for i in range(int(exon[0]),int(exon[1])+1):
                    chr_pos.add(line[0]+":"+str(i))#依据染色体号:外显子坐标，利用set数据去重

print(len(chr_pos))

输出结果：

35185772

用时 0m43.192s

R语言

if(T){
t1 <- Sys.time()#把程序运行之前的时间赋值给t1
data <- read.table("CCDS.current.txt", sep='\t')[c(1,10)]
get_gene <-function(data_item){
  # 目的就是去掉第10列的外面的中括号"[]"
  # 用到的数据data_item[2]为：[1013573-1013575, 1013983-1014477],输出的数据为"1013573-1013575, 1013983-1014477"
  #data_item=substr(data_item[2], 2, nchar(data_item[2])-1)
  data_item=sub("\\[(.*?)\\]", "\\1",data_item[2])
  data_item #返回值
}

get_length <- function(exon){
  # 输入的数据为'111-113'
  # 输出结果为c(111:113),即c(111,112,113)
  loc <- strsplit(exon,"-")[[1]] # 注：strsplit的输出结果为列表
  a <- c(loc[1]:loc[2])
  a
}

exon_length = NULL
chrs <- unique(data[,1])#得到染色体编号:1-23,X,Y
for (i in chrs){
  # paste 函数把i号染色体的所有外显子的坐标合并为一个character对象
  # gene的格式为'111-112, 115-135, 125-138, 254-258,...'
  gene <- paste(apply(data[which(data[1]==i & data[2] != '-'),], 1, get_gene),collapse=', ')#apply函数,1表示按行依次用get_gene函数处理
  # exon的格式为c('111-112','115-135', '125-138', '254-258', ...)
  exon <- unique(strsplit(gene,", ")[[1]]) # 输入的数据为c('111-112, 115-135, 125-138, 254-258,...');输出的数据为c('111-112','115-135', '125-138', '254-258', ...)
  exon_length <-  c(exon_length,length(unique(unlist(lapply(exon, get_length))))) # 输入的数据为lapply(c('111-112','115-135', '125-138', '254-258', ...),fun)
  #lapply的输出结果为列表,unlist则把列表变为向量,unique的目的是去重复计算的坐标,length计算该染色体的所有外显子坐标长度,一个坐标对应长度1，所以length得到的就是该染色体的外显子总长度
  #c(exon_length,....),累计所有已经计算过的染色体外显子长度
}

barplot(exon_length,names.arg=chrs,xlab='Chromosomes',ylab='length of exons',col = rainbow( length( chrs )) ) #绘制条形图
print(paste('The length of all exons is',sum(exon_length)))
difftime(Sys.time(), t1, units = 'secs')# 计算执行完成后时间与t1的间隔
}

"The length of all exons is 35185772"
Time difference of 6.63437 secs

参考

https://mp.weixin.qq.com/s/xS-h0TQnCwjvgcRA7Ru3NQ