200105 CAGProject
程序员文章站
2022-03-11 21:33:59
...
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import linecache
import csv
import codecs
#读取fasta文件
f = open(r'C:\Users\Administrator\Dropbox\Code\CAG Project\Upload-20191231\01.rawdata\Raw_F3_8STR.fasta',encoding='gb18030', errors = "ignore")
with f as file:
lines = file.readlines()
#将fasta文件中每一个测序文件进行提取
#将Fasta文件中所有包含测序文件名的行号提取出来
seqfile = []
i = 0
while i < len(lines):
if re.search("m64083", lines[i]):
seqfile.append(i)
i = i+1
else:
i = i+1
#将Fasta文件中所有测序文件名提取出来
seqfile_title = []
i = 0
while i < len(seqfile):
seqfile_title.append(lines[seqfile[i]])
i = i+1
#将Fasta文件中每个测序结果提取出来
n = 0
m = 0
seqfile_seq = []
seqnumber = len(seqfile)
#提取前n-1个测序结果
while n <seqnumber -1 :
minnumber = seqfile[n]
maxnumber = seqfile[n+1]
seqinfo = ""
m = minnumber + 1
while m < maxnumber:
seqinfo = seqinfo + lines[m].strip()
m = m + 1
seqfile_seq.append(seqinfo)
n = n+1
#提取最后一个测序结果
seqinfo = ""
m = seqfile[-1] + 1
filelength = len(lines)
while m < filelength:
seqinfo = seqinfo + lines[m].strip()
m = m +1
seqfile_seq.append(seqinfo)
###########################################################################################
#################################以下为数据处理部分#########################################
###########################################################################################
###################################先进行数据分类###########################################
#根据是否有引物方向将全体数据分成{双引物,单引物,无引物}三组
Primer_title = []
Primer_seq = []
Primer_number = 0
Single_primer_title = []
Single_primer_seq = []
Single_primer_number = 0
Zero_primer_title = []
Zero_primer_seq = []
Zero_primer_number = 0
n = 0
while n< len(seqfile_seq):
# 匹配正向序列
if re.search("GGCCTTCGAGTCCCTCAAGTCCTTCCAG(\S{0,10000})ACGGAAGGCATTTCATGAGGACGGCTTTCAAGAAGC", seqfile_seq[n]) or \
re.search( "GCTTCTTGAAAGCCGTCCTCATGAAATGCCTTCCGT(\S{0,10000})TGGAAGGACTTGAGGGACTCGAAGGCC", seqfile_seq[n]):
Primer_title.append(seqfile_title[n])
Primer_seq.append(seqfile_seq[n])
Primer_number = Primer_number + 1
else:
if re.search("GGCCTTCGAGTCCCTCAAGTCCTTCCAG", seqfile_seq[n]) or \
re.search("GCTTCTTGAAAGCCGTCCTCATGAAATGCCTTCCGTG", seqfile_seq[n]) or \
re.search("ACGGAAGGCATTTCATGAGGACGGCTTTCAAGAAGC", seqfile_seq[n]) or \
re.search("CTGGAAGGACTTGAGGGACTCGAAGGCC", seqfile_seq[n]):
Single_primer_title.append(seqfile_title[n])
Single_primer_seq.append(seqfile_seq[n])
Single_primer_number = Single_primer_number + 1
else:
Zero_primer_title.append(seqfile_title[n])
Zero_primer_seq.append(seqfile_seq[n])
Zero_primer_number = Zero_primer_number + 1
n = n+1
#计算每类数据占全数据的百分比
total_number = len(seqfile_title)
percent_Primer_number = (round(Primer_number/total_number*100, 2))
percent_Single_primer_number = (round(Single_primer_number/total_number*100, 2))
percent_Zero_primer_number = (round(Zero_primer_number/total_number*100, 2))
print("根据有无引物序列进行分类\n共检测到序列",Primer_number + Single_primer_number + Zero_primer_number,
"条\n含有双端引物的序列共有:",Primer_number,"条, 占全部序列的", percent_Primer_number,
"%\n含有单端引物的序列共有:",Single_primer_number,"条, 占全部序列的", percent_Single_primer_number,
"%\n不含任何引物的序列共有:",Zero_primer_number, "条, 占全部序列的", percent_Zero_primer_number,"%\n")
#根据引物方向将全体数据分成{正向测序、反向测序、异常}三组
Sense_tile = []
Sense_seq = []
Sense_CAG_seq = []
Sense_number = 0
Antisense_title = []
Antisense_seq = []
Antisense_CAG_seq = []
Antisense_number = 0
No_primer_title = []
No_primer_seq = []
No_primer_number = 0
n = 0
while n< len(seqfile_seq):
# print("n=", n)
# 匹配正向序列
if re.search("GTCCTTCCAG(\S{0,10000})CCGCCGCCACCG", seqfile_seq[n]) or re.search("GTCCTTCCAG(\S{0,10000})CCGCCACCGCCG", seqfile_seq[n]) :
# print("Enter Sense match")
if re.search("CCGCCGCCACCG", seqfile_seq[n]):
Sense_tile.append(seqfile_title[n])
Sense_seq.append(seqfile_seq[n])
Sense_number = Sense_number + 1
CAGsearch = re.search("GTCCTTCCAG(\S{0,10000})CCGCCGCCACCG" , seqfile_seq[n]).group(1) # 输出全部CAG序列
Sense_CAG_seq.append(CAGsearch)
# print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "Sense\nSense Sequencing number is ", Sense_number)
else:
if re.search("CCGCCACCGCCG", seqfile_seq[n]):
Sense_tile.append(seqfile_title[n])
Sense_seq.append(seqfile_seq[n])
Sense_number = Sense_number + 1
CAGsearch = re.search("GTCCTTCCAG(\S{0,10000})CCGCCACCGCCG", seqfile_seq[n]).group(1) # 输出全部CAG序列
Sense_CAG_seq.append(CAGsearch)
# print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "Sense\nSense Sequencing number is ", Sense_number)
else:
No_primer_title.append(seqfile_title[n])
No_primer_seq.append(seqfile_seq[n])
No_primer_number = No_primer_number + 1
# print(seqfile_title[n], seqfile_seq[n], "\n", "正向匹配无下游引物", "\nNo primer number is ", No_primer_number)
# 匹配逆向序列
else:
# 反向序列
if re.search("CGGTGGCGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]) or re.search("CGGCGGTGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]) :
# print("Enter Antiense match")
if re.search("CGGTGGCGGCGG", seqfile_seq[n]):
Antisense_title.append(seqfile_title[n])
Antisense_seq.append(seqfile_seq[n])
Antisense_number = Antisense_number + 1
CAGsearch = re.search("CGGTGGCGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]).group(1) # 输出全部CAG序列
Antisense_CAG_seq.append(CAGsearch)
# print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "#Antisense\nAntisense Sequencing number is #", Antisense_number)
else:
if re.search("CGGCGGTGGCGG", seqfile_seq[n]):
Antisense_title.append(seqfile_title[n])
Antisense_seq.append(seqfile_seq[n])
Antisense_number = Antisense_number + 1
CAGsearch = re.search("CGGCGGTGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]).group(1) # 输出全部CAG序列
Antisense_CAG_seq.append(CAGsearch)
# print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "#Antisense\nAntisense Sequencing number is #", Antisense_number)
else:
No_primer_title.append(seqfile_title[n])
No_primer_seq.append(seqfile_seq[n])
No_primer_number = No_primer_number + 1
# print(seqfile_title[n], seqfile_seq[n], "\n", "#反向匹配无下游引物", "\nNo primer number is ", No_primer_number)
else:
# print("Enter Error match")
No_primer_title.append(seqfile_title[n])
No_primer_seq.append(seqfile_seq[n])
No_primer_number = No_primer_number + 1
# print(seqfile_title[n], seqfile_seq[n], "\n", "###匹配无引物", "\nNo primer number is ", No_primer_number)
n = n + 1
percent_Sense_number = (round(Sense_number/total_number*100, 2))
percent_AntiSense_number = (round(Antisense_number/total_number*100, 2))
percent_No_primer_number = (round(No_primer_number/total_number*100, 2))
print("根据CAG序列进行分类\n共检测到序列", Sense_number + Antisense_number + No_primer_number,
"条\n测序为Sense方向序列共有:", Sense_number,"条, 占全部序列的", percent_Sense_number,
"%\n测序为AntiSense方向序列共有:", Antisense_number, "条, 占全部序列的", percent_AntiSense_number,
"%\n无法检测测序方向序列共有:", No_primer_number, "条, 占全部序列的", percent_No_primer_number, "%\n")
###################################再进行CAG计数###########################################
#利用CAG_Seq中字符总数/3计算CAG个数
def Length_Count(CAG):
CAGnumber = []
for i in CAG:
CAGtext = str(i)
CAGcount = int(len(CAGtext)/3)
CAGnumber.append(CAGcount)
return CAGnumber
Sense_Length_Count = Length_Count(Sense_CAG_seq)
AntiSense_Length_Count = Length_Count(Antisense_CAG_seq)
#将list中数字转换为str格式
def NumbertoStr(Number):
StrList = []
for i in Number:
StrList.append(str(i))
return StrList
Sense_Length_Count = NumbertoStr(Sense_Length_Count)
AntiSense_Length_Count = NumbertoStr(AntiSense_Length_Count)
#转换CAG序列文本中CAG为Q数量
def CAGtoPolyQ_TEXT(CAG):
CAG_TEXT = []
for i in CAG:
CAGtext = str(i)
if re.search("CAGCAG", CAGtext): # 匹配正向序列
CAGtext = re.sub("CAG", "Q", CAGtext) # Chang CAG sequence to Q
CAGtext = re.sub("CAA", "Q", CAGtext) #Chang CAA sequence to Q
CAG_TEXT.append(CAGtext)
else:
CAGtext = re.sub("TGC", "Q", CAGtext)
CAGtext = re.sub("TTC", "Q", CAGtext)
CAG_TEXT.append(CAGtext)
return CAG_TEXT
Sense_CAGtoQ_TEXT = CAGtoPolyQ_TEXT(Sense_CAG_seq)
AntiSense_CAGtoQ_TEXT = CAGtoPolyQ_TEXT(Antisense_CAG_seq)
#将list中PolyQ个数进行统计
def CAGtoPloyQ_COUNT(CAG):
CAG_Number = []
for i in CAG:
PolyQnumber = int(i.count("Q"))
CAG_Number.append(PolyQnumber)
return CAG_Number
Sense_CAGtoQ_Number = CAGtoPloyQ_COUNT(Sense_CAGtoQ_TEXT)
AntiSense_CAGtoQ_Number = CAGtoPloyQ_COUNT(AntiSense_CAGtoQ_TEXT)
Sense_CAGtoQ_Number = NumbertoStr(Sense_CAGtoQ_Number)
AntiSense_CAGtoQ_Number = NumbertoStr(AntiSense_CAGtoQ_Number)
Sense_file = zip(Sense_tile, Sense_seq, Sense_CAG_seq, Sense_Length_Count, Sense_CAGtoQ_Number)
Sense_file_list = list(Sense_file)
headers = ['SeqTitle', 'SeqInfo', 'SeqCAGInfo', 'Seq_CAGLength', 'Seq_QLength']
with open(r'C:\Users\Administrator\Dropbox\Code\Sense_Result.csv', 'w', newline='') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(Sense_file_list)
上一篇: 详解Go语言的计时器
下一篇: 办公车辆管理系统的实现(内附项目源码)
推荐阅读