欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

200105 CAGProject

程序员文章站 2022-03-11 21:33:59
...
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import linecache
import csv
import codecs

#读取fasta文件
f = open(r'C:\Users\Administrator\Dropbox\Code\CAG Project\Upload-20191231\01.rawdata\Raw_F3_8STR.fasta',encoding='gb18030', errors = "ignore")

with f as file:
    lines = file.readlines()

#将fasta文件中每一个测序文件进行提取
#将Fasta文件中所有包含测序文件名的行号提取出来
seqfile = []
i = 0
while i < len(lines):
     if re.search("m64083", lines[i]):
         seqfile.append(i)
         i = i+1
     else:
         i = i+1

#将Fasta文件中所有测序文件名提取出来
seqfile_title = []
i = 0
while i < len(seqfile):
    seqfile_title.append(lines[seqfile[i]])
    i = i+1

#将Fasta文件中每个测序结果提取出来
n = 0
m = 0
seqfile_seq = []
seqnumber = len(seqfile)

#提取前n-1个测序结果
while n <seqnumber -1 :
    minnumber = seqfile[n]
    maxnumber = seqfile[n+1]
    seqinfo = ""

    m = minnumber + 1
    while m < maxnumber:
        seqinfo = seqinfo + lines[m].strip()
        m = m + 1

    seqfile_seq.append(seqinfo)
    n = n+1

#提取最后一个测序结果
seqinfo = ""
m = seqfile[-1] + 1
filelength = len(lines)

while m < filelength:
    seqinfo = seqinfo + lines[m].strip()
    m = m +1

seqfile_seq.append(seqinfo)

###########################################################################################
#################################以下为数据处理部分#########################################
###########################################################################################


###################################先进行数据分类###########################################

#根据是否有引物方向将全体数据分成{双引物,单引物,无引物}三组
Primer_title = []
Primer_seq = []
Primer_number = 0

Single_primer_title = []
Single_primer_seq = []
Single_primer_number = 0

Zero_primer_title = []
Zero_primer_seq = []
Zero_primer_number = 0

n = 0
while n< len(seqfile_seq):
    # 匹配正向序列
    if re.search("GGCCTTCGAGTCCCTCAAGTCCTTCCAG(\S{0,10000})ACGGAAGGCATTTCATGAGGACGGCTTTCAAGAAGC", seqfile_seq[n]) or \
        re.search( "GCTTCTTGAAAGCCGTCCTCATGAAATGCCTTCCGT(\S{0,10000})TGGAAGGACTTGAGGGACTCGAAGGCC", seqfile_seq[n]):
        Primer_title.append(seqfile_title[n])
        Primer_seq.append(seqfile_seq[n])
        Primer_number = Primer_number + 1

    else:
        if  re.search("GGCCTTCGAGTCCCTCAAGTCCTTCCAG", seqfile_seq[n]) or \
            re.search("GCTTCTTGAAAGCCGTCCTCATGAAATGCCTTCCGTG", seqfile_seq[n]) or \
            re.search("ACGGAAGGCATTTCATGAGGACGGCTTTCAAGAAGC", seqfile_seq[n]) or \
            re.search("CTGGAAGGACTTGAGGGACTCGAAGGCC", seqfile_seq[n]):
            Single_primer_title.append(seqfile_title[n])
            Single_primer_seq.append(seqfile_seq[n])
            Single_primer_number = Single_primer_number + 1

        else:
            Zero_primer_title.append(seqfile_title[n])
            Zero_primer_seq.append(seqfile_seq[n])
            Zero_primer_number = Zero_primer_number + 1

    n = n+1

#计算每类数据占全数据的百分比
total_number = len(seqfile_title)
percent_Primer_number = (round(Primer_number/total_number*100, 2))
percent_Single_primer_number = (round(Single_primer_number/total_number*100, 2))
percent_Zero_primer_number = (round(Zero_primer_number/total_number*100, 2))


print("根据有无引物序列进行分类\n共检测到序列",Primer_number + Single_primer_number + Zero_primer_number,
      "条\n含有双端引物的序列共有:",Primer_number,"条, 占全部序列的", percent_Primer_number,
      "%\n含有单端引物的序列共有:",Single_primer_number,"条, 占全部序列的", percent_Single_primer_number,
      "%\n不含任何引物的序列共有:",Zero_primer_number, "条, 占全部序列的", percent_Zero_primer_number,"%\n")

#根据引物方向将全体数据分成{正向测序、反向测序、异常}三组
Sense_tile = []
Sense_seq = []
Sense_CAG_seq = []
Sense_number = 0

Antisense_title = []
Antisense_seq = []
Antisense_CAG_seq = []
Antisense_number = 0

No_primer_title = []
No_primer_seq = []
No_primer_number = 0

n = 0
while n< len(seqfile_seq):
#    print("n=", n)
    # 匹配正向序列
    if re.search("GTCCTTCCAG(\S{0,10000})CCGCCGCCACCG", seqfile_seq[n]) or re.search("GTCCTTCCAG(\S{0,10000})CCGCCACCGCCG", seqfile_seq[n]) :
#        print("Enter Sense match")
        if re.search("CCGCCGCCACCG", seqfile_seq[n]):
            Sense_tile.append(seqfile_title[n])
            Sense_seq.append(seqfile_seq[n])
            Sense_number = Sense_number + 1
            CAGsearch = re.search("GTCCTTCCAG(\S{0,10000})CCGCCGCCACCG" , seqfile_seq[n]).group(1)  # 输出全部CAG序列
            Sense_CAG_seq.append(CAGsearch)
#            print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "Sense\nSense Sequencing number is ", Sense_number)

        else:
            if re.search("CCGCCACCGCCG", seqfile_seq[n]):
                Sense_tile.append(seqfile_title[n])
                Sense_seq.append(seqfile_seq[n])
                Sense_number = Sense_number + 1
                CAGsearch = re.search("GTCCTTCCAG(\S{0,10000})CCGCCACCGCCG", seqfile_seq[n]).group(1)  # 输出全部CAG序列
                Sense_CAG_seq.append(CAGsearch)
#                print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "Sense\nSense Sequencing number is ", Sense_number)
            else:
                No_primer_title.append(seqfile_title[n])
                No_primer_seq.append(seqfile_seq[n])
                No_primer_number = No_primer_number + 1
#                print(seqfile_title[n], seqfile_seq[n], "\n", "正向匹配无下游引物", "\nNo primer number is ", No_primer_number)
        # 匹配逆向序列
    else:
         # 反向序列
         if re.search("CGGTGGCGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]) or re.search("CGGCGGTGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]) :
#             print("Enter Antiense match")
             if re.search("CGGTGGCGGCGG", seqfile_seq[n]):
                Antisense_title.append(seqfile_title[n])
                Antisense_seq.append(seqfile_seq[n])
                Antisense_number = Antisense_number + 1
                CAGsearch = re.search("CGGTGGCGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]).group(1)  # 输出全部CAG序列
                Antisense_CAG_seq.append(CAGsearch)
#                print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n",  "#Antisense\nAntisense Sequencing number is #", Antisense_number)

             else:
                 if re.search("CGGCGGTGGCGG", seqfile_seq[n]):
                    Antisense_title.append(seqfile_title[n])
                    Antisense_seq.append(seqfile_seq[n])
                    Antisense_number = Antisense_number + 1
                    CAGsearch = re.search("CGGCGGTGGCGG(\S{0,10000})CTGGAAGGAC", seqfile_seq[n]).group(1)  # 输出全部CAG序列
                    Antisense_CAG_seq.append(CAGsearch)
#                    print(seqfile_title[n], seqfile_seq[n], "\n", CAGsearch, "\n", "#Antisense\nAntisense Sequencing number is #", Antisense_number)
                 else:
                    No_primer_title.append(seqfile_title[n])
                    No_primer_seq.append(seqfile_seq[n])
                    No_primer_number = No_primer_number + 1
#                    print(seqfile_title[n], seqfile_seq[n], "\n", "#反向匹配无下游引物", "\nNo primer number is ", No_primer_number)
         else:
#            print("Enter Error match")
            No_primer_title.append(seqfile_title[n])
            No_primer_seq.append(seqfile_seq[n])
            No_primer_number = No_primer_number + 1
#            print(seqfile_title[n], seqfile_seq[n], "\n", "###匹配无引物",  "\nNo primer number is ", No_primer_number)

    n = n + 1

percent_Sense_number = (round(Sense_number/total_number*100, 2))
percent_AntiSense_number = (round(Antisense_number/total_number*100, 2))
percent_No_primer_number = (round(No_primer_number/total_number*100, 2))

print("根据CAG序列进行分类\n共检测到序列", Sense_number + Antisense_number + No_primer_number,
      "条\n测序为Sense方向序列共有:", Sense_number,"条, 占全部序列的", percent_Sense_number,
      "%\n测序为AntiSense方向序列共有:", Antisense_number, "条, 占全部序列的", percent_AntiSense_number,
      "%\n无法检测测序方向序列共有:", No_primer_number, "条, 占全部序列的", percent_No_primer_number, "%\n")

###################################再进行CAG计数###########################################

#利用CAG_Seq中字符总数/3计算CAG个数
def Length_Count(CAG):
    CAGnumber = []
    for i in CAG:
        CAGtext = str(i)
        CAGcount = int(len(CAGtext)/3)
        CAGnumber.append(CAGcount)
    return CAGnumber

Sense_Length_Count = Length_Count(Sense_CAG_seq)
AntiSense_Length_Count = Length_Count(Antisense_CAG_seq)

#将list中数字转换为str格式
def NumbertoStr(Number):
    StrList = []
    for i in Number:
        StrList.append(str(i))

    return StrList

Sense_Length_Count = NumbertoStr(Sense_Length_Count)
AntiSense_Length_Count = NumbertoStr(AntiSense_Length_Count)

#转换CAG序列文本中CAG为Q数量
def CAGtoPolyQ_TEXT(CAG):
    CAG_TEXT = []
    for i in CAG:
        CAGtext = str(i)
        if re.search("CAGCAG", CAGtext):  # 匹配正向序列
            CAGtext = re.sub("CAG", "Q", CAGtext)  # Chang CAG sequence to Q
            CAGtext = re.sub("CAA", "Q", CAGtext)  #Chang CAA sequence to Q
            CAG_TEXT.append(CAGtext)
        else:
            CAGtext = re.sub("TGC", "Q", CAGtext)
            CAGtext = re.sub("TTC", "Q", CAGtext)
            CAG_TEXT.append(CAGtext)

    return CAG_TEXT

Sense_CAGtoQ_TEXT = CAGtoPolyQ_TEXT(Sense_CAG_seq)
AntiSense_CAGtoQ_TEXT = CAGtoPolyQ_TEXT(Antisense_CAG_seq)

#将list中PolyQ个数进行统计
def CAGtoPloyQ_COUNT(CAG):
    CAG_Number = []
    for i in CAG:
        PolyQnumber = int(i.count("Q"))
        CAG_Number.append(PolyQnumber)

    return CAG_Number

Sense_CAGtoQ_Number = CAGtoPloyQ_COUNT(Sense_CAGtoQ_TEXT)
AntiSense_CAGtoQ_Number = CAGtoPloyQ_COUNT(AntiSense_CAGtoQ_TEXT)

Sense_CAGtoQ_Number = NumbertoStr(Sense_CAGtoQ_Number)
AntiSense_CAGtoQ_Number = NumbertoStr(AntiSense_CAGtoQ_Number)

Sense_file = zip(Sense_tile, Sense_seq, Sense_CAG_seq, Sense_Length_Count, Sense_CAGtoQ_Number)
Sense_file_list = list(Sense_file)

headers = ['SeqTitle', 'SeqInfo', 'SeqCAGInfo', 'Seq_CAGLength', 'Seq_QLength']

with open(r'C:\Users\Administrator\Dropbox\Code\Sense_Result.csv', 'w', newline='') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(Sense_file_list)
相关标签: 生物信息学

推荐阅读