计算基因组外显子长度

时间：2017-01-10 07:49:23 阅读：370 评论：0 收藏：0 [点我收藏+]

下载基因组外显子信心

运行下列代码得到外显子大约36M

import re

import os

from collections import OrderedDict

from operator import itemgetter

os.chdir(‘/Users/yangqin/Desktop/biotree/python/‘)

exonLength = 0 # 外显子长度尚未计算

overlapExons = OrderedDict()

with open(‘CCDS.current.txt‘, ‘rt‘) as f: # 把CCCDS.current.txt打开定义为f

for line in f: # 逐行读取f

if line.startswith(‘#‘): # 如果该行以井号开头，该行为表头

continue # 跳过该行

line = line.rstrip() # 把右边的空格去掉，_.rstrip()把右边的空格去掉

lst = line.split(‘\t‘) # 用split分开该行的各列数据，以\t分隔

if lst[-2] == ‘-‘: # 如果倒数第二列是‘-’，没有外显子

continue # 跳过该行

lst[-2] = re.sub(‘\[|\]‘, ‘ ‘, lst[-2]) # 正则表达式，去掉倒数第二列两端的[]，换成空格

exons = lst[-2].split(‘, ‘) # 把该列（每个基因）多个外显子隔开

for exon in exons: # 对于每一个外显子

start = int(exon.split(‘-‘)[0]) # 拿到该外显子的起始坐标

end = int(exon.split(‘-‘)[1]) # 拿到该外显子的终止坐标

coordinate = lst[0] + ‘:‘ + exon # 定义每一个外显子，以防重复计算

if coordinate not in overlapExons.keys(): # 如果该外显子尚未计算

overlapExons[coordinate] = 1 # 加入该外显子

exonLength += end - start # 计算长度

print(exonLength)

36061455

原文：http://www.cnblogs.com/qiniqnyang/p/6267106.html

踩

(0)

评论一句话评论（0）

分享档案

更多>