标签:art log html cat return pen [] weight 重复
生物信息学原理作业第四弹:DNA序列组装(贪婪算法)
原理:生物信息学(孙啸)
大致思想:
1. 找到权值最大的边;
2. 除去以最大权值边的起始顶点为起始顶点的边;
3. 除去以最大权值边为终点为终点的边;
4. 重复上述步骤,得到所有符合条件的边;
5. 拼接得到的边;
6. 加入孤立点(如果有)。
附上Python代码,如果有问题我会及时更正(确实不太熟算法)
转载请保留出处!
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Dec 4 15:04:39 2017
4 @author: zxzhu
5 python3.6
6 """
7 from functools import reduce
8 def get_weight(s1,s2): #通过两条序列的overlap计算出权值
9 l = min(len(s1),len(s2))
10 while l>0:
11 if s2[:l] == s1[-l:]:
12 return l
13 else:
14 l-=1
15 return 0
16
17 def print_result(s1,s2): #将两条序列去除首尾overlap后合并
18 weight = get_weight(s1,s2)
19 s = s1 + s2[weight:]
20 #print(s)
21 return s
22
23 def dir_graph(l,t=3): #得到满足条件的有向图(权值大于等于t)
24 graph = {}
25 for i in l:
26 for j in l:
27 if i!=j:
28 weight = get_weight(i,j)
29 if weight >= t:
30 key = (i,j)
31 graph[key] = weight
32 return graph
33
34 def rm_path(graph,path): #贪婪算法加入一条边后应该去除与该边首尾顶点相同的边
35 key = graph.keys()
36 rm_key = []
37 for i in key:
38 if i[1] == path[1] or i[0] == path[0]:
39 rm_key.append(i)
40 for i in rm_key:
41 graph.pop(i)
42 return graph
43
44 def get_path(graph,path = []): #得到满足条件的所有边
45 while graph:
46 max_weight = 0
47 for i in graph.keys():
48 if graph[i] > max_weight:
49 max_weight = graph[i]
50 cur_path = i
51 path.append(cur_path)
52 graph = rm_path(graph,cur_path)
53 get_path(graph,path)
54 return path
55
56 def out_num(path,V): #计算某顶点的出度
57 count = 0
58 for i in path:
59 if i[0] == V:
60 count+=1
61 return count
62
63 def get_last_V(path,last_V = None): #得到最后一条边
64 index = 0
65 if last_V: #非随机寻找出度为0的顶点
66 for i in path:
67 if i[1] == last_V:
68 return i,index
69 else:
70 index+=1
71 return None #没有找到指向last_V的顶点
72 else: #随机寻找出度为0的顶点
73 for i in path:
74 if out_num(path,i[1]) == 0:
75 return i,index
76 else:
77 index+=1
78
79 def assemble(cur_V,path,new_path = []): #给满足条件的边排序
80 while path:
81 path.pop(cur_V[1])
82 new_path.insert(0,cur_V[0])
83 cur_V = get_last_V(path,last_V = cur_V[0][0])
84 if cur_V:
85 assemble(cur_V,path,new_path)
86 else:
87 cur_V = get_last_V(path)
88 assemble(cur_V,path,new_path)
89 return new_path
90
91 def align_isolated(path,sequence): #加入孤立顶点
92 new_path = reduce(lambda x,y:x+y,path)
93 for i in sequence:
94 if i not in new_path:
95 new_path.append(i)
96 return new_path
97
98
99 x = ‘CCTTTTGG‘
100 y = ‘TTGGCAATCACT‘
101 w = ‘AGTATTGGCAATC‘
102 u = ‘ATGCAAACCT‘
103 z = ‘AATCGATG‘
104 v = ‘TCACTCCTTTT‘
105 a = ‘CATAA‘
106 b = ‘ATCA‘
107 c = ‘TGCAT‘
108 sequence = [x,y,w,u,z]
109 sequence1 = [a,b,c]
110 graph = dir_graph(sequence1,t=2)
111 path = get_path(graph)
112 path = [list(i) for i in path] #将path中的tuple元素换成list
113 #print(path)
114 start = get_last_V(path) #起始出度为0的顶点所在的边
115 new_path = assemble(start,path) #排序后的边
116 #print(new_path)
117 new_path = align_isolated(new_path,sequence1) #加入孤立顶点
118 #print(new_path)
119 result = reduce(print_result,new_path) #组装
120 print(result)
标签:art log html cat return pen [] weight 重复
原文地址:http://www.cnblogs.com/zxzhu/p/7979097.html