ovlp_to_graph.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600
  1. #import os
  2. #import random
  3. #random.seed(int(os.environ['PYTHONHASHSEED']))
  4. #import warnings
  5. #warnings.warn('PYTHONHASHSEED={}'.format(os.environ['PYTHONHASHSEED']))
  6. from future.utils import viewitems
  7. from future.utils import itervalues
  8. from builtins import object
  9. import networkx as nx
  10. import os
  11. import shlex
  12. import subprocess
  13. import sys
  14. # Makes chimer_nodes stable; maybe others.
  15. from ..util.ordered_set import OrderedSet as set
  16. # Not sure if adds to stability, but at least adds determinism.
  17. from collections import OrderedDict as dict
  18. DEBUG_LOG_LEVEL = 0
  19. class SGNode(object):
  20. """
  21. class representing a node in the string graph
  22. """
  23. def __init__(self, node_name):
  24. self.name = node_name
  25. self.out_edges = []
  26. self.in_edges = []
  27. def add_out_edge(self, out_edge):
  28. self.out_edges.append(out_edge)
  29. def add_in_edge(self, in_edge):
  30. self.in_edges.append(in_edge)
  31. class SGEdge(object):
  32. """
  33. class representing an edge in the string graph
  34. """
  35. def __init__(self, in_node, out_node):
  36. self.in_node = in_node
  37. self.out_node = out_node
  38. self.attr = {}
  39. def set_attribute(self, attr, value):
  40. self.attr[attr] = value
  41. def reverse_end(node_name):
  42. if (node_name == 'NA'):
  43. return node_name
  44. if (len(node_name) < 2 or (node_name[-2:] not in [':B', ':E'])):
  45. raise Exception(
  46. 'Invalid node name. Node name passed to method: "{node_name}", expected format: "(%d)+:[BE]" or "NA".'.format(node_name=node_name))
  47. node_id, end = node_name.split(":")
  48. new_end = "B" if end == "E" else "E"
  49. return node_id + ":" + new_end
  50. class StringGraph(object):
  51. """
  52. class representing the string graph
  53. """
  54. def __init__(self):
  55. self.nodes = {}
  56. self.edges = {}
  57. self.n_mark = {}
  58. self.e_reduce = {}
  59. self.repeat_overlap = {}
  60. self.best_out = {}
  61. self.best_in = {}
  62. def add_node(self, node_name):
  63. """
  64. add a node into the graph by given a node name
  65. """
  66. if node_name not in self.nodes:
  67. self.nodes[node_name] = SGNode(node_name)
  68. def add_edge(self, in_node_name, out_node_name, **attributes):
  69. """
  70. add an edge into the graph by given a pair of nodes
  71. """
  72. if (in_node_name, out_node_name) not in self.edges:
  73. self.add_node(in_node_name)
  74. self.add_node(out_node_name)
  75. in_node = self.nodes[in_node_name]
  76. out_node = self.nodes[out_node_name]
  77. edge = SGEdge(in_node, out_node)
  78. self.edges[(in_node_name, out_node_name)] = edge
  79. in_node.add_out_edge(edge)
  80. out_node.add_in_edge(edge)
  81. edge = self.edges[(in_node_name, out_node_name)]
  82. for (k, v) in viewitems(attributes):
  83. edge.attr[k] = v
  84. def init_reduce_dict(self):
  85. for e in self.edges:
  86. self.e_reduce[e] = False
  87. def bfs_nodes(self, n, exclude=None, depth=5):
  88. all_nodes = set()
  89. all_nodes.add(n)
  90. candidate_nodes = set()
  91. candidate_nodes.add(n)
  92. dp = 1
  93. while dp < depth and len(candidate_nodes) > 0:
  94. v = candidate_nodes.pop()
  95. for e in v.out_edges:
  96. w = e.out_node
  97. if w == exclude:
  98. continue
  99. if w not in all_nodes:
  100. all_nodes.add(w)
  101. if len(w.out_edges) > 0:
  102. candidate_nodes.add(w)
  103. dp += 1
  104. return all_nodes
  105. def mark_chimer_edges(self):
  106. multi_in_nodes = {}
  107. multi_out_nodes = {}
  108. for n_name in self.nodes:
  109. n = self.nodes[n_name]
  110. out_nodes = [e.out_node for e in n.out_edges if self.e_reduce[(
  111. e.in_node.name, e.out_node.name)] == False]
  112. in_nodes = [e.in_node for e in n.in_edges if self.e_reduce[(
  113. e.in_node.name, e.out_node.name)] == False]
  114. if len(out_nodes) >= 2:
  115. multi_out_nodes[n_name] = out_nodes
  116. if len(in_nodes) >= 2:
  117. multi_in_nodes[n_name] = in_nodes
  118. chimer_candidates = set()
  119. out_set = set()
  120. in_set = set()
  121. for n_name in multi_out_nodes:
  122. out_nodes = set(multi_out_nodes[n_name])
  123. out_set |= out_nodes
  124. for n_name in multi_in_nodes:
  125. in_nodes = set(multi_in_nodes[n_name])
  126. in_set |= in_nodes
  127. chimer_candidates = out_set & in_set
  128. chimer_nodes = []
  129. chimer_edges = set()
  130. for n in chimer_candidates: # sort, or OrderedSet
  131. out_nodes = set([e.out_node for e in n.out_edges])
  132. test_set = set()
  133. for in_node in [e.in_node for e in n.in_edges]:
  134. test_set = test_set | set(
  135. [e.out_node for e in in_node.out_edges])
  136. test_set -= set([n])
  137. if len(out_nodes & test_set) == 0:
  138. flow_node1 = set()
  139. flow_node2 = set()
  140. for v in list(out_nodes):
  141. flow_node1 |= self.bfs_nodes(v, exclude=n)
  142. for v in list(test_set):
  143. flow_node2 |= self.bfs_nodes(v, exclude=n)
  144. if len(flow_node1 & flow_node2) == 0:
  145. for e in n.out_edges:
  146. v, w = e.in_node.name, e.out_node.name
  147. if self.e_reduce[(v, w)] != True:
  148. self.e_reduce[(v, w)] = True
  149. chimer_edges.add((v, w))
  150. rv = reverse_end(w)
  151. rw = reverse_end(v)
  152. self.e_reduce[(rv, rw)] = True
  153. chimer_edges.add((rv, rw))
  154. for e in n.in_edges:
  155. v, w = e.in_node.name, e.out_node.name
  156. if self.e_reduce[(v, w)] != True:
  157. self.e_reduce[(v, w)] = True
  158. chimer_edges.add((v, w))
  159. rv = reverse_end(w)
  160. rw = reverse_end(v)
  161. self.e_reduce[(rv, rw)] = True
  162. chimer_edges.add((rv, rw))
  163. chimer_nodes.append(n.name)
  164. chimer_nodes.append(reverse_end(n.name))
  165. return chimer_nodes, chimer_edges
  166. def mark_spur_edge(self):
  167. removed_edges = set()
  168. for v in self.nodes:
  169. if len([e for e in self.nodes[v].out_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] != True]) > 1:
  170. for out_edge in self.nodes[v].out_edges:
  171. w = out_edge.out_node.name
  172. if len(self.nodes[w].out_edges) == 0 and self.e_reduce[(v, w)] != True:
  173. self.e_reduce[(v, w)] = True
  174. removed_edges.add((v, w))
  175. v2, w2 = reverse_end(w), reverse_end(v)
  176. self.e_reduce[(v2, w2)] = True
  177. removed_edges.add((v2, w2))
  178. if len([e for e in self.nodes[v].in_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] != True]) > 1:
  179. for in_edge in self.nodes[v].in_edges:
  180. w = in_edge.in_node.name
  181. if len(self.nodes[w].in_edges) == 0 and self.e_reduce[(w, v)] != True:
  182. self.e_reduce[(w, v)] = True
  183. removed_edges.add((w, v))
  184. v2, w2 = reverse_end(w), reverse_end(v)
  185. self.e_reduce[(w2, v2)] = True
  186. removed_edges.add((w2, v2))
  187. return removed_edges
  188. def mark_tr_edges(self):
  189. """
  190. transitive reduction
  191. """
  192. n_mark = self.n_mark
  193. e_reduce = self.e_reduce
  194. FUZZ = 500
  195. for n in self.nodes:
  196. n_mark[n] = "vacant"
  197. for (n_name, node) in viewitems(self.nodes):
  198. out_edges = node.out_edges
  199. if len(out_edges) == 0:
  200. continue
  201. out_edges.sort(key=lambda x: x.attr["length"])
  202. for e in out_edges:
  203. w = e.out_node
  204. n_mark[w.name] = "inplay"
  205. max_len = out_edges[-1].attr["length"]
  206. max_len += FUZZ
  207. for e in out_edges:
  208. e_len = e.attr["length"]
  209. w = e.out_node
  210. if n_mark[w.name] == "inplay":
  211. w.out_edges.sort(key=lambda x: x.attr["length"])
  212. for e2 in w.out_edges:
  213. if e2.attr["length"] + e_len < max_len:
  214. x = e2.out_node
  215. if n_mark[x.name] == "inplay":
  216. n_mark[x.name] = "eliminated"
  217. for e in out_edges:
  218. e_len = e.attr["length"]
  219. w = e.out_node
  220. w.out_edges.sort(key=lambda x: x.attr["length"])
  221. if len(w.out_edges) > 0:
  222. x = w.out_edges[0].out_node
  223. if n_mark[x.name] == "inplay":
  224. n_mark[x.name] = "eliminated"
  225. for e2 in w.out_edges:
  226. if e2.attr["length"] < FUZZ:
  227. x = e2.out_node
  228. if n_mark[x.name] == "inplay":
  229. n_mark[x.name] = "eliminated"
  230. for out_edge in out_edges:
  231. v = out_edge.in_node
  232. w = out_edge.out_node
  233. if n_mark[w.name] == "eliminated":
  234. e_reduce[(v.name, w.name)] = True
  235. v_name, w_name = reverse_end(w.name), reverse_end(v.name)
  236. e_reduce[(v_name, w_name)] = True
  237. n_mark[w.name] = "vacant"
  238. def mark_best_overlap(self):
  239. """
  240. find the best overlapped edges
  241. """
  242. best_edges = set()
  243. removed_edges = set()
  244. for v in self.nodes:
  245. out_edges = self.nodes[v].out_edges
  246. if len(out_edges) > 0:
  247. out_edges.sort(key=lambda e: -e.attr["score"])
  248. for e in out_edges:
  249. if self.e_reduce[(e.in_node.name, e.out_node.name)] != True:
  250. best_edges.add((e.in_node.name, e.out_node.name))
  251. self.best_out[v] = e.out_node.name
  252. break
  253. in_edges = self.nodes[v].in_edges
  254. if len(in_edges) > 0:
  255. in_edges.sort(key=lambda e: -e.attr["score"])
  256. for e in in_edges:
  257. if self.e_reduce[(e.in_node.name, e.out_node.name)] != True:
  258. best_edges.add((e.in_node.name, e.out_node.name))
  259. self.best_in[v] = e.in_node.name
  260. break
  261. if DEBUG_LOG_LEVEL > 1:
  262. print("X", len(best_edges))
  263. for (e_n, e) in viewitems(self.edges):
  264. v = e_n[0]
  265. w = e_n[1]
  266. if self.e_reduce[(v, w)] != True:
  267. if (v, w) not in best_edges:
  268. self.e_reduce[(v, w)] = True
  269. removed_edges.add((v, w))
  270. v2, w2 = reverse_end(w), reverse_end(v)
  271. self.e_reduce[(v2, w2)] = True
  272. removed_edges.add((v2, w2))
  273. return removed_edges
  274. def resolve_repeat_edges(self):
  275. edges_to_reduce = []
  276. nodes_to_test = set()
  277. for (v_n, v) in viewitems(self.nodes):
  278. out_nodes = []
  279. for e in v.out_edges:
  280. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  281. out_nodes.append(e.out_node.name)
  282. in_nodes = []
  283. for e in v.in_edges:
  284. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  285. in_nodes.append(e.in_node.name)
  286. if len(out_nodes) == 1 and len(in_nodes) == 1:
  287. nodes_to_test.add(v_n)
  288. for v_n in list(nodes_to_test):
  289. v = self.nodes[v_n]
  290. out_nodes = []
  291. for e in v.out_edges:
  292. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  293. out_nodes.append(e.out_node.name)
  294. in_nodes = []
  295. for e in v.in_edges:
  296. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  297. in_nodes.append(e.in_node.name)
  298. in_node_name = in_nodes[0]
  299. for out_edge in self.nodes[in_node_name].out_edges:
  300. vv = out_edge.in_node.name
  301. ww = out_edge.out_node.name
  302. ww_out = self.nodes[ww].out_edges
  303. v_out = self.nodes[v_n].out_edges
  304. ww_out_nodes = set([n.out_node.name for n in ww_out])
  305. v_out_nodes = set([n.out_node.name for n in v_out])
  306. o_overlap = len(ww_out_nodes & v_out_nodes)
  307. ww_in_count = 0
  308. for e in self.nodes[ww].in_edges:
  309. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  310. ww_in_count += 1
  311. if ww != v_n and\
  312. self.e_reduce[(vv, ww)] == False and\
  313. ww_in_count > 1 and\
  314. ww not in nodes_to_test and\
  315. o_overlap == 0:
  316. edges_to_reduce.append((vv, ww))
  317. out_node_name = out_nodes[0]
  318. for in_edge in self.nodes[out_node_name].in_edges:
  319. vv = in_edge.in_node.name
  320. ww = in_edge.out_node.name
  321. vv_in = self.nodes[vv].in_edges
  322. v_in = self.nodes[v_n].in_edges
  323. vv_in_nodes = set([n.in_node.name for n in vv_in])
  324. v_in_nodes = set([n.in_node.name for n in v_in])
  325. i_overlap = len(vv_in_nodes & v_in_nodes)
  326. vv_out_count = 0
  327. for e in self.nodes[vv].out_edges:
  328. if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
  329. vv_out_count += 1
  330. if vv != v_n and\
  331. self.e_reduce[(vv, ww)] == False and\
  332. vv_out_count > 1 and\
  333. vv not in nodes_to_test and\
  334. i_overlap == 0:
  335. edges_to_reduce.append((vv, ww))
  336. removed_edges = set()
  337. for e in edges_to_reduce:
  338. self.e_reduce[e] = True
  339. removed_edges.add(e)
  340. return removed_edges
  341. def get_out_edges_for_node(self, name, mask=True):
  342. rtn = []
  343. for e in self.nodes[name].out_edges:
  344. v = e.in_node
  345. w = e.out_node
  346. if self.e_reduce[(v.name, w.name)] == False:
  347. rtn.append(e)
  348. return rtn
  349. def get_in_edges_for_node(self, name, mask=True):
  350. rtn = []
  351. for e in self.nodes[name].in_edges:
  352. v = e.in_node
  353. w = e.out_node
  354. if self.e_reduce[(v.name, w.name)] == False:
  355. rtn.append(e)
  356. return rtn
  357. def get_best_out_edge_for_node(self, name, mask=True):
  358. rtn = []
  359. for e in self.nodes[name].out_edges:
  360. v = e.in_node
  361. w = e.out_node
  362. if self.e_reduce[(v.name, w.name)] == False:
  363. rtn.append(e)
  364. rtn.sort(key=lambda e: e.attr["score"])
  365. return rtn[-1]
  366. def get_best_in_edge_for_node(self, name, mask=True):
  367. rtn = []
  368. for e in self.nodes[name].in_edges:
  369. v = e.in_node
  370. w = e.out_node
  371. if self.e_reduce[(v.name, w.name)] == False:
  372. rtn.append(e)
  373. rtn.sort(key=lambda e: e.attr["score"])
  374. return rtn[-1]
  375. def reverse_edge(e):
  376. e1, e2 = e
  377. return reverse_end(e2), reverse_end(e1)
  378. def reverse_path(p):
  379. p = p[::-1]
  380. return [reverse_end(n) for n in p]
  381. def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
  382. tips = set()
  383. bundle_edges = set()
  384. bundle_nodes = set()
  385. local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
  386. length_to_node = {start_node: 0}
  387. score_to_node = {start_node: 0}
  388. v = start_node
  389. end_node = start_node
  390. if DEBUG_LOG_LEVEL > 1:
  391. print()
  392. print()
  393. print("start", start_node)
  394. bundle_nodes.add(v)
  395. for vv, ww, kk in local_graph.out_edges(v, keys=True):
  396. max_score = 0
  397. max_length = 0
  398. if (vv, ww, kk) not in bundle_edges and\
  399. reverse_end(ww) not in bundle_nodes:
  400. bundle_edges.add((vv, ww, kk))
  401. tips.add(ww)
  402. for v in list(tips):
  403. bundle_nodes.add(v)
  404. depth = 1
  405. width = 1.0
  406. converage = False
  407. while 1:
  408. if DEBUG_LOG_LEVEL > 1:
  409. print("# of tips", len(tips))
  410. if len(tips) > 4:
  411. converage = False
  412. break
  413. if len(tips) == 1:
  414. end_node = tips.pop()
  415. if DEBUG_LOG_LEVEL > 1:
  416. print("end", end_node)
  417. if end_node not in length_to_node:
  418. v = end_node
  419. max_score_edge = None
  420. max_score = 0
  421. for uu, vv, kk in local_graph.in_edges(v, keys=True):
  422. if uu not in length_to_node:
  423. continue
  424. score = u_edge_data[(uu, vv, kk)][1]
  425. if score > max_score:
  426. max_score = score
  427. max_score_edge = (uu, vv, kk)
  428. length_to_node[v] = length_to_node[max_score_edge[0]
  429. ] + u_edge_data[max_score_edge][0]
  430. score_to_node[v] = score_to_node[max_score_edge[0]
  431. ] + u_edge_data[max_score_edge][1]
  432. converage = True
  433. break
  434. depth += 1
  435. width = 1.0 * len(bundle_edges) / depth
  436. if depth > 10 and width > width_cutoff:
  437. converage = False
  438. break
  439. if depth > depth_cutoff:
  440. converage = False
  441. break
  442. tips_list = list(tips)
  443. tip_updated = False
  444. loop_detect = False
  445. length_limit_reached = False
  446. for v in tips_list:
  447. if DEBUG_LOG_LEVEL > 1:
  448. print("process", v)
  449. if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
  450. print("no out edge", v)
  451. continue
  452. max_score_edge = None
  453. max_score = 0
  454. extend_tip = True
  455. for uu, vv, kk in local_graph.in_edges(v, keys=True):
  456. if DEBUG_LOG_LEVEL > 1:
  457. print("in_edges", uu, vv, kk)
  458. print(uu, "in length_to_node", uu in length_to_node)
  459. if uu not in length_to_node:
  460. extend_tip = False
  461. break
  462. score = u_edge_data[(uu, vv, kk)][1]
  463. if score > max_score:
  464. max_score = score
  465. max_score_edge = (uu, vv, kk)
  466. if extend_tip:
  467. length_to_node[v] = length_to_node[max_score_edge[0]
  468. ] + u_edge_data[max_score_edge][0]
  469. score_to_node[v] = score_to_node[max_score_edge[0]
  470. ] + u_edge_data[max_score_edge][1]
  471. if length_to_node[v] > length_cutoff:
  472. length_limit_reached = True
  473. converage = False
  474. break
  475. v_updated = False
  476. for vv, ww, kk in local_graph.out_edges(v, keys=True):
  477. if DEBUG_LOG_LEVEL > 1:
  478. print("test", vv, ww, kk)
  479. if ww in length_to_node:
  480. loop_detect = True
  481. if DEBUG_LOG_LEVEL > 1:
  482. print("loop_detect", ww)
  483. break
  484. if (vv, ww, kk) not in bundle_edges and\
  485. reverse_end(ww) not in bundle_nodes:
  486. if DEBUG_LOG_LEVEL > 1:
  487. print("add", ww)
  488. tips.add(ww)
  489. bundle_edges.add((vv, ww, kk))
  490. tip_updated = True
  491. v_updated = True
  492. if v_updated:
  493. if DEBUG_LOG_LEVEL > 1:
  494. print("remove", v)
  495. tips.remove(v)
  496. if len(tips) == 1:
  497. break
  498. if loop_detect:
  499. converage = False
  500. break
  501. if length_limit_reached:
  502. converage = False
  503. break
  504. if loop_detect:
  505. converage = False
  506. break
  507. if not tip_updated:
  508. converage = False
  509. break
  510. for v in list(tips):
  511. bundle_nodes.add(v)
  512. data = start_node, end_node, bundle_edges, length_to_node[
  513. end_node], score_to_node[end_node], depth
  514. data_r = None
  515. if DEBUG_LOG_LEVEL > 1:
  516. print(converage, data, data_r)
  517. return converage, data, data_r
  518. def generate_string_graph(args):
  519. overlap_file = args.overlap_file
  520. contained_reads = set()
  521. chimer_ids = set()
  522. filter_reads = False
  523. seqs = set()
  524. G = nx.Graph()
  525. edges = set()
  526. overlap_data = []
  527. contained_reads = set()
  528. overlap_count = {}
  529. # loop through the overlapping data to load the data in the a python array
  530. # contained reads are identified
  531. def process_fields(l):
  532. # work around for some ill formed data recored
  533. # if len(l) != 13:
  534. # continue
  535. f_id, g_id, score, identity = l[:4]
  536. if f_id == g_id: # don't need self-self overlapping
  537. return
  538. if filter_reads:
  539. if g_id not in seqs:
  540. return
  541. if f_id not in seqs:
  542. return
  543. score = int(score)
  544. identity = float(identity)
  545. contained = l[12]
  546. if contained == "contained":
  547. contained_reads.add(f_id)
  548. return
  549. if contained == "contains":
  550. contained_reads.add(g_id)
  551. return
  552. if contained == "none":
  553. return
  554. if identity < args.min_idt: # only take record with >96% identity as overlapped reads
  555. return
  556. f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
  557. g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
  558. # only used reads longer than the 4kb for assembly
  559. if f_len < args.min_len:
  560. return
  561. if g_len < args.min_len:
  562. return
  563. """
  564. # double check for proper overlap
  565. # this is not necessary when using DALIGNER for overlapper
  566. # it may be useful if other overlappers give fuzzier alignment boundary
  567. if f_start > 24 and f_len - f_end > 24: # allow 24 base tolerance on both sides of the overlapping
  568. return
  569. if g_start > 24 and g_len - g_end > 24:
  570. return
  571. if g_strain == 0:
  572. if f_start < 24 and g_len - g_end > 24:
  573. return
  574. if g_start < 24 and f_len - f_end > 24:
  575. return
  576. else:
  577. if f_start < 24 and g_start > 24:
  578. return
  579. if g_start < 24 and f_start > 24:
  580. return
  581. """
  582. overlap_data.append((f_id, g_id, score, identity,
  583. f_strain, f_start, f_end, f_len,
  584. g_strain, g_start, g_end, g_len))
  585. overlap_count[f_id] = overlap_count.get(f_id, 0) + 1
  586. overlap_count[g_id] = overlap_count.get(g_id, 0) + 1
  587. with open(overlap_file) as f:
  588. n = 0
  589. for line in f:
  590. if line.startswith('-'):
  591. break
  592. l = line.strip().split()
  593. process_fields(l)
  594. n += 1
  595. else:
  596. # This happens only if we did not 'break' from the for-loop.
  597. msg = 'No end-of-file marker for overlap_file {!r} after {} lines.'.format(
  598. overlap_file, n)
  599. raise Exception(msg)
  600. overlap_set = set()
  601. sg = StringGraph()
  602. for od in overlap_data:
  603. f_id, g_id, score, identity = od[:4]
  604. if f_id in contained_reads:
  605. continue
  606. if g_id in contained_reads:
  607. continue
  608. f_s, f_b, f_e, f_l = od[4:8]
  609. g_s, g_b, g_e, g_l = od[8:12]
  610. overlap_pair = [f_id, g_id]
  611. overlap_pair.sort()
  612. overlap_pair = tuple(overlap_pair)
  613. if overlap_pair in overlap_set: # don't allow duplicated records
  614. continue
  615. else:
  616. overlap_set.add(overlap_pair)
  617. if g_s == 1: # revered alignment, swapping the begin and end coordinates
  618. g_b, g_e = g_e, g_b
  619. # build the string graph edges for each overlap
  620. if f_b > 0:
  621. if g_b < g_e:
  622. """
  623. f.B f.E
  624. f ----------->
  625. g ------------->
  626. g.B g.E
  627. """
  628. if f_b == 0 or g_e - g_l == 0:
  629. continue
  630. sg.add_edge("%s:B" % g_id, "%s:B" % f_id, label=(f_id, f_b, 0),
  631. length=abs(f_b - 0),
  632. score=-score,
  633. identity=identity)
  634. sg.add_edge("%s:E" % f_id, "%s:E" % g_id, label=(g_id, g_e, g_l),
  635. length=abs(g_e - g_l),
  636. score=-score,
  637. identity=identity)
  638. else:
  639. """
  640. f.B f.E
  641. f ----------->
  642. g <-------------
  643. g.E g.B
  644. """
  645. if f_b == 0 or g_e == 0:
  646. continue
  647. sg.add_edge("%s:E" % g_id, "%s:B" % f_id, label=(f_id, f_b, 0),
  648. length=abs(f_b - 0),
  649. score=-score,
  650. identity=identity)
  651. sg.add_edge("%s:E" % f_id, "%s:B" % g_id, label=(g_id, g_e, 0),
  652. length=abs(g_e - 0),
  653. score=-score,
  654. identity=identity)
  655. else:
  656. if g_b < g_e:
  657. """
  658. f.B f.E
  659. f ----------->
  660. g ------------->
  661. g.B g.E
  662. """
  663. if g_b == 0 or f_e - f_l == 0:
  664. continue
  665. sg.add_edge("%s:B" % f_id, "%s:B" % g_id, label=(g_id, g_b, 0),
  666. length=abs(g_b - 0),
  667. score=-score,
  668. identity=identity)
  669. sg.add_edge("%s:E" % g_id, "%s:E" % f_id, label=(f_id, f_e, f_l),
  670. length=abs(f_e - f_l),
  671. score=-score,
  672. identity=identity)
  673. else:
  674. """
  675. f.B f.E
  676. f ----------->
  677. g <-------------
  678. g.E g.B
  679. """
  680. if g_b - g_l == 0 or f_e - f_l == 0:
  681. continue
  682. sg.add_edge("%s:B" % f_id, "%s:E" % g_id, label=(g_id, g_b, g_l),
  683. length=abs(g_b - g_l),
  684. score=-score,
  685. identity=identity)
  686. sg.add_edge("%s:B" % g_id, "%s:E" % f_id, label=(f_id, f_e, f_l),
  687. length=abs(f_e - f_l),
  688. score=-score,
  689. identity=identity)
  690. sg.init_reduce_dict()
  691. sg.mark_tr_edges() # mark those edges that transitive redundant
  692. if DEBUG_LOG_LEVEL > 1:
  693. print(sum([1 for c in itervalues(sg.e_reduce) if c == True]))
  694. print(sum([1 for c in itervalues(sg.e_reduce) if c == False]))
  695. if not args.disable_chimer_bridge_removal:
  696. chimer_nodes, chimer_edges = sg.mark_chimer_edges()
  697. with open("chimers_nodes", "w") as f:
  698. for n in chimer_nodes:
  699. print(n, file=f)
  700. del chimer_nodes
  701. else:
  702. chimer_edges = set() # empty set
  703. spur_edges = sg.mark_spur_edge()
  704. removed_edges = set()
  705. if args.lfc == True:
  706. removed_edges = sg.resolve_repeat_edges()
  707. else:
  708. # mark those edges that are best overlap edges
  709. removed_edges = sg.mark_best_overlap()
  710. spur_edges.update(sg.mark_spur_edge())
  711. if DEBUG_LOG_LEVEL > 1:
  712. print(sum([1 for c in itervalues(sg.e_reduce) if c == False]))
  713. out_f = open("sg_edges_list", "w")
  714. nxsg = nx.DiGraph()
  715. edge_data = {}
  716. for v, w in sg.edges: # sort, or OrderedDict
  717. e = sg.edges[(v, w)]
  718. rid, sp, tp = e.attr["label"]
  719. score = e.attr["score"]
  720. identity = e.attr["identity"]
  721. length = abs(sp - tp)
  722. if sg.e_reduce[(v, w)] != True:
  723. type_ = "G"
  724. label = "%s:%d-%d" % (rid, sp, tp)
  725. nxsg.add_edge(v, w, label=label, length=length, score=score)
  726. edge_data[(v, w)] = (rid, sp, tp, length, score, identity, type_)
  727. if w in sg.best_in:
  728. nxsg.node[w]["best_in"] = v
  729. elif (v, w) in chimer_edges:
  730. type_ = "C"
  731. elif (v, w) in removed_edges:
  732. type_ = "R"
  733. elif (v, w) in spur_edges:
  734. type_ = "S"
  735. elif sg.e_reduce[(v, w)] == True:
  736. type_ = "TR"
  737. line = '%s %s %s %5d %5d %5d %5.2f %s' % (
  738. v, w, rid, sp, tp, score, identity, type_)
  739. print(line, file=out_f)
  740. out_f.close()
  741. nxsg_r = nxsg.reverse()
  742. return nxsg, nxsg_r, edge_data
  743. def construct_compound_paths(ug, u_edge_data):
  744. source_nodes = set()
  745. sink_nodes = set()
  746. simple_nodes = set()
  747. branch_nodes = set()
  748. all_nodes = ug.nodes()
  749. for n in all_nodes:
  750. in_degree = len(ug.in_edges(n))
  751. out_degree = len(ug.out_edges(n))
  752. if in_degree == 0:
  753. source_nodes.add(n)
  754. if out_degree == 0:
  755. sink_nodes.add(n)
  756. if in_degree == 1 and out_degree == 1:
  757. simple_nodes.add(n)
  758. if in_degree > 1 or out_degree > 1:
  759. branch_nodes.add(n)
  760. # print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
  761. compound_paths_0 = []
  762. for p in list(branch_nodes):
  763. if ug.out_degree(p) > 1:
  764. coverage, data, data_r = find_bundle(
  765. ug, u_edge_data, p, 48, 16, 500000)
  766. if coverage == True:
  767. start_node, end_node, bundle_edges, length, score, depth = data
  768. compound_paths_0.append(
  769. (start_node, "NA", end_node, 1.0 * len(bundle_edges) / depth, length, score, bundle_edges))
  770. compound_paths_0.sort(key=lambda x: -len(x[6]))
  771. edge_to_cpath = {}
  772. compound_paths_1 = {}
  773. for s, v, t, width, length, score, bundle_edges in compound_paths_0:
  774. if DEBUG_LOG_LEVEL > 1:
  775. print("constructing utg, test ", s, v, t)
  776. overlapped = False
  777. for vv, ww, kk in list(bundle_edges):
  778. if (vv, ww, kk) in edge_to_cpath:
  779. if DEBUG_LOG_LEVEL > 1:
  780. print("remove overlapped utg", (s, v, t), (vv, ww, kk))
  781. overlapped = True
  782. break
  783. rvv = reverse_end(vv)
  784. rww = reverse_end(ww)
  785. rkk = reverse_end(kk)
  786. if (rww, rvv, rkk) in edge_to_cpath:
  787. if DEBUG_LOG_LEVEL > 1:
  788. print("remove overlapped r utg", (s, v, t), (rww, rvv, rkk))
  789. overlapped = True
  790. break
  791. if not overlapped:
  792. if DEBUG_LOG_LEVEL > 1:
  793. print("constructing", s, v, t)
  794. bundle_edges_r = []
  795. rs = reverse_end(t)
  796. rt = reverse_end(s)
  797. for vv, ww, kk in list(bundle_edges):
  798. edge_to_cpath.setdefault((vv, ww, kk), set())
  799. edge_to_cpath[(vv, ww, kk)].add((s, t, v))
  800. rvv = reverse_end(ww)
  801. rww = reverse_end(vv)
  802. rkk = reverse_end(kk)
  803. edge_to_cpath.setdefault((rvv, rww, rkk), set())
  804. edge_to_cpath[(rvv, rww, rkk)].add(
  805. (rs, rt, v)) # assert v == "NA"
  806. bundle_edges_r.append((rvv, rww, rkk))
  807. compound_paths_1[(s, v, t)] = width, length, score, bundle_edges
  808. compound_paths_1[(rs, v, rt)
  809. ] = width, length, score, bundle_edges_r
  810. compound_paths_2 = {}
  811. edge_to_cpath = {}
  812. for s, v, t in compound_paths_1:
  813. rs = reverse_end(t)
  814. rt = reverse_end(s)
  815. if (rs, "NA", rt) not in compound_paths_1:
  816. if DEBUG_LOG_LEVEL > 1:
  817. print("non_compliment bundle", s, v, t, len(compound_paths_1[(s, v, t)][-1]))
  818. continue
  819. width, length, score, bundle_edges = compound_paths_1[(s, v, t)]
  820. compound_paths_2[(s, v, t)] = width, length, score, bundle_edges
  821. for vv, ww, kk in list(bundle_edges):
  822. edge_to_cpath.setdefault((vv, ww, kk), set())
  823. edge_to_cpath[(vv, ww, kk)].add((s, t, v))
  824. compound_paths_3 = {}
  825. for (k, val) in viewitems(compound_paths_2):
  826. start_node, NA, end_node = k
  827. rs = reverse_end(end_node)
  828. rt = reverse_end(start_node)
  829. assert (rs, "NA", rt) in compound_paths_2
  830. contained = False
  831. for vv, ww, kk in ug.out_edges(start_node, keys=True):
  832. if len(edge_to_cpath.get((vv, ww, kk), [])) > 1:
  833. contained = True
  834. if not contained:
  835. compound_paths_3[k] = val
  836. if DEBUG_LOG_LEVEL > 1:
  837. print("compound", k)
  838. compound_paths = {}
  839. for s, v, t in compound_paths_3:
  840. rs = reverse_end(t)
  841. rt = reverse_end(s)
  842. if (rs, "NA", rt) not in compound_paths_3:
  843. continue
  844. compound_paths[(s, v, t)] = compound_paths_3[(s, v, t)]
  845. return compound_paths
  846. def identify_simple_paths(sg2, edge_data):
  847. # utg construction phase 1, identify all simple paths
  848. simple_paths = dict()
  849. s_nodes = set()
  850. t_nodes = set()
  851. simple_nodes = set()
  852. all_nodes = sg2.nodes()
  853. for n in all_nodes:
  854. in_degree = len(sg2.in_edges(n))
  855. out_degree = len(sg2.out_edges(n))
  856. if in_degree == 1 and out_degree == 1:
  857. simple_nodes.add(n)
  858. else:
  859. if out_degree != 0:
  860. s_nodes.add(n)
  861. if in_degree != 0:
  862. t_nodes.add(n)
  863. free_edges = set(sg2.edges())
  864. if DEBUG_LOG_LEVEL > 1:
  865. for s in list(simple_nodes):
  866. print("simple_node", s)
  867. for s in list(s_nodes):
  868. print("s_node", s)
  869. for s in list(t_nodes):
  870. print("t_node", s)
  871. for v, w in free_edges:
  872. if (reverse_end(w), reverse_end(v)) not in free_edges:
  873. print("bug", v, w)
  874. print(reverse_end(w), reverse_end(v))
  875. while free_edges:
  876. if s_nodes:
  877. n = s_nodes.pop()
  878. if DEBUG_LOG_LEVEL > 1:
  879. print("initial utg 1", n)
  880. else:
  881. e = free_edges.pop()
  882. free_edges.add(e)
  883. n = e[0]
  884. if DEBUG_LOG_LEVEL > 1:
  885. print("initial utg 2", n)
  886. path = []
  887. path_length = 0
  888. path_score = 0
  889. for v, w in sg2.out_edges(n):
  890. if (v, w) not in free_edges:
  891. continue
  892. rv = reverse_end(v)
  893. rw = reverse_end(w)
  894. path_length = 0
  895. path_score = 0
  896. v0 = v
  897. w0 = w
  898. path = [v, w]
  899. path_edges = set()
  900. path_edges.add((v, w))
  901. path_length += edge_data[(v, w)][3]
  902. path_score += edge_data[(v, w)][4]
  903. free_edges.remove((v, w))
  904. r_path_length = 0
  905. r_path_score = 0
  906. rv0 = rv
  907. rw0 = rw
  908. r_path = [rv, rw] # need to reverse again
  909. r_path_edges = set()
  910. r_path_edges.add((rw, rv))
  911. r_path_length += edge_data[(rw, rv)][3]
  912. r_path_score += edge_data[(rw, rv)][4]
  913. free_edges.remove((rw, rv))
  914. while w in simple_nodes:
  915. w, w_ = list(sg2.out_edges(w))[0]
  916. if (w, w_) not in free_edges:
  917. break
  918. rw_, rw = reverse_end(w_), reverse_end(w)
  919. if (rw_, rw) in path_edges:
  920. break
  921. path.append(w_)
  922. path_edges.add((w, w_))
  923. path_length += edge_data[(w, w_)][3]
  924. path_score += edge_data[(w, w_)][4]
  925. free_edges.remove((w, w_))
  926. r_path.append(rw_)
  927. r_path_edges.add((rw_, rw))
  928. r_path_length += edge_data[(rw_, rw)][3]
  929. r_path_score += edge_data[(rw_, rw)][4]
  930. free_edges.remove((rw_, rw))
  931. w = w_
  932. simple_paths[(v0, w0, path[-1])] = path_length, path_score, path
  933. r_path.reverse()
  934. assert r_path[0] == reverse_end(path[-1])
  935. simple_paths[(r_path[0], rw0, rv0)
  936. ] = r_path_length, r_path_score, r_path
  937. if DEBUG_LOG_LEVEL > 1:
  938. print(path_length, path_score, path)
  939. #dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
  940. #dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
  941. return simple_paths
  942. def identify_spurs(ug, u_edge_data, spur_len):
  943. # identify spurs in the utg graph
  944. # Currently, we use ad-hoc logic filtering out shorter utg, but we can
  945. # add proper alignment comparison later to remove redundant utgs
  946. # Side-effect: Modifies u_edge_data
  947. ug2 = ug.copy()
  948. s_candidates = set()
  949. for v in ug2.nodes():
  950. if ug2.in_degree(v) == 0:
  951. s_candidates.add(v)
  952. while len(s_candidates) > 0:
  953. n = s_candidates.pop()
  954. if ug2.in_degree(n) != 0:
  955. continue
  956. n_ego_graph = nx.ego_graph(ug2, n, radius=10)
  957. n_ego_node_set = set(n_ego_graph.nodes())
  958. for b_node in n_ego_graph.nodes():
  959. if ug2.in_degree(b_node) <= 1:
  960. continue
  961. with_extern_node = False
  962. b_in_nodes = [e[0] for e in ug2.in_edges(b_node)]
  963. if len(b_in_nodes) == 1:
  964. continue
  965. for v in b_in_nodes:
  966. if v not in n_ego_node_set:
  967. with_extern_node = True
  968. break
  969. if not with_extern_node:
  970. continue
  971. s_path = nx.shortest_path(ug2, n, b_node)
  972. v1 = s_path[0]
  973. total_length = 0
  974. for v2 in s_path[1:]:
  975. for s, t, v in ug2.out_edges(v1, keys=True):
  976. if t != v2:
  977. continue
  978. length, score, edges, type_ = u_edge_data[(s, t, v)]
  979. total_length += length
  980. v1 = v2
  981. if total_length >= spur_len:
  982. continue
  983. v1 = s_path[0]
  984. for v2 in s_path[1:]:
  985. for s, t, v in list(ug2.out_edges(v1, keys=True)):
  986. if t != v2:
  987. continue
  988. length, score, edges, type_ = u_edge_data[(s, t, v)]
  989. rs = reverse_end(t)
  990. rt = reverse_end(s)
  991. rv = reverse_end(v)
  992. try:
  993. ug2.remove_edge(s, t, key=v)
  994. ug2.remove_edge(rs, rt, key=rv)
  995. u_edge_data[(s, t, v)] = length, score, edges, "spur:2"
  996. u_edge_data[(rs, rt, rv)
  997. ] = length, score, edges, "spur:2"
  998. except Exception:
  999. pass
  1000. if ug2.in_degree(v2) == 0:
  1001. s_candidates.add(v2)
  1002. v1 = v2
  1003. break
  1004. return ug2
  1005. def remove_dup_simple_path(ug, u_edge_data):
  1006. # identify simple dup path
  1007. # if there are many multiple simple path of length connect s and t, e.g. s->v1->t, and s->v2->t, we will only keep one
  1008. # Side-effect: Modifies u_edge_data
  1009. ug2 = ug.copy()
  1010. simple_edges = set()
  1011. dup_edges = {}
  1012. for s, t, v in u_edge_data:
  1013. length, score, edges, type_ = u_edge_data[(s, t, v)]
  1014. if len(edges) > 3:
  1015. continue
  1016. if type_ == "simple":
  1017. if (s, t) in simple_edges:
  1018. dup_edges[(s, t)].append(v)
  1019. else:
  1020. simple_edges.add((s, t))
  1021. dup_edges[(s, t)] = [v]
  1022. for s, t in dup_edges:
  1023. vl = dup_edges[(s, t)]
  1024. vl.sort()
  1025. for v in vl[1:]:
  1026. ug2.remove_edge(s, t, key=v)
  1027. length, score, edges, type_ = u_edge_data[(s, t, v)]
  1028. u_edge_data[(s, t, v)] = length, score, edges, "simple_dup"
  1029. return ug2
  1030. def construct_c_path_from_utgs(ug, u_edge_data, sg):
  1031. # Side-effects: None, I think.
  1032. s_nodes = set()
  1033. #t_nodes = set()
  1034. simple_nodes = set()
  1035. simple_out = set()
  1036. #simple_in = set()
  1037. all_nodes = ug.nodes()
  1038. for n in all_nodes:
  1039. in_degree = len(ug.in_edges(n))
  1040. out_degree = len(ug.out_edges(n))
  1041. if in_degree == 1 and out_degree == 1:
  1042. simple_nodes.add(n)
  1043. else:
  1044. if out_degree != 0:
  1045. s_nodes.add(n)
  1046. # if in_degree != 0:
  1047. # t_nodes.add(n)
  1048. if out_degree == 1:
  1049. simple_out.add(n)
  1050. # if in_degree == 1:
  1051. # simple_in.add(n)
  1052. c_path = []
  1053. free_edges = set()
  1054. for s, t, v in ug.edges(keys=True):
  1055. free_edges.add((s, t, v))
  1056. while free_edges:
  1057. if s_nodes:
  1058. n = s_nodes.pop()
  1059. else:
  1060. e = free_edges.pop()
  1061. n = e[0]
  1062. for s, t, v in ug.out_edges(n, keys=True):
  1063. path_start = n
  1064. path_end = None
  1065. path_key = None
  1066. path = []
  1067. path_length = 0
  1068. path_score = 0
  1069. path_nodes = set()
  1070. path_nodes.add(s)
  1071. if DEBUG_LOG_LEVEL > 1:
  1072. print("check 1", s, t, v)
  1073. path_key = t
  1074. t0 = s
  1075. while t in simple_out:
  1076. if t in path_nodes:
  1077. break
  1078. rt = reverse_end(t)
  1079. if rt in path_nodes:
  1080. break
  1081. length, score, path_or_edges, type_ = u_edge_data[(t0, t, v)]
  1082. """
  1083. If the next node has two in-edges and the current path has the best overlap,
  1084. we will extend the contigs. Otherwise, we will terminate the contig extension.
  1085. This can help reduce some mis-assemblies but it can still construct long contigs
  1086. when there is an oppertunity (assuming the best overlap has the highest
  1087. likelihood to be correct.)
  1088. """
  1089. if len(ug.in_edges(t, keys=True)) > 1:
  1090. best_in_node = sg.node[t]["best_in"]
  1091. if type_ == "simple" and best_in_node != path_or_edges[-2]:
  1092. break
  1093. if type_ == "compound":
  1094. t_in_nodes = set()
  1095. for ss, vv, tt in path_or_edges:
  1096. if tt != t:
  1097. continue
  1098. length, score, path_or_edges, type_ = u_edge_data[(
  1099. ss, vv, tt)]
  1100. if path_or_edges[-1] == tt:
  1101. t_in_nodes.add(path_or_edges[-2])
  1102. if best_in_node not in t_in_nodes:
  1103. break
  1104. # ----------------
  1105. path.append((t0, t, v))
  1106. path_nodes.add(t)
  1107. path_length += length
  1108. path_score += score
  1109. # t is "simple_out" node
  1110. assert len(ug.out_edges(t, keys=True)) == 1
  1111. t0, t, v = list(ug.out_edges(t, keys=True))[0]
  1112. path.append((t0, t, v))
  1113. length, score, path_or_edges, type_ = u_edge_data[(t0, t, v)]
  1114. path_length += length
  1115. path_score += score
  1116. path_nodes.add(t)
  1117. path_end = t
  1118. c_path.append((path_start, path_key, path_end,
  1119. path_length, path_score, path, len(path)))
  1120. if DEBUG_LOG_LEVEL > 1:
  1121. print("c_path", path_start, path_key, path_end, path_length, path_score, len(path))
  1122. for e in path:
  1123. if e in free_edges:
  1124. free_edges.remove(e)
  1125. if DEBUG_LOG_LEVEL > 1:
  1126. print("left over edges:", len(free_edges))
  1127. return c_path
  1128. def ovlp_to_graph(args):
  1129. # transitivity reduction, remove spurs, remove putative edges caused by repeats
  1130. sg, sg_r, edge_data = generate_string_graph(args)
  1131. #dual_path = {}
  1132. sg2 = nx.DiGraph()
  1133. for v, w in edge_data:
  1134. assert (reverse_end(w), reverse_end(v)) in edge_data
  1135. # if (v, w) in masked_edges:
  1136. # continue
  1137. rid, sp, tp, length, score, identity, type_ = edge_data[(v, w)]
  1138. if type_ != "G":
  1139. continue
  1140. label = "%s:%d-%d" % (rid, sp, tp)
  1141. sg2.add_edge(v, w, label=label, length=length, score=score)
  1142. simple_paths = identify_simple_paths(sg2, edge_data)
  1143. ug = nx.MultiDiGraph()
  1144. u_edge_data = {}
  1145. circular_path = set()
  1146. for s, v, t in simple_paths:
  1147. length, score, path = simple_paths[(s, v, t)]
  1148. u_edge_data[(s, t, v)] = (length, score, path, "simple")
  1149. if s != t:
  1150. ug.add_edge(s, t, key=v, type_="simple",
  1151. via=v, length=length, score=score)
  1152. else:
  1153. circular_path.add((s, t, v))
  1154. if DEBUG_LOG_LEVEL > 1:
  1155. with open("utg_data0", "w") as f:
  1156. for s, t, v in u_edge_data:
  1157. rs = reverse_end(t)
  1158. rt = reverse_end(s)
  1159. rv = reverse_end(v)
  1160. assert (rs, rt, rv) in u_edge_data
  1161. length, score, path_or_edges, type_ = u_edge_data[(s, t, v)]
  1162. if type_ == "compound":
  1163. path_or_edges = "|".join(
  1164. [ss + "~" + vv + "~" + tt for ss, tt, vv in path_or_edges])
  1165. else:
  1166. path_or_edges = "~".join(path_or_edges)
  1167. print(s, v, t, type_, length, score, path_or_edges, file=f)
  1168. ug2 = identify_spurs(ug, u_edge_data, 50000)
  1169. ug2 = remove_dup_simple_path(ug2, u_edge_data)
  1170. # phase 2, finding all "consistent" compound paths
  1171. compound_paths = construct_compound_paths(ug2, u_edge_data)
  1172. compound_path_file = open("c_path", "w")
  1173. ug2_edges = set(ug2.edges(keys=True))
  1174. edges_to_remove = set()
  1175. for s, v, t in compound_paths:
  1176. width, length, score, bundle_edges = compound_paths[(s, v, t)]
  1177. print(s, v, t, width, length, score, "|".join(
  1178. [e[0] + "~" + e[2] + "~" + e[1] for e in bundle_edges]), file=compound_path_file)
  1179. for ss, tt, vv in bundle_edges:
  1180. if (ss, tt, vv) in ug2_edges:
  1181. edges_to_remove.add((ss, tt, vv))
  1182. for s, t, v in edges_to_remove:
  1183. ug2.remove_edge(s, t, v)
  1184. length, score, edges, type_ = u_edge_data[(s, t, v)]
  1185. if type_ != "spur":
  1186. u_edge_data[(s, t, v)] = length, score, edges, "contained"
  1187. for s, v, t in compound_paths:
  1188. width, length, score, bundle_edges = compound_paths[(s, v, t)]
  1189. u_edge_data[(s, t, v)] = (length, score, bundle_edges, "compound")
  1190. ug2.add_edge(s, t, key=v, via=v, type_="compound",
  1191. length=length, score=score)
  1192. assert v == "NA"
  1193. rs = reverse_end(t)
  1194. rt = reverse_end(s)
  1195. assert (rs, v, rt) in compound_paths
  1196. #dual_path[ (s, v, t) ] = (rs, v, rt)
  1197. #dual_path[ (rs, v, rt) ] = (s, v, t)
  1198. compound_path_file.close()
  1199. # remove short utg using local flow consistent rule
  1200. r"""
  1201. short UTG like this can be removed, this kind of utg are likely artifects of repeats
  1202. >____ _____>
  1203. \__UTG_>__/
  1204. <____/ \_____<
  1205. """
  1206. ug_edge_to_remove = set()
  1207. for s, t, v in ug2.edges(keys=True):
  1208. if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
  1209. ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
  1210. length, score, path_or_edges, type_ = u_edge_data[(s, t, v)]
  1211. if length < 60000:
  1212. rs = reverse_end(t)
  1213. rt = reverse_end(s)
  1214. rv = reverse_end(v)
  1215. ug_edge_to_remove.add((s, t, v))
  1216. ug_edge_to_remove.add((rs, rt, rv))
  1217. for s, t, v in list(ug_edge_to_remove):
  1218. ug2.remove_edge(s, t, key=v)
  1219. length, score, edges, type_ = u_edge_data[(s, t, v)]
  1220. u_edge_data[(s, t, v)] = length, score, edges, "repeat_bridge"
  1221. ug = ug2
  1222. # Repeat the aggresive spur filtering with slightly larger spur length.
  1223. ug2 = identify_spurs(ug, u_edge_data, 80000)
  1224. ug = ug2
  1225. with open("utg_data", "w") as f:
  1226. for s, t, v in u_edge_data:
  1227. length, score, path_or_edges, type_ = u_edge_data[(s, t, v)]
  1228. if v == "NA":
  1229. path_or_edges = "|".join(
  1230. [ss + "~" + vv + "~" + tt for ss, tt, vv in path_or_edges])
  1231. else:
  1232. path_or_edges = "~".join(path_or_edges)
  1233. print(s, v, t, type_, length, score, path_or_edges, file=f)
  1234. # contig construction from utgs
  1235. c_path = construct_c_path_from_utgs(ug, u_edge_data, sg)
  1236. free_edges = set()
  1237. for s, t, v in ug.edges(keys=True):
  1238. free_edges.add((s, t, v))
  1239. ctg_id = 0
  1240. ctg_paths = open("ctg_paths", "w")
  1241. c_path.sort(key=lambda x: -x[3])
  1242. for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
  1243. length = 0
  1244. score = 0
  1245. length_r = 0
  1246. score_r = 0
  1247. non_overlapped_path = []
  1248. non_overlapped_path_r = []
  1249. for s, t, v in path:
  1250. if v != "NA":
  1251. rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
  1252. else:
  1253. rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
  1254. if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
  1255. non_overlapped_path.append((s, t, v))
  1256. non_overlapped_path_r.append((rs, rt, rv))
  1257. length += u_edge_data[(s, t, v)][0]
  1258. score += u_edge_data[(s, t, v)][1]
  1259. length_r += u_edge_data[(rs, rt, rv)][0]
  1260. score_r += u_edge_data[(rs, rt, rv)][1]
  1261. else:
  1262. break
  1263. if len(non_overlapped_path) == 0:
  1264. continue
  1265. s0, t0, v0 = non_overlapped_path[0]
  1266. end_node = non_overlapped_path[-1][1]
  1267. c_type_ = "ctg_linear" if (end_node != s0) else "ctg_circular"
  1268. print("%06dF" % ctg_id, c_type_, s0 + "~" + v0 + "~" + \
  1269. t0, end_node, length, score, "|".join(
  1270. [c[0] + "~" + c[2] + "~" + c[1] for c in non_overlapped_path]), file=ctg_paths)
  1271. non_overlapped_path_r.reverse()
  1272. s0, t0, v0 = non_overlapped_path_r[0]
  1273. end_node = non_overlapped_path_r[-1][1]
  1274. print("%06dR" % ctg_id, c_type_, s0 + "~" + v0 + "~" + \
  1275. t0, end_node, length_r, score_r, "|".join(
  1276. [c[0] + "~" + c[2] + "~" + c[1] for c in non_overlapped_path_r]), file=ctg_paths)
  1277. ctg_id += 1
  1278. for e in non_overlapped_path:
  1279. if e in free_edges:
  1280. free_edges.remove(e)
  1281. for e in non_overlapped_path_r:
  1282. if e in free_edges:
  1283. free_edges.remove(e)
  1284. for s, t, v in list(circular_path):
  1285. length, score, path, type_ = u_edge_data[(s, t, v)]
  1286. print("%6d" % ctg_id, "ctg_circular", s + \
  1287. "~" + v + "~" + t, t, length, score, s + "~" + v + "~" + t, file=ctg_paths)
  1288. ctg_id += 1
  1289. ctg_paths.close()
  1290. def main(argv=sys.argv):
  1291. import argparse
  1292. parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes',
  1293. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  1294. parser.add_argument(
  1295. '--overlap-file', default='preads.ovl',
  1296. help='a file that contains the overlap information.')
  1297. parser.add_argument(
  1298. '--min_len', type=int, default=4000,
  1299. help=argparse.SUPPRESS)
  1300. parser.add_argument(
  1301. '--min-len', type=int, default=4000,
  1302. help='minimum length of the reads to be considered for assembling')
  1303. parser.add_argument(
  1304. '--min_idt', type=float, default=96,
  1305. help=argparse.SUPPRESS)
  1306. parser.add_argument(
  1307. '--min-idt', type=float, default=96,
  1308. help='minimum alignment identity of the reads to be considered for assembling')
  1309. parser.add_argument(
  1310. '--lfc', action="store_true", default=False,
  1311. help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
  1312. parser.add_argument(
  1313. '--disable_chimer_bridge_removal', action="store_true", default=False,
  1314. help=argparse.SUPPRESS)
  1315. parser.add_argument(
  1316. '--disable-chimer-bridge-removal', action="store_true", default=False,
  1317. help='disable chimer induced bridge removal')
  1318. args = parser.parse_args(argv[1:])
  1319. ovlp_to_graph(args)
  1320. if __name__ == "__main__":
  1321. main(sys.argv)