graph_to_utgs.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. from builtins import zip
  2. from builtins import range
  3. from falcon_kit import kup, falcon, DWA
  4. from falcon_kit.fc_asm_graph import AsmGraph
  5. import networkx as nx
  6. import sys
  7. RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
  8. def rc(seq):
  9. return "".join([RCMAP[c] for c in seq[::-1]])
  10. def get_aln_data(t_seq, q_seq):
  11. aln_data = []
  12. K = 8
  13. seq0 = t_seq
  14. lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
  15. sa_ptr = kup.allocate_seq(len(seq0))
  16. sda_ptr = kup.allocate_seq_addr(len(seq0))
  17. kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
  18. q_id = "dummy"
  19. kmer_match_ptr = kup.find_kmer_pos_for_seq(
  20. q_seq, len(q_seq), K, sda_ptr, lk_ptr)
  21. kmer_match = kmer_match_ptr[0]
  22. aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
  23. aln_range = aln_range_ptr[0]
  24. x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
  25. for i in range(kmer_match.count)]))
  26. kup.free_kmer_match(kmer_match_ptr)
  27. s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
  28. if e1 - s1 > 100:
  29. alignment = DWA.align(q_seq[s1:e1], e1 - s1,
  30. seq0[s2:e2], e2 - s2,
  31. 1500, 1)
  32. if alignment[0].aln_str_size > 100:
  33. aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
  34. seq0), alignment[0].aln_str_size, alignment[0].dist))
  35. aln_str1 = alignment[0].q_aln_str
  36. aln_str0 = alignment[0].t_aln_str
  37. DWA.free_alignment(alignment)
  38. kup.free_kmer_lookup(lk_ptr)
  39. kup.free_seq_array(sa_ptr)
  40. kup.free_seq_addr_array(sda_ptr)
  41. return aln_data, x, y
  42. def main(argv=sys.argv):
  43. G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
  44. G_asm.load_sg_seq("preads4falcon.fasta")
  45. utg_out = open("utgs.fa", "w")
  46. for utg in G_asm.utg_data:
  47. s, t, v = utg
  48. type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
  49. if type_ == "simple":
  50. path_or_edges = path_or_edges.split("~")
  51. seq = G_asm.get_seq_from_path(path_or_edges)
  52. print(">%s~%s~%s-%d %d %d" % (
  53. s, v, t, 0, length, score), file=utg_out)
  54. print(seq, file=utg_out)
  55. if type_ == "compound":
  56. c_graph = nx.DiGraph()
  57. all_alt_path = []
  58. path_or_edges = [c.split("~") for c in path_or_edges.split("|")]
  59. for ss, vv, tt in path_or_edges:
  60. type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)]
  61. sub_path = sub_path.split("~")
  62. v1 = sub_path[0]
  63. for v2 in sub_path[1:]:
  64. c_graph.add_edge(
  65. v1, v2, e_score=G_asm.sg_edges[(v1, v2)][1])
  66. v1 = v2
  67. shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
  68. score = nx.shortest_path_length(c_graph, s, t, "e_score")
  69. all_alt_path.append((score, shortest_path))
  70. # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
  71. while 1:
  72. if s == t:
  73. break
  74. n0 = shortest_path[0]
  75. for n1 in shortest_path[1:]:
  76. c_graph.remove_edge(n0, n1)
  77. n0 = n1
  78. try:
  79. shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
  80. score = nx.shortest_path_length(c_graph, s, t, "e_score")
  81. #a_ctg_data.append( (s, t, shortest_path) )
  82. all_alt_path.append((score, shortest_path))
  83. except nx.exception.NetworkXNoPath:
  84. break
  85. # if len(shortest_path) < 2:
  86. # break
  87. all_alt_path.sort()
  88. all_alt_path.reverse()
  89. shortest_path = all_alt_path[0][1]
  90. score, atig_path = all_alt_path[0]
  91. atig_output = []
  92. atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
  93. sub_seqs = []
  94. total_length = 0
  95. total_score = 0
  96. for vv, ww in atig_path_edges:
  97. r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)]
  98. e_seq = G_asm.sg_edge_seqs[(vv, ww)]
  99. rid, ss, tt = r
  100. sub_seqs.append(e_seq)
  101. total_length += abs(ss - tt)
  102. total_score += aln_score
  103. base_seq = "".join(sub_seqs)
  104. atig_output.append(
  105. (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1))
  106. duplicated = True
  107. for score, atig_path in all_alt_path[1:]:
  108. atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
  109. sub_seqs = []
  110. total_length = 0
  111. total_score = 0
  112. for vv, ww in atig_path_edges:
  113. r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)]
  114. e_seq = G_asm.sg_edge_seqs[(vv, ww)]
  115. rid, ss, tt = r
  116. sub_seqs.append(e_seq)
  117. total_length += abs(ss - tt)
  118. total_score += aln_score
  119. seq = "".join(sub_seqs)
  120. aln_data, x, y = get_aln_data(base_seq, seq)
  121. if len(aln_data) != 0:
  122. idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2]
  123. cov = 1.0 * (aln_data[-1][3] -
  124. aln_data[-1][2]) / aln_data[-1][4]
  125. if idt < 0.96 or cov < 0.98:
  126. duplicated = False
  127. atig_output.append(
  128. (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov))
  129. else:
  130. duplicated = False
  131. atig_output.append(
  132. (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0))
  133. # if len(atig_output) == 1:
  134. # continue
  135. sub_id = 0
  136. for data in atig_output:
  137. v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
  138. print(">%s~%s~%s-%d %d %d" % (
  139. v0, "NA", w0, sub_id, total_length, total_score), file=utg_out)
  140. print(seq, file=utg_out)
  141. sub_id += 1
  142. if __name__ == "__main__":
  143. main(sys.argv)