graph_to_contig.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. """
  2. TODO: (from convo w/ Ivan)
  3. the issue with this script (but would still like to re-read it to refresh my memory). The script loads all edge sequences and tries to do two things at once: create p_ctg and a_ctg sequences, and align the bubbles using those sequences
  4. If we generate:
  5. 1. All paths first (as tiling paths) for all p_ctg and all a_ctg without loading sequences - this should not consume much space (take a look at *_tiling_paths files).
  6. 2. Load the first read of each tiling path fully, and only edge sequences for every transition, we can generate the output sequences with the same memory/disk consumption.
  7. 3. Align bubbles after that.
  8. Our resource consumption should be same
  9. Bubbles?
  10. It aligns them to produce the identity score
  11. After that the dedup_a_tigs.py script is used to deduplicate fake a_ctg.
  12. But that script is simple, and only depends on the alignment info that the previous script stored in the a_ctg header.
  13. """
  14. from builtins import zip
  15. from builtins import range
  16. import argparse
  17. import logging
  18. import sys
  19. import networkx as nx
  20. from ..FastaReader import open_fasta_reader
  21. from ..io import open_progress
  22. RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
  23. def log(msg):
  24. sys.stderr.write(msg)
  25. sys.stderr.write('\n')
  26. def rc(seq):
  27. return "".join([RCMAP[c] for c in seq[::-1]])
  28. def reverse_end(node_id):
  29. node_id, end = node_id.split(":")
  30. new_end = "B" if end == "E" else "E"
  31. return node_id + ":" + new_end
  32. def yield_first_seq(one_path_edges, seqs):
  33. if one_path_edges and one_path_edges[0][0] != one_path_edges[-1][1]:
  34. # If non-empty, and non-circular,
  35. # prepend the entire first read.
  36. (vv, ww) = one_path_edges[0]
  37. (vv_rid, vv_letter) = vv.split(":")
  38. if vv_letter == 'E':
  39. first_seq = seqs[vv_rid]
  40. else:
  41. assert vv_letter == 'B'
  42. first_seq = "".join([RCMAP[c] for c in seqs[vv_rid][::-1]])
  43. yield first_seq
  44. def compose_ctg(seqs, edge_data, ctg_id, path_edges, proper_ctg):
  45. total_score = 0
  46. total_length = 0
  47. edge_lines = []
  48. sub_seqs = []
  49. # If required, add the first read to the path sequence.
  50. if proper_ctg:
  51. sub_seqs = list(yield_first_seq(path_edges, seqs))
  52. total_length = 0 if len(sub_seqs) == 0 else len(sub_seqs[0])
  53. # Splice-in the rest of the path sequence.
  54. for vv, ww in path_edges:
  55. rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
  56. sub_seqs.append(e_seq)
  57. edge_lines.append('%s %s %s %s %d %d %d %0.2f' % (
  58. ctg_id, vv, ww, rid, s, t, aln_score, idt))
  59. total_length += abs(s - t)
  60. total_score += aln_score
  61. return edge_lines, sub_seqs, total_score, total_length
  62. def run(improper_p_ctg, proper_a_ctg, preads_fasta_fn, sg_edges_list_fn, utg_data_fn, ctg_paths_fn):
  63. """improper==True => Neglect the initial read.
  64. We used to need that for unzip.
  65. """
  66. reads_in_layout = set()
  67. with open_progress(sg_edges_list_fn) as f:
  68. for l in f:
  69. l = l.strip().split()
  70. """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
  71. v, w, rid, s, t, aln_score, idt, type_ = l
  72. if type_ != "G":
  73. continue
  74. r1 = v.split(":")[0]
  75. reads_in_layout.add(r1)
  76. r2 = w.split(":")[0]
  77. reads_in_layout.add(r2)
  78. seqs = {}
  79. # load all p-read name into memory
  80. with open_fasta_reader(preads_fasta_fn) as f:
  81. for r in f:
  82. if r.name not in reads_in_layout:
  83. continue
  84. seqs[r.name] = r.sequence.upper() # name == rid-string
  85. edge_data = {}
  86. with open_progress(sg_edges_list_fn) as f:
  87. for l in f:
  88. l = l.strip().split()
  89. """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
  90. v, w, rid, s, t, aln_score, idt, type_ = l
  91. if type_ != "G":
  92. continue
  93. r1, dir1 = v.split(":")
  94. reads_in_layout.add(r1) # redundant, but harmless
  95. r2, dir2 = w.split(":")
  96. reads_in_layout.add(r2) # redundant, but harmless
  97. s = int(s)
  98. t = int(t)
  99. aln_score = int(aln_score)
  100. idt = float(idt)
  101. if s < t:
  102. e_seq = seqs[rid][s:t]
  103. assert 'E' == dir2
  104. else:
  105. # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702
  106. # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon.
  107. e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]])
  108. assert 'B' == dir2
  109. edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)
  110. utg_data = {}
  111. with open_progress(utg_data_fn) as f:
  112. for l in f:
  113. l = l.strip().split()
  114. s, v, t, type_, length, score, path_or_edges = l
  115. if type_ not in ["compound", "simple", "contained"]:
  116. continue
  117. length = int(length)
  118. score = int(score)
  119. if type_ in ("simple", "contained"):
  120. path_or_edges = path_or_edges.split("~")
  121. else:
  122. path_or_edges = [tuple(e.split("~"))
  123. for e in path_or_edges.split("|")]
  124. utg_data[(s, v, t)] = type_, length, score, path_or_edges
  125. p_ctg_out = open("p_ctg.fa", "w")
  126. a_ctg_out = open("a_ctg_all.fa", "w")
  127. p_ctg_t_out = open("p_ctg_tiling_path", "w")
  128. a_ctg_t_out = open("a_ctg_all_tiling_path", "w")
  129. layout_ctg = set()
  130. with open_progress(ctg_paths_fn) as f:
  131. for l in f:
  132. l = l.strip().split()
  133. ctg_id, c_type_, i_utig, t0, length, score, utgs = l
  134. ctg_id = ctg_id
  135. s0 = i_utig.split("~")[0]
  136. if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
  137. continue
  138. else:
  139. layout_ctg.add((s0, t0))
  140. ctg_label = i_utig + "~" + t0
  141. length = int(length)
  142. utgs = utgs.split("|")
  143. one_path = []
  144. total_score = 0
  145. total_length = 0
  146. #a_ctg_data = []
  147. a_ctg_group = {}
  148. for utg in utgs:
  149. s, v, t = utg.split("~")
  150. type_, length, score, path_or_edges = utg_data[(s, v, t)]
  151. total_score += score
  152. total_length += length
  153. if type_ == "simple":
  154. if len(one_path) != 0:
  155. one_path.extend(path_or_edges[1:])
  156. else:
  157. one_path.extend(path_or_edges)
  158. if type_ == "compound":
  159. c_graph = nx.DiGraph()
  160. all_alt_path = []
  161. for ss, vv, tt in path_or_edges:
  162. type_, length, score, sub_path = utg_data[(ss, vv, tt)]
  163. v1 = sub_path[0]
  164. for v2 in sub_path[1:]:
  165. c_graph.add_edge(
  166. v1, v2, e_score=edge_data[(v1, v2)][3])
  167. v1 = v2
  168. shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
  169. score = nx.shortest_path_length(c_graph, s, t, "e_score")
  170. all_alt_path.append((score, shortest_path))
  171. # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
  172. while 1:
  173. n0 = shortest_path[0]
  174. for n1 in shortest_path[1:]:
  175. c_graph.remove_edge(n0, n1)
  176. n0 = n1
  177. try:
  178. shortest_path = nx.shortest_path(
  179. c_graph, s, t, "e_score")
  180. score = nx.shortest_path_length(
  181. c_graph, s, t, "e_score")
  182. #a_ctg_data.append( (s, t, shortest_path) )
  183. all_alt_path.append((score, shortest_path))
  184. except nx.exception.NetworkXNoPath:
  185. break
  186. # if len(shortest_path) < 2:
  187. # break
  188. # Is sorting required, if we are appending the shortest paths in order?
  189. all_alt_path.sort()
  190. all_alt_path.reverse()
  191. shortest_path = all_alt_path[0][1]
  192. # The longest branch in the compound unitig is added to the primary path.
  193. if len(one_path) != 0:
  194. one_path.extend(shortest_path[1:])
  195. else:
  196. one_path.extend(shortest_path)
  197. a_ctg_group[(s, t)] = all_alt_path
  198. if len(one_path) == 0:
  199. continue
  200. one_path_edges = list(zip(one_path[:-1], one_path[1:]))
  201. # Compose the primary contig.
  202. p_edge_lines, p_ctg_seq_chunks, p_total_score, p_total_length = compose_ctg(seqs, edge_data, ctg_id, one_path_edges, (not improper_p_ctg))
  203. # Write out the tiling path.
  204. p_ctg_t_out.write('\n'.join(p_edge_lines))
  205. p_ctg_t_out.write('\n')
  206. # Write the sequence.
  207. # Using the `total_score` instead of `p_total_score` intentionally. Sum of
  208. # edge scores is not identical to sum of unitig scores.
  209. p_ctg_out.write('>%s %s %s %d %d\n' % (ctg_id, ctg_label, c_type_, p_total_length, total_score))
  210. p_ctg_out.write(''.join(p_ctg_seq_chunks))
  211. p_ctg_out.write('\n')
  212. a_id = 0
  213. for v, w in a_ctg_group:
  214. atig_output = []
  215. # Compose the base sequence.
  216. for sub_id in range(len(a_ctg_group[(v, w)])):
  217. score, atig_path = a_ctg_group[(v, w)][sub_id]
  218. atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
  219. a_ctg_id = '%s-%03d-%02d' % (ctg_id, a_id + 1, sub_id)
  220. a_edge_lines, sub_seqs, a_total_score, a_total_length = compose_ctg(
  221. seqs, edge_data, a_ctg_id, atig_path_edges, proper_a_ctg)
  222. seq = ''.join(sub_seqs)
  223. # Keep the placeholder for these values for legacy purposes, but mark
  224. # them as for deletion.
  225. # The base a_ctg will also be output to the same file, for simplicity.
  226. delta_len = 0
  227. idt = 1.0
  228. cov = 1.0
  229. atig_output.append((v, w, atig_path, a_total_length, a_total_score, seq, atig_path_edges, a_ctg_id, a_edge_lines, delta_len, idt, cov))
  230. if len(atig_output) == 1:
  231. continue
  232. for sub_id, data in enumerate(atig_output):
  233. v, w, tig_path, a_total_length, a_total_score, seq, atig_path_edges, a_ctg_id, a_edge_lines, delta_len, a_idt, cov = data
  234. # Write out the tiling path.
  235. a_ctg_t_out.write('\n'.join(a_edge_lines))
  236. a_ctg_t_out.write('\n')
  237. # Write the sequence.
  238. a_ctg_out.write('>%s %s %s %d %d %d %d %0.2f %0.2f\n' % (a_ctg_id, v, w, a_total_length, a_total_score, len(atig_path_edges), delta_len, idt, cov))
  239. a_ctg_out.write(''.join(seq))
  240. a_ctg_out.write('\n')
  241. a_id += 1
  242. a_ctg_out.close()
  243. p_ctg_out.close()
  244. a_ctg_t_out.close()
  245. p_ctg_t_out.close()
  246. def main(argv=sys.argv):
  247. description = 'Generate the primary and alternate contig fasta files and tiling paths, given the string graph.'
  248. epilog = """
  249. We write these:
  250. p_ctg_out = open("p_ctg.fa", "w")
  251. a_ctg_out = open("a_ctg_all.fa", "w")
  252. p_ctg_t_out = open("p_ctg_tiling_path", "w")
  253. a_ctg_t_out = open("a_ctg_all_tiling_path", "w")
  254. """
  255. parser = argparse.ArgumentParser(
  256. description=description,
  257. formatter_class=argparse.RawDescriptionHelpFormatter,
  258. epilog=epilog)
  259. parser.add_argument('--improper-p-ctg', action='store_true',
  260. help='Skip the initial read in each p_ctg path.')
  261. parser.add_argument('--proper-a-ctg', action='store_true',
  262. help='Skip the initial read in each a_ctg path.')
  263. parser.add_argument('--preads-fasta-fn', type=str,
  264. default='./preads4falcon.fasta',
  265. help='Input. Preads file, required to construct the contigs.')
  266. parser.add_argument('--sg-edges-list-fn', type=str,
  267. default='./sg_edges_list',
  268. help='Input. File containing string graph edges, produced by ovlp_to_graph.py.')
  269. parser.add_argument('--utg-data-fn', type=str,
  270. default='./utg_data',
  271. help='Input. File containing unitig data, produced by ovlp_to_graph.py.')
  272. parser.add_argument('--ctg-paths-fn', type=str,
  273. default='./ctg_paths',
  274. help='Input. File containing contig paths, produced by ovlp_to_graph.py.')
  275. args = parser.parse_args(argv[1:])
  276. run(**vars(args))
  277. if __name__ == "__main__":
  278. logging.basicConfig(level=logging.INFO)
  279. main(sys.argv)