collect_pread_gfa.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. import argparse
  2. import os
  3. import sys
  4. import json
  5. from falcon_kit.fc_asm_graph import AsmGraph
  6. from falcon_kit.FastaReader import FastaReader
  7. from falcon_kit.gfa_graph import *
  8. import falcon_kit.tiling_path
  9. def load_seqs(fasta_fn, store_only_seq_len):
  10. """
  11. If store_only_seq_len is True, then the seq is discarded and
  12. only it's length stored.
  13. """
  14. seqs = {}
  15. f = FastaReader(fasta_fn)
  16. if store_only_seq_len == False:
  17. for r in f:
  18. seqs[r.name.split()[0]] = (len(r.sequence), r.sequence.upper())
  19. else:
  20. for r in f:
  21. seqs[r.name.split()[0]] = (len(r.sequence), '*')
  22. return seqs
  23. def load_pread_overlaps(fp_in):
  24. preads_overlap_dict = {}
  25. for line in fp_in:
  26. sl = line.strip().split()
  27. if len(sl) < 13:
  28. continue
  29. # Example line: 000000009 000000082 -3004 99.90 0 4038 7043 7043 1 6488 9492 9492 overlap 000000F.5000003.0 000000F.5000003.0
  30. preads_overlap_dict[(sl[0], sl[1])] = sl[0:4] + [int(val) for val in sl[4:12]] + sl[12:]
  31. # Overlaps are not always symmetrically represented in the preads.ovl for some reason, so add the
  32. # reverse overlap here as well, but do not overwrite existing (just to be safe).
  33. if (sl[1], sl[0]) not in preads_overlap_dict:
  34. preads_overlap_dict[(sl[1], sl[0])] = [sl[1], sl[0], sl[2], sl[3]] + [int(val) for val in sl[8:12]] + [int(val) for val in sl[4:8]] + sl[12:]
  35. return preads_overlap_dict
  36. def load_sg_edges(fp_in):
  37. """
  38. Loads all sg_edges_list so that haplotig paths can be reversed if needed.
  39. with open(os.path.join(fc_asm_path, "sg_edges_list"), 'r') as fp:
  40. sg_edges_dict = load_sg_edges(fp)
  41. """
  42. sg_edges_dict = {}
  43. for line in fp_in:
  44. sl = line.strip().split()
  45. if len(sl) < 8:
  46. continue
  47. # Example line: 000000512:B 000000679:E 000000679 4290 7984 4290 99.95 TR
  48. sg_edges_dict[(sl[0], sl[1])] = sl[0:3] + [int(val) for val in sl[3:6]] + [float(sl[6])] + sl[7:]
  49. return sg_edges_dict
  50. def add_node(gfa_graph, v, preads_dict):
  51. v_name, v_orient = v.split(':')
  52. v_len, v_seq = preads_dict[v_name]
  53. gfa_graph.add_node(v_name, v_len, v_seq)
  54. def add_edge(gfa_graph, v, w, edge_split_line, preads_overlap_dict, sg_edges_dict):
  55. edge_name = 'edge-%d' % (len(gfa_graph.edges))
  56. v_name, v_orient = v.split(':')
  57. w_name, w_orient = w.split(':')
  58. v_orient = '+' if v_orient == 'E' else '-'
  59. w_orient = '+' if w_orient == 'E' else '-'
  60. cigar = '*'
  61. # Get the SG edge and the overlap, and set the tags and labels.
  62. sg_edge = sg_edges_dict[(v, w)]
  63. overlap = preads_overlap_dict[(v_name, w_name)]
  64. labels = {'tp': edge_split_line, 'sg_edge': sg_edge, 'overlap': overlap}
  65. tags = {}
  66. # Example overlap:
  67. # 000000001 000000170 -6104 99.75 0 1909 8010 8010 1 1250 7354 7354 overlap 000000F.5000003.0 000000F.5000003.0
  68. # Handle the overlap coordinates - GFA format requires the coordinates to be with
  69. # respect to the fwd strand, and the M4 format reports overlaps on the
  70. # strand of the alignment.
  71. _, _, score, idt, v_rev, v_start, v_end, v_len, w_rev, w_start, w_end, w_len = overlap[0:12]
  72. if v_rev == 1:
  73. v_start, v_end = v_end, v_start
  74. v_start = v_len - v_start
  75. v_end = v_len - v_end
  76. if w_rev == 1:
  77. w_start, w_end = w_end, w_start
  78. w_start = w_len - w_start
  79. w_end = w_len - w_end
  80. gfa_graph.add_edge(edge_name, v_name, v_orient, w_name, w_orient, v_start, v_end, w_start, w_end, cigar, tags = tags, labels = labels)
  81. def add_tiling_paths_to_gfa(gfa_graph, tiling_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
  82. # Add nodes.
  83. for ctg_id, tiling_path in tiling_paths.items():
  84. for edge in tiling_path.edges:
  85. add_node(gfa_graph, edge.v, preads_dict)
  86. add_node(gfa_graph, edge.w, preads_dict)
  87. # Add edges.
  88. for ctg_id, tiling_path in tiling_paths.items():
  89. for edge in tiling_path.edges:
  90. add_edge(gfa_graph, edge.v, edge.w, edge.get_split_line(), preads_overlap_dict, sg_edges_dict)
  91. # Add path.
  92. for ctg_id, tiling_path in tiling_paths.items():
  93. path_nodes = []
  94. path_cigars = []
  95. if len(tiling_path.edges) == 0:
  96. continue
  97. # Add the first node to the path.
  98. v = tiling_path.edges[0].v
  99. v_name, v_orient = v.split(':')
  100. cigar = '%dM' % (tiling_path.coords[v]) # This will be 0 if the contig is improper, and length of v otherwise.
  101. path_nodes.append(v_name)
  102. path_cigars.append(cigar)
  103. # Add the rest of the nodes.
  104. for edge in tiling_path.edges:
  105. w_name, w_orient = edge.w.split(':')
  106. cigar = '%dM' % (abs(edge.e - edge.b))
  107. path_nodes.append(w_name)
  108. path_cigars.append(cigar)
  109. gfa_graph.add_path(ctg_id, path_nodes, path_cigars)
  110. def add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
  111. asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
  112. for v, w in asm_graph.sg_edges:
  113. add_node(gfa_graph, v, preads_dict)
  114. add_node(gfa_graph, w, preads_dict)
  115. for v, w in asm_graph.sg_edges:
  116. edge_data = asm_graph.sg_edges[(v, w)]
  117. if edge_data[-1] != 'G':
  118. continue
  119. add_edge(gfa_graph, v, w, edge_data, preads_overlap_dict, sg_edges_dict)
  120. def run(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
  121. preads_fasta, p_ctg_fasta, a_ctg_fasta,
  122. sg_edges_list, preads_ovl, utg_data, ctg_paths,
  123. add_string_graph, write_reads,
  124. min_p_len, min_a_len, only_these_contigs):
  125. """
  126. This method produces a GFAGraph object containing info required
  127. to write both the GFA-1 and GFA-2 formatted assemblies.
  128. However, it does not write the GFA formats directly, but instead
  129. dumps a JSON file to disk.
  130. The JSON file is converted to a GFA-1 or a GFA-2 with outside scripts.
  131. The graphical output is produced from either the entire string
  132. graph (only the non-filtered edges are considered) or from only
  133. the tiling paths. String graph can show the neighborhood of contig
  134. breaks, whereas the tiling path output is more sparse.
  135. Output is written to stdout.
  136. """
  137. gfa_graph = GFAGraph()
  138. # Load preads.
  139. preads_dict = load_seqs(preads_fasta, (not write_reads))
  140. # Load the pread overlaps
  141. with open(preads_ovl, 'r') as fp:
  142. preads_overlap_dict = load_pread_overlaps(fp)
  143. # Load the SG edges.
  144. with open(sg_edges_list, 'r') as fp:
  145. sg_edges_dict = load_sg_edges(fp)
  146. # Load the primary and associate contig files.
  147. p_ctg_seqs = load_seqs(p_ctg_fasta, True)
  148. a_ctg_seqs = load_seqs(a_ctg_fasta, True)
  149. # Collect the sequence lengths from the above dicts.
  150. p_ctg_lens = {key: val[0] for key, val in p_ctg_seqs.items()}
  151. a_ctg_lens = {key: val[0] for key, val in a_ctg_seqs.items()}
  152. # Create whitelists for filtering contigs.
  153. p_ctg_whitelist = set(p_ctg_seqs.keys())
  154. a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys())])
  155. if only_these_contigs:
  156. p_ctg_whitelist = set(open(only_these_contigs).read().splitlines()) & set(p_ctg_whitelist)
  157. a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys()) if key.split('-')[0].split('_')[0] in p_ctg_whitelist])
  158. # Load the tiling paths and assign coordinates.
  159. p_paths = falcon_kit.tiling_path.load_tiling_paths(p_ctg_tiling_path, whitelist_seqs=p_ctg_whitelist, contig_lens=p_ctg_lens)
  160. a_paths = falcon_kit.tiling_path.load_tiling_paths(a_ctg_tiling_path, whitelist_seqs=a_ctg_whitelist, contig_lens=a_ctg_lens)
  161. add_tiling_paths_to_gfa(gfa_graph, p_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
  162. add_tiling_paths_to_gfa(gfa_graph, a_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
  163. if add_string_graph:
  164. add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
  165. fp_out.write(serialize_gfa(gfa_graph))
  166. fp_out.write('\n')
  167. def parse_args(argv):
  168. parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
  169. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  170. parser.add_argument('--p-ctg-tiling-path', type=str, default='p_ctg_tiling_path',
  171. help='location of the p_ctg tiling path file')
  172. parser.add_argument('--a-ctg-tiling-path', type=str, default='a_ctg_tiling_path',
  173. help='location of the a_ctg tiling path file')
  174. parser.add_argument('--preads-fasta', type=str, default='preads4falcon.fasta',
  175. help='path to the preads4falcon.fasta file')
  176. parser.add_argument('--p-ctg-fasta', type=str, default='p_ctg.fa',
  177. help='path to the primary contigs file')
  178. parser.add_argument('--a-ctg-fasta', type=str, default='a_ctg.fa',
  179. help='path to the associate contigs file')
  180. parser.add_argument('--sg-edges-list', type=str, default='sg_edges_list',
  181. help='string graph edges file from Falcon assembly')
  182. parser.add_argument('--preads-ovl', type=str, default='preads.ovl',
  183. help='the preads overlap file')
  184. parser.add_argument('--utg-data', type=str,
  185. default='utg_data', help='unitig data file from Falcon')
  186. parser.add_argument('--ctg-paths', type=str, default='ctg_paths',
  187. help='contig paths file from Falcon assembly')
  188. parser.add_argument('--add-string-graph', action='store_true',
  189. help="in addition to tiling paths, output other edges and nodes from the final string graph")
  190. parser.add_argument('--write-reads', '-r', action='store_true',
  191. help="output read sequences in S lines")
  192. # parser.add_argument('--write-contigs', '-c', action='store_true',
  193. # help="output contig sequences as S lines")
  194. parser.add_argument('--min-p-len', type=int, default=0,
  195. help='primary contig paths with length smaller than this will not be reported')
  196. parser.add_argument('--min-a-len', type=int, default=0,
  197. help='associate contig paths with length smaller than this will not be reported')
  198. parser.add_argument('--only-these-contigs', type=str, default='',
  199. help='limit output to specified contigs listed in file (one per line)')
  200. args = parser.parse_args(argv[1:])
  201. return args
  202. def main(argv=sys.argv):
  203. args = parse_args(argv)
  204. run(sys.stdout, **vars(args))
  205. if __name__ == '__main__': # pragma: no cover
  206. main()