collect_contig_gfa.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import argparse
  2. import os
  3. import sys
  4. import json
  5. from falcon_kit.gfa_graph import GFAGraph, serialize_gfa, deserialize_gfa
  6. import falcon_kit.mains.collect_pread_gfa
  7. import falcon_kit.tiling_path
  8. def run(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
  9. p_ctg_fasta, a_ctg_fasta,
  10. write_contigs,
  11. min_p_len, min_a_len, only_these_contigs):
  12. gfa_graph = GFAGraph()
  13. # Load the primary and associate contig files.
  14. p_ctg_dict = falcon_kit.mains.collect_pread_gfa.load_seqs(p_ctg_fasta, (not write_contigs))
  15. p_ctg_lens = {key: val[0] for key, val in p_ctg_dict.items()}
  16. p_ctg_seqs = {key: val[1] for key, val in p_ctg_dict.items()}
  17. a_ctg_dict = falcon_kit.mains.collect_pread_gfa.load_seqs(a_ctg_fasta, (not write_contigs))
  18. a_ctg_lens = {key: val[0] for key, val in a_ctg_dict.items()}
  19. a_ctg_seqs = {key: val[1] for key, val in a_ctg_dict.items()}
  20. # Create whitelists for filtering contigs.
  21. p_ctg_whitelist = set(p_ctg_seqs.keys())
  22. a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys())])
  23. if only_these_contigs:
  24. p_ctg_whitelist = set(open(only_these_contigs).read().splitlines()) & set(p_ctg_whitelist)
  25. a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys()) if key.split('-')[0].split('_')[0] in p_ctg_whitelist])
  26. # Load the tiling paths and assign coordinates.
  27. p_paths = falcon_kit.tiling_path.load_tiling_paths(p_ctg_tiling_path, whitelist_seqs=p_ctg_whitelist, contig_lens=p_ctg_lens)
  28. a_paths = falcon_kit.tiling_path.load_tiling_paths(a_ctg_tiling_path, whitelist_seqs=a_ctg_whitelist, contig_lens=a_ctg_lens)
  29. # Find the associate contig placement. `a_placement` is a dict:
  30. # placement[p_ctg_id][a_ctg_id] = (start, end, p_ctg_id, a_ctg_id, first_node, last_node)
  31. a_placement = falcon_kit.tiling_path.find_a_ctg_placement(p_paths, a_paths)
  32. # Add the nodes.
  33. for ctg_id, tiling_path in p_paths.items():
  34. gfa_graph.add_node(ctg_id, p_ctg_lens[ctg_id], p_ctg_seqs[ctg_id])
  35. for ctg_id, tiling_path in a_paths.items():
  36. gfa_graph.add_node(ctg_id, a_ctg_lens[ctg_id], a_ctg_seqs[ctg_id])
  37. # Add edges between primary and associate contigs.
  38. for p_ctg_id, a_dict in a_placement.items():
  39. for a_ctg_id, placement in a_dict.items():
  40. start, end, p_ctg_id, a_ctg_id, first_node, last_node = placement
  41. a_ctg_len = a_ctg_lens[a_ctg_id]
  42. # edge_name = 'edge-%d-out-%s-to-%s' % (len(gfa_graph.edges), a_ctg_id, p_ctg_id)
  43. edge_name = 'edge-%d' % (len(gfa_graph.edges))
  44. gfa_graph.add_edge(edge_name, p_ctg_id, '+', a_ctg_id, '+', start, start, 0, 0, '*', tags = {}, labels = {})
  45. # edge_name = 'edge-%d-in-%s-to-%s' % (len(gfa_graph.edges), a_ctg_id, p_ctg_id)
  46. edge_name = 'edge-%d' % (len(gfa_graph.edges))
  47. gfa_graph.add_edge(edge_name, a_ctg_id, '+', p_ctg_id, '+', a_ctg_len, a_ctg_len, end, end, '*', tags = {}, labels = {})
  48. # Add circular edges to the primary contigs, if they exist.
  49. for ctg_id, tiling_path in p_paths.items():
  50. if len(tiling_path.edges) == 0:
  51. continue
  52. if tiling_path.edges[0].v != tiling_path.edges[-1].w:
  53. continue
  54. p_len = p_ctg_lens[ctg_id]
  55. edge_name = 'edge-%d' % (len(gfa_graph.edges))
  56. gfa_graph.add_edge(edge_name, ctg_id, '+', ctg_id, '+', p_len, p_len, 0, 0, '*', tags = {}, labels = {})
  57. fp_out.write(serialize_gfa(gfa_graph))
  58. fp_out.write('\n')
  59. def parse_args(argv):
  60. parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
  61. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  62. parser.add_argument('--p-ctg-tiling-path', type=str, default='p_ctg_tiling_path',
  63. help='location of the p_ctg tiling path file')
  64. parser.add_argument('--a-ctg-tiling-path', type=str, default='a_ctg_tiling_path',
  65. help='location of the a_ctg tiling path file')
  66. parser.add_argument('--p-ctg-fasta', type=str, default='p_ctg.fa',
  67. help='path to the primary contigs file')
  68. parser.add_argument('--a-ctg-fasta', type=str, default='a_ctg.fa',
  69. help='path to the associate contigs file')
  70. parser.add_argument('--write-contigs', '-c', action='store_true',
  71. help="output contig sequences as S lines")
  72. parser.add_argument('--min-p-len', type=int, default=0,
  73. help='primary contig paths with length smaller than this will not be reported')
  74. parser.add_argument('--min-a-len', type=int, default=0,
  75. help='associate contig paths with length smaller than this will not be reported')
  76. parser.add_argument('--only-these-contigs', type=str, default='',
  77. help='limit output to specified contigs listed in file (one per line)')
  78. args = parser.parse_args(argv[1:])
  79. return args
  80. def main(argv=sys.argv):
  81. args = parse_args(argv)
  82. run(sys.stdout, **vars(args))
  83. if __name__ == '__main__': # pragma: no cover
  84. main()