reduce_preads.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. """
  2. Creates a reduced version of preads4falcon.fasta file by writing only the preads
  3. which are incident with 'G' edges in the final assembly graph.
  4. """
  5. import argparse
  6. import logging
  7. import sys
  8. from ..FastaReader import open_fasta_reader
  9. from ..io import open_progress
  10. default_sg_edges_list_fns = ['./sg_edges_list']
  11. def run(fp_out, preads_fasta_fn, sg_edges_list_fns):
  12. # Workaround the Argparse issue. It does not override
  13. # the default argument value when the parameter is
  14. # used in the append mode, but instead adds to the default
  15. # list. https://bugs.python.org/issue16399
  16. # Instead, we will not specify the default value, and
  17. # check if the list is emptu here here, so that the user
  18. # can specify exactly the paths to the file(s).
  19. if not sg_edges_list_fns:
  20. sg_edges_list_fns = default_sg_edges_list_fns
  21. reads_in_layout = set()
  22. for fn in sg_edges_list_fns:
  23. with open_progress(fn) as fp_in:
  24. for l in fp_in:
  25. l = l.strip().split()
  26. """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
  27. v, w, rid, s, t, aln_score, idt, type_ = l
  28. if type_ != "G":
  29. continue
  30. r1 = v.split(":")[0]
  31. reads_in_layout.add(r1)
  32. r2 = w.split(":")[0]
  33. reads_in_layout.add(r2)
  34. with open_fasta_reader(preads_fasta_fn) as f:
  35. for r in f:
  36. if r.name not in reads_in_layout:
  37. continue
  38. fp_out.write('>{}\n{}\n'.format(r.name, r.sequence.upper()))
  39. def main(argv=sys.argv):
  40. description = 'Create a reduced set of preads, with only those used in the final layout. Write to stdout.'
  41. parser = argparse.ArgumentParser(
  42. description=description,
  43. formatter_class=argparse.RawDescriptionHelpFormatter)
  44. parser.add_argument('--preads-fasta-fn', type=str,
  45. default='preads4falcon.fasta',
  46. help='Preads file, required to construct the contigs.')
  47. parser.add_argument('--sg-edges-list-fns', action='append',
  48. help='One or more files containing string graph edges, produced by ovlp_to_graph.py.')
  49. args = parser.parse_args(argv[1:])
  50. run(sys.stdout, **vars(args))
  51. if __name__ == "__main__":
  52. logging.basicConfig(level=logging.INFO)
  53. main(sys.argv)