dedup_a_tp.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. from falcon_kit.FastaReader import open_fasta_reader
  2. import argparse
  3. import sys
  4. def load_headers(fp_in):
  5. """
  6. Loads all a_ctg IDs from the a_ctg.fa, which is already deduplicated.
  7. """
  8. ret = set()
  9. for r in fp_in:
  10. a_ctg_id = r.name.split()[0]
  11. ret.add(a_ctg_id)
  12. return ret
  13. def run(fp_out, a_ctg, a_ctg_all_tiling_path):
  14. with open_fasta_reader(a_ctg) as fp_in:
  15. a_ctg_ids = load_headers(fp_in)
  16. with open(a_ctg_all_tiling_path, 'r') as fp_in:
  17. for line in fp_in:
  18. line = line.strip()
  19. if len(line) == 0: # pragma: no cover
  20. continue # pragma: no cover
  21. sl = line.split()
  22. if sl[0] not in a_ctg_ids:
  23. continue
  24. fp_out.write('%s\n' % (line))
  25. def parse_args(argv):
  26. parser = argparse.ArgumentParser(description='Extracts all tiling paths from a_ctg_all_tiling_paths for which there is a header in a_ctg.fa (which was already deduplicated).',
  27. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  28. parser.add_argument('--a-ctg', type=str,
  29. help="Path to the a_ctg.fa file.", default='a_ctg.fa')
  30. parser.add_argument('--a-ctg-all-tiling-path', type=str,
  31. help="Path to the a_ctg_all_tiling_path file.", default='a_ctg_all_tiling_path')
  32. args = parser.parse_args(argv[1:])
  33. return args
  34. def main(argv=sys.argv):
  35. args = parse_args(argv)
  36. run(sys.stdout, **vars(args))
  37. if __name__ == "__main__": # pragma: no cover
  38. main(sys.argv) # pragma: no cover