zmw_collect.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. """
  2. Performs a single pass over an input FASTA (file or streamed), and collects
  3. all ZMWs. For each ZMW it calculates the expected molecular size by picking
  4. the internal median subread length.
  5. The script writes one line per ZMW indicating:
  6. movie_zmw median_insert_length total_insert_sum num_passes
  7. Author: Ivan Sovic
  8. """
  9. from falcon_kit.mains.fasta_filter import ZMWTuple
  10. import falcon_kit.FastaReader as FastaReader
  11. import falcon_kit.mains.fasta_filter as fasta_filter
  12. import falcon_kit.io as io
  13. import os
  14. import sys
  15. import argparse
  16. import logging
  17. import contextlib
  18. import itertools
  19. LOG = logging.getLogger()
  20. def yield_record(fp_in):
  21. fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
  22. for record in fasta_records:
  23. yield record
  24. def run(fp_out, yield_zmw_tuple_func):
  25. for zmw_id, zmw_subreads in itertools.groupby(yield_zmw_tuple_func, lambda x: x.zmw_id):
  26. zmw_subreads_list = list(zmw_subreads)
  27. zrec = fasta_filter.internal_median_zmw_subread(zmw_subreads_list)
  28. movie_zmw = zrec.movie_name + '/' + zrec.zmw_id
  29. zmw_unique_molecular_size = zrec.seq_len
  30. zmw_total_size = sum([zmw.seq_len for zmw in zmw_subreads_list])
  31. zmw_num_passes = len(zmw_subreads_list)
  32. fp_out.write('{}\t{}\t{}\t{}\n'.format(movie_zmw, zmw_unique_molecular_size, zmw_total_size, zmw_num_passes))
  33. def parse_args(argv):
  34. parser = argparse.ArgumentParser(description="For a given streamed FASTA file, it collects all subreads per ZMW, "\
  35. "calculates the median insert size, and writes out a TSV file with base counts.",
  36. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  37. args = parser.parse_args(argv[1:])
  38. return args
  39. def main(argv=sys.argv):
  40. args = parse_args(argv)
  41. logging.basicConfig(level=logging.INFO)
  42. run(sys.stdout, fasta_filter.yield_zmwtuple(yield_record(sys.stdin), whitelist_set=None, store_record=False))
  43. if __name__ == "__main__": # pragma: no cover
  44. main(sys.argv) # pragma: no cover