1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- """
- Performs a single pass over an input FASTA (file or streamed), and collects
- all ZMWs. For each ZMW it calculates the expected molecular size by picking
- the internal median subread length.
- The script writes one line per ZMW indicating:
- movie_zmw median_insert_length total_insert_sum num_passes
- Author: Ivan Sovic
- """
- from falcon_kit.mains.fasta_filter import ZMWTuple
- import falcon_kit.FastaReader as FastaReader
- import falcon_kit.mains.fasta_filter as fasta_filter
- import falcon_kit.io as io
- import os
- import sys
- import argparse
- import logging
- import contextlib
- import itertools
- LOG = logging.getLogger()
- def yield_record(fp_in):
- fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
- for record in fasta_records:
- yield record
- def run(fp_out, yield_zmw_tuple_func):
- for zmw_id, zmw_subreads in itertools.groupby(yield_zmw_tuple_func, lambda x: x.zmw_id):
- zmw_subreads_list = list(zmw_subreads)
- zrec = fasta_filter.internal_median_zmw_subread(zmw_subreads_list)
- movie_zmw = zrec.movie_name + '/' + zrec.zmw_id
- zmw_unique_molecular_size = zrec.seq_len
- zmw_total_size = sum([zmw.seq_len for zmw in zmw_subreads_list])
- zmw_num_passes = len(zmw_subreads_list)
- fp_out.write('{}\t{}\t{}\t{}\n'.format(movie_zmw, zmw_unique_molecular_size, zmw_total_size, zmw_num_passes))
- def parse_args(argv):
- parser = argparse.ArgumentParser(description="For a given streamed FASTA file, it collects all subreads per ZMW, "\
- "calculates the median insert size, and writes out a TSV file with base counts.",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- args = parser.parse_args(argv[1:])
- return args
- def main(argv=sys.argv):
- args = parse_args(argv)
- logging.basicConfig(level=logging.INFO)
- run(sys.stdout, fasta_filter.yield_zmwtuple(yield_record(sys.stdin), whitelist_set=None, store_record=False))
- if __name__ == "__main__": # pragma: no cover
- main(sys.argv) # pragma: no cover
|