"""A simple script to scatter a (filtered) subreadset into units of input files. """ from pbcore.io import (SubreadSet, ExternalResource) # pylint: disable=import-error import argparse import logging import os import sys import copy log = logging.getLogger(__name__) def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a FOFN of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True, skipCounts=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) fofn = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn, skipCounts=True) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) fofn.append(out_fn) return fofn def run_split_dataset(subreadset, out_prefix): out_prefix_abs = os.path.abspath(out_prefix) fofn = split_dataset(subreadset, out_prefix_abs) out_fofn_fn = '{}.fofn'.format(out_prefix_abs) with open(out_fofn_fn, 'w') as ofs: for fn in fofn: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(fofn), out_fofn_fn)) def main(argv=sys.argv): description = """Scatter subreadsets in units of one input file. """ epilog = """ """ parser = argparse.ArgumentParser( description=description, epilog=epilog, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('subreadset', help='Input subreadset XML filename. Can be filtered.') parser.add_argument('out_prefix', help='Prefix of the output sub-datasets.') args = parser.parse_args(argv[1:]) run_split_dataset(args.subreadset, args.out_prefix) if __name__ == "__main__": logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) main(sys.argv)