123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- """A simple script to scatter a (filtered) subreadset into units of input files.
- """
- from pbcore.io import (SubreadSet, ExternalResource) # pylint: disable=import-error
- import argparse
- import logging
- import os
- import sys
- import copy
- log = logging.getLogger(__name__)
- def split_dataset(subreadset, out_prefix):
- """
- Takes an input dataset, and for each entry generates one separate dataset
- file, while maintaining all the filters.
- Returns a FOFN of the generated datasets.
- To create an example filtered dataset for testing:
- dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
- dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
- """
- out_prefix_abs = os.path.abspath(out_prefix)
- dset = SubreadSet(subreadset, strict=True, skipCounts=True)
- fns = dset.toFofn()
- log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))
- fofn = []
- for i, bam_fn in enumerate(fns):
- out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
- new_dataset = SubreadSet(bam_fn, skipCounts=True)
- new_dataset.newUuid()
- new_dataset._filters = copy.deepcopy(dset._filters)
- new_dataset.write(out_fn)
- fofn.append(out_fn)
- return fofn
- def run_split_dataset(subreadset, out_prefix):
- out_prefix_abs = os.path.abspath(out_prefix)
- fofn = split_dataset(subreadset, out_prefix_abs)
- out_fofn_fn = '{}.fofn'.format(out_prefix_abs)
- with open(out_fofn_fn, 'w') as ofs:
- for fn in fofn:
- ofs.write('{}\n'.format(fn))
- log.info('Wrote {} chunks into "{}"'.format(len(fofn), out_fofn_fn))
- def main(argv=sys.argv):
- description = """Scatter subreadsets in units of one input file.
- """
- epilog = """
- """
- parser = argparse.ArgumentParser(
- description=description,
- epilog=epilog,
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('subreadset',
- help='Input subreadset XML filename. Can be filtered.')
- parser.add_argument('out_prefix',
- help='Prefix of the output sub-datasets.')
- args = parser.parse_args(argv[1:])
- run_split_dataset(args.subreadset, args.out_prefix)
- if __name__ == "__main__":
- logging.basicConfig()
- logging.getLogger().setLevel(logging.DEBUG)
- main(sys.argv)
|