dataset_split.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. """A simple script to scatter a (filtered) subreadset into units of input files.
  2. """
  3. from pbcore.io import (SubreadSet, ExternalResource) # pylint: disable=import-error
  4. import argparse
  5. import logging
  6. import os
  7. import sys
  8. import copy
  9. log = logging.getLogger(__name__)
  10. def split_dataset(subreadset, out_prefix):
  11. """
  12. Takes an input dataset, and for each entry generates one separate dataset
  13. file, while maintaining all the filters.
  14. Returns a FOFN of the generated datasets.
  15. To create an example filtered dataset for testing:
  16. dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
  17. dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
  18. """
  19. out_prefix_abs = os.path.abspath(out_prefix)
  20. dset = SubreadSet(subreadset, strict=True, skipCounts=True)
  21. fns = dset.toFofn()
  22. log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))
  23. fofn = []
  24. for i, bam_fn in enumerate(fns):
  25. out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
  26. new_dataset = SubreadSet(bam_fn, skipCounts=True)
  27. new_dataset.newUuid()
  28. new_dataset._filters = copy.deepcopy(dset._filters)
  29. new_dataset.write(out_fn)
  30. fofn.append(out_fn)
  31. return fofn
  32. def run_split_dataset(subreadset, out_prefix):
  33. out_prefix_abs = os.path.abspath(out_prefix)
  34. fofn = split_dataset(subreadset, out_prefix_abs)
  35. out_fofn_fn = '{}.fofn'.format(out_prefix_abs)
  36. with open(out_fofn_fn, 'w') as ofs:
  37. for fn in fofn:
  38. ofs.write('{}\n'.format(fn))
  39. log.info('Wrote {} chunks into "{}"'.format(len(fofn), out_fofn_fn))
  40. def main(argv=sys.argv):
  41. description = """Scatter subreadsets in units of one input file.
  42. """
  43. epilog = """
  44. """
  45. parser = argparse.ArgumentParser(
  46. description=description,
  47. epilog=epilog,
  48. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  49. parser.add_argument('subreadset',
  50. help='Input subreadset XML filename. Can be filtered.')
  51. parser.add_argument('out_prefix',
  52. help='Prefix of the output sub-datasets.')
  53. args = parser.parse_args(argv[1:])
  54. run_split_dataset(args.subreadset, args.out_prefix)
  55. if __name__ == "__main__":
  56. logging.basicConfig()
  57. logging.getLogger().setLevel(logging.DEBUG)
  58. main(sys.argv)