calc_cutoff.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. from .. import functional as f
  2. import argparse
  3. import uuid
  4. import json
  5. import os
  6. import sys
  7. import traceback
  8. def main(argv=sys.argv):
  9. import argparse
  10. description = """
  11. Given the result of 'DBstats -u -b1' on stdin,
  12. print the lowest read-length required for sufficient coverage of the genome
  13. (i.e. 'length_cutoff').
  14. """
  15. epilog = """
  16. This is useful when length_cutoff is not provided but the genome-size
  17. can be estimated. The purpose is to *reduce* the amount of data seen by
  18. DALIGNER, since otherwise it will miss many alignments when it
  19. encounters resource limits.
  20. Note: If PBFALCON_ERRFILE is defined (and its directory is writable),
  21. we will write errors there in addition to stderr.
  22. """
  23. parser = argparse.ArgumentParser(description=description, epilog=epilog,
  24. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  25. parser.add_argument('--coverage', type=float, default=20,
  26. help='Desired coverage ratio (i.e. over-sampling)')
  27. parser.add_argument('genome_size', type=int,
  28. help='Estimated number of bases in genome. (haploid?)')
  29. parser.add_argument('capture', # default='-', # I guess default is not allowed for required args.
  30. help='File with captured output of DBstats. (Otherwise, stdin.)')
  31. args = parser.parse_args(argv[1:])
  32. target = int(args.genome_size * args.coverage)
  33. def capture():
  34. # This generator ensures that our file is closed at end-of-program.
  35. if args.capture != '-':
  36. with open(args.capture) as sin:
  37. yield sin
  38. else:
  39. yield sys.stdin
  40. for sin in capture():
  41. stats = sin.read()
  42. try:
  43. cutoff = f.calc_cutoff(target, stats)
  44. except Exception as e:
  45. tb = traceback.format_exc()
  46. msg = 'User-provided genome_size: {}\nDesired coverage: {}\n'.format(
  47. args.genome_size, args.coverage)
  48. # pbfalcon wants us to write errs here.
  49. errfile = os.environ.get('PBFALCON_ERRFILE')
  50. if errfile:
  51. with open(errfile, 'w') as ofs:
  52. ofs.write(tb + msg)
  53. # this is propagated to SMRT Link UI
  54. # see PacBioAlarm class in pbcommand.models.common for details
  55. with open("alarms.json", "w") as alarms_out:
  56. alarms_out.write(json.dumps([
  57. {
  58. "exception": e.__class__.__name__,
  59. "info": tb,
  60. "message": str(e) + "\n" + msg,
  61. "name": e.__class__.__name__,
  62. "severity": "ERROR",
  63. "owner": "python3",
  64. "id": str(uuid.uuid4())
  65. }]))
  66. raise Exception(tb + msg)
  67. sys.stdout.write(str(cutoff))
  68. if __name__ == "__main__":
  69. main(sys.argv)