ovlp_stats.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. from falcon_kit.multiproc import Pool
  2. import falcon_kit.util.io as io
  3. import argparse
  4. import shlex
  5. import subprocess as sp
  6. import sys
  7. import traceback
  8. Reader = io.CapturedProcessReaderContext
  9. def filter_stats(readlines, min_len):
  10. current_q_id = None
  11. ave_idt = 0.0
  12. all_over_len = 0.0
  13. overlap_data = {"5p": 0, "3p": 0}
  14. q_id = None
  15. rtn_data = []
  16. q_l = 0
  17. for l in readlines():
  18. l = l.strip().split()
  19. q_id, t_id = l[:2]
  20. if q_id != current_q_id:
  21. left_count = overlap_data["5p"]
  22. right_count = overlap_data["3p"]
  23. if (current_q_id != None and
  24. (left_count > 0 or right_count > 0)):
  25. rtn_data.append((current_q_id, q_l, left_count, right_count))
  26. overlap_data = {"5p": 0, "3p": 0}
  27. current_q_id = q_id
  28. ave_idt = 0.0
  29. all_over_len = 0.0
  30. overlap_len = -int(l[2])
  31. idt = float(l[3])
  32. q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
  33. t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
  34. if q_l < min_len or t_l < min_len:
  35. continue
  36. if idt < 90:
  37. continue
  38. if l[-1] in ("contains", "overlap"):
  39. ave_idt += idt * overlap_len
  40. all_over_len += overlap_len
  41. if q_s == 0:
  42. overlap_data["5p"] += 1
  43. if q_e == q_l:
  44. overlap_data["3p"] += 1
  45. if q_id != None:
  46. left_count = overlap_data["5p"]
  47. right_count = overlap_data["3p"]
  48. if (left_count > 0 or right_count > 0):
  49. rtn_data.append((q_id, q_l, left_count, right_count))
  50. return rtn_data
  51. def run_filter_stats(db_fn, fn, min_len):
  52. try:
  53. cmd = "LA4Falcon -mo {} {}".format(db_fn, fn)
  54. reader = Reader(cmd)
  55. with reader:
  56. return fn, filter_stats(reader.readlines, min_len)
  57. except Exception:
  58. stack = traceback.format_exc()
  59. io.LOG(stack)
  60. raise
  61. def run_ovlp_stats(exe_pool, db_fn, file_list, min_len):
  62. inputs = []
  63. for fn in file_list:
  64. if len(fn) != 0:
  65. inputs.append((run_filter_stats, db_fn, fn, min_len))
  66. for res in exe_pool.imap(io.run_func, inputs):
  67. for l in res[1]:
  68. print(" ".join([str(c) for c in l]))
  69. def try_run_ovlp_stats(n_core, db_fn, fofn, min_len):
  70. io.LOG('starting ovlp_stats')
  71. file_list = io.validated_fns(fofn)
  72. io.LOG('fofn {!r}: {}'.format(fofn, file_list))
  73. io.LOG('db {!r}; n_core={}'.format(db_fn, n_core))
  74. n_core = min(n_core, len(file_list))
  75. exe_pool = Pool(n_core)
  76. try:
  77. run_ovlp_stats(exe_pool, db_fn, file_list, min_len)
  78. io.LOG('finished ovlp_stats')
  79. except KeyboardInterrupt:
  80. io.LOG('terminating ovlp_stats workers...')
  81. exe_pool.terminate()
  82. def ovlp_stats(db_fn, fofn, min_len, n_core, stream, debug, silent):
  83. if debug:
  84. n_core = 0
  85. silent = False
  86. if silent:
  87. io.LOG = io.write_nothing
  88. if stream:
  89. global Reader
  90. Reader = io.StreamedProcessReaderContext
  91. try_run_ovlp_stats(n_core, db_fn, fofn, min_len)
  92. def parse_args(argv):
  93. parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter',
  94. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  95. parser.add_argument('--n-core', type=int, default=4,
  96. help='number of processes used for generating consensus; '
  97. '0 for main process only')
  98. parser.add_argument('--fofn', type=str, required=True,
  99. help='file contains the path of all LAS file to be processed in parallel')
  100. parser.add_argument('--min-len', type=int, default=2500,
  101. help="min length of the reads")
  102. parser.add_argument('--db-fn', default='./1-preads_ovl/preads.db',
  103. help="DAZZLER DB of preads")
  104. parser.add_argument('--stream', action='store_true',
  105. help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
  106. parser.add_argument('--debug', '-g', action='store_true',
  107. help="single-threaded, plus other aids to debugging")
  108. parser.add_argument('--silent', action='store_true',
  109. help="suppress cmd reporting on stderr")
  110. return parser.parse_args(argv[1:])
  111. def main(argv=sys.argv):
  112. args = parse_args(argv)
  113. ovlp_stats(**vars(args))
  114. if __name__ == "__main__":
  115. main(sys.argv)