align_dw.py 3.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import argparse
  2. import sys
  3. from falcon_kit import kup, falcon, DWA
  4. class TooLongError(Exception): pass
  5. def log(msg):
  6. sys.stderr.write(msg)
  7. sys.stderr.write('\n')
  8. def get_aln_data(t_seq, q_seq):
  9. """
  10. Inputs are Unicode.
  11. """
  12. t_seq = t_seq.encode('ascii')
  13. q_seq = q_seq.encode('ascii')
  14. aln_data = []
  15. #x = []
  16. #y = []
  17. K = 8
  18. seq0 = t_seq
  19. lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
  20. sa_ptr = kup.allocate_seq(len(seq0))
  21. sda_ptr = kup.allocate_seq_addr(len(seq0))
  22. kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
  23. q_id = "dummy"
  24. kmer_match_ptr = kup.find_kmer_pos_for_seq(
  25. q_seq, len(q_seq), K, sda_ptr, lk_ptr)
  26. kmer_match = kmer_match_ptr[0]
  27. if kmer_match.count != 0:
  28. aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
  29. aln_range = aln_range_ptr[0]
  30. #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
  31. # for i in range(kmer_match.count)]))
  32. s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
  33. log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
  34. s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
  35. max_len = 250000 # to keep allocations < 16GB, given band_tol=1500
  36. if (e1 - s1) >= max_len or (e2 - s2) >= max_len:
  37. # DW.align() would crash, so raise here.
  38. # (500000 is the approx. upper bound for int overflow,
  39. # but some users run out of memory anyway.)
  40. raise TooLongError('q_len={} or t_len={} are too big, over 500k'.format(
  41. (e1-s1), (e2-s2)))
  42. if e1 - s1 > 100:
  43. log('Calling DW_banded.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
  44. s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
  45. alignment = DWA.align(q_seq[s1:e1], e1 - s1,
  46. seq0[s2:e2], e2 - s2,
  47. 1500, 1)
  48. if alignment[0].aln_str_size > 100:
  49. aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
  50. seq0), alignment[0].aln_str_size, alignment[0].dist))
  51. #aln_str1 = alignment[0].q_aln_str
  52. #aln_str0 = alignment[0].t_aln_str
  53. DWA.free_alignment(alignment)
  54. kup.free_aln_range(aln_range_ptr)
  55. kup.free_kmer_match(kmer_match_ptr)
  56. kup.free_kmer_lookup(lk_ptr)
  57. kup.free_seq_array(sa_ptr)
  58. kup.free_seq_addr_array(sda_ptr)
  59. return aln_data #, x, y
  60. def get_aln_results(ref_seq, query_seq, min_seq_len):
  61. # Align the a_ctg against the base.
  62. log('Aligning (DW): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
  63. delta_len = len(query_seq) - len(ref_seq)
  64. idt = 0.0
  65. cov = 0.0
  66. if len(ref_seq) > min_seq_len and len(query_seq) > min_seq_len:
  67. try:
  68. aln_data = get_aln_data(ref_seq, query_seq)
  69. if len(aln_data) != 0:
  70. idt = 1.0 - 1.0 * \
  71. aln_data[-1][-1] / aln_data[-1][-2]
  72. cov = 1.0 * \
  73. (aln_data[-1][3] - aln_data[-1]
  74. [2]) / aln_data[-1][4]
  75. else:
  76. log('len(aln_data) == 0!')
  77. except TooLongError:
  78. log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs. len(ref_seq) = {}, len(query_seq) = {}'.format(len(ref_seq), len(query_seq)))
  79. idt = -1.0
  80. cov = -1.0
  81. return delta_len, idt, cov