align_edlib.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. import argparse
  2. import sys
  3. from falcon_kit import kup, falcon
  4. import edlib
  5. def log(msg):
  6. sys.stderr.write(msg)
  7. sys.stderr.write('\n')
  8. def count_cigar_ops(cigar):
  9. """
  10. For curious people: regexes are very slow for parsing CIGAR strings.
  11. cigar: Unicode
  12. """
  13. b = 0
  14. num_m, num_i, num_d = 0, 0, 0
  15. for i in range(len(cigar)):
  16. if cigar[i] <= '9':
  17. continue
  18. # Check if there are no digits before the op char.
  19. assert(b < i)
  20. count = int(cigar[b:i])
  21. op = cigar[i]
  22. b = i + 1
  23. if op == 'D':
  24. num_d += count
  25. elif op == 'I':
  26. num_i += count
  27. elif op in ['M', '=', 'X']:
  28. num_m += count
  29. else: # pragma: no cover
  30. pass # pragma: no cover
  31. # Check if there are dangling ops.
  32. assert(b == len(cigar))
  33. total_len = num_d + num_i + num_m
  34. return num_m, num_i, num_d, total_len
  35. def get_aln_data(t_seq, q_seq):
  36. """
  37. Inputs in bytes.
  38. """
  39. aln_data = []
  40. K = 8
  41. seq0 = t_seq
  42. lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
  43. sa_ptr = kup.allocate_seq(len(seq0))
  44. sda_ptr = kup.allocate_seq_addr(len(seq0))
  45. kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
  46. q_id = "dummy"
  47. kmer_match_ptr = kup.find_kmer_pos_for_seq(
  48. q_seq, len(q_seq), K, sda_ptr, lk_ptr)
  49. kmer_match = kmer_match_ptr[0]
  50. if kmer_match.count != 0:
  51. aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
  52. aln_range = aln_range_ptr[0]
  53. s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
  54. log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
  55. s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
  56. if e1 - s1 > 100:
  57. log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
  58. s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
  59. # Align using Edlib instead of DWA.
  60. edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW")
  61. delta_l = len(q_seq) - len(t_seq)
  62. cov = float(e1 - s1) / float(len(q_seq))
  63. idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1)
  64. aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
  65. delta_l, idt, cov))
  66. kup.free_aln_range(aln_range_ptr)
  67. kup.free_kmer_match(kmer_match_ptr)
  68. kup.free_kmer_lookup(lk_ptr)
  69. kup.free_seq_array(sa_ptr)
  70. kup.free_seq_addr_array(sda_ptr)
  71. return aln_data #, x, y
  72. def get_global_aln_results(ref_seq, query_seq, min_seq_len):
  73. """
  74. Aligns two sequences globally, and returns (delta_len, idt, cov) values
  75. compatible with the legacy deduplication code.
  76. Currently unused - it was used in an intermediate version, and might be useful
  77. at some point in the future.
  78. Inputs in Unicode.
  79. """
  80. ref_seq = ref_seq.encode('ascii')
  81. query_seq = query_seq.encode('ascii')
  82. log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
  83. delta_len = len(query_seq) - len(ref_seq)
  84. idt = 0.0
  85. cov = 0.0
  86. if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len:
  87. return delta_len, idt, cov
  88. result = edlib.align(query_seq, ref_seq, mode="NW", task="path")
  89. # result map is Unicode
  90. cigar = result['cigar']
  91. num_m, num_i, num_d, total_len = count_cigar_ops(result['cigar'])
  92. num_eq = (num_m + num_i + num_d) - result['editDistance']
  93. num_x = num_m - num_eq
  94. idt_query = float(num_eq) / float(num_eq + num_x + num_i)
  95. idt_ref = float(num_eq) / float(num_eq + num_x + num_d)
  96. idt = min(idt_query, idt_ref)
  97. cov = 1.0
  98. log(' - Alignment stats: num_m = %d, num_i = %d, num_d = %d, total_len = %d, num_eq = %d, num_x = %d' % (num_m, num_i, num_d, total_len, num_eq, num_x))
  99. return delta_len, idt, cov
  100. def get_aln_results(ref_seq, query_seq, min_seq_len):
  101. """
  102. Runs the legacy mapping code, and aligns the selected region using Edlib
  103. instead of the legacy DWA alignment with quadratic memory.
  104. Inputs in Unicode.
  105. """
  106. ref_seq = ref_seq.encode('ascii')
  107. query_seq = query_seq.encode('ascii')
  108. log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
  109. delta_len = len(query_seq) - len(ref_seq)
  110. idt = 0.0
  111. cov = 0.0
  112. if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len:
  113. return delta_len, idt, cov
  114. aln_data = get_aln_data(ref_seq, query_seq)
  115. if len(aln_data) != 0:
  116. delta_len, idt, cov = aln_data[-1][8:11]
  117. else:
  118. log('len(aln_data) == 0!')
  119. return delta_len, idt, cov