falcon_kit.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. __all__ = [
  2. 'kup', 'DWA', 'falcon',
  3. 'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
  4. 'Alignment', 'get_alignment',
  5. ]
  6. from ctypes import *
  7. import os
  8. import ext_falcon
  9. #module_path = os.path.split(__file__)[0]
  10. seq_coor_t = c_int
  11. base_t = c_uint8
  12. class KmerLookup(Structure):
  13. _fields_ = [("start", seq_coor_t),
  14. ("last", seq_coor_t),
  15. ("count", seq_coor_t)]
  16. class KmerMatch(Structure):
  17. _fields_ = [("count", seq_coor_t),
  18. ("query_pos", POINTER(seq_coor_t)),
  19. ("target_pos", POINTER(seq_coor_t))]
  20. class AlnRange(Structure):
  21. _fields_ = [("s1", seq_coor_t),
  22. ("e1", seq_coor_t),
  23. ("s2", seq_coor_t),
  24. ("e2", seq_coor_t),
  25. ("score", c_long)]
  26. class ConsensusData(Structure):
  27. _fields_ = [("sequence", c_char_p),
  28. ("eff_cov", POINTER(c_uint))]
  29. try:
  30. falcon_dll = CDLL(ext_falcon.__file__)
  31. except OSError:
  32. # It seems that setup.py has changed the __file__ it attaches to an extension module.
  33. # I have no idea why or why, but this works around it.
  34. falcon_dll = CDLL(os.path.join(os.path.dirname(__file__),
  35. '..', os.path.basename(ext_falcon.__file__)))
  36. kup = falcon_dll
  37. kup.allocate_kmer_lookup.argtypes = [seq_coor_t]
  38. kup.allocate_kmer_lookup.restype = POINTER(KmerLookup)
  39. kup.init_kmer_lookup.argtypes = [POINTER(KmerLookup), seq_coor_t]
  40. kup.free_kmer_lookup.argtypes = [POINTER(KmerLookup)]
  41. kup.allocate_seq.argtypes = [seq_coor_t]
  42. kup.allocate_seq.restype = POINTER(base_t)
  43. kup.init_seq_array.argtypes = [POINTER(base_t), seq_coor_t]
  44. kup.free_seq_array.argtypes = [POINTER(base_t)]
  45. kup.allocate_seq_addr.argtypes = [seq_coor_t]
  46. kup.allocate_seq_addr.restype = POINTER(seq_coor_t)
  47. kup.free_seq_addr_array.argtypes = [POINTER(seq_coor_t)]
  48. kup.add_sequence.argtypes = [seq_coor_t, c_uint, POINTER(c_char), seq_coor_t, POINTER(seq_coor_t),
  49. POINTER(c_uint8), POINTER(KmerLookup)]
  50. kup.mask_k_mer.argtypes = [c_long, POINTER(KmerLookup), c_long]
  51. kup.find_kmer_pos_for_seq.argtypes = [POINTER(c_char), seq_coor_t, c_uint, POINTER(seq_coor_t),
  52. POINTER(KmerLookup)]
  53. kup.find_kmer_pos_for_seq.restype = POINTER(KmerMatch)
  54. kup.free_kmer_match.argtypes = [POINTER(KmerMatch)]
  55. kup.find_best_aln_range.argtypes = [
  56. POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
  57. kup.find_best_aln_range.restype = POINTER(AlnRange)
  58. kup.find_best_aln_range2.argtypes = [
  59. POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
  60. kup.find_best_aln_range2.restype = POINTER(AlnRange)
  61. kup.free_aln_range.argtypes = [POINTER(AlnRange)]
  62. class Alignment(Structure):
  63. """
  64. typedef struct {
  65. seq_coor_t aln_str_size ;
  66. seq_coor_t dist ;
  67. seq_coor_t aln_q_s;
  68. seq_coor_t aln_q_e;
  69. seq_coor_t aln_t_s;
  70. seq_coor_t aln_t_e;
  71. char * q_aln_str;
  72. char * t_aln_str;
  73. } alignment;
  74. """
  75. _fields_ = [("aln_str_size", seq_coor_t),
  76. ("dist", seq_coor_t),
  77. ("aln_q_s", seq_coor_t),
  78. ("aln_q_e", seq_coor_t),
  79. ("aln_t_s", seq_coor_t),
  80. ("aln_t_e", seq_coor_t),
  81. ("q_aln_str", c_char_p),
  82. ("t_aln_str", c_char_p)]
  83. DWA = falcon_dll
  84. DWA.align.argtypes = [POINTER(c_char), c_long, POINTER(
  85. c_char), c_long, c_long, c_int]
  86. DWA.align.restype = POINTER(Alignment)
  87. DWA.free_alignment.argtypes = [POINTER(Alignment)]
  88. falcon = falcon_dll
  89. falcon.generate_consensus.argtypes = [
  90. POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double]
  91. falcon.generate_consensus.restype = POINTER(ConsensusData)
  92. falcon.free_consensus_data.argtypes = [POINTER(ConsensusData)]
  93. def get_alignment(seq1, seq0):
  94. K = 8
  95. lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
  96. sa_ptr = kup.allocate_seq(len(seq0))
  97. sda_ptr = kup.allocate_seq_addr(len(seq0))
  98. kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
  99. kmer_match_ptr = kup.find_kmer_pos_for_seq(
  100. seq1, len(seq1), K, sda_ptr, lk_ptr)
  101. kmer_match = kmer_match_ptr[0]
  102. aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 10, 50)
  103. #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
  104. kup.free_kmer_match(kmer_match_ptr)
  105. aln_range = aln_range_ptr[0]
  106. s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
  107. kup.free_aln_range(aln_range_ptr)
  108. if e1 - s1 > 500:
  109. #s1 = 0 if s1 < 14 else s1 - 14
  110. #s2 = 0 if s2 < 14 else s2 - 14
  111. e1 = len(seq1) if e1 >= len(seq1) - 2 * K else e1 + K * 2
  112. e2 = len(seq0) if e2 >= len(seq0) - 2 * K else e2 + K * 2
  113. alignment = DWA.align(seq1[s1:e1], e1 - s1,
  114. seq0[s2:e2], e2 - s2,
  115. 100,
  116. 0)
  117. # print seq1[s1:e1]
  118. # print seq0[s2:e2]
  119. # if alignment[0].aln_str_size > 500:
  120. #aln_str1 = alignment[0].q_aln_str
  121. #aln_str0 = alignment[0].t_aln_str
  122. aln_size = alignment[0].aln_str_size
  123. aln_dist = alignment[0].dist
  124. aln_q_s = alignment[0].aln_q_s
  125. aln_q_e = alignment[0].aln_q_e
  126. aln_t_s = alignment[0].aln_t_s
  127. aln_t_e = alignment[0].aln_t_e
  128. # print "X,",alignment[0].aln_q_s, alignment[0].aln_q_e
  129. # print "Y,",alignment[0].aln_t_s, alignment[0].aln_t_e
  130. # print aln_str1
  131. # print aln_str0
  132. DWA.free_alignment(alignment)
  133. kup.free_seq_addr_array(sda_ptr)
  134. kup.free_seq_array(sa_ptr)
  135. kup.free_kmer_lookup(lk_ptr)
  136. if e1 - s1 > 500 and aln_size > 500:
  137. return s1, s1 + aln_q_e - aln_q_s, s2, s2 + aln_t_e - aln_t_s, aln_size, aln_dist
  138. else:
  139. return None