kbmpoa.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /*
  2. *
  3. * Copyright (c) 2018, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef KMER_BIN_MAP_PO_MSA_CNS_RJ_H
  20. #define KMER_BIN_MAP_PO_MSA_CNS_RJ_H
  21. #include "kbm.h"
  22. #include "wtpoa.h"
  23. typedef struct {
  24. u4i chridx, cidx, bidx;
  25. String *rdtag;
  26. BaseBank *rdseq;
  27. u8i rdoff;
  28. u4i rdlen;
  29. KBMAux *aux;
  30. layseqr *seqs;
  31. u4v *heap;
  32. u2i bsize, bstep; // block size, and slide steps
  33. u4i sidx, lidx, hidx;
  34. } KBMBlock;
  35. static inline KBMBlock* init_kbmblock(u2i bsize, u2i bstep){
  36. KBMBlock *kb;
  37. kb = malloc(sizeof(KBMBlock));
  38. kb->rdtag = init_string(1024);
  39. kb->rdseq = NULL;
  40. kb->rdoff = 0;
  41. kb->rdlen = 0;
  42. kb->aux = NULL;
  43. kb->seqs = init_layseqr(32);
  44. kb->heap = init_u4v(32);
  45. bsize = ((bsize + KBM_BIN_SIZE - 1) / KBM_BIN_SIZE) * KBM_BIN_SIZE;
  46. if(bsize == 0) bsize = KBM_BIN_SIZE;
  47. bstep = ((bstep + KBM_BIN_SIZE - 1) / KBM_BIN_SIZE) * KBM_BIN_SIZE;
  48. if(bstep == 0) bstep = KBM_BIN_SIZE;
  49. kb->bsize = bsize;
  50. kb->bstep = bstep;
  51. kb->chridx = 0;
  52. kb->cidx = 0;
  53. kb->bidx = 0;
  54. kb->sidx = MAX_U4;
  55. kb->lidx = MAX_U4;
  56. kb->hidx = 0;
  57. return kb;
  58. }
  59. static inline void free_kbmblock(KBMBlock *kb){
  60. free_string(kb->rdtag);
  61. free_layseqr(kb->seqs);
  62. free(kb);
  63. }
  64. static inline void reset_kbmblock(KBMBlock *kb, char *rdtag, u4i chridx, BaseBank *rdseq, u8i rdoff, u4i rdlen, KBMAux *aux){
  65. lay_seq_t *stop;
  66. kb->chridx = chridx;
  67. kb->cidx = chridx;
  68. kb->bidx = 0;
  69. clear_string(kb->rdtag);
  70. if(rdtag){
  71. append_string(kb->rdtag, rdtag, strlen(rdtag));
  72. } else {
  73. append_string(kb->rdtag, "anonymous", strlen("anonymous"));
  74. }
  75. kb->rdseq = rdseq;
  76. kb->rdoff = rdoff;
  77. kb->rdlen = rdlen;
  78. kb->aux = aux;
  79. recyc_all_layseqr(kb->seqs);
  80. clear_u4v(kb->heap);
  81. kb->sidx = MAX_U4;
  82. kb->lidx = MAX_U4;
  83. sort_array(aux->hits->buffer, aux->hits->size, kbm_map_t, num_cmpgt(a.qb, b.qb));
  84. kb->hidx = 0;
  85. stop = pop_layseqr(kb->seqs);
  86. stop->chridx = chridx + 1;
  87. stop->bidx = 0;
  88. }
  89. static inline lay_seq_t* _push_padding_ref_kbmblock(KBMBlock *kb, lay_seq_t *sq){
  90. lay_seq_t *st;
  91. u8i off;
  92. u4i sqidx, len;
  93. sqidx = offset_layseqr(kb->seqs, sq);
  94. while(kb->chridx == sq->chridx && kb->bidx <= sq->bidx){
  95. st = pop_layseqr(kb->seqs);
  96. st->chridx = kb->cidx;
  97. st->bidx = kb->bidx;
  98. st->rdidx = 0;
  99. st->rddir = 0;
  100. st->rdoff = st->bidx * kb->bstep;
  101. st->rbeg = st->rdoff;
  102. off = kb->rdoff;
  103. len = kb->rdlen;
  104. st->rend = num_min(st->rbeg + kb->bsize, len);
  105. st->rdtag = kb->rdtag->string;
  106. clear_basebank(st->seq);
  107. fast_fwdbits2basebank(st->seq, kb->rdseq->bits, off + st->rdoff, st->rend - st->rbeg);
  108. array_heap_push(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, offset_layseqr(kb->seqs, st),
  109. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  110. if(st->rend >= len){
  111. kb->chridx = MAX_U4;
  112. kb->bidx = 0;
  113. break;
  114. } else {
  115. kb->bidx ++;
  116. }
  117. sq = ref_layseqr(kb->seqs, sqidx);
  118. }
  119. sq = ref_layseqr(kb->seqs, sqidx);
  120. return sq;
  121. }
  122. static inline lay_seq_t* iter_kbmblock(void *obj){
  123. KBMBlock *kb;
  124. kbm_map_t *hit;
  125. lay_seq_t *sc, *sl;
  126. u8i coff, tsoff;
  127. u4i off, rdoff, rdlen, nxt, val, len, clen, bt;
  128. kb = (KBMBlock*)obj;
  129. if(kb->sidx != MAX_U4){
  130. recyc_layseqr(kb->seqs, kb->sidx);
  131. kb->sidx = MAX_U4;
  132. }
  133. sc = NULL;
  134. sl = NULL;
  135. do {
  136. if(kb->heap->size == 0){
  137. kb->lidx = MAX_U4;
  138. }
  139. while(kb->lidx == MAX_U4){
  140. while(kb->hidx < kb->aux->hits->size){
  141. if(kb->aux->hits->buffer[kb->hidx].mat == 0) kb->hidx ++;
  142. else break;
  143. }
  144. if(kb->hidx >= kb->aux->hits->size){
  145. sc = ref_layseqr(kb->seqs, 0);
  146. sc = _push_padding_ref_kbmblock(kb, sc);
  147. break;
  148. }
  149. hit = ref_kbmmapv(kb->aux->hits, kb->hidx ++);
  150. tsoff = kb->aux->kbm->reads->buffer[hit->tidx].seqoff * KBM_BIN_SIZE;
  151. rdlen = kb->aux->kbm->reads->buffer[hit->tidx].bincnt * KBM_BIN_SIZE;
  152. off = hit->qb;
  153. rdoff = hit->qdir? Int(rdlen - hit->te) : hit->tb;
  154. {
  155. encap_layseqr(kb->seqs, 2);
  156. sc = pop_layseqr(kb->seqs);
  157. sc->chridx = kb->cidx;
  158. sc->bidx = (off / kb->bstep);
  159. sc->rdidx = hit->tidx;
  160. sc->rddir = hit->qdir;
  161. sc->rdoff = rdoff;
  162. sc->rbeg = off;
  163. sc->rend = 0;
  164. sc->rdtag = kb->aux->kbm->reads->buffer[hit->tidx].tag;
  165. clear_basebank(sc->seq);
  166. }
  167. sl = NULL;
  168. {
  169. nxt = (off / kb->bstep) * kb->bstep;
  170. if(nxt && off - nxt < UInt(kb->bsize - kb->bstep)){
  171. if(sc->bidx){
  172. sl = pop_layseqr(kb->seqs);
  173. sl->chridx = kb->cidx;
  174. sl->bidx = sc->bidx - 1;
  175. sl->rdidx = hit->tidx;
  176. sl->rddir = hit->qdir;
  177. sl->rdoff = rdoff;
  178. sl->rbeg = off;
  179. sl->rend = 0;
  180. sl->rdtag = kb->aux->kbm->reads->buffer[hit->tidx].tag;
  181. clear_basebank(sl->seq);
  182. }
  183. nxt += kb->bsize - kb->bstep;
  184. } else {
  185. nxt += kb->bstep;
  186. }
  187. }
  188. coff = hit->cgoff;
  189. clen = hit->cglen;
  190. while(clen){
  191. bt = get_bitsvec(kb->aux->cigars, coff + clen - 1);
  192. clen --;
  193. val = KBM_BIN_SIZE;
  194. bt = (bt & 0x03)? (bt & 0x03) : 3;
  195. while(val){
  196. if(bt & 0b001){
  197. len = num_min(val, nxt - off);
  198. off += len;
  199. } else {
  200. len = val;
  201. }
  202. val -= len;
  203. if(bt & 0b010){
  204. {
  205. if(sc){
  206. if(sc->seq->size == 0){
  207. sc->rbeg = (bt & 0b001)? off - len : off;
  208. }
  209. if(hit->qdir){
  210. revbits2basebank(sc->seq, kb->aux->kbm->rdseqs->bits, tsoff + rdlen - (rdoff + len), len);
  211. } else {
  212. fwdbits2basebank(sc->seq, kb->aux->kbm->rdseqs->bits, tsoff + rdoff, len);
  213. }
  214. sc->rend = off;
  215. }
  216. if(sl){
  217. if(sl->seq->size == 0){
  218. sl->rbeg = (bt & 0b001)? off - len : off;
  219. }
  220. if(hit->qdir){
  221. revbits2basebank(sl->seq, kb->aux->kbm->rdseqs->bits, tsoff + rdlen - (rdoff + len), len);
  222. } else {
  223. fwdbits2basebank(sl->seq, kb->aux->kbm->rdseqs->bits, tsoff + rdoff, len);
  224. }
  225. sl->rend = off;
  226. }
  227. }
  228. rdoff += len;
  229. }
  230. if(off == nxt){
  231. if(sl){
  232. if(sl->rend > sl->rbeg){
  233. u4i scidx;
  234. scidx = offset_layseqr(kb->seqs, sc);
  235. sl = _push_padding_ref_kbmblock(kb, sl);
  236. sc = ref_layseqr(kb->seqs, scidx);
  237. if(kb->lidx == MAX_U4){
  238. kb->lidx = offset_layseqr(kb->seqs, sl);
  239. }
  240. array_heap_push(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, offset_layseqr(kb->seqs, sl),
  241. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  242. }
  243. sl = NULL;
  244. nxt += 2 * kb->bstep - kb->bsize;
  245. } else {
  246. u4i scidx;
  247. scidx = offset_layseqr(kb->seqs, sc);
  248. encap_layseqr(kb->seqs, 1);
  249. sl = ref_layseqr(kb->seqs, scidx);
  250. sc = pop_layseqr(kb->seqs);
  251. sc->chridx = kb->cidx;
  252. sc->bidx = sl->bidx + 1;
  253. sc->rdidx = hit->tidx;
  254. sc->rddir = hit->qdir;
  255. sc->rdoff = rdoff;
  256. sc->rbeg = off;
  257. sc->rend = 0;
  258. sc->rdtag = kb->aux->kbm->reads->buffer[hit->tidx].tag;
  259. clear_basebank(sc->seq);
  260. nxt += kb->bsize - kb->bstep;
  261. }
  262. }
  263. }
  264. }
  265. if(sl && sl->rend > sl->rbeg){
  266. u4i scidx;
  267. scidx = offset_layseqr(kb->seqs, sc);
  268. sl = _push_padding_ref_kbmblock(kb, sl);
  269. sc = ref_layseqr(kb->seqs, scidx);
  270. if(kb->lidx == MAX_U4){
  271. kb->lidx = offset_layseqr(kb->seqs, sl);
  272. }
  273. array_heap_push(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, offset_layseqr(kb->seqs, sl),
  274. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  275. }
  276. if(sc && sc->rend > sc->rbeg){
  277. u4i scidx;
  278. scidx = offset_layseqr(kb->seqs, sl);
  279. sc = _push_padding_ref_kbmblock(kb, sc);
  280. sl = ref_layseqr(kb->seqs, scidx);
  281. if(kb->lidx == MAX_U4){
  282. kb->lidx = offset_layseqr(kb->seqs, sc);
  283. }
  284. array_heap_push(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, offset_layseqr(kb->seqs, sc),
  285. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  286. }
  287. }
  288. if(kb->heap->size == 0) break;
  289. if(kb->lidx != MAX_U4){
  290. sl = ref_layseqr(kb->seqs, kb->lidx);
  291. kb->sidx = kb->heap->buffer[0];
  292. sc = ref_layseqr(kb->seqs, kb->sidx);
  293. if(sc->chridx == sl->chridx && sc->bidx + 1 >= sl->bidx){
  294. kb->lidx = MAX_U4;
  295. continue;
  296. }
  297. array_heap_remove(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, 0,
  298. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  299. sc->rbeg -= sc->bidx * kb->bstep;
  300. sc->rend -= sc->bidx * kb->bstep;
  301. } else {
  302. kb->sidx = kb->heap->buffer[0];
  303. array_heap_remove(kb->heap->buffer, kb->heap->size, kb->heap->cap, u4i, 0,
  304. num_cmpx(ref_layseqr(kb->seqs, a)->bidx, ref_layseqr(kb->seqs, b)->bidx, ref_layseqr(kb->seqs, a)->rdidx, ref_layseqr(kb->seqs, b)->rdidx));
  305. sc = ref_layseqr(kb->seqs, kb->sidx);
  306. sc->rbeg -= sc->bidx * kb->bstep;
  307. sc->rend -= sc->bidx * kb->bstep;
  308. }
  309. return sc;
  310. } while(1);
  311. kb->sidx = MAX_U4;
  312. return NULL;
  313. }
  314. static inline void info_kbmblock(void *obj, lay_seq_t *sq, lay_blk_t *bk){
  315. KBMBlock *kb;
  316. kb = (KBMBlock*)obj;
  317. if(sq == NULL) return;
  318. bk->node1 = sq->bidx;
  319. bk->node2 = sq->bidx + 1;
  320. bk->reftag = kb->rdtag->string;
  321. bk->reflen = kb->rdlen;
  322. bk->refoff = sq->bidx * kb->bstep;
  323. }
  324. static inline int map_kbmpoa(CTGCNS *cc, KBMAux *aux, char *rdtag, u4i qidx, BaseBank *rdseq, u8i seqoff, u4i seqlen, u4i corr_min, u4i corr_max, float corr_cov, FILE *layf){
  325. ctg_cns_t *ctg;
  326. KBMBlock *kb;
  327. u4i i;
  328. int self_aln, max_hit, min_aln, min_mat;
  329. kb = (KBMBlock*)cc->obj;
  330. reset_ctgcns(cc, kb, iter_kbmblock, info_kbmblock);
  331. seqlen = kbm_cvt_length(seqlen);
  332. if(seqlen < 4 * KBM_BIN_SIZE + UInt(aux->par->min_aln)) return 0;
  333. if(rdseq && rdseq != aux->kbm->rdseqs){
  334. self_aln = 0;
  335. } else {
  336. self_aln = 1;
  337. rdseq = aux->kbm->rdseqs;
  338. }
  339. max_hit = aux->par->max_hit;
  340. min_aln = aux->par->min_aln;
  341. min_mat = aux->par->min_mat;
  342. aux->par->self_aln = 0;
  343. aux->par->max_hit = corr_max;
  344. aux->par->min_aln = num_max(seqlen * corr_cov, min_aln);
  345. aux->par->min_mat = num_max(aux->par->min_aln * aux->par->min_sim, min_mat);
  346. query_index_kbm(aux, rdtag, qidx, rdseq, seqoff, seqlen);
  347. map_kbm(aux);
  348. aux->par->self_aln = 0;
  349. aux->par->max_hit = max_hit;
  350. aux->par->min_aln = min_aln;
  351. aux->par->min_mat = min_mat;
  352. if(KBM_LOG){
  353. fprintf(KBM_LOGF, ">>> Contained alignments\n"); fflush(KBM_LOGF);
  354. for(i=0;i<aux->hits->size;i++){
  355. fprint_hit_kbm(aux, i, KBM_LOGF);
  356. }
  357. }
  358. if(aux->hits->size < corr_min){
  359. return 0;
  360. }
  361. if(self_aln){
  362. for(i=0;i<aux->hits->size;i++){
  363. if(aux->hits->buffer[i].tidx == qidx){
  364. aux->hits->buffer[i].mat = 0;
  365. }
  366. }
  367. }
  368. reset_kbmblock(kb, rdtag, qidx, rdseq, seqoff, seqlen, aux);
  369. if(layf){
  370. print_lays_ctgcns(cc, layf);
  371. fflush(layf);
  372. reset_kbmblock(kb, rdtag, qidx, rdseq, seqoff, seqlen, aux);
  373. reset_ctgcns(cc, kb, iter_kbmblock, info_kbmblock);
  374. }
  375. if((ctg = iter_ctgcns(cc))){
  376. if(KBM_LOG){
  377. fprintf(KBM_LOGF, ">%s corrected\n", ctg->tag->string);
  378. print_lines_basebank(ctg->cns, 0, ctg->cns->size, KBM_LOGF, 100);
  379. fflush(KBM_LOGF);
  380. }
  381. } else {
  382. return 0;
  383. }
  384. clear_kbmmapv(aux->hits);
  385. if(ctg->cns->size > seqlen){
  386. ctg->cns->size = seqlen;
  387. normalize_basebank(ctg->cns);
  388. } else if(ctg->cns->size < seqlen){
  389. ctg->cns->size = kbm_cvt_length(ctg->cns->size);
  390. normalize_basebank(ctg->cns);
  391. }
  392. if(ctg->cns->size == 0){
  393. repay_ctgcns(cc, ctg);
  394. return 0;
  395. }
  396. query_index_kbm(aux, rdtag, qidx, ctg->cns, 0, ctg->cns->size);
  397. map_kbm(aux);
  398. repay_ctgcns(cc, ctg); // Please make sure ctg is not used unless this function return
  399. return 1;
  400. }
  401. #endif