tripoa.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /*
  2. *
  3. * Copyright (c) 2018, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef TRI_PO_MSA_CNS_RJ_H
  20. #define TRI_PO_MSA_CNS_RJ_H
  21. #include "poacns.h"
  22. #include "ksw.h"
  23. typedef struct {
  24. SeqBank *seqs;
  25. u2v *rbegs, *rends;
  26. u4i longest_idx, seqmax;
  27. int8_t matrix[16];
  28. int fail_skip;
  29. u1i ksize; // 11
  30. float kdup; // 0.1
  31. float keqs; // 0.2
  32. uuhash *khash;
  33. u1v *qry, *ref;
  34. u2i winlen, winmin;
  35. u2v *regs[2];
  36. POG *pogs[3];
  37. u4v *kidxs;
  38. f4v *kords;
  39. BaseBank *cns;
  40. int is_tripog, refmode;
  41. int shuffle; // 0: no shuffling, 1: by kmer, 2, random
  42. } TriPOG;
  43. //static inline TriPOG* init_tripog(u4i seqmax, int refmode, int winlen, int winmin, int fail_skip, int M, int X, int I, int D, int W, int use_sse, int rW, u4i min_cnt, float min_freq){
  44. static inline TriPOG* init_tripog(u4i seqmax, int shuffle, int winlen, int winmin, int fail_skip, POGPar *par){
  45. TriPOG *tp;
  46. u4i i;
  47. tp = malloc(sizeof(TriPOG));
  48. tp->seqs = init_seqbank();
  49. tp->rbegs = init_u2v(32);
  50. tp->rends = init_u2v(32);
  51. tp->seqmax = seqmax;
  52. tp->longest_idx = 0;
  53. tp->ksize = 11;
  54. tp->kdup = 0.1;
  55. tp->keqs = 0.2;
  56. tp->khash = init_uuhash(13);
  57. tp->ref = init_u1v(winlen);
  58. tp->qry = init_u1v(1024);
  59. tp->regs[0] = init_u2v(32);
  60. tp->regs[1] = init_u2v(32);
  61. tp->refmode = par->refmode;
  62. tp->fail_skip = fail_skip;
  63. tp->winlen = winlen;
  64. tp->winmin = winmin;
  65. tp->pogs[0] = init_pog(*par);
  66. #if 0
  67. if(winlen > 0){
  68. tp->winlen = winlen;
  69. tp->winmin = winmin;
  70. tp->pogs[0] = init_pog(refmode, M, X, I, D, 0, 0, use_sse, 0, min_cnt, min_freq);
  71. } else {
  72. tp->winlen = 0;
  73. tp->winmin = 0;
  74. tp->pogs[0] = init_pog(refmode, M, X, I, D, W, - winlen, use_sse, rW, min_cnt, min_freq);
  75. }
  76. #endif
  77. tp->pogs[1] = init_pog(*par);
  78. tp->pogs[1]->par->W_score = 0;
  79. tp->pogs[2] = init_pog(*par);
  80. tp->pogs[2]->par->W_score = 0;
  81. //tp->pogs[1] = init_pog(refmode, M, X, I, D, W, 0, use_sse, rW, min_cnt, min_freq);
  82. //tp->pogs[2] = init_pog(refmode, M, X, I, D, W, 0, use_sse, rW, min_cnt, min_freq);
  83. //tp->pogs[1]->near_dialog = 1;
  84. //tp->pogs[2]->near_dialog = 1;
  85. tp->kidxs = init_u4v(32);
  86. tp->kords = init_f4v(32);
  87. tp->cns = init_basebank();
  88. for(i=0;i<16;i++){
  89. tp->matrix[i] = ((i / 4) == (i % 4))? par->M : par->X;
  90. }
  91. tp->is_tripog = 0;
  92. tp->shuffle = shuffle;
  93. return tp;
  94. }
  95. static inline void free_tripog(TriPOG *tp){
  96. free_seqbank(tp->seqs);
  97. free_u2v(tp->rbegs);
  98. free_u2v(tp->rends);
  99. free_uuhash(tp->khash);
  100. free_u1v(tp->qry);
  101. free_u1v(tp->ref);
  102. free_u2v(tp->regs[0]);
  103. free_u2v(tp->regs[1]);
  104. free_pog(tp->pogs[0]);
  105. free_pog(tp->pogs[1]);
  106. free_pog(tp->pogs[2]);
  107. free_u4v(tp->kidxs);
  108. free_f4v(tp->kords);
  109. free_basebank(tp->cns);
  110. free(tp);
  111. }
  112. static inline void beg_tripog(TriPOG *tp){
  113. clear_seqbank(tp->seqs);
  114. clear_u2v(tp->rbegs);
  115. clear_u2v(tp->rends);
  116. tp->longest_idx = 0;
  117. tp->is_tripog = 0;
  118. }
  119. static inline void push_tripog(TriPOG *tp, char *seq, u4i len, u2i refbeg, u2i refend){
  120. if(!tp->shuffle && tp->seqs->nseq >= tp->seqmax) return;
  121. push_seqbank(tp->seqs, NULL, 0, seq, len);
  122. push_u2v(tp->rbegs, refbeg);
  123. push_u2v(tp->rends, refend);
  124. if(tp->seqs->rdlens->buffer[tp->longest_idx] < len){
  125. tp->longest_idx = tp->seqs->nseq - 1;
  126. }
  127. }
  128. static inline void fwdbitpush_tripog(TriPOG *tp, u8i *bits, u8i off, u4i len, u2i refbeg, u2i refend){
  129. if(!tp->shuffle && tp->seqs->nseq >= tp->seqmax) return;
  130. fwdbitpush_seqbank(tp->seqs, NULL, 0, bits, off, len);
  131. push_u2v(tp->rbegs, refbeg);
  132. push_u2v(tp->rends, refend);
  133. if(tp->seqs->rdlens->buffer[tp->longest_idx] < len){
  134. tp->longest_idx = tp->seqs->nseq - 1;
  135. }
  136. }
  137. static inline void revbitpush_tripog(TriPOG *tp, u8i *bits, u8i off, u4i len, u2i refbeg, u2i refend){
  138. if(!tp->shuffle && tp->seqs->nseq >= tp->seqmax) return;
  139. revbitpush_seqbank(tp->seqs, NULL, 0, bits, off, len);
  140. push_u2v(tp->rbegs, refbeg);
  141. push_u2v(tp->rends, refend);
  142. if(tp->seqs->rdlens->buffer[tp->longest_idx] < len){
  143. tp->longest_idx = tp->seqs->nseq - 1;
  144. }
  145. }
  146. static inline void direct_run_tripog(TriPOG *tp){
  147. POG *g;
  148. u4i ridx;
  149. clear_basebank(tp->cns);
  150. if(tp->refmode){
  151. } else {
  152. tp->pogs[0]->par->W = tp->pogs[1]->par->W;
  153. tp->pogs[0]->par->rW = tp->pogs[1]->par->rW;
  154. tp->pogs[0]->par->W_score = tp->pogs[1]->par->W * tp->pogs[1]->par->M * 8;
  155. }
  156. g = tp->pogs[0];
  157. beg_pog(g);
  158. for(ridx=0;ridx<tp->seqs->nseq;ridx++){
  159. fwdbitpush_pog_core(g, tp->seqs->rdseqs->bits, tp->seqs->rdoffs->buffer[ridx], tp->seqs->rdlens->buffer[ridx], tp->rbegs->buffer[ridx], tp->rends->buffer[ridx]);
  160. }
  161. end_pog(g);
  162. if(g->cns->size == 0){
  163. fast_fwdbits2basebank(tp->cns, tp->seqs->rdseqs->bits, tp->seqs->rdoffs->buffer[tp->longest_idx], tp->seqs->rdlens->buffer[tp->longest_idx]);
  164. } else {
  165. fast_fwdbits2basebank(tp->cns, g->cns->bits, 0, g->cns->size);
  166. }
  167. if(tp->refmode){
  168. } else {
  169. tp->pogs[0]->par->W = 0;
  170. tp->pogs[0]->par->rW = 0;
  171. tp->pogs[0]->par->W_score = 0;
  172. }
  173. }
  174. static inline void shuffle_reads_by_kmers_tripog(TriPOG *tp){
  175. SeqBank *sb;
  176. uuhash *khash;
  177. u4v *kidxs;
  178. f4v *kords;
  179. u2v *rbegs, *rends;
  180. uuhash_t *u;
  181. u8i roff;
  182. u4i ridx, i, ksize, kmer, kmask, rlen, khit, mincnt;
  183. double logv;
  184. int exists;
  185. sb = tp->seqs;
  186. if(sb->nseq == 0) return;
  187. khash = tp->khash;
  188. kidxs = tp->kidxs;
  189. kords = tp->kords;
  190. rbegs = tp->rbegs;
  191. rends = tp->rends;
  192. ksize = tp->ksize;
  193. clear_u4v(kidxs);
  194. clear_f4v(kords);
  195. clear_uuhash(khash);
  196. kmask = MAX_U4 >> ((16 - ksize) << 1);
  197. mincnt = tp->refmode? 1 : 2;
  198. for(ridx=0;ridx<sb->nseq;ridx++){
  199. rlen = sb->rdlens->buffer[ridx];
  200. kmer = 0;
  201. roff = sb->rdoffs->buffer[ridx];
  202. for(i=0;i<rlen;i++){
  203. kmer = ((kmer << 2) | get_basebank(sb->rdseqs, roff + i)) & kmask;
  204. if(i + 1 < ksize) continue;
  205. u = prepare_uuhash(khash, kmer, &exists);
  206. if(exists){
  207. if(((u->val >> 16) & 0x7FFFU) == ridx + 1){
  208. u->val |= 1U << 31;
  209. } else {
  210. u->val = (u->val & 0x8000FFFFU) | ((ridx + 1) << 16);
  211. }
  212. u->val ++;
  213. } else {
  214. u->key = kmer;
  215. u->val = (0U << 31) | ((ridx + 1) << 16) | 1;
  216. }
  217. }
  218. if(tp->refmode) break;
  219. }
  220. logv = log(1.2);
  221. for(ridx=0;ridx<sb->nseq;ridx++){
  222. rlen = sb->rdlens->buffer[ridx];
  223. kmer = 0;
  224. roff = sb->rdoffs->buffer[ridx];
  225. khit = 0;
  226. for(i=0;i<rlen;i++){
  227. kmer = ((kmer << 2) | get_basebank(sb->rdseqs, roff + i)) & kmask;
  228. if(i + 1 < ksize) continue;
  229. u = get_uuhash(khash, kmer);
  230. if(u && (u->val & 0x80000000U) == 0 && (u->val & 0xFFFFU) >= mincnt){
  231. khit ++;
  232. }
  233. }
  234. if(tp->refmode){
  235. if(ridx == 0){
  236. push_f4v(kords, 3e+38F);
  237. } else {
  238. push_f4v(kords, ((double)khit) * logv / log(num_max(rlen, sb->rdlens->buffer[0])));
  239. }
  240. } else {
  241. push_f4v(kords, ((double)khit) * logv / log(rlen));
  242. }
  243. push_u4v(kidxs, ridx);
  244. }
  245. sort_array(kidxs->buffer, kidxs->size, u4i, num_cmpgt(kords->buffer[b], kords->buffer[a]));
  246. if(cns_debug > 1){
  247. for(i=0;i<kidxs->size;i++){
  248. fprintf(stderr, "SHUFFLE[%u] %u\t%u\t%0.4f\n", i, kidxs->buffer[i], sb->rdlens->buffer[kidxs->buffer[i]], kords->buffer[kidxs->buffer[i]]);
  249. }
  250. }
  251. for(i=0;i<kidxs->size;i++){
  252. push_u8v(sb->rdoffs, sb->rdoffs->buffer[kidxs->buffer[i]]);
  253. push_u4v(sb->rdlens, sb->rdlens->buffer[kidxs->buffer[i]]);
  254. push_u2v(rbegs, rbegs->buffer[kidxs->buffer[i]]);
  255. push_u2v(rends, rends->buffer[kidxs->buffer[i]]);
  256. }
  257. remove_array_u8v(sb->rdoffs, 0, kidxs->size);
  258. remove_array_u4v(sb->rdlens, 0, kidxs->size);
  259. remove_array_u2v(rbegs, 0, kidxs->size);
  260. remove_array_u2v(rends, 0, kidxs->size);
  261. if(tp->seqmax && sb->nseq > tp->seqmax){
  262. if(cns_debug > 1){
  263. fprintf(stderr, "SEQMAX: %u -> %u\n", sb->nseq, tp->seqmax);
  264. }
  265. sb->nseq = tp->seqmax;
  266. sb->rdoffs->size = tp->seqmax;
  267. sb->rdlens->size = tp->seqmax;
  268. rbegs->size = tp->seqmax;
  269. rends->size = tp->seqmax;
  270. }
  271. }
  272. static inline void subsample_reads_tripog(TriPOG *tp){
  273. SeqBank *sb;
  274. u4v *kidxs;
  275. u2v *rbegs, *rends;
  276. f4v *kords;
  277. u4i ridx, i;
  278. f4i cutoff;
  279. sb = tp->seqs;
  280. if(sb->nseq == 0 || sb->nseq <= tp->seqmax) return;
  281. kidxs = tp->kidxs;
  282. kords = tp->kords;
  283. rbegs = tp->rbegs;
  284. rends = tp->rends;
  285. clear_u4v(kidxs);
  286. clear_f4v(kords);
  287. if(tp->refmode){
  288. push_u4v(kidxs, 0);
  289. push_f4v(kords, 200.0);
  290. ridx = 1;
  291. } else {
  292. ridx = 0;
  293. }
  294. for(;ridx<sb->nseq;ridx++){
  295. push_u4v(kidxs, ridx);
  296. push_f4v(kords, 100.0 * drand48());
  297. }
  298. sort_array(kidxs->buffer, kidxs->size, u4i, num_cmpgt(kords->buffer[b], kords->buffer[a]));
  299. cutoff = kords->buffer[kidxs->buffer[tp->seqmax - 1]];
  300. clear_u4v(kidxs);
  301. for(i=0;i<kords->size;i++){
  302. if(kords->buffer[i] >= cutoff){
  303. push_u4v(kidxs, i);
  304. }
  305. }
  306. if(cns_debug > 1){
  307. fprintf(stderr, "RANDOM CUTOFF = %f\n", cutoff);
  308. for(i=0;i<kidxs->size;i++){
  309. fprintf(stderr, "SHUFFLE[%u] %u\t%0.4f\n", i, kidxs->buffer[i], kords->buffer[kidxs->buffer[i]]);
  310. }
  311. }
  312. for(i=0;i<kidxs->size;i++){
  313. push_u8v(sb->rdoffs, sb->rdoffs->buffer[kidxs->buffer[i]]);
  314. push_u4v(sb->rdlens, sb->rdlens->buffer[kidxs->buffer[i]]);
  315. push_u2v(rbegs, rbegs->buffer[kidxs->buffer[i]]);
  316. push_u2v(rends, rends->buffer[kidxs->buffer[i]]);
  317. }
  318. remove_array_u8v(sb->rdoffs, 0, kords->size);
  319. remove_array_u4v(sb->rdlens, 0, kords->size);
  320. remove_array_u2v(rbegs, 0, kords->size);
  321. remove_array_u2v(rends, 0, kords->size);
  322. if(cns_debug > 1){
  323. fprintf(stderr, "SEQMAX: %u -> %u\n", sb->nseq, tp->seqmax);
  324. }
  325. sb->nseq = tp->seqmax;
  326. }
  327. static inline void end_tripog(TriPOG *tp){
  328. POG *g;
  329. kswr_t R;
  330. u4i ridx, rlen, b, e, failed;
  331. switch(tp->shuffle){
  332. case 1: shuffle_reads_by_kmers_tripog(tp); break;
  333. case 2: subsample_reads_tripog(tp); break;
  334. }
  335. if(tp->winlen == 0){
  336. return direct_run_tripog(tp);
  337. }
  338. tp->longest_idx = 0;
  339. for(ridx=1;ridx<tp->seqs->nseq;ridx++){
  340. if(tp->seqs->rdlens->buffer[ridx] > tp->seqs->rdlens->buffer[tp->longest_idx]){
  341. tp->longest_idx = ridx;
  342. }
  343. }
  344. if(tp->seqs->nseq < 2){
  345. return direct_run_tripog(tp);
  346. }
  347. rlen = tp->seqs->rdlens->buffer[0];
  348. if(rlen < 2 * tp->winlen){
  349. return direct_run_tripog(tp);
  350. }
  351. clear_u2v(tp->regs[0]);
  352. clear_u2v(tp->regs[1]);
  353. // selecting unique window
  354. {
  355. uuhash_t *u;
  356. u4i i, j, hit, nb, kmer, kcnt, kdup, keqs, ktot, kmask, roff;
  357. reset_iter_uuhash(tp->khash);
  358. while((u = ref_iter_uuhash(tp->khash))){
  359. u->val &= 0xFFFFU;
  360. }
  361. kmask = MAX_U4 >> ((16 - tp->ksize) << 1);
  362. for(ridx=0;ridx<1;ridx++){
  363. rlen = tp->seqs->rdlens->buffer[ridx];
  364. kmer = 0;
  365. roff = tp->seqs->rdoffs->buffer[ridx];
  366. for(i=0;i<rlen;i++){
  367. kmer = ((kmer << 2) | get_basebank(tp->seqs->rdseqs, roff + i)) & kmask;
  368. if(i + 1 < tp->ksize) continue;
  369. u = get_uuhash(tp->khash, kmer);
  370. u->val += 1U << 16;
  371. }
  372. }
  373. nb = ((rlen - tp->winlen) * 2 / 3 / tp->winlen / 2) * 2;
  374. b = (rlen - tp->winlen) / 2;
  375. hit = 0;
  376. roff = tp->seqs->rdoffs->buffer[0];
  377. for(j=0;j<nb;){
  378. e = b + tp->winlen;
  379. kmer = 0;
  380. kdup = 0;
  381. keqs = 0;
  382. ktot = e - b + 1 - tp->ksize;
  383. for(i=b;i<e;i++){
  384. kmer = ((kmer << 2) | get_basebank(tp->seqs->rdseqs, roff + i)) & kmask;
  385. if(i + 1 - b < tp->ksize) continue;
  386. kcnt = getval_uuhash(tp->khash, kmer);
  387. if((kcnt >> 16) > 1){
  388. kdup ++;
  389. } else if((kcnt & 0xFFFFU) > 1){
  390. keqs ++;
  391. }
  392. }
  393. if(cns_debug > 1){
  394. fprintf(stderr, "Selecting anchor[%4d,%4d]: ktot=%d keqs=%d kdup=%d\n", b, e, ktot, keqs, kdup);
  395. }
  396. if(kdup < UInt(tp->kdup * ktot) && keqs >= UInt(tp->keqs * ktot)){
  397. hit = 1;
  398. break;
  399. }
  400. j ++;
  401. if(j & 0x01){
  402. b = (rlen - tp->winlen) / 2 + tp->winlen * ((j + 1) >> 1);
  403. } else {
  404. b = (rlen - tp->winlen) / 2 - tp->winlen * ((j + 1) >> 1);
  405. }
  406. }
  407. if(hit == 0){
  408. return direct_run_tripog(tp);
  409. }
  410. }
  411. push_u2v(tp->regs[0], b);
  412. push_u2v(tp->regs[1], e);
  413. failed = 0;
  414. for(ridx=1;ridx<tp->seqs->nseq;ridx++){
  415. clear_and_encap_u1v(tp->qry, tp->winlen);
  416. bitseq_basebank(tp->seqs->rdseqs, tp->seqs->rdoffs->buffer[0] + b, tp->winlen, tp->qry->buffer);
  417. rlen = tp->seqs->rdlens->buffer[ridx];
  418. clear_and_encap_u1v(tp->ref, rlen);
  419. bitseq_basebank(tp->seqs->rdseqs, tp->seqs->rdoffs->buffer[ridx], rlen, tp->ref->buffer);
  420. R = ksw_align(tp->winlen, tp->qry->buffer, rlen, tp->ref->buffer, 4, tp->matrix, - (tp->pogs[0]->par->I + tp->pogs[0]->par->D)/2, 1, KSW_XSTART, NULL);
  421. if(R.qb <= -1 || R.tb <= -1 || R.qe <= -1 || R.te <= -1){
  422. if(cns_debug > 1){
  423. fprintf(stderr, "FAILED_ALIGN: READ%u [%d,%d=%d][%d,%d=%d]\n", ridx, R.qb, R.qe, R.qe - R.qb, R.tb, R.te, R.te - R.tb);
  424. }
  425. if(tp->fail_skip){
  426. push_u2v(tp->regs[0], MAX_U2);
  427. push_u2v(tp->regs[1], MAX_U2);
  428. failed ++;
  429. continue;
  430. } else {
  431. return direct_run_tripog(tp);
  432. }
  433. }
  434. if(R.qe + 1 - R.qb < tp->winmin || R.te + 1 - R.tb < tp->winmin){
  435. if(cns_debug > 1){
  436. fprintf(stderr, "FAILED_ALIGN: READ%u [%d,%d=%d][%d,%d=%d]\n", ridx, R.qb, R.qe, R.qe - R.qb, R.tb, R.te, R.te - R.tb);
  437. }
  438. if(tp->fail_skip){
  439. push_u2v(tp->regs[0], MAX_U2);
  440. push_u2v(tp->regs[1], MAX_U2);
  441. failed ++;
  442. continue;
  443. } else {
  444. return direct_run_tripog(tp);
  445. }
  446. }
  447. push_u2v(tp->regs[0], R.tb);
  448. push_u2v(tp->regs[1], R.te + 1);
  449. }
  450. if(failed * 2 >= tp->seqs->nseq){
  451. return direct_run_tripog(tp);
  452. }
  453. // building cns for fast aligned regions
  454. g = tp->pogs[0];
  455. g->par->alnmode = POG_ALNMODE_GLOBAL;
  456. beg_pog(g);
  457. for(ridx=0;ridx<tp->seqs->nseq;ridx++){
  458. if(tp->regs[0]->buffer[ridx] == MAX_U2) continue;
  459. fwdbitpush_pog(g, tp->seqs->rdseqs->bits, tp->seqs->rdoffs->buffer[ridx] + tp->regs[0]->buffer[ridx], tp->regs[1]->buffer[ridx] - tp->regs[0]->buffer[ridx]);
  460. }
  461. if(0){
  462. print_seqs_pog(g, "p0.fa", NULL);
  463. }
  464. end_pog(g);
  465. g->par->alnmode = POG_ALNMODE_OVERLAP;
  466. // finding a best break point
  467. {
  468. u2v *rs;
  469. u1i *s;
  470. u2i i, idx, bst, bsi, cnt;
  471. rs = init_u2v(256);
  472. bst = 0;
  473. bsi = MAX_U2;
  474. s = g->msa->buffer + g->msa_len * g->seqs->nseq;
  475. for(idx=0;idx<g->msa_len;idx++){
  476. if(s[idx] == 4) continue;
  477. cnt = 0;
  478. for(ridx=0;ridx<g->seqs->nseq;ridx++){
  479. if(g->msa->buffer[g->msa_len * ridx + idx] == s[idx]) cnt ++;
  480. }
  481. if(cnt > bst){
  482. bst = cnt;
  483. bsi = idx;
  484. clear_u2v(rs);
  485. push_u2v(rs, idx);
  486. } else if(cnt == bst){
  487. push_u2v(rs, idx);
  488. }
  489. }
  490. if(rs->size){
  491. bsi = rs->buffer[rs->size / 2];
  492. }
  493. free_u2v(rs);
  494. if(bsi == MAX_U2){
  495. return direct_run_tripog(tp);
  496. }
  497. if(cns_debug > 1){
  498. fprintf(stderr, "BREAKPOINT[MSA]: %u/%u\n", bsi, g->msa_len);
  499. }
  500. // transform coordinate
  501. i = 0;
  502. for(ridx=0;ridx<tp->seqs->nseq;ridx++){
  503. if(tp->regs[0]->buffer[ridx] == MAX_U2) continue;
  504. cnt = 0;
  505. s = g->msa->buffer + g->msa_len * i;
  506. for(idx=0;idx<bsi;idx++){
  507. if(s[idx] != 4) cnt ++;
  508. }
  509. tp->regs[0]->buffer[ridx] = tp->regs[0]->buffer[ridx] + cnt;
  510. tp->regs[1]->buffer[ridx] = tp->regs[0]->buffer[ridx];
  511. if(cns_debug > 1){
  512. fprintf(stderr, "BREAKPOINT[%u:%u]: %u/%u\n", i, ridx, tp->regs[0]->buffer[ridx], tp->seqs->rdlens->buffer[ridx]);
  513. }
  514. i ++;
  515. }
  516. }
  517. // forward cns to seqs' ends
  518. g = tp->pogs[2];
  519. beg_pog(g);
  520. for(ridx=0;ridx<tp->seqs->nseq;ridx++){
  521. if(tp->regs[0]->buffer[ridx] == MAX_U2) continue;
  522. fwdbitpush_pog(g, tp->seqs->rdseqs->bits, tp->seqs->rdoffs->buffer[ridx] + tp->regs[1]->buffer[ridx], tp->seqs->rdlens->buffer[ridx] - tp->regs[1]->buffer[ridx]);
  523. }
  524. if(0){
  525. print_seqs_pog(g, "p2.fa", NULL);
  526. }
  527. end_pog(g);
  528. if(g->cns->size == 0){
  529. return direct_run_tripog(tp);
  530. }
  531. // backward cns to seqs' begs
  532. g = tp->pogs[1];
  533. beg_pog(g);
  534. for(ridx=0;ridx<tp->seqs->nseq;ridx++){
  535. if(tp->regs[0]->buffer[ridx] == MAX_U2) continue;
  536. revbitpush_pog(g, tp->seqs->rdseqs->bits, tp->seqs->rdoffs->buffer[ridx], tp->regs[0]->buffer[ridx]);
  537. }
  538. if(0){
  539. print_seqs_pog(g, "p1.fa", NULL);
  540. }
  541. end_pog(g);
  542. if(g->cns->size == 0){
  543. return direct_run_tripog(tp);
  544. }
  545. // merge two parts
  546. clear_basebank(tp->cns);
  547. fast_revbits2basebank(tp->cns, tp->pogs[1]->cns->bits, 0, tp->pogs[1]->cns->size);
  548. fast_fwdbits2basebank(tp->cns, tp->pogs[2]->cns->bits, 0, tp->pogs[2]->cns->size);
  549. tp->is_tripog = 1;
  550. }
  551. #endif