dagcns.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899
  1. /*
  2. *
  3. * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "dna.h"
  20. #include "chararray.h"
  21. #include "list.h"
  22. #include "queue.h"
  23. #include "hashset.h"
  24. #include "general_graph.h"
  25. #include <math.h>
  26. #include <float.h>
  27. #ifndef PACBIO_PROBS_DAGCNS_RJ_H
  28. #define PACBIO_PROBS_DAGCNS_RJ_H
  29. #define DAGCNS_MAX_LEN 0x3FFF // 16k
  30. static int dagcns_debug = 0;
  31. typedef struct {
  32. int x, y;
  33. u4i dnidx;
  34. f8i probs[2];
  35. } bdp_node_t;
  36. define_list(bdpnodev, bdp_node_t);
  37. typedef struct {
  38. f8i prob;
  39. u1i cigar, base;
  40. } bdp_edge_t;
  41. define_list(bdpedgev, bdp_edge_t);
  42. define_simple_geg_callback(bdp, bdpnodev, bdp_node_t, bdpedgev, bdp_edge_t);
  43. typedef struct {
  44. u4i gnidx, dnidx;
  45. } bdp_link_t;
  46. define_list(bdplinkv, bdp_link_t);
  47. typedef struct {
  48. uint32_t nodes[2];
  49. uint32_t links[2];
  50. uint32_t cov:28, visit:1, closed:1, cns:1, alt:1;
  51. double score;
  52. } dagedge_t;
  53. define_list(dagedgev, dagedge_t);
  54. #define NODE_MAX_FW_EDGE 0xFFFFFFFFU
  55. typedef struct dagnode_t {
  56. uint32_t pos:28, base:2, cns:1, visit:1;
  57. uint32_t fw_edge;
  58. uint32_t edges[2];
  59. f8i aux;
  60. } dagnode_t;
  61. define_list(dagnodev, dagnode_t);
  62. typedef struct {
  63. uint32_t pos;
  64. uint32_t bases[4];
  65. } dagsnp_t;
  66. define_list(dagsnpv, dagsnp_t);
  67. typedef struct {
  68. u8list *cns;
  69. u32list *deps;
  70. dagnodev *nodes;
  71. dagedgev *edges;
  72. u32list *trash;
  73. String *alns[2];
  74. int W, M, X, I, D, E;
  75. f4i pM, pX, pI, pD; // log(prob_M)
  76. double ref_penalty, alt_penalty; // 0.5 and 0.2
  77. double cns_score;
  78. uint32_t cns_head, backbone_size;
  79. } DAGCNS;
  80. static inline DAGCNS* init_dagcns(int W, int M, int X, int I, int D, int E, f4i pM, f4i pX, f4i pI, f4i pD){
  81. DAGCNS *g;
  82. g = malloc(sizeof(DAGCNS));
  83. g->cns = init_u8list(1024);
  84. g->deps = init_u32list(1024);
  85. g->nodes = init_dagnodev(1024);
  86. g->edges = init_dagedgev(1024);
  87. g->trash = init_u32list(1024);
  88. g->alns[0] = init_string(1024);
  89. g->alns[1] = init_string(1024);
  90. g->cns_score = 0;
  91. g->cns_head = 0xFFFFFFFFU;
  92. g->backbone_size = 0;
  93. g->W = W;
  94. g->M = M;
  95. g->X = X;
  96. g->I = I;
  97. g->D = D;
  98. g->E = E;
  99. g->pM = pM;
  100. g->pX = pX;
  101. g->pI = pI;
  102. g->pD = pD;
  103. g->ref_penalty = 0.5;
  104. g->alt_penalty = 0.2;
  105. return g;
  106. }
  107. static inline void free_dagcns(DAGCNS *g){
  108. free_dagnodev(g->nodes);
  109. free_dagedgev(g->edges);
  110. free_u32list(g->trash);
  111. free_u8list(g->cns);
  112. free_u32list(g->deps);
  113. free_string(g->alns[0]);
  114. free_string(g->alns[1]);
  115. free(g);
  116. }
  117. static inline void reset_dagcns(DAGCNS *g){
  118. clear_dagnodev(g->nodes);
  119. clear_dagedgev(g->edges);
  120. clear_u32list(g->trash);
  121. clear_u8list(g->cns);
  122. clear_u32list(g->deps);
  123. g->cns_score = 0;
  124. g->cns_head = 0xFFFFFFFFU;
  125. g->backbone_size = 0;
  126. }
  127. static uint32_t prepare_node_dagcns(DAGCNS *g, uint32_t pos, uint8_t base){
  128. dagnode_t *n;
  129. n = next_ref_dagnodev(g->nodes);
  130. n->pos = pos;
  131. n->base = base;
  132. n->cns = 0;
  133. n->aux = 0;
  134. n->visit = 0;
  135. n->edges[0] = 0xFFFFFFFFU;
  136. n->edges[1] = 0xFFFFFFFFU;
  137. n->fw_edge = NODE_MAX_FW_EDGE;
  138. return g->nodes->size - 1;
  139. }
  140. static inline dagedge_t* find_edge_by_node_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
  141. dagnode_t *n;
  142. dagedge_t *e;
  143. n = ref_dagnodev(g->nodes, nid1);
  144. if(n->edges[dir] != 0xFFFFFFFFU){
  145. e = ref_dagedgev(g->edges, n->edges[dir]);
  146. while(1){
  147. if(e->nodes[dir] == nid2) return e;
  148. if(e->links[dir] == 0xFFFFFFFFU) break;
  149. e = ref_dagedgev(g->edges, e->links[dir]);
  150. }
  151. }
  152. return NULL;
  153. }
  154. static inline dagedge_t* add_edge_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
  155. dagnode_t *n;
  156. dagedge_t *e;
  157. uint32_t eid;
  158. n = ref_dagnodev(g->nodes, nid1);
  159. if(pop_u32list(g->trash, &eid)){
  160. e = ref_dagedgev(g->edges, eid);
  161. } else {
  162. eid = g->edges->size;
  163. e = next_ref_dagedgev(g->edges);
  164. }
  165. e->nodes[!dir] = nid1;
  166. e->nodes[dir] = nid2;
  167. e->cov = 1;
  168. e->score = 0;
  169. e->visit = 0;
  170. e->closed = 0;
  171. e->cns = 0;
  172. e->alt = 0;
  173. e->links[dir] = n->edges[dir];
  174. n->edges[dir] = eid;
  175. n = ref_dagnodev(g->nodes, nid2);
  176. e->links[!dir] = n->edges[!dir];
  177. n->edges[!dir] = eid;
  178. return e;
  179. }
  180. static inline dagedge_t* prepare_edge_dagcns(DAGCNS *g, uint32_t nid1, uint32_t nid2, int dir){
  181. dagedge_t *e;
  182. e = find_edge_by_node_dagcns(g, nid1, nid2, dir);
  183. if(e){ e->cov ++; return e; }
  184. return add_edge_dagcns(g, nid1, nid2, dir);
  185. }
  186. static inline void gen_pregraph_dagcns(DAGCNS *g){
  187. dagedge_t *e;
  188. uint32_t i;
  189. clear_dagnodev(g->nodes);
  190. clear_dagedgev(g->edges);
  191. clear_u32list(g->trash);
  192. clear_u32list(g->deps);
  193. g->backbone_size = g->cns->size;
  194. for(i=0;i<g->cns->size;i++){
  195. push_u32list(g->deps, 0);
  196. prepare_node_dagcns(g, i, g->cns->buffer[i]);
  197. if(i){ // make sure the graph is conntective even the alignment is partial
  198. e = add_edge_dagcns(g, i - 1, i, 0);
  199. e->cov = 0;
  200. }
  201. }
  202. }
  203. static inline int remove_edge_dagcns(DAGCNS *g, uint32_t eid){
  204. dagnode_t *n;
  205. dagedge_t *e;
  206. uint32_t i, lst;
  207. for(i=0;i<2;i++){
  208. e = ref_dagedgev(g->edges, eid);
  209. lst = e->links[i];
  210. n = ref_dagnodev(g->nodes, e->nodes[!i]);
  211. if(n->edges[i] == eid){
  212. n->edges[i] = lst;
  213. } else if(n->edges[i] == 0xFFFFFFFFU){
  214. //fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort();
  215. return 0;
  216. } else {
  217. e = ref_dagedgev(g->edges, n->edges[i]);
  218. while(1){
  219. if(e->links[i] == eid){
  220. e->links[i] = lst; break;
  221. } else if(e->links[i] == 0xFFFFFFFFU){
  222. //fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort();
  223. return 0;
  224. } else {
  225. e = ref_dagedgev(g->edges, e->links[i]);
  226. }
  227. }
  228. }
  229. }
  230. push_u32list(g->trash, eid);
  231. return 1;
  232. }
  233. #define MIN_SCORE -0x0FFFFFFF
  234. static inline f8i log_sum(f8i a, f8i b){
  235. f8i c;
  236. c = num_max(a, b);
  237. if(c - a >= 10 || c - b >= 10) return c;
  238. return logl(expl(a - c) + expl(b - c)) + c;
  239. }
  240. typedef struct {
  241. int x, y, d;
  242. } bdp_beg_t;
  243. define_list(bdpbegv, bdp_beg_t);
  244. static inline void fprint_dot_bdpgraph(GEGraph *g, bdpnodev *bnodes, bdpedgev *bedges, char *prefix, char *suffix){
  245. static const char *colors[2][2] = {{"blue", "green"}, {"red", "gray"}};
  246. FILE *out;
  247. ge_node_t *n;
  248. bdp_node_t *bn;
  249. ge_edge_t *e;
  250. bdp_edge_t *be;
  251. u8i i;
  252. out = open_file_for_write(prefix, suffix, 1);
  253. fprintf(out, "digraph {\nnode [shape=record]\nrankdir=LR\n");
  254. for(i=0;i<g->nodes->size;i++){
  255. n = ref_genodev(g->nodes, i);
  256. if(n->closed) continue;
  257. bn = ref_bdpnodev(bnodes, offset_genodev(g->nodes, n));
  258. fprintf(out, " N%llu [label=\"{N%llu|%d|%d}|{%.4Lf|%.4Lf}\"]\n", i, i, bn->x, bn->y, bn->probs[0], bn->probs[1]);
  259. }
  260. for(i=1;i<g->edges->size;i++){
  261. e = ref_geedgev(g->edges, i);
  262. if(e->closed) continue;
  263. be = ref_bdpedgev(bedges, offset_geedgev(g->edges, e));
  264. fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%c:%.4Lf\" color=%s]\n", (u8i)e->node1, (u8i)e->node2, "+-"[e->dir1], "+-"[e->dir2], e->cov, "MIDX"[be->cigar], be->prob, colors[e->dir1][e->dir2]);
  265. fprintf(out, " N%llu -> N%llu [label=\"%c%c:%d:%c:%.4Lf\" color=%s]\n", (u8i)e->node2, (u8i)e->node1, "-+"[e->dir2], "-+"[e->dir1], e->cov, "MIDX"[be->cigar], be->prob, colors[!e->dir2][!e->dir1]);
  266. }
  267. fprintf(out, "}\n");
  268. fclose(out);
  269. }
  270. static inline u4i dp_matrix2alignment_graph(DAGCNS *dag, u1i *query, u1i *z, int x, int y, GEGraph *g, bdpnodev *bnodes, bdpedgev *bedges){
  271. ge_node_t *gn, *gn2;
  272. ge_edge_ref_t *gf;
  273. ge_edge_t *ge;
  274. bdp_node_t *bn, *bn2;
  275. bdp_edge_t *be;
  276. bdp_beg_t BEG;
  277. UUhash *bnhash;
  278. UUhash_t *u;
  279. bdpbegv *stack;
  280. f8i cigar_probs[4], curator;
  281. u8v *idxs;
  282. u1i *target;
  283. u4i i, k, nidx, nlst, nbegs[2], visit, ret;
  284. int d, f, n_col, ops[2], base, exists;
  285. cigar_probs[0] = dag->pM;
  286. cigar_probs[1] = dag->pI;
  287. cigar_probs[2] = dag->pD;
  288. cigar_probs[3] = dag->pX;
  289. n_col = dag->cns->size;
  290. target = dag->cns->buffer;
  291. bdp_set_callbacks_gegraph(g, bnodes, bedges);
  292. reset_gegraph(g);
  293. stack = init_bdpbegv(4);
  294. push_bdpbegv(stack, (bdp_beg_t){x, y, 0});
  295. idxs = init_u8v(4);
  296. nbegs[0] = nbegs[1] = 0;
  297. bnhash = init_UUhash(1023);
  298. ret = 0;
  299. while(stack->size){
  300. BEG = stack->buffer[--stack->size];
  301. u = prepare_UUhash(bnhash, (((b8i)BEG.x) << 32) | BEG.y, &exists);
  302. if(exists){
  303. nidx = u->val;
  304. } else {
  305. gn = add_node_gegraph(g);
  306. nidx = offset_genodev(g->nodes, gn);
  307. u->key = (((b8i)BEG.x) << 32) | BEG.y;
  308. u->val = nidx;
  309. bn = ref_bdpnodev(bnodes, nidx);
  310. bn->x = BEG.x; bn->y = BEG.y;
  311. bn->dnidx = MAX_VALUE_U4;
  312. }
  313. nlst = nidx;
  314. x = BEG.x; y = BEG.y;
  315. d = BEG.d;
  316. f = 0;
  317. clear_u8v(idxs);
  318. while(x >= 0 && y >= 0){
  319. d = (z[x * n_col + y] >> (d << 1)) & 0x03;
  320. if(d == 0){
  321. if(query[x] == target[y]){
  322. if(z[x * n_col + y] & (1 << 6)){
  323. f = 1;
  324. break;
  325. } else if(BEG.d == 0){
  326. z[x * n_col + y] |= 1 << 6;
  327. push_bdpbegv(stack, (bdp_beg_t){x, y, 1});
  328. push_bdpbegv(stack, (bdp_beg_t){x, y, 2});
  329. }
  330. ops[0] = 0; ops[1] = 3;
  331. } else {
  332. ops[0] = 1; ops[1] = 2;
  333. }
  334. //x --; y --;
  335. } else if(d == 1){
  336. ops[0] = 1; ops[1] = 3;
  337. //x --;
  338. } else {
  339. ops[0] = 2; ops[1] = 3;
  340. //y --;
  341. }
  342. for(i=0;i<2;i++){
  343. if(ops[i] == 3) break;
  344. base = query[x];
  345. if(ops[i] == 0){ x --; y --; }
  346. else if(ops[i] == 1){ x --; }
  347. else { y --; }
  348. u = prepare_UUhash(bnhash, (((b8i)x) << 32) | y, &exists);
  349. if(exists){
  350. nidx = u->val;
  351. } else {
  352. gn = add_node_gegraph(g);
  353. nidx = offset_genodev(g->nodes, gn);
  354. u->key = (((b8i)x) << 32) | y;
  355. u->val = nidx;
  356. bn = ref_bdpnodev(bnodes, nidx);
  357. bn->x = x; bn->y = y;
  358. bn->dnidx = MAX_VALUE_U4;
  359. }
  360. ge = prepare_edge_gegraph(g, nlst, 1, nidx, 1, &exists);
  361. if(exists){
  362. ge->cov ++;
  363. } else {
  364. ge->cov = 1;
  365. be = ref_bdpedgev(bedges, offset_geedgev(g->edges, ge));
  366. be->cigar = ops[i];
  367. be->base = base;
  368. }
  369. nlst = nidx;
  370. if(BEG.d) push_u8v(idxs, offset_geedgev(g->edges, ge));
  371. }
  372. }
  373. if(BEG.d){
  374. if(f == 0){
  375. // not a bubble
  376. for(i=0;i<idxs->size;i++){
  377. ge = ref_geedgev(g->edges, idxs->buffer[i]);
  378. ge->cov --;
  379. if(ge->cov == 0) cut_edge_gegraph(g, ge);
  380. }
  381. } else ret ++;
  382. } else {
  383. ret ++;
  384. nbegs[0] = g->nodes->size > 1? g->nodes->size - 2 : 0;
  385. nbegs[1] = 0;
  386. }
  387. }
  388. free_bdpbegv(stack);
  389. free_UUhash(bnhash);
  390. u4i cnt = 0;
  391. for(i=0;i<g->nodes->size;i++){
  392. gn = ref_genodev(g->nodes, i);
  393. if(gn->edges[0].cnt || gn->edges[1].cnt){ cnt++; continue; }
  394. gn->closed = 1;
  395. }
  396. // calculate probabilities (forward + backward)
  397. if(nbegs[0] == nbegs[1]){
  398. free_u8v(idxs);
  399. return ret;
  400. }
  401. for(k=0;k<2;k++){
  402. bn = ref_bdpnodev(bnodes, nbegs[k]);
  403. bn->probs[k] = 0;
  404. clear_u8v(idxs);
  405. push_u8v(idxs, nbegs[k]);
  406. visit = k + 1;
  407. while(idxs->size){
  408. gn = ref_genodev(g->nodes, idxs->buffer[idxs->size - 1]);
  409. bn = ref_bdpnodev(bnodes, idxs->buffer[idxs->size - 1]);
  410. idxs->size --;
  411. geg_beg_iter_edges(g, gn, k, gf, ge);
  412. if(ge->closed) continue;
  413. be = ref_bdpedgev(bedges, offset_geedgev(g->edges, ge));
  414. gn2 = ref_genodev(g->nodes, gf->flg? ge->node1 : ge->node2);
  415. bn2 = ref_bdpnodev(bnodes, gf->flg? ge->node1 : ge->node2);
  416. if(gn2->bt_visit == visit){
  417. bn2->probs[k] = log_sum(bn2->probs[k], bn->probs[k] + cigar_probs[be->cigar]); // p1 + p2
  418. } else {
  419. gn2->bt_visit = visit;
  420. gn2->unvisit = gn2->edges[!k].cnt;
  421. bn2->probs[k] = bn->probs[k] + cigar_probs[be->cigar]; // p1 * p2
  422. }
  423. if(gn2->unvisit) gn2->unvisit --;
  424. if(gn2->unvisit == 0){
  425. push_u8v(idxs, offset_genodev(g->nodes, gn2));
  426. }
  427. geg_end_iter_edges();
  428. }
  429. }
  430. free_u8v(idxs);
  431. // calculate edge prob
  432. curator = -FLT_MAX;
  433. for(i=1;i<g->edges->size;i++){
  434. ge = ref_geedgev(g->edges, i);
  435. if(ge->closed) continue;
  436. be = ref_bdpedgev(bedges, i);
  437. be->prob = cigar_probs[be->cigar] + ref_bdpnodev(bnodes, ge->node1)->probs[ge->dir1] + ref_bdpnodev(bnodes, ge->node2)->probs[!ge->dir2];
  438. if(be->prob > curator) curator = be->prob;
  439. }
  440. for(i=1;i<g->edges->size;i++){
  441. ge = ref_geedgev(g->edges, i);
  442. if(ge->closed) continue;
  443. be = ref_bdpedgev(bedges, i);
  444. be->prob = expl(be->prob - curator);
  445. }
  446. //fprint_dot_bdpgraph(g, bnodes, bedges, "geg.dot", NULL);
  447. return nbegs[0];
  448. }
  449. static inline u4i branched_dynamic_programming_alignment(DAGCNS *g, u1i *query, int ql, GEGraph *geg, bdpnodev *bnodes, bdpedgev *bedges, u1v *mem_buffer){
  450. u4i nbeg;
  451. int *rh, *re;
  452. u1i *z, *zi, d, *target;
  453. int tl, n_col;
  454. int i, j, jc, jb, je, h1, h, m, e, f, t;
  455. int mi, mj, max;
  456. int W, M, X, I, D, E;
  457. tl = g->cns->size;
  458. target = g->cns->buffer;
  459. if(ql <= 0 || tl <= 0){ return 0; }
  460. n_col = tl;
  461. encap_u1v(mem_buffer, kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x(((long long)ql) * n_col));
  462. rh = (int*)(mem_buffer->buffer + mem_buffer->size);
  463. re = (int*)(mem_buffer->buffer + mem_buffer->size + kswx_roundup8x((tl + 2) * sizeof(int)));
  464. z = (mem_buffer->buffer + mem_buffer->size + kswx_roundup8x((tl + 2) * sizeof(int)) + kswx_roundup8x((tl + 2) * sizeof(int)));
  465. W = g->W;
  466. M = g->M;
  467. X = g->X;
  468. I = g->I;
  469. D = g->D;
  470. E = g->E;
  471. // banded DP, global alignment
  472. rh[0] = 0;
  473. re[1] = 0 + D + E;
  474. for(j=2;j<=tl&&j<=W;j++) rh[j] = rh[j-1] + E;
  475. for(;j<tl;j++) rh[j] = MIN_SCORE;
  476. for(j=0;j<=tl;j++) re[j] = MIN_SCORE;
  477. mi = mj = 0;
  478. max = MIN_SCORE;
  479. for(i=0;i<ql;i++){
  480. f = MIN_SCORE;
  481. jc = (i * tl / ql); // in case of biased gaps
  482. jb = jc > W? jc - W : 0;
  483. je = jc + W + 1 < tl? jc + W + 1 : tl;
  484. h1 = jb == 0? (I + E * (i + 1)) : MIN_SCORE;
  485. zi = &z[i * n_col];
  486. for(j=jb;j<je;j++){
  487. m = rh[j] + ((query[i] == target[j])? M : X);
  488. rh[j] = h1;
  489. e = re[j];
  490. d = m >= e? 0 : 1;
  491. h = m >= e? m : e;
  492. d = h >= f? d : 2;
  493. h = h >= f? h : f;
  494. h1 = h;
  495. t = m + I + E;
  496. e = e + E;
  497. d |= e > t? 1<<2 : 0;
  498. e = e > t? e : t;
  499. re[j] = e;
  500. t = m + D + E;
  501. f = f + E;
  502. d |= f > t? 2<<4 : 0;
  503. f = f > t? f : t;
  504. zi[j] = d;
  505. }
  506. rh[j] = h1; re[j] = MIN_SCORE;
  507. if(i + 1 == ql){
  508. for(j=jb;j<je;j++){
  509. if(rh[j + 1] > max){
  510. max = rh[j + 1];
  511. mi = i; mj = j;
  512. }
  513. }
  514. } else if(je == tl){
  515. if(h1 > max){
  516. max = h1;
  517. mi = i; mj = tl - 1;
  518. }
  519. }
  520. }
  521. if(max == MIN_SCORE) return 0;
  522. nbeg = dp_matrix2alignment_graph(g, query, z, mi, mj, geg, bnodes, bedges);
  523. return nbeg;
  524. }
  525. static inline void bdpgraph2dagcns(DAGCNS *dg, GEGraph *gg, bdpnodev *bnodes, bdpedgev *bedges, u4i nbeg, bdplinkv *stack){
  526. dagnode_t *dn, *dn2;
  527. dagedge_t *de;
  528. ge_node_t *gn, *gn2;
  529. ge_edge_ref_t *gf;
  530. ge_edge_t *ge;
  531. bdp_node_t *bn, *bn2;
  532. bdp_edge_t *be;
  533. bdp_link_t T;
  534. u4i i, j, beg, end;
  535. int open;
  536. for(i=0;i<gg->nodes->size;i++) gg->nodes->buffer[i].bt_visit = 0;
  537. clear_bdplinkv(stack);
  538. push_bdplinkv(stack, (bdp_link_t){nbeg, 0xFFFFFFFFU});
  539. beg = bnodes->buffer[nbeg].y;
  540. end = beg;
  541. open = 0;
  542. while(stack->size){
  543. T = stack->buffer[--stack->size];
  544. gn = ref_genodev(gg->nodes, T.gnidx);
  545. bn = ref_bdpnodev(bnodes, T.gnidx);
  546. if(bn->y > (int)end) end = bn->y;
  547. dn = (T.dnidx == 0xFFFFFFFFU)? NULL : ref_dagnodev(dg->nodes, T.dnidx);
  548. geg_beg_iter_edges(gg, gn, 0, gf, ge);
  549. if(ge->closed) continue;
  550. be = ref_bdpedgev(bedges, offset_geedgev(gg->edges, ge));
  551. gn2 = ref_genodev(gg->nodes, gf->flg? ge->node1 : ge->node2);
  552. bn2 = ref_bdpnodev(bnodes, gf->flg? ge->node1 : ge->node2);
  553. if(gn2->bt_visit == 0){
  554. gn2->bt_visit = 1;
  555. open ++;
  556. gn2->unvisit = gn2->edges[1].cnt;
  557. }
  558. if(gn2->unvisit) gn2->unvisit --;
  559. if(gn2->unvisit == 0){
  560. open --;
  561. }
  562. if(open < 0){
  563. fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  564. }
  565. if(bn2->dnidx == MAX_VALUE_U4){
  566. if(be->cigar == 2){
  567. dn2 = dn;
  568. } else if(dn && (open || be->cigar == 1)){
  569. dn2 = ref_dagnodev(dg->nodes, prepare_node_dagcns(dg, bn2->y, be->base));
  570. dn = dn? ref_dagnodev(dg->nodes, T.dnidx) : NULL;
  571. } else if((dn || open == 0) && be->cigar == 0){ // cigar == 0 && open == 0
  572. dn2 = ref_dagnodev(dg->nodes, bn2->y);
  573. } else dn2 = NULL;
  574. bn2->dnidx = dn2? offset_dagnodev(dg->nodes, dn2) : MAX_VALUE_U4;
  575. } else {
  576. dn2 = ref_dagnodev(dg->nodes, bn2->dnidx);
  577. }
  578. if(dn && dn2 && dn != dn2){
  579. de = prepare_edge_dagcns(dg, offset_dagnodev(dg->nodes, dn), offset_dagnodev(dg->nodes, dn2), 0);
  580. de->score += be->prob;
  581. }
  582. if(gn2->unvisit == 0){
  583. push_bdplinkv(stack, (bdp_link_t){offset_genodev(gg->nodes, gn2), dn2? offset_dagnodev(dg->nodes, dn2) : 0xFFFFFFFFU});
  584. }
  585. geg_end_iter_edges();
  586. }
  587. for(j=beg;j<end;j++) dg->deps->buffer[j] ++;
  588. return;
  589. }
  590. static inline void merge_nodes_core_dagcns(DAGCNS *g, uint32_t nid, u32list *stack, u32list *cache[4], int dir){
  591. dagnode_t *n0, *n2, *n;
  592. dagedge_t *e, *e2, *e1;
  593. uint32_t base, eid, nid1, i, ret;
  594. clear_u32list(stack);
  595. push_u32list(stack, nid);
  596. ret = 0;
  597. while(pop_u32list(stack, &nid)){
  598. n0 = ref_dagnodev(g->nodes, nid);
  599. if((eid = n0->edges[dir]) == 0xFFFFFFFFU) continue;
  600. clear_u32list(cache[0]);
  601. clear_u32list(cache[1]);
  602. clear_u32list(cache[2]);
  603. clear_u32list(cache[3]);
  604. while(1){
  605. e = ref_dagedgev(g->edges, eid);
  606. n = ref_dagnodev(g->nodes, e->nodes[dir]);
  607. e2 = ref_dagedgev(g->edges, n->edges[!dir]);
  608. if(e2->links[!dir] == 0xFFFFFFFFU){ // check whether there is only one edge from n -(!dir)-> n0
  609. push_u32list(cache[n->base], eid);
  610. }
  611. if((eid = e->links[dir]) == 0xFFFFFFFFU) break;
  612. }
  613. for(base=0;base<4;base++){
  614. if(cache[base]->size < 2) continue;
  615. for(i=0;i<cache[base]->size;i++) ref_dagedgev(g->edges, cache[base]->buffer[i])->visit = 1;
  616. e1 = ref_dagedgev(g->edges, cache[base]->buffer[0]);
  617. n = ref_dagnodev(g->nodes, e1->nodes[dir]);
  618. eid = n->edges[dir];
  619. nid1 = e1->nodes[dir];
  620. while(eid != 0xFFFFFFFFU){
  621. e = ref_dagedgev(g->edges, eid);
  622. e->visit = 1;
  623. eid = e->links[dir];
  624. }
  625. for(i=1;i<cache[base]->size;i++){
  626. e2 = ref_dagedgev(g->edges, cache[base]->buffer[i]);
  627. n2 = ref_dagnodev(g->nodes, e2->nodes[dir]);
  628. e1->cov += e2->cov;
  629. e1->score += e2->score;
  630. remove_edge_dagcns(g, cache[base]->buffer[i]);
  631. eid = n2->edges[dir];
  632. while(eid != 0xFFFFFFFFU){
  633. e2 = ref_dagedgev(g->edges, eid);
  634. e = prepare_edge_dagcns(g, nid1, e2->nodes[dir], dir);
  635. {
  636. e1 = ref_dagedgev(g->edges, cache[base]->buffer[0]); // memory referred by e1 may be freed in prepare_edge_dagcns
  637. }
  638. e->cov = e->cov - 1 + e2->cov;
  639. e->score = e->score + e2->score;
  640. e->visit = 1;
  641. eid = e2->links[dir];
  642. }
  643. eid = n2->edges[dir];
  644. while(eid != 0xFFFFFFFFU){
  645. e2 = ref_dagedgev(g->edges, eid);
  646. remove_edge_dagcns(g, eid); // e2->links retain the same values after removing
  647. eid = e2->links[dir];
  648. }
  649. }
  650. //n = ref_dagnodev(g->nodes, e1->nodes[dir]);
  651. //eid = n->edges[dir];
  652. //if(eid != 0xFFFFFFFFU && g->edges->buffer[eid].links[dir] == 0xFFFFFFFFU) continue; // we had merged a bubble branch1:A->C->T, branch2:A->C->T
  653. push_u32list(stack, nid1);
  654. }
  655. }
  656. }
  657. static inline int has_non_visited_edge_dagcns(DAGCNS *g, uint32_t nid, int dir){
  658. dagnode_t *n;
  659. dagedge_t *e;
  660. uint32_t eid;
  661. n = ref_dagnodev(g->nodes, nid);
  662. eid = n->edges[dir];
  663. while(eid != 0xFFFFFFFFU){
  664. e = ref_dagedgev(g->edges, eid);
  665. if(e->visit == 0) return 1;
  666. eid = e->links[dir];
  667. }
  668. return 0;
  669. }
  670. static inline void print_local_dot_dagcns(DAGCNS *g, uint32_t nid, int distance, FILE *out){
  671. u32list *stack;
  672. u32hash *hash;
  673. dagnode_t *n, *n1, *n2;
  674. dagedge_t *e;
  675. uint32_t id1, id2, eid, *u;
  676. int lo, hi, dir, exists;
  677. n = ref_dagnodev(g->nodes, nid);
  678. stack = init_u32list(32);
  679. hash = init_u32hash(1023);
  680. lo = n->pos - distance;
  681. hi = n->pos + distance;
  682. push_u32list(stack, nid);
  683. put_u32hash(hash, nid);
  684. fprintf(out, "digraph {\nrankdir=LR\n");
  685. while(stack->size){
  686. id1 = stack->buffer[--stack->size];
  687. n1 = ref_dagnodev(g->nodes, id1);
  688. for(dir=0;dir<1;dir++){
  689. eid = n1->edges[dir];
  690. while(eid != 0xFFFFFFFFU){
  691. e = ref_dagedgev(g->edges, eid);
  692. id2 = e->nodes[dir];
  693. n2 = ref_dagnodev(g->nodes, id2);
  694. fprintf(out, "N%d_%d_%c -> N%d_%d_%c [label=\"%d:%0.6f\" color=%s]\n", id1, n1->pos, "ACGT"[n1->base], id2, n2->pos, "ACGT"[n2->base], e->cov, e->score, e->visit? "blue" : "black");
  695. if(n2->pos >= lo && n2->pos <= hi){
  696. u = prepare_u32hash(hash, id2, &exists);
  697. if(exists){
  698. } else {
  699. *u = id2;
  700. push_u32list(stack, id2);
  701. }
  702. }
  703. eid = e->links[dir];
  704. }
  705. }
  706. }
  707. fprintf(out, "}\n");
  708. fflush(out);
  709. free_u32list(stack);
  710. free_u32hash(hash);
  711. }
  712. static inline void fprint_local_dot_dagcns(DAGCNS *g, u4i nid, int level, char *prefix, char *suffix){
  713. FILE *out;
  714. out = open_file_for_write(prefix, suffix, 1);
  715. print_local_dot_dagcns(g, nid, level, out);
  716. fclose(out);
  717. }
  718. static inline void merge_nodes_dagcns(DAGCNS *g){
  719. dagnode_t *n;
  720. dagedge_t *e;
  721. u32list *stack, *cache[4];
  722. u32fifo *queue;
  723. uint32_t i, nid, eid;
  724. stack = init_u32list(1024);
  725. cache[0] = init_u32list(4);
  726. cache[1] = init_u32list(4);
  727. cache[2] = init_u32list(4);
  728. cache[3] = init_u32list(4);
  729. for(i=0;i<g->edges->size;i++) g->edges->buffer[i].visit = 0;
  730. for(i=0;i<g->nodes->size;i++) g->nodes->buffer[i].visit = 0;
  731. queue = init_u32fifo();
  732. for(i=0;i<g->nodes->size;i++){
  733. n = ref_dagnodev(g->nodes, i);
  734. if(n->edges[1] != 0xFFFFFFFFU) continue;
  735. push_u32fifo(queue, i);
  736. }
  737. //dagcns_debug = 2;
  738. while(pop_u32fifo(queue, &nid)){
  739. if(dagcns_debug > 1) fprintf(stdout, "\npop(%u) %u\n", (u4i)queue->size, nid);
  740. n = ref_dagnodev(g->nodes, nid);
  741. if(n->visit) continue;
  742. n->visit = 1;
  743. merge_nodes_core_dagcns(g, nid, stack, cache, 1);
  744. merge_nodes_core_dagcns(g, nid, stack, cache, 0);
  745. n = ref_dagnodev(g->nodes, nid);
  746. eid = n->edges[0];
  747. while(eid != 0xFFFFFFFFU){
  748. e = ref_dagedgev(g->edges, eid);
  749. e->visit = 1;
  750. eid = e->links[0];
  751. }
  752. eid = n->edges[0];
  753. while(eid != 0xFFFFFFFFU){
  754. e = ref_dagedgev(g->edges, eid);
  755. if(!has_non_visited_edge_dagcns(g, e->nodes[0], 1)){
  756. if(dagcns_debug > 1) fprintf(stdout, "push %u\n", e->nodes[0]);
  757. push_u32fifo(queue, e->nodes[0]);
  758. }
  759. eid = e->links[0];
  760. }
  761. }
  762. free_u32fifo(queue);
  763. free_u32list(stack);
  764. free_u32list(cache[0]);
  765. free_u32list(cache[1]);
  766. free_u32list(cache[2]);
  767. free_u32list(cache[3]);
  768. }
  769. static inline void print_seq_dagcns(DAGCNS *g, FILE *out){
  770. char buffer[100];
  771. uint32_t i, j;
  772. for(i=j=0;i<g->cns->size;i++){
  773. buffer[j++] = bit_base_table[g->cns->buffer[i]];
  774. if(j == 99){
  775. buffer[j] = '\0';
  776. fprintf(out, "%s", buffer);
  777. j = 0;
  778. }
  779. }
  780. buffer[j] = '\0';
  781. fprintf(out, "%s", buffer);
  782. }
  783. static inline void gen_consensus_dagcns(DAGCNS *g, u32list *map){
  784. dagnode_t *n1, *n2;
  785. dagedge_t *e;
  786. u32fifo *queue;
  787. uint32_t i, lst, nid, eid, best_e;
  788. f8i best_s, score;
  789. queue = init_u32fifo();
  790. if(queue == NULL){ // un-reachable, but is used to call fprint_local_dot_dagcns in gdb Debug
  791. fprint_local_dot_dagcns(g, 0, 10, "test.dot", NULL);
  792. }
  793. for(i=0;i<g->nodes->size;i++){
  794. n1 = ref_dagnodev(g->nodes, i);
  795. if(n1->edges[0] == 0xFFFFFFFFU && n1->edges[1] != 0xFFFFFFFFU){
  796. push_u32fifo(queue, i);
  797. n1->fw_edge = NODE_MAX_FW_EDGE;
  798. n1->aux = 0;
  799. }
  800. }
  801. for(i=0;i<g->edges->size;i++) g->edges->buffer[i].visit = 0;
  802. while(pop_u32fifo(queue, &nid)){
  803. best_s = - FLT_MAX;
  804. best_e = 0xFFFFFFFFU;
  805. n1 = ref_dagnodev(g->nodes, nid);
  806. eid = n1->edges[0];
  807. while(eid != 0xFFFFFFFFU){
  808. e = ref_dagedgev(g->edges, eid);
  809. n2 = ref_dagnodev(g->nodes, e->nodes[0]);
  810. if(e->nodes[0] < g->backbone_size){
  811. //score = n2->aux + e->cov - g->ref_penalty * g->deps->buffer[n1->pos];
  812. score = n2->aux + e->score - g->ref_penalty * g->deps->buffer[n1->pos];
  813. } else {
  814. //score = n2->aux + e->cov - g->alt_penalty * g->deps->buffer[n1->pos];
  815. score = n2->aux + e->score - g->alt_penalty * g->deps->buffer[n1->pos];
  816. }
  817. if(score > best_s){
  818. best_s = score;
  819. best_e = eid;
  820. }
  821. eid = e->links[0];
  822. }
  823. if(best_s > - FLT_MAX) n1->aux = best_s;
  824. n1->fw_edge = best_e;
  825. eid = n1->edges[1];
  826. while(eid != 0xFFFFFFFFU){
  827. e = ref_dagedgev(g->edges, eid);
  828. e->visit = 1;
  829. if(!has_non_visited_edge_dagcns(g, e->nodes[1], 0)){
  830. push_u32fifo(queue, e->nodes[1]);
  831. }
  832. eid = e->links[1];
  833. }
  834. }
  835. free_u32fifo(queue);
  836. clear_u8list(g->cns);
  837. clear_u32list(g->deps);
  838. if(map) clear_u32list(map);
  839. g->cns_head = 0;
  840. if(g->nodes->size == 0) return;
  841. n1 = ref_dagnodev(g->nodes, g->cns_head);
  842. g->cns_score = n1->aux;
  843. n1->cns = 1;
  844. lst = 0;
  845. if(map && g->cns_head < g->backbone_size){
  846. while(lst < g->cns_head){ push_u32list(map, g->cns->size); lst ++; }
  847. }
  848. push_u8list(g->cns, n1->base);
  849. push_u32list(g->deps, 0);
  850. while(n1->fw_edge != NODE_MAX_FW_EDGE){
  851. e = ref_dagedgev(g->edges, n1->fw_edge);
  852. e->cns = 1;
  853. if(map && e->nodes[0] < g->backbone_size){
  854. while(lst < e->nodes[0]){ push_u32list(map, g->cns->size); lst ++; }
  855. }
  856. n1 = ref_dagnodev(g->nodes, e->nodes[0]);
  857. n1->cns = 1;
  858. push_u8list(g->cns, n1->base);
  859. push_u32list(g->deps, 0);
  860. }
  861. if(map) while(lst <= g->backbone_size){ push_u32list(map, g->cns->size); lst ++; }
  862. }
  863. #endif