dbgcns.h 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380
  1. /*
  2. *
  3. * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef __DBGCNS_CNS_RJ_H
  20. #define __DBGCNS_CNS_RJ_H
  21. #include "dna.h"
  22. #include "list.h"
  23. #include "hashset.h"
  24. #include "thread.h"
  25. #include "ksw.h"
  26. #include "chararray.h"
  27. /*
  28. #ifndef DBGCNS_DEBUG
  29. #define DBGCNS_DEBUG 0
  30. #endif
  31. */
  32. static int DBGCNS_DEBUG = 0;
  33. #define DBGCNS_KMER_MAX_SIZE 21
  34. #define DBGCNS_KMER_MAX_NODE_COV 0x3FF
  35. #define DBGCNS_KMER_MAX_EDGE_COV 0xFF
  36. #define DBGCNS_MAX_UID 0xFFF
  37. typedef struct {
  38. uint64_t mer:42, cov:10, visit:12;
  39. uint8_t edges[2][4];
  40. } dbgcns_kmer_t;
  41. define_list(dbgcnskmerv, dbgcns_kmer_t);
  42. #define UD(E) ((dbgcnskmerv*)set->userdata)->buffer[E].mer
  43. #define kmer_hashcode(E) u64hashcode(UD(E))
  44. #define kmer_hashequals(E1, E2) UD(E1) == UD(E2)
  45. define_hashset(dbgcnskmerhash, uint32_t, kmer_hashcode, kmer_hashequals);
  46. #undef UD
  47. typedef struct {
  48. u4i ksize;
  49. u8i kmask;
  50. int hz;
  51. dbgcnskmerv *kmers;
  52. dbgcnskmerhash *khash;
  53. u1v *zseq;
  54. } DBG;
  55. #define BT_SCORE_MIN -0x0FFFFFFF
  56. typedef struct {
  57. u8i mer:42, off:21, closed:1;
  58. u2i n_in, n_visit;
  59. u4i edges, bt_node, bt_edge;
  60. int bt_score;
  61. u8i ptr; // refer to dbg->kmers
  62. } fbg_kmer_t;
  63. #define fbgkmer_hashcode(E) u64hashcode((E).mer)
  64. #define fbgkmer_equals(E1, E2) ((E1).mer == (E2).mer)
  65. define_hashset(fbgkmerh, fbg_kmer_t, fbgkmer_hashcode, fbgkmer_equals);
  66. typedef struct {
  67. u4i node, cov:28, most:1, key:2, select:1;
  68. u4i dist;
  69. u4i link;
  70. u4i next;
  71. } fbg_edge_t;
  72. define_list(fbgedgev, fbg_edge_t);
  73. typedef struct {
  74. u4i ridx, roff:15, rlen:15, key:1, select:1;
  75. u4i next;
  76. } fbg_link_t;
  77. define_list(fbglinkv, fbg_link_t);
  78. typedef struct {
  79. u8i ridx:16, kidx:30, koff:17, closed:1;
  80. } rd_kmer_t;
  81. define_list(rdkmerv, rd_kmer_t);
  82. typedef struct {
  83. u4i lidx, len, gidx;
  84. } link_grp_t;
  85. define_list(linkgrpv, link_grp_t);
  86. typedef struct {
  87. fbgkmerh *kmers;
  88. fbgedgev *edges;
  89. fbglinkv *links;
  90. linkgrpv *grps;
  91. rdkmerv *mats;
  92. u1v *starseq;
  93. } FBG;
  94. #define DBGCNS_DP_SCORE_MIN -0x7FFFFFFF
  95. #define DBGCNS_PATH_M 0
  96. #define DBGCNS_PATH_X 1
  97. #define DBGCNS_PATH_I 2
  98. #define DBGCNS_PATH_D 3
  99. #define DBGCNS_CNS_NON 0
  100. #define DBGCNS_CNS_TIP 1
  101. #define DBGCNS_CNS_CUT 2
  102. #define DBGCNS_CNS_EXT 3
  103. #define DBGCNS_CNS_HIT 4
  104. typedef struct {
  105. union {
  106. struct { uint32_t kidx:30, path:2; uint32_t qpos; };
  107. uint64_t identifier;
  108. };
  109. int score;
  110. uint32_t bt_idx;
  111. } dbgcns_dp_t;
  112. define_list(dbgcnsdpv, dbgcns_dp_t);
  113. #define DD(E) ((dbgcnsdpv*)set->userdata)->buffer[E]
  114. #define dp_hashcode(E) u64hashcode(DD(E).identifier)
  115. #define dp_hashequals(E1, E2) DD(E1).identifier == DD(E2).identifier
  116. define_hashset(dbgcnsdphash, uint32_t, dp_hashcode, dp_hashequals);
  117. #undef DD
  118. typedef struct {uint64_t off:40, len:23, solid:1;} blk_t;
  119. define_list(blkv, blk_t);
  120. typedef struct {
  121. DBG *g;
  122. FBG *fbg;
  123. int C, M, X, I, D, E, H, L;
  124. int Z, W;
  125. u1v *qseqs;
  126. blkv *qblks;
  127. uint8_t *qry;
  128. uint32_t qlen;
  129. u4i qidx;
  130. int avg_cov;
  131. dbgcnsdpv *dps;
  132. u4v *heap;
  133. dbgcnsdphash *hash;
  134. b4v *qmaxs;
  135. uint32_t qtop;
  136. int max_score;
  137. uint32_t best_idx;
  138. String *seq;
  139. u1v *cns;
  140. u1v *cigars;
  141. int alns[4];
  142. } CNS;
  143. static inline DBG* init_dbg(uint32_t ksize){
  144. DBG *g;
  145. if(ksize > DBGCNS_KMER_MAX_SIZE){
  146. fprintf(stderr, " -- ksize MUST be no greater than %d, but is %d in %s -- %s:%d --\n", DBGCNS_KMER_MAX_SIZE, ksize, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  147. exit(1);
  148. }
  149. g = malloc(sizeof(DBG));
  150. g->hz = 0;
  151. g->ksize = ksize;
  152. g->kmask = 0xFFFFFFFFFFFFFFFFLLU >> ((32 - ksize) << 1);
  153. g->kmers = init_dbgcnskmerv(32);
  154. next_ref_dbgcnskmerv(g->kmers);
  155. memset(g->kmers->buffer, 0, sizeof(dbgcns_kmer_t));
  156. g->khash = init_dbgcnskmerhash(1023);
  157. set_userdata_dbgcnskmerhash(g->khash, g->kmers);
  158. g->zseq = init_u1v(64);
  159. return g;
  160. }
  161. static inline void free_dbg(DBG *g){
  162. free_dbgcnskmerv(g->kmers);
  163. free_dbgcnskmerhash(g->khash);
  164. free_u1v(g->zseq);
  165. free(g);
  166. }
  167. static inline void clear_dbg(DBG *g){
  168. clear_dbgcnskmerv(g->kmers);
  169. next_ref_dbgcnskmerv(g->kmers);
  170. memset(g->kmers->buffer, 0, sizeof(dbgcns_kmer_t));
  171. clear_dbgcnskmerhash(g->khash);
  172. }
  173. static inline int add_kmer_dbg(DBG *g, u8i kmer, u4i ridx, uint8_t fbase, uint8_t rbase, int inc){
  174. dbgcns_kmer_t *k;
  175. u8i krev;
  176. uint32_t *u;
  177. int dir, exists;
  178. if(0){
  179. krev = dna_rev_seq(kmer, g->ksize);
  180. if(kmer < krev){ dir = 0; }
  181. else { kmer = krev; dir = 1; }
  182. } else {
  183. dir = 0;
  184. }
  185. g->kmers->buffer[0].mer = kmer;
  186. u = prepare_dbgcnskmerhash(g->khash, 0, &exists);
  187. if(exists){
  188. k = ref_dbgcnskmerv(g->kmers, *u);
  189. } else {
  190. *u = g->kmers->size;
  191. k = next_ref_dbgcnskmerv(g->kmers);
  192. memset(k, 0, sizeof(dbgcns_kmer_t));
  193. k->mer = kmer;
  194. k->cov = 0;
  195. k->visit = 0;
  196. }
  197. if(k->visit != ridx + 1 && k->cov < DBGCNS_KMER_MAX_NODE_COV && (inc || k->cov == 0)){
  198. k->cov ++;
  199. }
  200. k->visit = ridx + 1;
  201. if(dir){
  202. if(fbase < 4){
  203. fbase = (~fbase) & 0x03;
  204. if(k->edges[1][fbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[1][fbase] == 0)) k->edges[1][fbase] ++;
  205. }
  206. if(rbase < 4){
  207. if(k->edges[0][rbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[0][rbase] == 0)) k->edges[0][rbase] ++;
  208. }
  209. } else {
  210. if(fbase < 4){
  211. if(k->edges[0][fbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[0][fbase] == 0)) k->edges[0][fbase] ++;
  212. }
  213. if(rbase < 4){
  214. rbase = (~rbase) & 0x03;
  215. if(k->edges[1][rbase] < DBGCNS_KMER_MAX_EDGE_COV && (inc || k->edges[1][rbase] == 0)) k->edges[1][rbase] ++;
  216. }
  217. }
  218. return exists;
  219. }
  220. static inline void homopolymer_compress_dbg(DBG *g, u1i *seq, u4i len){
  221. u4i i;
  222. u1i b;
  223. clear_u1v(g->zseq);
  224. b = 4;
  225. for(i=0;i<len;i++){
  226. if(seq[i] == b) continue;
  227. b = seq[i];
  228. push_u1v(g->zseq, b);
  229. }
  230. }
  231. static inline void add_seq_dbg(DBG *g, u4i ridx, uint8_t *seq, uint32_t len, int cov_inc){
  232. u8i kmer;
  233. uint32_t i;
  234. uint8_t b, f, r;
  235. if(g->hz){
  236. homopolymer_compress_dbg(g, seq, len);
  237. seq = g->zseq->buffer;
  238. len = g->zseq->size;
  239. }
  240. kmer = 0;
  241. for(i=0;i<len;){
  242. b = seq[i];
  243. kmer = ((kmer << 2) | b) & g->kmask;
  244. i ++;
  245. if(i < g->ksize) continue;
  246. f = (i < len)? seq[i] : 4;
  247. r = (i > g->ksize)? seq[i - g->ksize] : 4;
  248. add_kmer_dbg(g, kmer, ridx, f, r, cov_inc);
  249. }
  250. }
  251. static inline int kmer_cov_seq_dbg(DBG *g, u1i *seq, u4i len, u4i uid){
  252. dbgcns_kmer_t *k;
  253. u8i kmer;
  254. uint32_t i, *u;
  255. uint8_t b;
  256. int cov;
  257. uid = uid % DBGCNS_MAX_UID;
  258. if(g->hz){
  259. homopolymer_compress_dbg(g, seq, len);
  260. seq = g->zseq->buffer;
  261. len = g->zseq->size;
  262. }
  263. cov = 0;
  264. kmer = 0;
  265. for(i=0;i<len;){
  266. b = seq[i];
  267. kmer = ((kmer << 2) | b) & g->kmask;
  268. i ++;
  269. if(i < g->ksize) continue;
  270. g->kmers->buffer[0].mer = kmer;
  271. u = get_dbgcnskmerhash(g->khash, 0);
  272. if(u){
  273. k = ref_dbgcnskmerv(g->kmers, *u);
  274. if(k->visit == uid){ cov --; continue; }
  275. k->visit = uid;
  276. cov += k->cov > 1? k->cov + 1 : - 1;
  277. }
  278. }
  279. return cov;
  280. }
  281. static inline void print_kmers_dbg(DBG *g, FILE *out){
  282. dbgcns_kmer_t *k;
  283. uint64_t i;
  284. char seq[DBGCNS_KMER_MAX_SIZE + 1];
  285. for(i=1;i<g->kmers->size;i++){
  286. k = ref_dbgcnskmerv(g->kmers, i);
  287. kmer2seq(seq, k->mer, g->ksize);
  288. fprintf(out, "%s\t%d\t%d,%d,%d,%d\t%d,%d,%d,%d\n", seq, k->cov,
  289. k->edges[0][0], k->edges[0][1], k->edges[0][2], k->edges[0][3],
  290. k->edges[1][0], k->edges[1][1], k->edges[1][2], k->edges[1][3]);
  291. }
  292. }
  293. static inline CNS* init_cns(uint32_t ksize, int Z, int W, int M, int X, int I, int D, int E, int H, int L){
  294. CNS *cns;
  295. cns = malloc(sizeof(CNS));
  296. cns->g = init_dbg(ksize);
  297. cns->fbg = malloc(sizeof(FBG));
  298. cns->fbg->kmers = init_fbgkmerh(1023);
  299. cns->fbg->edges = init_fbgedgev(32);
  300. memset(next_ref_fbgedgev(cns->fbg->edges), 0, sizeof(fbg_edge_t));
  301. cns->fbg->links = init_fbglinkv(32);
  302. memset(next_ref_fbglinkv(cns->fbg->links), 0, sizeof(fbg_link_t));
  303. cns->fbg->grps = init_linkgrpv(32);
  304. cns->fbg->mats = init_rdkmerv(32);
  305. cns->fbg->starseq = init_u1v(32);
  306. cns->qseqs = init_u1v(32);
  307. cns->qblks = init_blkv(32);
  308. cns->Z = Z;
  309. cns->W = W;
  310. cns->C = 1;
  311. cns->M = M;
  312. cns->X = X;
  313. cns->I = I;
  314. cns->D = D;
  315. cns->E = E;
  316. cns->H = H;
  317. cns->L = L;
  318. cns->qlen = 0;
  319. cns->qidx = 0;
  320. cns->avg_cov = 0;
  321. cns->dps = init_dbgcnsdpv(32);
  322. next_ref_dbgcnsdpv(cns->dps);
  323. memset(cns->dps->buffer, 0, sizeof(dbgcns_dp_t)); // no need, it is always zero-filled
  324. cns->heap = init_u4v(32);
  325. cns->hash = init_dbgcnsdphash(1023);
  326. set_userdata_dbgcnsdphash(cns->hash, cns->dps);
  327. cns->qmaxs = init_b4v(32);
  328. cns->qtop = 0;
  329. cns->max_score = DBGCNS_DP_SCORE_MIN;
  330. cns->best_idx = 0;
  331. cns->seq = init_string(32);
  332. cns->cns = init_u1v(32);
  333. cns->cigars = init_u1v(32);
  334. return cns;
  335. }
  336. static inline void free_cns(CNS *cns){
  337. free_dbg(cns->g);
  338. free_fbgkmerh(cns->fbg->kmers);
  339. free_fbgedgev(cns->fbg->edges);
  340. free_fbglinkv(cns->fbg->links);
  341. free_linkgrpv(cns->fbg->grps);
  342. free_rdkmerv(cns->fbg->mats);
  343. free_u1v(cns->fbg->starseq);
  344. free(cns->fbg);
  345. free_u1v(cns->qseqs);
  346. free_blkv(cns->qblks);
  347. free_dbgcnsdpv(cns->dps);
  348. free_u4v(cns->heap);
  349. free_dbgcnsdphash(cns->hash);
  350. free_b4v(cns->qmaxs);
  351. free_string(cns->seq);
  352. free_u1v(cns->cns);
  353. free_u1v(cns->cigars);
  354. free(cns);
  355. }
  356. static inline void reset_cns(CNS *cns){
  357. clear_dbg(cns->g);
  358. clear_u1v(cns->qseqs);
  359. clear_blkv(cns->qblks);
  360. clear_fbgkmerh(cns->fbg->kmers);
  361. clear_fbgedgev(cns->fbg->edges);
  362. memset(next_ref_fbgedgev(cns->fbg->edges), 0, sizeof(fbg_edge_t));
  363. clear_fbglinkv(cns->fbg->links);
  364. memset(next_ref_fbglinkv(cns->fbg->links), 0, sizeof(fbg_link_t));
  365. cns->qry = NULL;
  366. cns->qlen = 0;
  367. }
  368. static inline void add_seq_cns(CNS *cns, char *seq, int len, int solid){
  369. blk_t *b;
  370. int i;
  371. b = next_ref_blkv(cns->qblks);
  372. b->off = cns->qseqs->size;
  373. b->len = len;
  374. b->solid = solid;
  375. for(i=0;i<len;i++) push_u1v(cns->qseqs, base_bit_table[(int)seq[i]]);
  376. }
  377. static inline void ready_cns(CNS *cns){
  378. UNUSED(cns);
  379. //blk_t *b;
  380. //u4i i;
  381. //for(i=0;i<cns->qblks->size;i++){
  382. //b = ref_blkv(cns->qblks, i);
  383. //add_seq_dbg(cns->g, cns->qseqs->buffer + b->off, b->len);
  384. //}
  385. }
  386. static inline int dbg_cns_core(CNS *cns){
  387. dbgcns_dp_t *dp, *dp2;
  388. dbgcns_kmer_t *k;
  389. u8i kmer, knew;
  390. uint32_t i, dp_idx, kidx, *u;
  391. int sum, cov, cut;
  392. uint8_t b, q;
  393. int exists, score, nadd, mat_only;
  394. if(cns->heap->size == 0) return DBGCNS_CNS_NON;
  395. dp_idx = cns->heap->buffer[0]; //array_heap_pop(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  396. encap_dbgcnsdpv(cns->dps, 9);
  397. dp = ref_dbgcnsdpv(cns->dps, dp_idx);
  398. mat_only = 0;
  399. if(dp->qpos >= cns->qlen) return DBGCNS_CNS_HIT;
  400. if(dp->qpos + cns->W < cns->qtop){
  401. mat_only = 1;
  402. //array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  403. //return DBGCNS_CNS_CUT;
  404. } else if(dp->qpos > cns->qtop){
  405. cns->qtop = dp->qpos;
  406. for(i=cns->heap->size;i>0;i--){
  407. if(cns->dps->buffer[cns->heap->buffer[i-1]].qpos + cns->W < cns->qtop){
  408. array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, i - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  409. }
  410. }
  411. }
  412. if(dp->score < cns->qmaxs->buffer[dp->qpos]){
  413. mat_only = 1;
  414. //array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  415. //return DBGCNS_CNS_CUT;
  416. } else if(dp->score + cns->Z * cns->X > cns->qmaxs->buffer[dp->qpos]){
  417. cns->qmaxs->buffer[dp->qpos] = dp->score + cns->Z * cns->X;
  418. }
  419. u = prepare_dbgcnsdphash(cns->hash, dp_idx, &exists);
  420. if(exists){
  421. dp2 = ref_dbgcnsdpv(cns->dps, *u);
  422. if(dp->score > dp2->score){
  423. *u = dp_idx;
  424. } else {
  425. array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  426. return DBGCNS_CNS_CUT;
  427. }
  428. } else {
  429. *u = dp_idx;
  430. }
  431. array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  432. k = ref_dbgcnskmerv(cns->g->kmers, dp->kidx);
  433. kmer = k->mer;
  434. q = cns->qry[dp->qpos];
  435. sum = k->edges[0][0] + k->edges[0][1] + k->edges[0][2] + k->edges[0][3];
  436. if(sum == 0) return DBGCNS_CNS_TIP;
  437. cut = num_max((cns->avg_cov + cns->L - 1) / cns->L, 1);
  438. //cut = num_max((cns->avg_cov + 2) / 3, 1);
  439. nadd = 0;
  440. for(b=0;b<4;b++){
  441. if((cov = k->edges[0][b]) == 0) continue;
  442. if(b == q){
  443. score = dp->score + cns->M;
  444. } else if(mat_only){
  445. continue;
  446. } else {
  447. score = dp->score + cns->X;
  448. }
  449. //score += (cov > 1)? (cov >= cut? 1 : 0) : -1;
  450. score += (cov > cut)? (cns->H + (b == q? 1 : 0)) : cov - cut;
  451. //score += (cov > cut)? cns->H : cov - cut;
  452. knew = ((kmer << 2) | b) & cns->g->kmask;
  453. cns->g->kmers->buffer[0].mer = knew;
  454. u = get_dbgcnskmerhash(cns->g->khash, 0);
  455. kidx = *u;
  456. dp2 = next_ref_dbgcnsdpv(cns->dps);
  457. dp2->kidx = kidx;
  458. dp2->path = (b == q)? DBGCNS_PATH_M : DBGCNS_PATH_X;
  459. dp2->qpos = dp->qpos + 1;
  460. dp2->score = score;
  461. dp2->bt_idx = dp_idx;
  462. array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  463. nadd ++;
  464. // deletion
  465. if(dp->path != DBGCNS_PATH_I){
  466. dp2 = next_ref_dbgcnsdpv(cns->dps);
  467. score = dp->score + (cns->E + (dp->path == DBGCNS_PATH_D? 0 : cns->D));
  468. if(dp->path != DBGCNS_PATH_D) score += (cov > cut)? 0 : cov - cut;
  469. else score += (cov > cut)? cns->H : cov - cut;
  470. dp2->kidx = kidx;
  471. dp2->path = DBGCNS_PATH_D;
  472. dp2->qpos = dp->qpos;
  473. dp2->score = score;
  474. dp2->bt_idx = dp_idx;
  475. array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  476. nadd ++;
  477. }
  478. }
  479. // insertion
  480. if(mat_only == 0 && dp->path != DBGCNS_PATH_D){
  481. dp2 = next_ref_dbgcnsdpv(cns->dps);
  482. score = dp->score + (cns->E + (dp->path == DBGCNS_PATH_I? 0 : cns->I));
  483. //if(dp->qpos && cns->qry[dp->qpos] == cns->qry[dp->qpos - 1]) score += 1; // homopolymer merge
  484. score += 1 - cut;
  485. dp2->kidx = dp->kidx;
  486. dp2->path = DBGCNS_PATH_I;
  487. dp2->qpos = dp->qpos + 1;
  488. dp2->score = score;
  489. dp2->bt_idx = dp_idx;
  490. array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  491. nadd ++;
  492. }
  493. return nadd? DBGCNS_CNS_EXT : DBGCNS_CNS_CUT;
  494. }
  495. static inline void ready_core_cns(CNS *cns, int candidate_mode, int reflen){
  496. blk_t *b;
  497. u4i i;
  498. int tot;
  499. if(candidate_mode == 3) i = 1;
  500. else i = 0;
  501. tot = 0;
  502. for(;i<cns->qblks->size;i++){
  503. b = ref_blkv(cns->qblks, i);
  504. add_seq_dbg(cns->g, i, cns->qseqs->buffer + b->off, b->len, 1);
  505. tot += b->len;
  506. }
  507. if(candidate_mode == 3){
  508. b = ref_blkv(cns->qblks, 0);
  509. add_seq_dbg(cns->g, i, cns->qseqs->buffer + b->off, b->len, 0);
  510. tot += b->len;
  511. }
  512. cns->avg_cov = (tot + reflen - 1) / reflen;
  513. }
  514. static inline int run_core_cns(CNS *cns, uint8_t *qry, uint32_t qlen){
  515. dbgcns_dp_t *dp;
  516. dbgcns_kmer_t *k;
  517. uint32_t i, kmer, kidx, dp_idx, *u;
  518. int status;
  519. {
  520. cns->qry = qry;
  521. cns->qlen = qlen;
  522. // reset auxiliaries
  523. clear_dbgcnsdpv(cns->dps);
  524. next_ref_dbgcnsdpv(cns->dps);
  525. memset(cns->dps->buffer, 0, sizeof(dbgcns_dp_t));
  526. clear_u4v(cns->heap);
  527. clear_dbgcnsdphash(cns->hash);
  528. clear_b4v(cns->qmaxs);
  529. for(i=0;i<qlen;i++) push_b4v(cns->qmaxs, DBGCNS_DP_SCORE_MIN);
  530. cns->qtop = 0;
  531. cns->max_score= DBGCNS_DP_SCORE_MIN;
  532. cns->best_idx = 0;
  533. clear_u1v(cns->cns);
  534. clear_string(cns->seq);
  535. clear_u1v(cns->cigars);
  536. cns->alns[0] = cns->alns[1] = cns->alns[2] = cns->alns[3] = 0;
  537. }
  538. // set first kmer
  539. kmer = 0;
  540. for(cns->qtop=0;cns->qtop<cns->g->ksize;cns->qtop++){
  541. kmer = (kmer << 2) | qry[cns->qtop];
  542. }
  543. cns->g->kmers->buffer[0].mer = kmer;
  544. u = get_dbgcnskmerhash(cns->g->khash, 0);
  545. if(u == NULL) return 0;
  546. kidx = *u;
  547. dp = next_ref_dbgcnsdpv(cns->dps);
  548. dp->kidx = kidx;
  549. dp->path = DBGCNS_PATH_M;
  550. dp->qpos = cns->qtop;
  551. dp->score = 0;
  552. dp->bt_idx = 0;
  553. array_heap_push(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, cns->dps->size - 1, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  554. // dbg traversing
  555. while(cns->heap->size){
  556. status = dbg_cns_core(cns);
  557. if(status == DBGCNS_CNS_HIT){
  558. dp_idx = cns->heap->buffer[0];
  559. array_heap_remove(cns->heap->buffer, cns->heap->size, cns->heap->cap, uint32_t, 0, num_cmp(cns->dps->buffer[b].score, cns->dps->buffer[a].score));
  560. dp = ref_dbgcnsdpv(cns->dps, dp_idx);
  561. if(dp->score > cns->max_score){
  562. cns->max_score = dp->score;
  563. cns->best_idx = dp_idx;
  564. }
  565. }
  566. }
  567. if(cns->best_idx == 0) return 0;
  568. // traceback to get cns seq
  569. dp_idx = cns->best_idx;
  570. while(dp_idx){
  571. dp = ref_dbgcnsdpv(cns->dps, dp_idx);
  572. push_u1v(cns->cigars, dp->path);
  573. cns->alns[dp->path] ++;
  574. if(dp->path != DBGCNS_PATH_I){
  575. k = ref_dbgcnskmerv(cns->g->kmers, dp->kidx);
  576. push_u1v(cns->cns, k->mer & 0x03);
  577. if(k->cov == 1){
  578. add_char_string(cns->seq, "acgt"[k->mer & 0x03]);
  579. } else {
  580. add_char_string(cns->seq, "ACGT"[k->mer & 0x03]);
  581. }
  582. }
  583. dp_idx = dp->bt_idx;
  584. }
  585. // first ksize - 1 bases may be not corrected, truncated
  586. reverse_string(cns->seq);
  587. reverse_u1v(cns->cns);
  588. reverse_u1v(cns->cigars);
  589. return cns->seq->size;
  590. }
  591. static inline int hierarchical_clustering_edge_links(FBG *fbg, fbg_edge_t *e, linkgrpv *grps, double max_var){
  592. fbg_link_t *lnk;
  593. u4i lidx, gidx;
  594. u4i i, b;
  595. double sum, avg;
  596. clear_linkgrpv(grps);
  597. lidx = e->link;
  598. while(lidx){
  599. lnk = ref_fbglinkv(fbg->links, lidx);
  600. push_linkgrpv(grps, (link_grp_t){lidx, lnk->rlen, 0});
  601. lidx = lnk->next;
  602. }
  603. if(grps->size == 0) return 0;
  604. sort_array(grps->buffer, grps->size, link_grp_t, num_cmpgt(a.len, b.len));
  605. gidx = 0;
  606. b = 0;
  607. sum = avg = grps->buffer[0].len;
  608. for(i=1;i<grps->size;i++){
  609. if(grps->buffer[i].len - avg > avg * max_var){
  610. gidx ++;
  611. sum = avg = grps->buffer[i].len;
  612. b = i;
  613. } else {
  614. sum += grps->buffer[i].len;
  615. avg = sum / (i - b + 1);
  616. if(avg - grps->buffer[b].len > avg * max_var || grps->buffer[i].len - avg > avg * max_var){
  617. gidx ++;
  618. sum = avg = grps->buffer[i].len;
  619. b = i;
  620. }
  621. }
  622. grps->buffer[i].gidx = gidx;
  623. }
  624. return gidx + 1;
  625. }
  626. static inline int revise_edge_fbg(FBG *fbg, u4i kidx, u4i eidx, linkgrpv *grps, double max_var){
  627. fbg_kmer_t *k;
  628. fbg_edge_t *e, *p;
  629. fbg_link_t *lnk;
  630. u4i eidx2, eidx3;
  631. u4i i, b, j, ng, avg, gidx;
  632. u4i max_cov, max_eidx, key;
  633. double sum;
  634. e = ref_fbgedgev(fbg->edges, eidx);
  635. ng = hierarchical_clustering_edge_links(fbg, e, grps, max_var);
  636. encap_fbgedgev(fbg->edges, ng - 1);
  637. k = ref_fbgkmerh(fbg->kmers, kidx);
  638. e = ref_fbgedgev(fbg->edges, eidx);
  639. e->key = key = e->key? 2 : 0;
  640. e->most = 0;
  641. if(DBGCNS_DEBUG){
  642. if(ng > 1){
  643. fprintf(stderr, "REVISE K%d -> K%d cov = %d into %d edges --\n", k->off, ref_fbgkmerh(fbg->kmers, e->node)->off, e->cov, ng); fflush(stderr);
  644. }
  645. }
  646. ref_fbgkmerh(fbg->kmers, e->node)->n_in += ng - 1;
  647. p = e;
  648. p->link = 0;
  649. eidx3 = e->next;
  650. gidx = 0;
  651. sum = 0;
  652. max_cov = 0;
  653. max_eidx = 0;
  654. for(i=b=0;i<=grps->size;i++){
  655. if(i == grps->size || grps->buffer[i].gidx != gidx){
  656. avg = sum / (i - b) + 0.5;
  657. if(p == NULL){
  658. eidx2 = fbg->edges->size;
  659. p = next_ref_fbgedgev(fbg->edges);
  660. p->node = e->node;
  661. e->next = eidx2;
  662. p->next = eidx3;
  663. }
  664. p->link = 0;
  665. p->cov = i - b;
  666. p->dist = avg;
  667. p->key = key;
  668. p->select = 0;
  669. p->most = 0;
  670. if(p->cov > max_cov){
  671. max_cov = p->cov;
  672. max_eidx = offset_fbgedgev(fbg->edges, p);
  673. } else if(p->cov == max_cov){
  674. max_eidx = 0;
  675. }
  676. if(DBGCNS_DEBUG){
  677. if(ng > 1){
  678. fprintf(stderr, "+ %d %d --\n", p->cov, p->dist); fflush(stderr);
  679. }
  680. }
  681. for(j=b;j<i;j++){
  682. lnk = ref_fbglinkv(fbg->links, grps->buffer[j].lidx);
  683. lnk->next = p->link;
  684. p->link = grps->buffer[j].lidx;
  685. if(lnk->key) p->key = 1;
  686. }
  687. e = p;
  688. p = NULL;
  689. b = i;
  690. gidx ++;
  691. sum = 0;
  692. }
  693. if(i < grps->size) sum += grps->buffer[i].len;
  694. }
  695. if(max_eidx){
  696. ref_fbgedgev(fbg->edges, max_eidx)->most = 1;
  697. }
  698. return gidx;
  699. }
  700. static inline void revise_edge_cov_fbg(FBG *fbg, fbg_edge_t *e){
  701. fbg_link_t *lnk;
  702. u4i lidx, cnt;
  703. double sum, var, max, avg, std;
  704. sum = 0;
  705. var = 0;
  706. max = 0;
  707. cnt = 0;
  708. lidx = e->link;
  709. while(lidx){
  710. lnk = ref_fbglinkv(fbg->links, lidx);
  711. if((double)lnk->rlen > max) max = lnk->rlen;
  712. sum += lnk->rlen;
  713. var += lnk->rlen * lnk->rlen;
  714. cnt ++;
  715. lidx = lnk->next;
  716. }
  717. if(cnt == 0) return;
  718. avg = sum / cnt;
  719. std = sqrt(var / cnt - avg * avg);
  720. if(std > 10 && std > 0.2 * avg) std = 0.2 * max;
  721. if(std < 1) std = 1;
  722. cnt = 0;
  723. sum = 0;
  724. lidx = e->link;
  725. while(lidx){
  726. lnk = ref_fbglinkv(fbg->links, lidx);
  727. if(num_diff((double)lnk->rlen, avg) <= std){
  728. sum += lnk->rlen;
  729. cnt ++;
  730. }
  731. lidx = lnk->next;
  732. }
  733. if(cnt == 0) cnt = 1;
  734. avg = sum / cnt;
  735. e->cov = cnt;
  736. e->dist = avg + 0.5;
  737. }
  738. static inline void build_DirectFuzzyBruijnGraph(CNS *cns, u4i ridx, u4i cov_cutoff, double max_dist_var){
  739. DBG *g;
  740. dbgcns_kmer_t *k;
  741. fbg_kmer_t A, *a;
  742. fbg_edge_t *e;
  743. fbg_link_t *l;
  744. rd_kmer_t *rk;
  745. u8i kmer;
  746. u4i r, rr, i, j, beg, end, c, len, idx1, off1, idx2, off2, *u;
  747. u4i eidx;
  748. u1i b, *seq;
  749. int exists;
  750. g = cns->g;
  751. // select high cov kmers
  752. if(cov_cutoff){
  753. seq = cns->qseqs->buffer + cns->qblks->buffer[ridx].off;
  754. len = cns->qblks->buffer[ridx].len;
  755. memset(&A, 0, sizeof(fbg_kmer_t));
  756. kmer = 0;
  757. j = 0x0000FFFFU;
  758. for(i=0;i<len;){
  759. b = seq[i];
  760. kmer = ((kmer << 2) | b) & g->kmask;
  761. i ++;
  762. if(i < g->ksize) continue;
  763. g->kmers->buffer[0].mer = kmer;
  764. u = get_dbgcnskmerhash(g->khash, 0);
  765. if(u == NULL) continue;
  766. k = ref_dbgcnskmerv(g->kmers, *u);
  767. if(k->cov < cov_cutoff) continue;
  768. //if(j + 1 == i){ j = i; continue; }
  769. A.mer = kmer;
  770. a = prepare_fbgkmerh(cns->fbg->kmers, A, &exists);
  771. if(exists){
  772. a->closed = 1;
  773. } else {
  774. a->mer = kmer;
  775. a->off = i - g->ksize;
  776. a->closed = 0;
  777. a->n_in = 0;
  778. a->n_visit = 0;
  779. a->edges = 0;
  780. a->ptr = *u;
  781. j = i;
  782. }
  783. }
  784. } else {
  785. // select best cov per 10 bp, but cov >= 4
  786. clear_rdkmerv(cns->fbg->mats);
  787. seq = cns->qseqs->buffer + cns->qblks->buffer[ridx].off;
  788. len = cns->qblks->buffer[ridx].len;
  789. memset(&A, 0, sizeof(fbg_kmer_t));
  790. kmer = 0;
  791. for(i=0;i<len;){
  792. b = seq[i];
  793. kmer = ((kmer << 2) | b) & g->kmask;
  794. i ++;
  795. if(i < g->ksize) continue;
  796. g->kmers->buffer[0].mer = kmer;
  797. u = get_dbgcnskmerhash(g->khash, 0);
  798. if(u == NULL) continue;
  799. if(g->kmers->buffer[*u].cov < 4) continue;
  800. push_rdkmerv(cns->fbg->mats, (rd_kmer_t){ridx, *u, i - g->ksize, 0});
  801. }
  802. sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(g->kmers->buffer[b.kidx].cov, g->kmers->buffer[a.kidx].cov, a.kidx, b.kidx));
  803. for(i=1;i<cns->fbg->mats->size;i++){
  804. rk = ref_rdkmerv(cns->fbg->mats, i);
  805. if(rk->kidx == ref_rdkmerv(cns->fbg->mats, i - 1)->kidx){
  806. rk->closed = 1;
  807. ref_rdkmerv(cns->fbg->mats, i - 1)->closed = 1;
  808. }
  809. }
  810. sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgt(a.koff, b.koff));
  811. beg = 0;
  812. j = 0xFFFFFFFFU;
  813. for(i=0;i<cns->fbg->mats->size;i++){
  814. rk = ref_rdkmerv(cns->fbg->mats, i);
  815. if(rk->closed) continue;
  816. if(j == 0xFFFFFFFFU) j = i;
  817. if(rk->koff - beg < 10){
  818. if(g->kmers->buffer[rk->kidx].cov > g->kmers->buffer[cns->fbg->mats->buffer[j].kidx].cov){
  819. j = i;
  820. }
  821. continue;
  822. }
  823. rk = ref_rdkmerv(cns->fbg->mats, j);
  824. beg = rk->koff;
  825. j = i;
  826. i --;
  827. k = ref_dbgcnskmerv(g->kmers, rk->kidx);
  828. kmer = k->mer;
  829. A.mer = kmer;
  830. a = prepare_fbgkmerh(cns->fbg->kmers, A, &exists);
  831. if(exists){
  832. a->closed = 1;
  833. } else {
  834. a->mer = kmer;
  835. a->off = rk->koff;
  836. a->closed = 0;
  837. a->n_in = 0;
  838. a->n_visit = 0;
  839. a->edges = 0;
  840. a->ptr = rk->kidx;
  841. }
  842. }
  843. }
  844. clear_rdkmerv(cns->fbg->mats);
  845. for(r=0;r<cns->qblks->size;r++){
  846. seq = cns->qseqs->buffer + cns->qblks->buffer[r].off;
  847. len = cns->qblks->buffer[r].len;
  848. memset(&A, 0, sizeof(fbg_kmer_t));
  849. kmer = 0;
  850. for(i=0;i<len;){
  851. b = seq[i];
  852. kmer = ((kmer << 2) | b) & g->kmask;
  853. i ++;
  854. if(i < g->ksize) continue;
  855. A.mer = kmer;
  856. a = get_fbgkmerh(cns->fbg->kmers, A);
  857. if(a == NULL) continue;
  858. if(a->closed) continue;
  859. push_rdkmerv(cns->fbg->mats, (rd_kmer_t){r, offset_fbgkmerh(cns->fbg->kmers, a), i - g->ksize, 0});
  860. }
  861. }
  862. sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(a.ridx, b.ridx, a.kidx, b.kidx));
  863. for(i=1;i<cns->fbg->mats->size;i++){
  864. if(cns->fbg->mats->buffer[i-1].ridx == cns->fbg->mats->buffer[i].ridx && cns->fbg->mats->buffer[i-1].kidx == cns->fbg->mats->buffer[i].kidx){
  865. ref_fbgkmerh(cns->fbg->kmers, cns->fbg->mats->buffer[i].kidx)->closed = 1;
  866. }
  867. }
  868. for(i=0;i<cns->fbg->mats->size;i++){
  869. if(ref_fbgkmerh(cns->fbg->kmers, cns->fbg->mats->buffer[i].kidx)->closed){
  870. cns->fbg->mats->buffer[i].closed = 1;
  871. }
  872. }
  873. sort_array(cns->fbg->mats->buffer, cns->fbg->mats->size, rd_kmer_t, num_cmpgtx(a.ridx, b.ridx, a.koff, b.koff));
  874. // add edges
  875. for(i=0;i+1<cns->fbg->mats->size;i++){
  876. if(cns->fbg->mats->buffer[i].closed) continue;
  877. idx1 = cns->fbg->mats->buffer[i].kidx;
  878. off1 = cns->fbg->mats->buffer[i].koff;
  879. a = ref_fbgkmerh(cns->fbg->kmers, idx1);
  880. for(j=i+1,c=0;j<cns->fbg->mats->size&&c<1;j++){
  881. if(cns->fbg->mats->buffer[j].ridx != cns->fbg->mats->buffer[i].ridx) break;
  882. if(cns->fbg->mats->buffer[j].closed) continue;
  883. idx2 = cns->fbg->mats->buffer[j].kidx;
  884. off2 = cns->fbg->mats->buffer[j].koff;
  885. if(a->off >= ref_fbgkmerh(cns->fbg->kmers, idx2)->off) continue;
  886. c ++;
  887. eidx = a->edges;
  888. while(eidx){
  889. e = ref_fbgedgev(cns->fbg->edges, eidx);
  890. if(e->node == idx2) break;
  891. eidx = e->next;
  892. }
  893. if(eidx == 0){
  894. eidx = cns->fbg->edges->size;
  895. e = next_ref_fbgedgev(cns->fbg->edges);
  896. e->node = idx2;
  897. e->link = 0;
  898. e->next = a->edges;
  899. e->cov = 0;
  900. e->dist = 0;
  901. e->key = 0;
  902. e->select = 0;
  903. a->edges = eidx;
  904. ref_fbgkmerh(cns->fbg->kmers, idx2)->n_in ++;
  905. }
  906. e = ref_fbgedgev(cns->fbg->edges, eidx);
  907. if(cns->fbg->mats->buffer[i].ridx == ridx) e->key = 1;
  908. e->cov ++;
  909. l = next_ref_fbglinkv(cns->fbg->links);
  910. l->ridx = cns->fbg->mats->buffer[i].ridx;
  911. l->roff = off1;
  912. l->rlen = off2 - off1;
  913. l->key = (cns->fbg->mats->buffer[i].ridx == ridx);
  914. l->select = 0;
  915. l->next = e->link;
  916. e->link = cns->fbg->links->size - 1;
  917. }
  918. }
  919. rr = 0xFFFFFFFFU;
  920. for(beg=end=0;beg<cns->fbg->mats->size;beg=end){
  921. r = rr;
  922. for(;end<cns->fbg->mats->size;end++){
  923. if(cns->fbg->mats->buffer[end].closed) continue;
  924. if(r == 0xFFFFFFFFU){
  925. r = cns->fbg->mats->buffer[end].ridx;
  926. } else if(cns->fbg->mats->buffer[end].ridx == r) continue;
  927. else { rr = cns->fbg->mats->buffer[end].ridx; break; }
  928. }
  929. if(r == ridx) continue; // reference seq
  930. for(i=beg;i+2<end;i++){
  931. if(cns->fbg->mats->buffer[i].closed) continue;
  932. idx1 = cns->fbg->mats->buffer[i].kidx;
  933. off1 = cns->fbg->mats->buffer[i].koff;
  934. a = ref_fbgkmerh(cns->fbg->kmers, idx1);
  935. c = 0;
  936. for(j=i+1;j<end;j++){
  937. if(cns->fbg->mats->buffer[j].closed) continue;
  938. idx2 = cns->fbg->mats->buffer[j].kidx;
  939. off2 = cns->fbg->mats->buffer[j].koff;
  940. if(a->off >= ref_fbgkmerh(cns->fbg->kmers, idx2)->off) continue;
  941. c ++;
  942. if(c < 2) continue;
  943. if(c > 5) break;
  944. eidx = a->edges;
  945. while(eidx){
  946. e = ref_fbgedgev(cns->fbg->edges, eidx);
  947. if(e->node == idx2) break;
  948. eidx = e->next;
  949. }
  950. if(eidx == 0) continue;
  951. e = ref_fbgedgev(cns->fbg->edges, eidx);
  952. e->cov ++;
  953. l = next_ref_fbglinkv(cns->fbg->links);
  954. l->ridx = r;
  955. l->roff = off1;
  956. l->rlen = off2 - off1;
  957. l->key = 0;
  958. l->select = 0;
  959. l->next = e->link;
  960. e->link = cns->fbg->links->size - 1;
  961. }
  962. }
  963. }
  964. if(0){
  965. for(i=1;i<cns->fbg->edges->size;i++){
  966. revise_edge_cov_fbg(cns->fbg, ref_fbgedgev(cns->fbg->edges, i));
  967. }
  968. } else {
  969. reset_iter_fbgkmerh(cns->fbg->kmers);
  970. while((a = ref_iter_fbgkmerh(cns->fbg->kmers))){
  971. eidx = a->edges;
  972. while(eidx){
  973. e = ref_fbgedgev(cns->fbg->edges, eidx);
  974. eidx = e->next;
  975. revise_edge_fbg(cns->fbg, offset_fbgkmerh(cns->fbg->kmers, a), offset_fbgedgev(cns->fbg->edges, e), cns->fbg->grps, max_dist_var);
  976. }
  977. }
  978. }
  979. }
  980. static inline void print_dot_DirectFuzzyBruijnGraph(CNS *cns, FILE *out){
  981. fbg_kmer_t *k, *n;
  982. fbg_edge_t *e;
  983. fbg_link_t *l;
  984. u4i eidx, lidx;
  985. if(out == NULL) return;
  986. fprintf(out, "digraph {\n\trankdir=LR\n");
  987. reset_iter_fbgkmerh(cns->fbg->kmers);
  988. while((k = ref_iter_fbgkmerh(cns->fbg->kmers))){
  989. if(k->closed) continue;
  990. eidx = k->edges;
  991. while(eidx){
  992. e = ref_fbgedgev(cns->fbg->edges, eidx);
  993. if(e->cov >= 3 || e->key || e->select){
  994. n = ref_fbgkmerh(cns->fbg->kmers, e->node);
  995. lidx = e->link;
  996. while(lidx){
  997. l = ref_fbglinkv(cns->fbg->links, lidx);
  998. fprintf(out, "\tK%d -> K%d [label=\"R%04d_%d_%d(%d:%d)\" color=%s%s]\n", k->off, n->off, l->ridx, l->roff, l->rlen, e->cov, e->dist, l->key? "blue" : (e->most? "green" : "black"), l->select? " style=dashed" : "");
  999. lidx = l->next;
  1000. }
  1001. }
  1002. eidx = e->next;
  1003. }
  1004. }
  1005. fprintf(out, "}\n");
  1006. }
  1007. static inline void DP_best_path_DirectFuzzyBruijnGraph(CNS *cns){
  1008. FBG *fbg;
  1009. fbg_kmer_t *k, *n;
  1010. fbg_edge_t *e;
  1011. u4v *heap;
  1012. u4i kidx, nb, ne, nboff, neoff;
  1013. u4i eidx, etmp;
  1014. int ref_score, ref_one, alt_one, most_score, max_dist_var, var, score;
  1015. float dist_var;
  1016. ref_score = 1;
  1017. ref_one = -50;
  1018. alt_one = -1000;
  1019. most_score = 2;
  1020. max_dist_var = 100;
  1021. dist_var = -0.5;
  1022. fbg = cns->fbg;
  1023. reset_iter_fbgkmerh(fbg->kmers);
  1024. nb = ne = 0xFFFFFFFFU;
  1025. nboff = 0xFFFFFFFFU;
  1026. neoff = 0;
  1027. while((k = ref_iter_fbgkmerh(fbg->kmers))){
  1028. k->n_visit = 0;
  1029. k->bt_node = 0xFFFFFFFFU;
  1030. k->bt_edge = 0;
  1031. k->bt_score = BT_SCORE_MIN;
  1032. if(k->closed) continue;
  1033. if(k->off < nboff){ nb = offset_fbgkmerh(fbg->kmers, k); nboff = k->off; }
  1034. if(k->off >= neoff){ ne = offset_fbgkmerh(fbg->kmers, k); neoff = k->off; }
  1035. }
  1036. if(nb == 0xFFFFFFFFU) return;
  1037. heap = init_u4v(32);
  1038. k = ref_fbgkmerh(fbg->kmers, nb);
  1039. k->bt_score = 0;
  1040. push_u4v(heap, nb);
  1041. while(heap->size){
  1042. kidx = heap->buffer[--heap->size];
  1043. k = ref_fbgkmerh(fbg->kmers, kidx);
  1044. eidx = k->edges;
  1045. while(eidx){
  1046. e = ref_fbgedgev(fbg->edges, eidx);
  1047. n = ref_fbgkmerh(fbg->kmers, e->node);
  1048. score = k->bt_score + e->most * most_score;
  1049. var = n->off - k->off;
  1050. var = num_diff(var, (int)e->dist);
  1051. if(e->key == 1){
  1052. score += ref_score + (e->cov <= 1? ref_one : 0);
  1053. } else if(e->key == 2){
  1054. score += (e->cov <= 1? alt_one : 0);
  1055. } else {
  1056. if(var > max_dist_var) score = -1000000;
  1057. else score += var * dist_var + (e->cov <= 1? alt_one : 0);
  1058. }
  1059. if(score > n->bt_score){
  1060. n->bt_score = score;
  1061. n->bt_node = kidx;
  1062. n->bt_edge = eidx;
  1063. }
  1064. n->n_visit ++;
  1065. if(n->n_visit >= n->n_in){
  1066. push_u4v(heap, e->node);
  1067. }
  1068. eidx = e->next;
  1069. }
  1070. }
  1071. free_u4v(heap);
  1072. k = ref_fbgkmerh(fbg->kmers, ne);
  1073. if(ne != nb && k->bt_edge == 0){
  1074. FILE *out = open_file_for_write("debug.dot", NULL, 1);
  1075. print_dot_DirectFuzzyBruijnGraph(cns, out);
  1076. fclose(out);
  1077. fprintf(stderr, " -- something wrong, nb = %d(K%04d), ne = %d(K%04d) in %s -- %s:%d --\n", nb, fbg->kmers->array[nb].off, ne, fbg->kmers->array[ne].off, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1078. exit(1);
  1079. }
  1080. kidx = 0xFFFFFFFFU;
  1081. eidx = 0;
  1082. while(kidx != nb){
  1083. k = ref_fbgkmerh(fbg->kmers, ne);
  1084. ne = k->bt_node;
  1085. swap_tmp(eidx, k->bt_edge, etmp);
  1086. k->bt_node = kidx;
  1087. kidx = offset_fbgkmerh(fbg->kmers, k);
  1088. }
  1089. }
  1090. static inline int correct_struct_DirectFuzzyBruijnGraph(CNS *cns, u4i ridx){
  1091. FBG *fbg;
  1092. fbg_kmer_t *k, *n;
  1093. fbg_edge_t *e;
  1094. fbg_link_t *lnk;
  1095. int chg;
  1096. u4i off, kidx, koff, eidx, lidx, key, upd;
  1097. fbg = cns->fbg;
  1098. reset_iter_fbgkmerh(fbg->kmers);
  1099. kidx = 0xFFFFFFFFU;
  1100. koff = 0xFFFFFFFFU;
  1101. while((k = ref_iter_fbgkmerh(fbg->kmers))){
  1102. if(k->closed) continue;
  1103. if(k->off < koff){ kidx = offset_fbgkmerh(fbg->kmers, k); koff = k->off; }
  1104. }
  1105. clear_u1v(fbg->starseq);
  1106. if(DBGCNS_DEBUG){
  1107. fprintf(stderr, " -- select seq[%d] len=%d in %s -- %s:%d --\n", ridx, cns->qblks->buffer[ridx].len, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1108. }
  1109. if(kidx == 0xFFFFFFFFU){
  1110. append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off, cns->qblks->buffer[ridx].len);
  1111. return 0;
  1112. }
  1113. chg = 0;
  1114. off = 0;
  1115. while(1){
  1116. k = ref_fbgkmerh(fbg->kmers, kidx);
  1117. if(off < k->off){
  1118. if(DBGCNS_DEBUG){
  1119. fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d -> %d\n", (int)fbg->starseq->size, k->off - off, ridx, off, k->off);
  1120. }
  1121. append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off + off, k->off - off);
  1122. off = k->off;
  1123. }
  1124. if(k->bt_edge == 0) break;
  1125. eidx = k->bt_edge;
  1126. e = ref_fbgedgev(fbg->edges, eidx);
  1127. e->select = 1;
  1128. key = 0;
  1129. double sum, var, cnt, avg, std, min;
  1130. sum = 0;
  1131. var = 0;
  1132. cnt = 0;
  1133. lidx = e->link;
  1134. while(lidx){
  1135. lnk = ref_fbglinkv(fbg->links, lidx);
  1136. sum += lnk->rlen;
  1137. var += lnk->rlen * lnk->rlen;
  1138. cnt ++;
  1139. if(lnk->key) key = lidx;
  1140. lidx = lnk->next;
  1141. }
  1142. if(key){
  1143. upd = key;
  1144. } else {
  1145. avg = sum / cnt;
  1146. std = sqrt(var / cnt - avg * avg);
  1147. if(std < 1) std = 1;
  1148. lidx = e->link;
  1149. min = 100000;
  1150. upd = key;
  1151. while(lidx){
  1152. lnk = ref_fbglinkv(fbg->links, lidx);
  1153. if(num_diff((double)lnk->rlen, avg) < min){
  1154. upd = lidx;
  1155. min = num_diff((double)lnk->rlen, avg);
  1156. }
  1157. lidx = lnk->next;
  1158. }
  1159. }
  1160. lnk = ref_fbglinkv(fbg->links, upd);
  1161. lnk->select = 1;
  1162. kidx = k->bt_node;
  1163. n = ref_fbgkmerh(fbg->kmers, kidx);
  1164. if(DBGCNS_DEBUG){
  1165. fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d + %d -> %d\n", (int)fbg->starseq->size, lnk->rlen, lnk->ridx, off, n->off - off, n->off);
  1166. }
  1167. append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[lnk->ridx].off + lnk->roff, lnk->rlen);
  1168. off = n->off;
  1169. }
  1170. k = NULL;
  1171. if(off < cns->qblks->buffer[ridx].len){
  1172. if(DBGCNS_DEBUG){
  1173. fprintf(stderr, "-- %d + %d bases from seq[%d], offset %d -> %d\n", (int)fbg->starseq->size, cns->qblks->buffer[ridx].len - off, ridx, off, cns->qblks->buffer[ridx].len);
  1174. }
  1175. append_array_u1v(fbg->starseq, cns->qseqs->buffer + cns->qblks->buffer[ridx].off + off, cns->qblks->buffer[ridx].len - off);
  1176. }
  1177. return chg;
  1178. }
  1179. static inline int homopolymer_analysis_cns(CNS *cns){
  1180. u8i kmer, xmask[2];
  1181. u4i chg, i, j, l, r, c, brun, *u, kcnts[3];
  1182. u1i b;
  1183. char kstr[64];
  1184. UNUSED(kstr); // only for compile warning
  1185. xmask[1] = 0xFFFFFFFFFFFFFFFFLLU >> (64 - (cns->g->ksize / 2 * 2));
  1186. xmask[0] = (~xmask[1]) & cns->g->kmask;
  1187. b = 4; brun = 0;
  1188. clear_u1v(cns->g->zseq);
  1189. chg = 0;
  1190. for(i=0;i<cns->g->ksize;i++) push_u1v(cns->g->zseq, cns->cns->buffer[i]);
  1191. for(;i+cns->g->ksize<cns->cns->size;i++){
  1192. if(cns->cns->buffer[i] == b){
  1193. brun ++;
  1194. } else {
  1195. if(brun >= 3 && brun + 2 <= cns->g->ksize){
  1196. if(DBGCNS_DEBUG){
  1197. fprintf(stderr, "POLY %c(%d) at pos %d\n", "ACGT"[b], brun, i);
  1198. }
  1199. r = i - brun;
  1200. c = (cns->g->ksize - brun) / 2;
  1201. l = r - c;
  1202. kmer = 0;
  1203. for(j=l;j<r;j++) kmer = (kmer << 2) | cns->cns->buffer[j];
  1204. for(j=1;j<brun;j++) kmer = (kmer << 2) | b;
  1205. l = i;
  1206. r = l + (cns->g->ksize - brun - c) + 1;
  1207. for(j=l;j<r;j++) kmer = (kmer << 2) | cns->cns->buffer[j];
  1208. cns->g->kmers->buffer[0].mer = kmer;
  1209. u = get_dbgcnskmerhash(cns->g->khash, 0);
  1210. kcnts[0] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
  1211. if(DBGCNS_DEBUG){
  1212. kmer2seq(kstr, kmer, cns->g->ksize);
  1213. fprintf(stderr, "%s\t%d\n", kstr, kcnts[0]);
  1214. }
  1215. kmer = (kmer & xmask[0]) | (((u8i)b) << (cns->g->ksize / 2 * 2 - 2)) | ((kmer & xmask[1]) >> 2);
  1216. cns->g->kmers->buffer[0].mer = kmer;
  1217. u = get_dbgcnskmerhash(cns->g->khash, 0);
  1218. kcnts[1] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
  1219. if(DBGCNS_DEBUG){
  1220. kmer2seq(kstr, kmer, cns->g->ksize);
  1221. fprintf(stderr, "%s\t%d\n", kstr, kcnts[1]);
  1222. }
  1223. kmer = (kmer & xmask[0]) | (((u8i)b) << (cns->g->ksize / 2 * 2 - 2)) | ((kmer & xmask[1]) >> 2);
  1224. cns->g->kmers->buffer[0].mer = kmer;
  1225. u = get_dbgcnskmerhash(cns->g->khash, 0);
  1226. kcnts[2] = u? ref_dbgcnskmerv(cns->g->kmers, *u)->cov : 0;
  1227. if(DBGCNS_DEBUG){
  1228. kmer2seq(kstr, kmer, cns->g->ksize);
  1229. fprintf(stderr, "%s\t%d\n", kstr, kcnts[2]);
  1230. }
  1231. if(kcnts[0] > kcnts[1] && kcnts[1] >= kcnts[2]){ // there is a insertion base
  1232. brun --;
  1233. chg ++;
  1234. if(DBGCNS_DEBUG){
  1235. fprintf(stderr, "#HOMO ins\n");
  1236. }
  1237. } else if(kcnts[2] > kcnts[1] && kcnts[1] >= kcnts[0]){ // deletion
  1238. brun ++;
  1239. chg ++;
  1240. if(DBGCNS_DEBUG){
  1241. fprintf(stderr, "#HOMO del\n");
  1242. }
  1243. }
  1244. }
  1245. for(j=0;j<brun;j++) push_u1v(cns->g->zseq, b);
  1246. b = cns->cns->buffer[i];
  1247. brun = 1;
  1248. }
  1249. }
  1250. for(j=0;j<brun;j++) push_u1v(cns->g->zseq, b);
  1251. for(;i<cns->cns->size;i++) push_u1v(cns->g->zseq, cns->cns->buffer[i]);
  1252. if(chg){
  1253. clear_u1v(cns->cns);
  1254. append_u1v(cns->cns, cns->g->zseq);
  1255. clear_string(cns->seq);
  1256. encap_string(cns->seq, cns->g->zseq->size);
  1257. for(i=0;i<cns->g->zseq->size;i++){
  1258. cns->seq->string[i] = bit_base_table[cns->g->zseq->buffer[i]];
  1259. }
  1260. cns->seq->size = i;
  1261. cns->seq->string[i] = '\0';
  1262. }
  1263. return chg;
  1264. }
  1265. static inline int run_cns(CNS *cns, int candidate_mode, int corr_struct){
  1266. u4i i;
  1267. if(cns->qblks->size == 0) return 0;
  1268. cns->qidx = 0;
  1269. if(candidate_mode == 4){ // longest
  1270. for(i=0;i<cns->qblks->size;i++){
  1271. if(cns->qblks->buffer[i].solid == 0) continue;
  1272. if(cns->qblks->buffer[i].len > cns->qblks->buffer[cns->qidx].len) cns->qidx = i;
  1273. }
  1274. } else if(candidate_mode == 5){ // shortest
  1275. for(i=0;i<cns->qblks->size;i++){
  1276. if(cns->qblks->buffer[i].solid == 0) continue;
  1277. if(cns->qblks->buffer[i].len < cns->qblks->buffer[cns->qidx].len) cns->qidx = i;
  1278. }
  1279. } else if(candidate_mode == 3){ // first and but not increase coverage
  1280. cns->qidx = 0;
  1281. } else if(candidate_mode == 0){ // kmer coverage
  1282. cns->qidx = 0;
  1283. } else if(candidate_mode == 2){ // first and include
  1284. cns->qidx = 0;
  1285. } else if(candidate_mode < 0){
  1286. cns->qidx = num_min(-1 - candidate_mode, (int)(cns->qblks->size - 1));
  1287. } else if(candidate_mode == 1){ // median
  1288. u2v *idxs;
  1289. idxs = init_u2v(cns->qblks->size);
  1290. for(i=0;i<cns->qblks->size;i++){
  1291. if(cns->qblks->buffer[i].solid == 0) continue;
  1292. push_u2v(idxs, i);
  1293. }
  1294. cns->qidx = quick_median_array(idxs->buffer, idxs->size, u2i, num_cmpgt(cns->qblks->buffer[a].len, cns->qblks->buffer[b].len));
  1295. } else {
  1296. fprintf(stderr, " -- Unknown candidate mode %d in %s -- %s:%d --\n", candidate_mode, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1297. }
  1298. ready_core_cns(cns, candidate_mode, cns->qblks->buffer[cns->qidx].len);
  1299. if(candidate_mode == 0){
  1300. double max, cov;
  1301. max = - 1000000;
  1302. for(i=0;i<cns->g->kmers->size;i++) cns->g->kmers->buffer[i].visit = 0;
  1303. for(i=cns->qblks->size;i>0;i--){
  1304. if(cns->qblks->buffer[i - 1].solid == 0) continue;
  1305. cov = kmer_cov_seq_dbg(cns->g, cns->qseqs->buffer + cns->qblks->buffer[i - 1].off, cns->qblks->buffer[i - 1].len, (i - 1 + 1) % 255) * 1.0 / cns->qblks->buffer[i - 1].len;
  1306. if(cov > max){
  1307. max = cov;
  1308. cns->qidx = i - 1;
  1309. }
  1310. }
  1311. // revise cns->avg_cov
  1312. cns->avg_cov = cns->avg_cov * cns->qblks->buffer[0].len / cns->qblks->buffer[cns->qidx].len;
  1313. }
  1314. if(corr_struct){
  1315. build_DirectFuzzyBruijnGraph(cns, cns->qidx, 5, 0.2);
  1316. //build_DirectFuzzyBruijnGraph(cns, cns->qidx, 0, 0.2);
  1317. DP_best_path_DirectFuzzyBruijnGraph(cns);
  1318. correct_struct_DirectFuzzyBruijnGraph(cns, cns->qidx);
  1319. if(DBGCNS_DEBUG){
  1320. FILE *dotf = open_file_for_write("debug.dot", NULL, 1);
  1321. print_dot_DirectFuzzyBruijnGraph(cns, dotf);
  1322. fclose(dotf);
  1323. }
  1324. run_core_cns(cns, cns->fbg->starseq->buffer, cns->fbg->starseq->size);
  1325. } else {
  1326. run_core_cns(cns, cns->qseqs->buffer + cns->qblks->buffer[cns->qidx].off, cns->qblks->buffer[cns->qidx].len);
  1327. }
  1328. homopolymer_analysis_cns(cns);
  1329. return cns->seq->size;
  1330. }
  1331. #endif