/* * * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com> * * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __WTDBG_H_RJ #define __WTDBG_H_RJ #include "kbm.h" #include "kbmpoa.h" #include "filewriter.h" #include "pgzf.h" #include <getopt.h> #include <regex.h> #define WT_MAX_RD 0x3FFFFFFF // 1 G #define WT_MAX_RDLEN 0x00FFFFFF // 16 Mb #define WT_MAX_RDBIN 0x0000FFFF // 64 K bins #define WT_MAX_NODE 0x000000FFFFFFFFFFLLU #define WT_MAX_EDGE 0x000000FFFFFFFFFFLLU #define WT_MAX_NODE_EDGES 0xFFFF #define WT_MAX_EDGE_COV 0xFFF typedef struct { u8i rid:30, dir:1, beg:16, end:16, closed:1; } rd_frg_t; define_list(rdfrgv, rd_frg_t); typedef struct { u8i node; u8i rid:30, dir:1, beg:16, end:16, closed:1; } rd_reg_t; define_list(rdregv, rd_reg_t); typedef struct { u8i rid:30, dir:1, beg:16, end:16, closed:1; u8i read_link; } rd_rep_t; define_list(rdrepv, rd_rep_t); typedef struct { u8i node; u8i rid:30, dir:1, beg:16, end:16, closed:1; u8i read_link; } reg_t; define_list(regv, reg_t); typedef struct { uint32_t x, y; } xy_t; define_list(xyv, xy_t); typedef struct { uint64_t idx:46, cnt:18; } vec_ref_t; typedef struct { uint64_t idx:46, cnt:18; } ptr_ref_t; typedef struct { uint64_t idx:46, cnt:18, fix:1, rank:45, score:18; } rnk_ref_t; static const vec_ref_t VEC_REF_NULL = (vec_ref_t){0, 0}; static const ptr_ref_t PTR_REF_NULL = (ptr_ref_t){0, 0}; static const rnk_ref_t RNK_REF_NULL = (rnk_ref_t){0, 0, 0, 0, 0}; define_list(vecrefv, vec_ref_t); define_list(ptrrefv, ptr_ref_t); define_list(rnkrefv, rnk_ref_t); #define ptrref_hashcode(E) u64hashcode((E).idx) #define ptrref_hashequals(E1, E2) ((E1).idx == (E2).idx) define_hashset(ptrrefhash, ptr_ref_t, ptrref_hashcode, ptrref_hashequals); #define WT_EDGE_CLOSED_NULL 0 #define WT_EDGE_CLOSED_MASK 1 #define WT_EDGE_CLOSED_LESS 2 #define WT_EDGE_CLOSED_HARD 3 typedef struct { uint64_t node1:45, dir1:1, dir2:1, status:1, closed:2, flag:2, cov:12; uint64_t node2:45; int64_t off:19; } edge_t; define_list(edgev, edge_t); static inline uint64_t _edge_hashcode(edge_t e){ const uint64_t m = 0xc6a4a7935bd1e995LLU; const int r = 47; uint64_t h = 1023 ^ (16 * m); uint64_t k = (e.node1 << 1) | e.dir1; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; k = (e.node2 << 1) | e.dir2; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; h ^= h >> r; h *= m; h ^= h >> r; return h; } #define EDGEHASH(idx) ((edgev *)set->userdata)->buffer[idx] #define edge_hashcode(E) _edge_hashcode(EDGEHASH(E)) #define edge_hashequals(E1, E2) (EDGEHASH(E1).node1 == EDGEHASH(E2).node1 && EDGEHASH(E1).node2 == EDGEHASH(E2).node2 \ && EDGEHASH(E1).dir1 == EDGEHASH(E2).dir1 && EDGEHASH(E1).dir2 == EDGEHASH(E2).dir2) define_hashset(edgehash, uint64_t, edge_hashcode, edge_hashequals); typedef struct { uint64_t idx:63, flg:1; uint64_t next; } edge_ref_t; static const edge_ref_t EDGE_REF_NULL = (edge_ref_t){0x7FFFFFFFFFFFFFFLLU, 1, 0}; define_list(edgerefv, edge_ref_t); #define MAX_REP_IDX MAX_U4 typedef struct { u8i rep_idx:32; u4i unvisit:16, cov:16; u8i closed:1, single_in:1, bt_visit:45, rep_dir:1, bt_idx:16, init_end:1; vec_ref_t regs; ptr_ref_t edges[2]; } node_t; define_list(nodev, node_t); typedef struct { u8i idx:39, flg:1, cnt:24; } hit_lnk_t; define_list(hitlnkv, hit_lnk_t); typedef struct { rd_frg_t frgs[2]; hit_lnk_t lnks[2]; // idx+flg: link, cnt[0]: mat, cnt[1]:cglen } rd_hit_t; define_list(rdhitv, rd_hit_t); typedef struct { u8i visit:63, flag:1; hit_lnk_t hits; // point to the g->rdhits int clps[2]; ptr_ref_t regs; //u2i corr_bincnt; } read_t; define_list(readv, read_t); typedef struct { u8i node; //node_t *n; edge_ref_t edges[2]; u4i dir:2, cov:30; int off; } trace_t; define_list(tracev, trace_t); #define WT_TRACE_MSG_ZERO 0 #define WT_TRACE_MSG_ONE 1 #define WT_TRACE_MSG_MORE 2 #define WT_TRACE_MSG_VISITED 3 #define WT_TRACE_MSG_UNDEF 4 typedef struct { union { struct { u4i frg1:31, dir1:1; u4i frg2:31, dir2:1; }; u8i key; }; u4i cov:13, flag:2, tidx1:8, tidx2:8, weak:1, closed:2; b4i off; } lnk_t; define_list(lnkv, lnk_t); #define lnk_hashcode(E) (E).key #define lnk_hashequals(E1, E2) (E1).key == (E2).key define_hashset(lnkhash, lnk_t, lnk_hashcode, lnk_hashequals); typedef struct { u8i toff:46, tcnt:18; // extended traces and core traces u4i tx, ty; // core traces ptr_ref_t lnks[2]; u4i len, length; u8i rep_idx:48, unvisit:16; u8i closed:1, single_in:1, bt_visit:46, rep_dir:1, bt_idx:16; } frg_t; define_list(frgv, frg_t); typedef struct { edge_ref_t lnks[2]; u4i frg; u4i dir:2, cov:30; int off; u4i tx, ty; } path_t; define_list(pathv, path_t); #define WT_PATH_MSG_ZERO 0 #define WT_PATH_MSG_ONE 1 #define WT_PATH_MSG_MORE 2 #define WT_PATH_MSG_VISITED 3 #define WT_PATH_MSG_UNDEF 4 typedef struct { u8i node1:63, dir1:1; u8i node2:63, dir2:1; b8i off:42, len:22; } seqlet_t; define_list(seqletv, seqlet_t); typedef struct { KBM *kbm; KBMPar *par, *rpar; regv *regs; readv *reads; rdhitv *rdhits; BitsVec *cigars; nodev *nodes; edgev *edges; edgehash *ehash; edgerefv *erefs; frgv *frgs; lnkv *lnks; edgerefv *lrefs; tracev *traces; u8i genome_size; u4i num_index; //u4i corr_mode, corr_min, corr_max; //u4i corr_bsize, corr_bstep; //float corr_cov, corr_gcov; int node_order, mem_stingy; u4i n_fix, only_fix; // first n sequences are accurate contigs; only_fix means whether to include other pacbio sequenes u4i reglen, regovl, bestn; int min_node_mats; int max_overhang, chainning_hits, uniq_hit; float node_max_conflict; // 0.25 float node_merge_cutoff; uint32_t max_node_cov, min_node_cov, exp_node_cov, min_edge_cov, max_edge_span; u4i max_node_cov_sg, max_sg_end; int cut_tip; int store_low_cov_edge; int rep_filter, rep_detach; uint32_t bub_step, tip_step, rep_step; int min_ctg_len, min_ctg_nds, minimal_output; vplist *utgs; u4i major_nctg; vplist *ctgs; } Graph; static const char *colors[2][2] = {{"blue", "green"}, {"red", "gray"}}; static inline Graph* init_graph(KBM *kbm){ Graph *g; u4i rid; g = malloc(sizeof(Graph)); g->kbm = kbm; g->par = kbm->par; g->rpar = NULL; g->regs = init_regv(32); g->reads = init_readv(kbm->reads->size); g->reads->size = kbm->reads->size; for(rid=0;rid<g->reads->size;rid++){ g->reads->buffer[rid].clps[0] = 0; g->reads->buffer[rid].clps[1] = g->kbm->reads->buffer[rid].bincnt; } g->nodes = init_nodev(32); g->rdhits = init_rdhitv(1024); g->cigars = init_bitsvec(1024, 3); g->edges = init_edgev(32); g->ehash = init_edgehash(1023); set_userdata_edgehash(g->ehash, g->edges); g->erefs = init_edgerefv(32); g->frgs = init_frgv(32); g->lnks = init_lnkv(32); g->lrefs = init_edgerefv(32); g->traces = init_tracev(32); g->genome_size = 1024 * 1024 * 1024LLU; g->num_index = 1; //g->corr_mode = 0; //g->corr_min = 5; //g->corr_max = 10; //g->corr_cov = 0.75; //g->corr_gcov = 5.0; //g->corr_bsize = 2048; //g->corr_bstep = 2048 - 512; g->node_order = 0; g->n_fix = 0; g->only_fix = 0; g->reglen = 1024; g->regovl = 256; g->node_max_conflict = 0.25; g->node_merge_cutoff = 0.8; g->max_overhang = -1; g->bestn = 0; g->min_node_mats = 1; g->chainning_hits = 0; g->uniq_hit = 0; g->min_node_cov = 4; g->max_node_cov = 60; g->exp_node_cov = 40; // expected node cov g->min_edge_cov = 4; g->max_edge_span = 1024; // 1024 * 256 = 256 kb g->max_node_cov_sg = 2; g->max_sg_end = 5; g->store_low_cov_edge = 1; g->rep_filter = 1; g->rep_detach = 1; g->bub_step = 10; g->tip_step = 5; g->rep_step = 20; g->cut_tip = 1; g->min_ctg_len = 10000; g->min_ctg_nds = 5; g->minimal_output = 0; g->utgs = init_vplist(32); g->ctgs = init_vplist(32); g->major_nctg = 0; return g; } static inline void free_graph(Graph *g){ u8i i; free_regv(g->regs); free_readv(g->reads); free_nodev(g->nodes); free_rdhitv(g->rdhits); free_bitsvec(g->cigars); free_edgev(g->edges); free_edgehash(g->ehash); free_edgerefv(g->erefs); free_frgv(g->frgs); free_lnkv(g->lnks); free_edgerefv(g->lrefs); free_tracev(g->traces); for(i=0;i<g->utgs->size;i++) free_tracev(g->utgs->buffer[i]); free_vplist(g->utgs); for(i=0;i<g->ctgs->size;i++) free_tracev(g->ctgs->buffer[i]); free_vplist(g->ctgs); free(g); } #define KBM_MAP2RDHIT_QUICK static inline int map2rdhits_graph(Graph *g, kbm_map_t *hit){ u4i k, f, d, p, n, add; rd_hit_t *rh, *hp, *hn; read_t *rd; if(hit->mat == 0) return 0; rh = next_ref_rdhitv(g->rdhits); rh->frgs[0] = (rd_frg_t){hit->qidx, hit->qdir, hit->qb, hit->qe, 0}; rh->frgs[1] = (rd_frg_t){hit->tidx, hit->tdir, hit->tb, hit->te, 0}; rh->lnks[0].cnt = hit->mat; rh->lnks[1].cnt = hit->cglen; add = 0; for(k=0;k<2;k++){ rd = ref_readv(g->reads, rh->frgs[k].rid); #ifdef KBM_MAP2RDHIT_QUICK { // Just add it rh->lnks[k].idx = rd->hits.idx; rh->lnks[k].flg = rd->hits.flg; rd->hits.idx = offset_rdhitv(g->rdhits, rh); rd->hits.flg = k; rd->hits.cnt ++; add ++; continue; } #endif hn = ref_rdhitv(g->rdhits, rd->hits.idx); f = rd->hits.flg; hp = NULL; p = 0; n = 1; while(1){ if(hn->lnks[0].cnt <= rh->lnks[0].cnt){ // hit->mat break; } p = f; hp = hn; if(hn->lnks[f].idx == 0) break; f = hn->lnks[p].flg; hn = ref_rdhitv(g->rdhits, hn->lnks[p].idx); n ++; } if(g->bestn && n > g->bestn){ continue; } if(hp == NULL){ rh->lnks[k].idx = rd->hits.idx; rh->lnks[k].flg = rd->hits.flg; rd->hits.idx = offset_rdhitv(g->rdhits, rh); rd->hits.flg = k; } else { rh->lnks[k].idx = hp->lnks[p].idx; rh->lnks[k].flg = hp->lnks[p].flg; hp->lnks[p].idx = offset_rdhitv(g->rdhits, rh); hp->lnks[p].flg = k; } rd->hits.cnt ++; if(g->bestn && rd->hits.cnt > g->bestn){ hn = rh; p = k; while(n < g->bestn){ f = hn->lnks[p].flg; hn = ref_rdhitv(g->rdhits, hn->lnks[p].idx); p = f; n ++; } hp = hn; f = p; while(hn->lnks[f].idx){ d = hn->lnks[f].flg; hn = ref_rdhitv(g->rdhits, hn->lnks[f].idx); hn->frgs[f].closed = 1; f = d; } hp->lnks[p].idx = 0; hp->lnks[p].flg = 0; rd->hits.cnt = n; } add ++; } if(add == 0){ trunc_rdhitv(g->rdhits, 1); g->cigars->size -= hit->cglen; } return add; } static inline int is_dovetail_overlap(Graph *g, kbm_map_t *hit){ read_t *q, *t; int overhangs[2][2]; q = ref_readv(g->reads, hit->qidx); t = ref_readv(g->reads, hit->tidx); overhangs[1][0] = hit->tb - t->clps[0]; overhangs[1][1] = (t->clps[1] - t->clps[0]) - hit->te; if(hit->qdir){ overhangs[0][0] = (q->clps[1] - q->clps[0]) - hit->qe; overhangs[0][1] = hit->qb; } else { overhangs[0][0] = hit->qb; overhangs[0][1] = (q->clps[1] - q->clps[0]) - hit->qe; } if(overhangs[0][0] > g->max_overhang && overhangs[1][0] > g->max_overhang){ return 0; } if(overhangs[0][1] > g->max_overhang && overhangs[1][1] > g->max_overhang){ return 0; } return 1; } // qlen is measured by KBM_BIN_SIZE static inline int hit2rdregs_graph(Graph *g, rdregv *regs, int qlen, kbm_map_t *hit, BitsVec *cigars, u4v *maps[3]){ KBM *kbm; u8i ndoff; u4i bpos[2][2], npos[2][2], clen, ndbeg, qn, j, qbincnt; int tmp, bt, tlen, x, y, mat, beg, end, min_node_len, max_node_len; int mask, closed; kbm = g->kbm; mask = 0; if(g->max_overhang >= 0){ if(!is_dovetail_overlap(g, hit)){ mask = 1; } } qn = g->reglen; min_node_len = (qn - 1); max_node_len = (qn + 1); // translate into reverse sequence order if(qlen == 0){ qbincnt = kbm->reads->buffer[hit->qidx].bincnt; qlen = qbincnt; } else if(qlen > kbm->reads->buffer[hit->qidx].bincnt){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort(); } else { qbincnt = qlen; } tlen = kbm->reads->buffer[hit->tidx].bincnt; if(hit->qdir){ tmp = qlen - hit->qb; hit->qb = qlen - hit->qe; hit->qe = tmp; } { clear_u4v(maps[0]); clear_u4v(maps[1]); clear_u4v(maps[2]); clen = hit->cglen; x = -1; y = -1; while(clen){ bt = get_bitsvec(cigars, hit->cgoff + clen - 1); push_u4v(maps[2], !(bt >> 2)); bt = bt & 0x03; switch(bt){ case 0: x ++; y ++; break; case 1: x ++; break; case 2: y ++; break; } push_u4v(maps[0], (x < 0)? 0 : x); push_u4v(maps[1], (y < 0)? 0 : y); clen --; } push_u4v(maps[0], x + 1); push_u4v(maps[1], y + 1); push_u4v(maps[2], 0); #if DEBUG if(x + 1 + hit->qb != hit->qe || y + 1 + hit->tb != hit->te){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); print_hit_kbm(g->kbm, g->kbm->reads->buffer[hit->qidx].tag, g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE, hit, cigars, NULL, stderr); abort(); } #endif } bpos[0][0] = hit->qb; bpos[0][1] = hit->qe; bpos[1][0] = hit->tb; bpos[1][1] = hit->te; #if 0 fprintf(stdout, "BPOS\t%d\t%d\t%d\t%d\n", bpos[0][0], bpos[0][1], bpos[1][0], bpos[1][1]); for(j=0;j<maps[0]->size;j++){ fprintf(stdout, "%d,%d\t", maps[0]->buffer[j], maps[1]->buffer[j]); if((j % 10) == 9) fprintf(stdout, "\n"); } fprintf(stdout, "\n"); #endif { ndoff = kbm->reads->buffer[hit->qidx].binoff; if(hit->qdir){ ndbeg = qbincnt - bpos[0][0]; ndbeg = ndbeg % qn; ndoff = ndoff + ((qbincnt - (ndbeg + bpos[0][0])) / qn) - 1; } else { ndbeg = (bpos[0][0] % qn)? qn - (bpos[0][0] % qn) : 0; ndoff = ndoff + ((ndbeg + bpos[0][0]) / qn); } x = 0; for(j=ndbeg+bpos[0][0];j+qn<=bpos[0][1];j+=qn){ mat = 0; while(maps[0]->buffer[x] < j - bpos[0][0]){ x ++; } npos[0][0] = maps[1]->buffer[x]; while(maps[0]->buffer[x] == j - bpos[0][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[0][1] = maps[1]->buffer[x]; while(maps[0]->buffer[x] < j + qn - 1 - bpos[0][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[1][0] = maps[1]->buffer[x]; while(maps[0]->buffer[x + 1] == j + qn - 1 - bpos[0][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[1][1] = maps[1]->buffer[x]; // TODO: arbitrarily use outer boundary as matched region, need to test its effect // TODO: whether to remove regs outside clps if(mat >= g->min_node_mats){ beg = (npos[0][0] + bpos[1][0]); end = (npos[1][1] + bpos[1][0] + 1); if(end - beg >= min_node_len && end - beg <= max_node_len){ closed = 0; } else { closed = 1; } #if DEBUG if(beg >= end || beg < 0){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort(); } #endif push_rdregv(regs, (rd_reg_t){ndoff, hit->tidx, hit->qdir, beg, end, mask | closed}); } if(hit->qdir){ ndoff --; } else { ndoff ++; } #if 0 rd_reg_t *rg = ref_rdregv(regs, regs->size - 1); fprintf(stdout, "NPOS\tX=%d,%d\tY=%d,%d\t%d\t%d\t%d\t%d\n", j, j - bpos[0][0], j + qn - 1, j + qn - 1 - bpos[0][0], npos[0][0], npos[0][1], npos[1][0], npos[1][1]); fprintf(stdout, "REG\t%llu\t%s\t%c\t%d\t%d\t%d\n", rg->node, kbm->reads->buffer[rg->rid].tag, "+-"[rg->dir], rg->beg, rg->end, rg->end - rg->beg); #endif } } //if(!g->corr_mode){ { ndoff = kbm->reads->buffer[hit->tidx].binoff; ndbeg = (bpos[1][0] % qn)? qn - (bpos[1][0] % qn): 0; ndoff = ndoff + ((ndbeg + bpos[1][0]) / qn); x = 0; for(j=ndbeg+bpos[1][0];j+qn<=bpos[1][1];j+=qn){ mat = 0; while(maps[1]->buffer[x] < j - bpos[1][0]){ x ++; } npos[0][0] = maps[0]->buffer[x]; while(maps[1]->buffer[x] == j - bpos[1][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[0][1] = maps[0]->buffer[x]; while(maps[1]->buffer[x] < j + qn - 1 - bpos[1][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[1][0] = maps[0]->buffer[x]; while(maps[1]->buffer[x + 1] == j + qn - 1 - bpos[1][0]){ if(maps[2]->buffer[x]) mat ++; x ++; } npos[1][1] = maps[0]->buffer[x]; // TODO: arbitrarily use outer boundary as matched region, need to test its effect if(mat >= g->min_node_mats){ if(hit->qdir){ beg = qlen - (npos[1][1] + bpos[0][0] + 1); end = qlen - (npos[0][0] + bpos[0][0]); } else { beg = (npos[0][0] + bpos[0][0]); end = (npos[1][1] + bpos[0][0] + 1); } if(end - beg >= min_node_len && end - beg <= max_node_len){ closed = 0; } else { closed = 1; } #if DEBUG if(beg >= end || beg < 0){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort(); } #endif push_rdregv(regs, (rd_reg_t){ndoff, hit->qidx, hit->qdir, beg, end, mask | closed}); #if 0 fprintf(stdout, "NPOS\tX=%d,%d\tY=%d,%d\t%d\t%d\t%d\t%d\n", j, j - bpos[1][0], j + qn - 1, j + qn - 1 - bpos[1][0], npos[0][0], npos[0][1], npos[1][0], npos[1][1]); rd_reg_t *rg = ref_rdregv(regs, regs->size - 1); fprintf(stdout, "REG\t%llu\t%s\t%c\t%d\t%d\t%d\n", rg->node, kbm->reads->buffer[rg->rid].tag, "+-"[rg->dir], rg->beg, rg->end, rg->end - rg->beg); #endif } ndoff ++; } } if(hit->qdir){ tmp = qlen - hit->qb; hit->qb = qlen - hit->qe; hit->qe = tmp; } return !mask; } thread_beg_def(mdbg); Graph *g; KBMAux *aux, *raux; CTGCNS *cc; reg_t reg; rdregv *regs; BitVec *rdflags; u4i beg, end; int raw; FILE *alno; int task; thread_end_def(mdbg); thread_beg_func(mdbg); Graph *g; KBM *kbm; KBMAux *aux, *raux; rdregv *regs; BitVec *rdflags; u4v *maps[3], *tidxs; volatile reg_t *reg; g = mdbg->g; kbm = g->kbm; reg = (reg_t*)&mdbg->reg; aux = mdbg->aux; raux = mdbg->raux; regs = mdbg->regs; rdflags = mdbg->rdflags; maps[0] = init_u4v(32); maps[1] = init_u4v(32); maps[2] = init_u4v(32); tidxs = init_u4v(16); thread_beg_loop(mdbg); if(mdbg->task == 1){ if(reg->closed) continue; //if(g->corr_mode){ if(0){ } else { query_index_kbm(aux, NULL, reg->rid, kbm->rdseqs, (kbm->reads->buffer[reg->rid].seqoff + UInt(reg->beg)) * KBM_BIN_SIZE, UInt(reg->end - reg->beg) * KBM_BIN_SIZE); map_kbm(aux); } if(raux && aux->hits->size){ // refine kbm_read_t *rd; u4i i, j, tidx; clear_kbm(raux->kbm); bitpush_kbm(raux->kbm, NULL, 0, kbm->rdseqs->bits, (kbm->reads->buffer[reg->rid].seqoff + UInt(reg->beg)) * KBM_BIN_SIZE, UInt(reg->end - reg->beg) * KBM_BIN_SIZE); ready_kbm(raux->kbm); simple_index_kbm(raux->kbm, 0, raux->kbm->bins->size); clear_u4v(tidxs); sort_array(aux->hits->buffer, aux->hits->size, kbm_map_t, num_cmpgt(a.tidx, b.tidx)); for(i=0;i<aux->hits->size;i++){ if(i && tidxs->buffer[tidxs->size - 1] == aux->hits->buffer[i].tidx) continue; push_u4v(tidxs, aux->hits->buffer[i].tidx); } clear_kbmmapv(aux->hits); clear_bitsvec(aux->cigars); for(i=0;i<tidxs->size;i++){ tidx = get_u4v(tidxs, i); rd = ref_kbmreadv(aux->kbm->reads, tidx); query_index_kbm(raux, rd->tag, tidx, aux->kbm->rdseqs, rd->seqoff * KBM_BIN_SIZE, rd->bincnt * KBM_BIN_SIZE); map_kbm(raux); for(j=0;j<raux->hits->size;j++){ flip_hit_kbmaux(aux, raux, j); } } } sort_array(aux->hits->buffer, aux->hits->size, kbm_map_t, num_cmpgt(b.mat, a.mat)); } thread_end_loop(mdbg); free_u4v(maps[0]); free_u4v(maps[1]); free_u4v(maps[2]); free_u4v(tidxs); thread_end_func(mdbg); typedef struct { int pos:19; u4i dir:1, spur:1, dep:11; } rd_clp_t; define_list(rdclpv, rd_clp_t); static inline void clip_read_algo(int clps[2], rdclpv *brks, rdclpv *chis, int avg_dep, int min_dep){ rd_clp_t *c; u4i i; int dep, x, y, max, mx, my; sort_array(brks->buffer, brks->size, rd_clp_t, num_cmpgt((a.pos << 1) | a.dir, (b.pos << 1) | b.dir)); x = y = 0; max = 0; dep = 0; mx = my = 0; for(i=0;i<brks->size;i++){ c = ref_rdclpv(brks, i); if(dep >= min_dep){ y = c->pos; if(y - x > max){ max = y - x; mx = x; my = y; } } if(c->dir){ // end of overlap c->dep = dep; dep --; } else { dep ++; c->dep = dep; if(dep == min_dep){ x = c->pos; } } if(c->spur){ push_rdclpv(chis, (rd_clp_t){c->pos - KBM_BIN_SIZE, 0, 0, c->dep}); push_rdclpv(chis, (rd_clp_t){c->pos - 1, 1, 0, c->dep}); push_rdclpv(chis, (rd_clp_t){c->pos, 0, 1, c->dep}); push_rdclpv(chis, (rd_clp_t){c->pos + KBM_BIN_SIZE, 1, 0, c->dep}); } } clps[0] = mx / KBM_BIN_SIZE; clps[1] = my / KBM_BIN_SIZE; if((int)chis->size < avg_dep){ return; } sort_array(chis->buffer, chis->size, rd_clp_t, num_cmpgtx(a.pos, b.pos, b.dir, a.dir)); dep = 0; max = 0; mx = 0; for(i=0;i<chis->size;i++){ c = ref_rdclpv(chis, i); if(c->dir){ if(c->spur){ if(dep >= max){ max = dep; mx = i; } } dep --; } else { dep ++; if(c->spur){ if(dep >= max){ max = dep; mx = i; } } } } if(max * 2 < avg_dep){ return; } c = ref_rdclpv(chis, mx); if(c->dep >= avg_dep){ return; } if(2 * c->dep > max + 1){ return; } if(c->pos <= clps[0] * KBM_BIN_SIZE || c->pos >= clps[1] * KBM_BIN_SIZE){ return; } if(c->pos - clps[0] > clps[1] - c->pos){ clps[1] = (c->pos + 1) / KBM_BIN_SIZE; } else { clps[0] = (c->pos + 1) / KBM_BIN_SIZE; } } static inline void clip_read_core(Graph *g, u4i rid, hitlnkv *lnks, rdclpv *brks, rdclpv *chis){ read_t *rd; hit_lnk_t *lnk; rd_hit_t *hit; rd_frg_t *f1, *f2; u4i i; int rlen, dlen, margin, min_dep, dep, avg, x, y; rd = ref_readv(g->reads, rid); rlen = g->kbm->reads->buffer[rid].bincnt; if(rlen == 0){ return; } lnk = &rd->hits; clear_rdclpv(brks); margin = g->max_overhang > 0? g->max_overhang : 4; min_dep = 2; dep = 0; clear_hitlnkv(lnks); while(lnk->idx){ hit = ref_rdhitv(g->rdhits, lnk->idx); f1 = hit->frgs + lnk->flg; if(f1->closed == 0) push_hitlnkv(lnks, *lnk); lnk = hit->lnks + lnk->flg; } for(i=0;i<lnks->size;i++){ lnk = ref_hitlnkv(lnks, i); hit = ref_rdhitv(g->rdhits, lnk->idx); f1 = hit->frgs + lnk->flg; if(f1->closed == 0){ // whether be contained if(f1->beg <= margin && f1->end + margin >= rlen){ rd->clps[0] = 0; rd->clps[1] = rlen; return; } f2 = hit->frgs + !lnk->flg; dlen = g->kbm->reads->buffer[f2->rid].bincnt; if(f1->dir ^ f2->dir){ x = (f1->beg > margin && f2->end + margin < dlen); y = (f1->end + margin < rlen && f2->beg > margin); } else { x = (f1->beg > margin && f2->beg > margin); y = (f1->end + margin < rlen && f2->end + margin < dlen); } if(x && y){ //push_rdclpv(brks, (rd_clp_t){f1->beg, 0, 1, 0}); //push_rdclpv(brks, (rd_clp_t){f1->end, 1, 1, 0}); } else if(x){ push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 1, 0}); push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 0, 0}); } else if(y){ push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 0, 0}); push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 1, 0}); } else { dep += f1->end - f1->beg; push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 0, 0}); push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 0, 0}); } } } clear_rdclpv(chis); avg = (dep + rlen - 1) / rlen; clip_read_algo(rd->clps, brks, chis, avg, min_dep); } thread_beg_def(mclp); Graph *g; int task; thread_end_def(mclp); thread_beg_func(mclp); Graph *g; rdclpv *brks, *chis; hitlnkv *lnks; u4i rid; g = mclp->g; brks = init_rdclpv(32); chis = init_rdclpv(32); lnks = init_hitlnkv(32); thread_beg_loop(mclp); if(mclp->task == 1){ for(rid=mclp->t_idx;rid<mclp->g->reads->size;rid+=mclp->n_cpu){ clip_read_core(mclp->g, rid, lnks, brks, chis); } } else if(mclp->task == 2){ hitlnkv *lnks, *chks; read_t *rd; hit_lnk_t *lnk; u4i i; lnks = init_hitlnkv(1024); chks = init_hitlnkv(1024); for(rid=mclp->t_idx;rid<mclp->g->reads->size;rid+=mclp->n_cpu){ rd = ref_readv(g->reads, rid); if(rd->hits.idx == 0) continue; clear_hitlnkv(lnks); lnk = &rd->hits; while(1){ if(lnk->idx == 0) break; push_hitlnkv(lnks, *lnk); lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg; } if(lnks->size < 2) continue; sort_array(lnks->buffer, lnks->size, hit_lnk_t, num_cmpgt(g->rdhits->buffer[b.idx].lnks[0].cnt, g->rdhits->buffer[a.idx].lnks[0].cnt)); lnk = &rd->hits; for(i=0;i<lnks->size;i++){ lnk->idx = lnks->buffer[i].idx; lnk->flg = lnks->buffer[i].flg; lnk = g->rdhits->buffer[lnks->buffer[i].idx].lnks + lnks->buffer[i].flg; } lnk->idx = 0; lnk->flg = 0; { clear_hitlnkv(chks); lnk = &rd->hits; while(1){ if(lnk->idx == 0) break; push_hitlnkv(chks, *lnk); lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg; } if(lnks->size != chks->size){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort(); } for(i=0;i<lnks->size;i++){ if(lnks->buffer[i].idx != chks->buffer[i].idx || lnks->buffer[i].flg != chks->buffer[i].flg){ fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr); abort(); } } } if(g->bestn && lnks->size > mclp->g->bestn){ lnk = lnks->buffer + g->bestn - 1; lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg; lnk->idx = 0; lnk->flg = 0; for(i=g->bestn;i<lnks->size;i++){ lnk = lnks->buffer + i; g->rdhits->buffer[lnk->idx].frgs[lnk->flg].closed = 1; } rd->hits.cnt = g->bestn; } } free_hitlnkv(lnks); free_hitlnkv(chks); } thread_end_loop(mclp); free_rdclpv(brks); free_rdclpv(chis); free_hitlnkv(lnks); thread_end_func(mclp); static inline u8i check_read_reg_conflict_core(Graph *g, rd_reg_t *hit, int *conflict){ read_t *rd; reg_t *reg; u8i idx, pre; int x, y; rd = ref_readv(g->reads, hit->rid); idx = rd->regs.idx; pre = 0; *conflict = 0; while(idx){ reg = ref_regv(g->regs, idx); if(hit->beg >= reg->beg) pre = idx; idx = reg->read_link; if(hit->end <= reg->beg) break; if(hit->beg >= reg->end) continue; if(reg->closed) continue; if(hit->beg <= reg->beg){ x = reg->beg; if(hit->end >= reg->end){ y = reg->end; } else y = hit->end; } else { x = hit->beg; if(hit->end <= reg->end){ y = hit->end; } else y = reg->end; } if(x < y){ if(x + (int)g->regovl < y) *conflict = 1; } } return pre; } thread_beg_def(mupd); Graph *g; rdregv *regs; rnk_ref_t *nd; u8v *ins; u8i vt; thread_end_def(mupd); thread_beg_func(mupd); Graph *g; rdregv *regs; rd_reg_t *hit; u8i pre; u4i i; int conflict; g = mupd->g; regs = mupd->regs; thread_beg_loop(mupd); if(mupd->nd == NULL) continue; clear_u8v(mupd->ins); if(1){ for(i=1;i<mupd->nd->cnt;i++){ if(regs->buffer[mupd->nd->idx + i].rid == regs->buffer[mupd->nd->idx + i - 1].rid){ regs->buffer[mupd->nd->idx + i].closed = 1; regs->buffer[mupd->nd->idx + i - 1].closed = 1; } } } for(i=0;i<mupd->nd->cnt;i++){ hit = ref_rdregv(regs, mupd->nd->idx + i); conflict = 0; pre = check_read_reg_conflict_core(g, hit, &conflict); if(conflict){ hit->closed = 1; } push_u8v(mupd->ins, pre); } thread_end_loop(mupd); thread_end_func(mupd); //TODO: reg_t to store regs, sort them by rank, and sort them by rid in another u4i/u8i array //TODO: fast generate nodes and read_link, then select important intervals static inline void mul_update_regs_graph(Graph *g, rdregv *regs, rnkrefv *nds, u4i ncpu, u8i upds[3]){ u8i idx, vt; u4i i, j, pass; read_t *rd; node_t *n; reg_t *reg, *r; rd_reg_t *hit; int conflict, closed; thread_prepare(mupd); for(i=0;i<g->reads->size;i++){ g->reads->buffer[i].visit = 0; // in which round did the read be touched } encap_regv(g->regs, regs->size + 1); // make sure next_ref_regv in this function is thread-safe thread_beg_init(mupd, ncpu); mupd->g = g; mupd->regs = regs; mupd->nd = NULL; mupd->ins = init_u8v(64); mupd->vt = 0; thread_end_init(mupd); vt = 1; upds[0] = upds[1] = upds[2] = 0; for(idx=0;idx<nds->size+ncpu;idx++){ thread_wait_next(mupd); if(mupd->nd){ pass = 0; for(j=0;j<mupd->nd->cnt;j++){ hit = ref_rdregv(regs, mupd->nd->idx + j); rd = ref_readv(g->reads, hit->rid); if(rd->visit >= mupd->vt){ // previous node had changed the layout of this read, need to check conflict again mupd->ins->buffer[j] = check_read_reg_conflict_core(g, hit, &conflict); if(conflict){ hit->closed = 1; } } rd->visit = idx; if(hit->closed == 0) pass ++; } do { closed = 0; if(0 && mupd->nd->cnt > g->max_node_cov){ // Repetitive nodes should be in graph to avoid mis-assembling upds[1] ++; closed = 1; continue; } if(mupd->nd->fix){ if(pass == 0){ upds[1] ++; closed = 1; continue; } else { upds[0] ++; } } else { if(pass < g->min_node_cov || pass < (u4i)(mupd->nd->cnt * (1 - g->node_max_conflict))){ upds[1] ++; closed = 1; continue; } else { upds[0] ++; } } n = next_ref_nodev(g->nodes); n->rep_idx = MAX_REP_IDX; n->unvisit = 0; n->closed = 0; n->single_in = 0; n->bt_visit = 0; n->bt_idx = 0; n->init_end = 0; n->regs.idx = g->regs->size; n->regs.cnt = 0; n->cov = 0; n->edges[0] = PTR_REF_NULL; n->edges[1] = PTR_REF_NULL; for(j=0;j<mupd->nd->cnt;j++){ hit = ref_rdregv(regs, mupd->nd->idx + j); rd = ref_readv(g->reads, hit->rid); n->regs.cnt ++; reg = next_ref_regv(g->regs); // Now, it is thread-safe reg->node = g->nodes->size - 1; reg->rid = hit->rid; reg->dir = hit->dir; reg->beg = hit->beg; reg->end = hit->end; reg->closed = hit->closed | closed; if(reg->closed == 0) n->cov ++; reg->read_link = 0; if(mupd->ins->buffer[j]){ r = ref_regv(g->regs, mupd->ins->buffer[j]); reg->read_link = r->read_link; r->read_link = g->regs->size - 1; } else { reg->read_link = rd->regs.idx; rd->regs.idx = g->regs->size - 1; } rd->regs.cnt ++; } } while(0); } if(idx < nds->size){ mupd->nd = ref_rnkrefv(nds, idx); mupd->vt = idx; thread_wake(mupd); } else { mupd->nd = NULL; } } thread_beg_close(mupd); free_u8v(mupd->ins); thread_end_close(mupd); } static inline u8i chainning_hits_core(kbmmapv *hits, BitsVec *cigars, int uniq_hit, float aln_var){ kbm_map_t HIT; u8i idx, bst, ite, lst, lsx, nrm; u4i state; sort_array(hits->buffer, hits->size, kbm_map_t, num_cmpgtxx(a.qidx, b.qidx, a.tidx, b.tidx, a.qdir, b.qdir)); nrm = 0; for(idx=lst=lsx=0;idx<=hits->size;idx++){ state = 0; if(idx == hits->size){ state = 1; } else if(hits->buffer[lst].qidx == hits->buffer[idx].qidx && hits->buffer[lst].tidx == hits->buffer[idx].tidx){ if(hits->buffer[lst].qdir == hits->buffer[idx].qdir){ state = 0; } else { state = 2; } } else { state = 1; } if(state){ if(idx > lst + 1){ if(simple_chain_all_maps_kbm(hits->buffer + lst, idx - lst, cigars, &HIT, cigars, aln_var)){ hits->buffer[lst++] = HIT; while(lst < idx){ hits->buffer[lst++].mat = 0; // closed = 1 nrm ++; } } } if(state == 1){ // Choose the best hit if(uniq_hit && idx > lsx + 1){ bst = lsx; for(bst=lsx;bst<idx;bst++){ if(hits->buffer[bst].mat) break; } for(ite=bst+1;ite<idx;ite++){ if(hits->buffer[ite].mat == 0) continue; if((hits->buffer[ite].aln > hits->buffer[bst].aln) || (hits->buffer[ite].aln == hits->buffer[bst].aln && hits->buffer[ite].mat > hits->buffer[bst].mat)){ bst = ite; } } for(ite=lsx;ite<bst;ite++){ hits->buffer[ite].mat = 0; } for(ite=bst+1;ite<idx;ite++){ hits->buffer[ite].mat = 0; } nrm += idx - lsx - 1; } lsx = idx; } lst = idx; } } return nrm; } static inline u8i load_alignments_core(Graph *g, FileReader *pws, int raw, rdregv *regs, u4v *maps[3]){ kbmmapv *hits; BitsVec *cigars; u4v *cgs; kbm_map_t *hit, HIT, *h; cuhash_t *cu; u8i nhit; u4i i, rid; int qlen, val, flg, nwarn, mwarn, ncol; char *cgstr, *qtag; mwarn = 20; nwarn = 0; cgs = init_u4v(4); hits = init_kbmmapv(32); cigars = g->chainning_hits? init_bitsvec(1024, 3) : g->cigars; memset(&HIT, 0, sizeof(kbm_map_t)); hit = &HIT; nhit = 0; while((ncol = readtable_filereader(pws)) != -1){ if((pws->n_line % 100000) == 0){ fprintf(KBM_LOGF, "\r%llu", pws->n_line); fflush(KBM_LOGF); } if(pws->line->buffer[0] == '#'){ //if(strncasecmp(pws->line->buffer, "#corr_mode=1", 12) == 0){ //g->corr_mode = 1; //} continue; } if(ncol < 15) continue; if((cu = get_cuhash(g->kbm->tag2idx, get_col_str(pws, 0))) == NULL){ if(nwarn < mwarn){ fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(pws, 0), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); nwarn ++; } continue; } else rid = cu->val; hit->qidx = rid; qtag = get_col_str(pws, 0); hit->qdir = (get_col_str(pws, 1)[0] == '-'); qlen = atoi(get_col_str(pws, 2)); //if(g->corr_mode){ if(0){ //g->reads->buffer[hit->qidx].corr_bincnt = qlen / KBM_BIN_SIZE; } else if(qlen != (int)g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE){ if(nwarn < mwarn){ fprintf(stderr, " -- inconsisitent read length \"%s\" %d != %d in %s -- %s:%d --\n", qtag, qlen, g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); nwarn ++; } } hit->qb = atoi(get_col_str(pws, 3)) / KBM_BIN_SIZE; hit->qe = atoi(get_col_str(pws, 4)) / KBM_BIN_SIZE; if((cu = get_cuhash(g->kbm->tag2idx, get_col_str(pws, 5))) == NULL){ if(nwarn < mwarn){ fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(pws, 5), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); nwarn ++; } continue; } else rid = cu->val; if(hit->qidx >= rid) continue; hit->tidx = rid; hit->tdir = 0; // col 6 always eq '+' // skil col 7 hit->tb = atoi(get_col_str(pws, 8)) / KBM_BIN_SIZE; hit->te = atoi(get_col_str(pws, 9)) / KBM_BIN_SIZE; // skip col 10-13 hit->mat = atoi(get_col_str(pws, 10)); if(hit->mat < g->par->min_mat) continue; hit->aln = atoi(get_col_str(pws, 11)) / KBM_BIN_SIZE; if(hit->aln < g->par->min_aln) continue; if(hit->mat < hit->aln * KBM_BIN_SIZE * g->par->min_sim) continue; if(num_diff(hit->qe - hit->qb, hit->te - hit->tb) > (int)num_max(g->par->aln_var * hit->aln, 1.0)) continue; hit->cnt = atoi(get_col_str(pws, 12)); hit->gap = atoi(get_col_str(pws, 13)); if(g->chainning_hits){ if(hits->size && peer_kbmmapv(hits)->qidx != hit->qidx){ chainning_hits_core(hits, cigars, g->uniq_hit, g->kbm->par->aln_var); for(i=0;i<hits->size;i++){ h = ref_kbmmapv(hits, i); if(h->mat == 0) continue; append_bitsvec(g->cigars, cigars, h->cgoff, h->cglen); h->cgoff = g->cigars->size - h->cglen; nhit ++; map2rdhits_graph(g, h); } clear_kbmmapv(hits); clear_bitsvec(cigars); } } hit->cgoff = cigars->size; clear_u4v(cgs); cgstr = get_col_str(pws, 14); if(cgstr[0] == '*'){ // no cigar continue; } val = 0; while(cgstr[0]){ if(cgstr[0] >= '0' && cgstr[0] <= '9'){ val = val * 10 + (cgstr[0] - '0'); } else { flg = -1; switch(cgstr[0]){ case 'M': flg = 0; break; case 'I': flg = 1; break; case 'D': flg = 2; break; case 'm': flg = 4; break; case 'i': flg = 5; break; case 'd': flg = 6; break; default: fprintf(stderr, " -- Bad cigar '%c' \"%s\" in LINE:%llu in %s -- %s:%d --\n", cgstr[0], get_col_str(pws, 14), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); exit(1); } if(val == 0) val = 1; push_u4v(cgs, (val << 8) | flg); val = 0; } cgstr ++; } for(i=cgs->size;i>0;i--){ val = cgs->buffer[i - 1] >> 8; flg = cgs->buffer[i - 1] & 0xFF; pushs_bitsvec(cigars, flg, val); } hit->cglen = cigars->size - hit->cgoff; if(raw){ hit2rdregs_graph(g, regs, qlen / KBM_BIN_SIZE, hit, cigars, maps); clear_bitsvec(cigars); } else if(g->chainning_hits){ push_kbmmapv(hits, *hit); } else { map2rdhits_graph(g, hit); } } if(g->chainning_hits){ if(hits->size){ chainning_hits_core(hits, cigars, g->uniq_hit, g->kbm->par->aln_var); for(i=0;i<hits->size;i++){ h = ref_kbmmapv(hits, i); if(h->mat == 0) continue; append_bitsvec(g->cigars, cigars, h->cgoff, h->cglen); h->cgoff = g->cigars->size - h->cglen; nhit ++; map2rdhits_graph(g, h); } clear_kbmmapv(hits); clear_bitsvec(cigars); } } fprintf(KBM_LOGF, "\r%llu lines, %llu hits\n", pws->n_line, nhit); print_proc_stat_info(g_mpi_comm_rank); //每个分布节点比对完成后消耗时间 ningwenmo@outlook.com free_kbmmapv(hits); if(g->chainning_hits) free_bitsvec(cigars); free_u4v(cgs); return nhit; } static inline u8i proc_alignments_core(Graph *g, int ncpu, int raw, rdregv *regs, u4v *maps[3], char *prefix, char *dump_kbm){ kbm_map_t *hit; kbm_read_t *pb; BitVec *rdflags; BufferedWriter *bw; FILE *alno, *kmlog; u8i nbp, mbp, nhit; u8i i, ib, ie, ic; u4i rid, qb, qe, ii, in; int reset_kbm, n_cpu; thread_prepare(mdbg); if(KBM_LOG) n_cpu = 1; else n_cpu = ncpu; ic = (g->kbm->bins->size + g->num_index - 1) / g->num_index; ie = 0; UNUSED(mbp); #if 0 if(g->corr_mode){ mbp = g->genome_size * g->corr_gcov; qb = qe = g->kbm->reads->size / 2; nbp = g->kbm->reads->buffer[qb].bincnt * KBM_BIN_SIZE; while(nbp < mbp && qb && qe + 1 < g->kbm->reads->size){ qb --; qe ++; nbp += g->kbm->reads->buffer[qb].bincnt * KBM_BIN_SIZE; nbp += g->kbm->reads->buffer[qe].bincnt * KBM_BIN_SIZE; } if(qe < g->kbm->reads->size) qe ++; fprintf(KBM_LOGF, "[%s] turn correct-mode on, reads[%u ~ %u = %u] (%llu bp), genome-size=%llu, corr-gcov=%0.2f, corr-dep=[%d,%d,%0.2f]\n", date(), qb, qe, qe - qb, nbp, g->genome_size, g->corr_gcov, g->corr_min, g->corr_max, g->corr_cov); fflush(KBM_LOGF); } else { #else { #endif qb = 0; qe = g->reads->size; } // alno = open_file_for_write(prefix, ".alignments.gz", 1); // 每个分布节点输出相应计算的alignments ningwenmo@outlook.com char* alignments_file_name[64]; sprintf(alignments_file_name, ".%u_%u.alignments", g_mpi_comm_rank, g_mpi_comm_size); alno = open_file_for_write(prefix, alignments_file_name, 1); // bw = zopen_bufferedwriter(alno, 1024 * 1024, ncpu, 0); #if 0 if(g->corr_mode){ beg_bufferedwriter(bw); fprintf(bw->out, "#corr_mode=1\n"); end_bufferedwriter(bw); } rdflags = (!g->corr_mode && g->par->skip_contained)? init_bitvec(g->kbm->reads->size) : NULL; in = g->corr_mode? 1 : g->num_index; #else rdflags = (g->par->skip_contained)? init_bitvec(g->kbm->reads->size) : NULL; in = g->num_index; #endif if(g->kbm->seeds->size){ reset_kbm = 0; if(in > 1){ fprintf(KBM_LOGF, " ** WARNNING: change number of kbm index to 1 **\n"); fflush(KBM_LOGF); in = 1; } } else { reset_kbm = 1; } //fix_node = 0; nhit = 0; for(ii=0;ii<in;ii++){ ib = ie; ie = num_min(ib + ic, g->kbm->bins->size); while(ie > 0 && ie < g->kbm->bins->size && g->kbm->bins->buffer[ie - 1].ridx == g->kbm->bins->buffer[ie].ridx) ie ++; qb = ib ? g->kbm->bins->buffer[ib - 1].ridx : 0; qe = ie ? g->kbm->bins->buffer[ie - 1].ridx : 0; nbp = ((u8i)(ie - ib)) * KBM_BIN_SIZE; //每个分布节点计算分别计算相应数据 ningwenmo@outlook.com if (g_mpi_comm_size >= 2 && ii != (g_mpi_comm_rank - 1)) continue; if(reset_kbm){ reset_index_kbm(g->kbm); fprintf(KBM_LOGF, "[%s] indexing bins[(%llu,%llu)/%llu] (%llu/%llu bp), %d threads\n", date(), ib, ie, (u8i)g->kbm->bins->size, nbp, (u8i)g->kbm->rdseqs->size, ncpu); fflush(KBM_LOGF); kmlog = (in > 1)? NULL : open_file_for_write(prefix, ".kmerdep", 1); index_kbm(g->kbm, ib, ie, ncpu, kmlog); if(kmlog){ fclose(kmlog); kmlog = NULL; } fprintf(KBM_LOGF, "[%s] Done\n", date()); fflush(KBM_LOGF); if(in == 1 && dump_kbm){ FILE *dump; fprintf(KBM_LOGF, "[%s] dump kbm-index to %s ...", date(), dump_kbm); fflush(KBM_LOGF); dump = open_file_for_write(dump_kbm, NULL, 1); mem_dump_obj_file(g->kbm, 1, &kbm_obj_desc, 1, 0, dump); fclose(dump); fprintf(KBM_LOGF, " Done\n"); fflush(KBM_LOGF); } if(in == 1){ u4i *deps; u8i hidx; kmlog = open_file_for_write(prefix, ".binkmer", 1); deps = calloc(KBM_BIN_SIZE + 1, 4); for(hidx=0;hidx<g->kbm->bins->size;hidx++){ deps[g->kbm->bins->buffer[hidx].degree] ++; } for(hidx=0;hidx<KBM_BIN_SIZE;hidx++){ fprintf(kmlog, "%u\n", deps[hidx]); } fclose(kmlog); free(deps); if(!g->minimal_output && 0){ kbm_bin_t *bn; kmlog = open_file_for_write(prefix, ".closed_bins", 1); for(hidx=0;hidx<g->kbm->bins->size;hidx++){ bn = ref_kbmbinv(g->kbm->bins, hidx); if(bn->closed){ fprintf(kmlog, "%s_F_%d_%d\t%d\n", g->kbm->reads->buffer[bn->ridx].tag, bn->off * KBM_BIN_SIZE, KBM_BIN_SIZE, bn->degree); } } fclose(kmlog); } } } { { thread_beg_init(mdbg, n_cpu); mdbg->g = g; memset((void*)&mdbg->reg, 0, sizeof(reg_t)); mdbg->reg.closed = 1; mdbg->aux = init_kbmaux(g->kbm); if(g->rpar){ mdbg->raux = init_kbmaux(init_kbm(g->rpar)); } else { mdbg->raux = NULL; } #if 0 if(g->corr_mode){ KBMBlock *kb; POGPar par; kb = init_kbmblock(g->corr_bsize, g->corr_bstep); //mdbg->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, 1, g->corr_max, 200, 100, 1, 96, 2, -5, -2, -4, -1, 16, 3, 0.5, g->corr_bsize - g->corr_bstep + KBM_BIN_SIZE); par = DEFAULT_POG_PAR; par.refmode = 1; mdbg->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, g->corr_max, 200, 100, 1, g->corr_bsize - g->corr_bstep + KBM_BIN_SIZE, &par); } else { #else { #endif mdbg->cc = NULL; } mdbg->aux->par = (KBMPar*)malloc(sizeof(KBMPar)); memcpy(mdbg->aux->par, g->par, sizeof(KBMPar)); mdbg->regs = regs; mdbg->rdflags = rdflags; mdbg->beg = 0; mdbg->end = 0; mdbg->raw = raw; mdbg->alno = alno; thread_end_init(mdbg); } thread_beg_iter(mdbg); mdbg->task = 1; thread_end_iter(mdbg); for(rid=qb;rid<=qe+ncpu;rid++){ if(rid < qe){ if(!KBM_LOG && ((rid - qb) % 2000) == 0){ fprintf(KBM_LOGF, "\r%u|%llu", rid - qb, nhit); fflush(KBM_LOGF); } thread_wait_one(mdbg); } else { thread_wait_next(mdbg); pb = NULL; } if(mdbg->reg.closed == 0){ KBMAux *aux = mdbg->aux; //if(g->corr_mode && mdbg->cc->cns->size){ //g->reads->buffer[mdbg->reg.rid].corr_bincnt = mdbg->cc->cns->size / KBM_BIN_SIZE; //} if(alno){ // beg_bufferedwriter(bw); //if(g->corr_mode && mdbg->aux->cns->size){ //fprintf(bw->out, "#corrected\t%s\t%u\t", mdbg->cc->tag->string, (u4i)mdbg->cc->cns->size); //println_fwdseq_basebank(mdbg->cc->cns, 0, mdbg->cc->cns->size, bw->out); //} // for(i=0;i<mdbg->aux->hits->size;i++){ // hit = ref_kbmmapv(mdbg->aux->hits, i); // fprint_hit_kbm(mdbg->aux, i, bw->out); // } // end_bufferedwriter(bw); for (i = 0; i < mdbg->aux->hits->size; i++) { hit = ref_kbmmapv(mdbg->aux->hits, i); fprint_hit_kbm(mdbg->aux, i, alno); } } for(i=0;i<mdbg->aux->hits->size;i++){ hit = ref_kbmmapv(mdbg->aux->hits, i); if(hit->mat == 0) continue; if(rdflags && g->kbm->reads->buffer[hit->tidx].bincnt < g->kbm->reads->buffer[hit->qidx].bincnt && (hit->tb <= 1 && hit->te + 1 >= (int)(g->kbm->reads->buffer[hit->tidx].bincnt)) && (hit->qb > 1 || hit->qe + 1 < (int)(g->kbm->reads->buffer[hit->qidx].bincnt)) ){ one_bitvec(rdflags, hit->tidx); } } if(g->chainning_hits){ chainning_hits_core(aux->hits, aux->cigars, g->uniq_hit, g->kbm->par->aln_var); } for(i=0;i<aux->hits->size;i++){ hit = ref_kbmmapv(aux->hits, i); if(hit->mat == 0) continue; //hit->qb /= KBM_BIN_SIZE; //hit->qe /= KBM_BIN_SIZE; //hit->tb /= KBM_BIN_SIZE; //hit->te /= KBM_BIN_SIZE; //hit->aln /= KBM_BIN_SIZE; nhit ++; append_bitsvec(g->cigars, aux->cigars, hit->cgoff, hit->cglen); hit->cgoff = g->cigars->size - hit->cglen; if(raw){ //hit2rdregs_graph(g, regs, g->corr_mode? mdbg->cc->cns->size / KBM_BIN_SIZE : 0, hit, mdbg->aux->cigars, maps); hit2rdregs_graph(g, regs, 0, hit, mdbg->aux->cigars, maps); } else { map2rdhits_graph(g, hit); } } if(KBM_LOG){ fprintf(KBM_LOGF, "QUERY: %s\t+\t%d\t%d\n", g->kbm->reads->buffer[mdbg->reg.rid].tag, mdbg->reg.beg, mdbg->reg.end); for(i=0;i<mdbg->aux->hits->size;i++){ hit = ref_kbmmapv(mdbg->aux->hits, i); fprintf(KBM_LOGF, "\t%s\t%c\t%d\t%d\t%d\t%d\t%d\n", g->kbm->reads->buffer[hit->tidx].tag, "+-"[hit->qdir], g->kbm->reads->buffer[hit->tidx].bincnt * KBM_BIN_SIZE, hit->tb * KBM_BIN_SIZE, hit->te * KBM_BIN_SIZE, hit->aln * KBM_BIN_SIZE, hit->mat); } } mdbg->reg.closed = 1; } if(rid < qe && (rdflags == NULL || get_bitvec(rdflags, rid) == 0)){ pb = ref_kbmreadv(g->kbm->reads, rid); mdbg->reg = (reg_t){0, rid, 0, 0, pb->bincnt, 0, 0}; thread_wake(mdbg); } } { thread_beg_close(mdbg); free(mdbg->aux->par); free_kbmaux(mdbg->aux); //if(g->corr_mode){ //free_kbmblock((KBMBlock*)mdbg->cc->obj); //free_ctgcns(mdbg->cc); //} if(mdbg->raux){ free_kbm(mdbg->raux->kbm); free_kbmaux(mdbg->raux); } thread_end_close(mdbg); } } if(!KBM_LOG) fprintf(KBM_LOGF, "\r%u reads|total hits %llu\n", qe - qb, nhit); if(reset_kbm){ reset_index_kbm(g->kbm); } print_proc_stat_info(g_mpi_comm_rank); } // if(bw) close_bufferedwriter(bw); if(alno) fclose(alno); if(rdflags) free_bitvec(rdflags); return nhit; } static inline void build_nodes_graph(Graph *g, u8i maxbp, int ncpu, FileReader *pws, int rdclip, char *prefix, char *dump_kbm){ kbm_map_t *hit, HIT; BitVec *rks; u4v *maps[3]; FILE *clplog; u8i idx, rank, kcnts[256], upds[3], fix_node, nhit; u4i rid, i, dep; int raw; rdregv *regs; rnkrefv *nds; rnk_ref_t *nd; thread_preprocess(mclp); regs = init_rdregv(1024); nds = init_rnkrefv(1024); renew_rdhitv(g->rdhits, 1024); maps[0] = init_u4v(4); maps[1] = init_u4v(4); maps[2] = init_u4v(4); clear_regv(g->regs); next_ref_regv(g->regs); clear_rdhitv(g->rdhits); ZEROS(next_ref_rdhitv(g->rdhits)); //clear_kbmmapv(g->pwalns); clear_bitsvec(g->cigars); raw = !((g->bestn > 0) || rdclip); //if(g->corr_mode){ //raw = 1; //} fix_node = 0; // TODO: new code hasn't coped with contigs+longreads mode if(pws){ nhit = load_alignments_core(g, pws, raw, regs, maps); } else { UNUSED(maxbp); nhit = proc_alignments_core(g, ncpu, raw, regs, maps, prefix, dump_kbm); } // 每个节点分布计算完成后返回 ningwenmo@outlook.com if (g_mpi_comm_size >= 2 && g_mpi_comm_rank != 0){ // int message = 0; // MPI_Send(&message, 1, MPI_INT, 0, 1, MPI_COMM_WORLD); // fprintf(KBM_LOGF, "[%s] send %u/%u\n", date(), g_mpi_comm_rank, g_mpi_comm_size); // fflush(KBM_LOGF); return; } // print_proc_stat_info(g_mpi_comm_rank); if(raw){ fprintf(KBM_LOGF, "[%s] generated %llu regs\n", date(), (u8i)regs->size); } else { #ifdef KBM_MAP2RDHIT_QUICK // sort rdhits and pick bestn { fprintf(KBM_LOGF, "[%s] sorting rdhits ... ", date()); fflush(KBM_LOGF); thread_beg_init(mclp, ncpu); mclp->g = g; mclp->task = 2; thread_end_init(mclp); thread_wake_all(mclp); thread_beg_close(mclp); thread_end_close(mclp); fprintf(KBM_LOGF, "Done\n"); fflush(KBM_LOGF); } #endif // clip reads if(rdclip){ fprintf(KBM_LOGF, "[%s] clipping ... ", date()); fflush(KBM_LOGF); clplog = open_file_for_write(prefix, ".clps", 1); thread_beg_init(mclp, ncpu); mclp->g = g; mclp->task = 1; thread_end_init(mclp); thread_wake_all(mclp); thread_beg_close(mclp); thread_end_close(mclp); u8i tot, clp; tot = clp = 0; for(rid=0;rid<g->reads->size;rid++){ tot += g->kbm->reads->buffer[rid].bincnt * KBM_BIN_SIZE; clp += (g->reads->buffer[rid].clps[1] - g->reads->buffer[rid].clps[0]) * KBM_BIN_SIZE; fprintf(clplog, "%s\t%d\t%d\t%d\n", g->kbm->reads->buffer[rid].tag, g->kbm->reads->buffer[rid].bincnt * KBM_BIN_SIZE, g->reads->buffer[rid].clps[0] * KBM_BIN_SIZE, g->reads->buffer[rid].clps[1] * KBM_BIN_SIZE); } fclose(clplog); fprintf(KBM_LOGF, "%.2f%% bases\n", ((tot - clp) * 100.0) / tot); fflush(KBM_LOGF); } fprintf(KBM_LOGF, "[%s] generating regs ... ", date()); fflush(KBM_LOGF); rd_hit_t *rh; u8i cgoff; cgoff = 0; hit = &HIT; hit->cnt = 0; hit->gap = 0; for(idx=0;idx<g->rdhits->size;idx++){ rh = ref_rdhitv(g->rdhits, idx); hit->cgoff = cgoff; hit->cglen = rh->lnks[1].cnt; cgoff += hit->cglen; if(rh->frgs[0].closed && rh->frgs[1].closed){ continue; } hit->qidx = rh->frgs[0].rid; hit->tidx = rh->frgs[1].rid; hit->qdir = rh->frgs[0].dir; hit->tdir = rh->frgs[1].dir; hit->qb = rh->frgs[0].beg; hit->qe = rh->frgs[0].end; hit->tb = rh->frgs[1].beg; hit->te = rh->frgs[1].end; hit->mat = rh->lnks[0].cnt; hit->aln = num_min(hit->qe - hit->qb, hit->te - hit->tb); //hit2rdregs_graph(g, regs, g->reads->buffer[hit->qidx].corr_bincnt, hit, g->cigars, maps); hit2rdregs_graph(g, regs, g->kbm->reads->buffer[hit->qidx].bincnt, hit, g->cigars, maps); } free_bitsvec(g->cigars); g->cigars = init_bitsvec(1024, 3); fprintf(KBM_LOGF, "%llu\n", (u8i)regs->size); fflush(KBM_LOGF); } free_u4v(maps[0]); free_u4v(maps[1]); free_u4v(maps[2]); // add node itself //if(!g->corr_mode){ { rks = init_bitvec(g->kbm->bins->size); for(idx=0;idx<regs->size;idx++){ one_bitvec(rks, regs->buffer[idx].node); } for(idx=0;idx<g->kbm->bins->size;idx++){ if(get_bitvec(rks, idx)){ rid = g->kbm->bins->buffer[idx].ridx; kbm_read_t *rd = ref_kbmreadv(g->kbm->reads, rid); push_rdregv(regs, (rd_reg_t){idx, rid, 0, (idx - rd->binoff) * g->reglen, (idx + 1 - rd->binoff) * g->reglen, 0}); } } free_bitvec(rks); } // generating nodes fprintf(KBM_LOGF, "[%s] sorting regs ... ", date()); fflush(KBM_LOGF); psort_array(regs->buffer, regs->size, rd_reg_t, ncpu, num_cmpgtxx((a.node << 30) | a.rid, (b.node << 30) | b.rid, a.beg, b.beg, b.end, a.end)); fprintf(KBM_LOGF, " Done\n"); fflush(KBM_LOGF); u4v *kbcnts; kbcnts = init_u4v(1024); rank = 0xFFFFFFFFFFFFFFFFLLU; nd = NULL; fprintf(KBM_LOGF, "[%s] generating intervals ... ", date()); fflush(KBM_LOGF); for(idx=0;idx<regs->size;idx++){ if(0){ kbm_read_t *rd1, *rd2; rd_reg_t *r; r = ref_rdregv(regs, idx); rd1 = ref_kbmreadv(g->kbm->reads, g->kbm->bins->buffer[r->node].ridx); rd2 = ref_kbmreadv(g->kbm->reads, r->rid); fprintf(stdout, "%s\t%c\t%d\t%d\t%s\t%c\t%d\t%d\n", rd1->tag, '+', (int)((r->node) - rd1->binoff) * g->reglen, (int)(r->node - rd1->binoff + 1) * g->reglen, rd2->tag, "+-"[r->dir], r->beg, r->end); } if(regs->buffer[idx].node != rank){ if(nd){ while(nd->cnt >= kbcnts->size) push_u4v(kbcnts, 0); kbcnts->buffer[nd->cnt] ++; } nd = next_ref_rnkrefv(nds); nd->idx = idx; nd->rank = regs->buffer[idx].node; nd->fix = regs->buffer[idx].node < fix_node; nd->cnt = 1; //nd->score = (regs->buffer[idx].beg >= g->reads->buffer[regs->buffer[idx].rid].clps[0] && regs->buffer[idx].end <= g->reads->buffer[regs->buffer[idx].rid].clps[1]); rank = regs->buffer[idx].node; } else { nd->cnt ++; //nd->score += (regs->buffer[idx].beg >= g->reads->buffer[regs->buffer[idx].rid].clps[0] && regs->buffer[idx].end <= g->reads->buffer[regs->buffer[idx].rid].clps[1]); } } if(nd){ while(nd->cnt >= kbcnts->size) push_u4v(kbcnts, 0); kbcnts->buffer[nd->cnt] ++; } // find medean k-bin depth u4i gcov; for(gcov=0,rank=0;gcov<kbcnts->size&&rank<nds->size;gcov++){ rank += kbcnts->buffer[gcov]; } //psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(b.cnt, a.cnt, a.rank, b.rank)); psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(num_diff(a.cnt, gcov), num_diff(b.cnt, gcov), a.rank, b.rank)); fprintf(KBM_LOGF, " %llu intervals\n", (u8i)nds->size); fflush(KBM_LOGF); //psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(b.score, a.score, a.rank, b.rank)); if(0){ memset(kcnts, 0, sizeof(uint64_t) * 256); for(idx=0;idx<nds->size;idx++){ dep = num_min(nds->buffer[idx].cnt, 255); kcnts[dep] ++; } for(i=1;i<51;i++){ fprintf(KBM_LOGF, "%10llu", (long long unsigned int)kcnts[i]); if(((i - 1) % 10) == 9) fprintf(KBM_LOGF, "\n"); } if(((i - 1) % 10) != 0) fprintf(KBM_LOGF, "\n"); } fprintf(KBM_LOGF, "[%s] selecting important intervals from %llu intervals\n", date(), (u8i)nds->size); mul_update_regs_graph(g, regs, nds, ncpu, upds); free_rdregv(regs); free_rnkrefv(nds); encap_regv(g->regs, 1); memset(g->regs->buffer + g->regs->size, 0xFF, sizeof(reg_t)); if(!KBM_LOG) fprintf(KBM_LOGF, "[%s] Intervals: kept %llu, discarded %llu\n", date(), upds[0], upds[1]); print_proc_stat_info(g_mpi_comm_rank); } uint32_t estimate_genome_size(Graph *g, unsigned long long tot_bp, FILE *out){ uint64_t kcnts[256]; node_t *n; uint64_t nid, sum, ncnt, pmax; uint32_t i, dep, peak, mid; float avg; ncnt = g->nodes->size; memset(kcnts, 0, sizeof(uint64_t) * 256); sum = 0; for(nid=0;nid<ncnt;nid++){ n = ref_nodev(g->nodes, nid); dep = num_min(n->regs.cnt, 255); sum += n->regs.cnt; kcnts[dep] ++; } mid = pmax = 0; while(mid < 255){ pmax += kcnts[mid]; if(pmax >= ncnt / 2) break; mid ++; } avg = 1.0 * sum / (ncnt + 1); fprintf(out, "[%s] median node depth = %d\n", date(), mid); return mid; //TODO: calculate the genome coverage for(i=1;i<51;i++){ fprintf(out, "%10llu", (long long unsigned int)kcnts[i]); if(((i - 1) % 10) == 9) fprintf(out, "\n"); } if(((i - 1) % 10) != 0) fprintf(out, "\n"); return avg; pmax = 0; peak = avg; for(i=g->min_node_cov+1;i<256;i++){ if(kcnts[i] > pmax){ peak = i; pmax = kcnts[i]; } else if(i > avg && kcnts[i] < 0.8 * pmax) break; } fprintf(out, "[%s] depth peak = %d\n", date(), peak); fprintf(out, "[%s] genome size = %llu\n", date(), tot_bp / peak); return peak; } // MUST be called before build_edges static u8i mask_nodes_by_cov_graph(Graph *g, FILE *out){ node_t *n; u8i ret, i; ret = 0; for(i=0;i<g->nodes->size;i++){ n = ref_nodev(g->nodes, i); if(n->regs.cnt > g->max_node_cov || n->regs.cnt < g->min_node_cov){ n->closed = 1; ret ++; if(out) fprintf(out, "MASK_COV\tN%llu\t%u\t%u\n", (u8i)i, (u4i)n->regs.cnt, n->cov); } } return ret; } static inline void remove_all_edges_graph(Graph *g){ node_t *n; uint64_t nid; free_edgev(g->edges); g->edges = init_edgev(32); free_edgehash(g->ehash); g->ehash = init_edgehash(1023); set_userdata_edgehash(g->ehash, g->edges); free_edgerefv(g->erefs); g->erefs = init_edgerefv(32); for(nid=0;nid<g->nodes->size;nid++){ n = ref_nodev(g->nodes, nid); n->edges[0] = n->edges[1] = PTR_REF_NULL; } } typedef struct { uint64_t idx:46; int off:18; } edge_off_t; define_list(edgeoffv, edge_off_t); static inline int estimate_edge_length(edge_off_t *ps, uint32_t size, uint32_t idxs[2]){ int64_t tot, var; uint32_t i, b, e, mi; int v, max, len, avg, std; idxs[0] = 0; idxs[1] = size; if(size == 0){ return 0; } if(size <= 2){ return ps[size/2].off; } b = 0; e = size; for(i=b,tot=0;i<e;i++) tot += ps[i].off; len = tot / (e - b); //fprintf(stdout, "b=%d\te=%d\tlen=%d\n", b, e, len); while(b + 2 < e){ max = 0; mi = 0; for(i=b+1;i<e;i++){ if(ps[i].off - ps[i-1].off > max){ max = ps[i].off - ps[i-1].off; mi = i; } } if(max < len * 0.5) break; else if(max < 100) break; if(mi - b > e - mi) e = mi; else b = mi; for(i=b,tot=0;i<e;i++) tot += ps[i].off; avg = tot / (e - b); if(num_diff(avg, len) < num_max(avg * 0.2, 50)) break; len = avg; } //fprintf(stdout, "b=%d\te=%d\tlen=%d\n", b, e, len); if(0){ if(b + 1 < e){ for(i=b,var=0;i<e;i++){ v = ((int)ps[i].off) - ((int)len); var += v * v; } std = sqrt(var / (e - b)); //fprintf(stdout, "std=%d\n", std); b = 0; e = size; while(b < e && num_diff(ps[b].off, len) > 3 * std) b ++; while(b < e && num_diff(ps[e - 1].off, len) > 3 * std) e --; } idxs[0] = b; idxs[1] = e; } return len; } static inline int calculate_edge_cov_off_graph(Graph *g, edge_t *e, edgeoffv *offs){ node_t *n1, *n2; reg_t *r1, *r2; uint32_t dir1, dir2, cov, idxs[2]; int off; n1 = ref_nodev(g->nodes, e->node1); n2 = ref_nodev(g->nodes, e->node2); r1 = ref_regv(g->regs, n1->regs.idx); r2 = ref_regv(g->regs, n2->regs.idx); cov = e->cov; clear_edgeoffv(offs); while(1){ if(r1->rid > r2->rid){ r2 ++; if(r2->node != e->node2) break; } else if(r1->rid < r2->rid){ r1 ++; if(r1->node != e->node1) break; } else { if(r1->beg < r2->beg){ if(r1->node < r2->node){ dir1 = r1->dir; dir2 = r2->dir; } else { dir1 = !r2->dir; dir2 = !r1->dir; } off = r2->beg - r1->end; } else { if(r2->node < r1->node){ dir1 = r2->dir; dir2 = r1->dir; } else { dir1 = !r1->dir; dir2 = !r2->dir; } off = ((int)r1->beg) - r2->end; } if(dir1 == e->dir1 && dir2 == e->dir2){ push_edgeoffv(offs, (edge_off_t){e - g->edges->buffer, off}); } r1 ++; if(r1->node != e->node1) break; r2 ++; if(r2->node != e->node2) break; } } e->off = estimate_edge_length(offs->buffer, offs->size, idxs); e->cov = idxs[1] - idxs[0]; if(cov != e->cov) return 1; else return 0; } static inline void build_edges_graph(Graph *g, int ncpu, FILE *log){ read_t *rd; node_t *n; edge_t *E, *e; vplist *regs; reg_t *r1, *r2; edge_ref_t f1, f2; edgeoffv *offs; uint64_t idx, lst, cnt, x, *u; uint32_t rid, i, idxs[2]; int exists; UNUSED(log); clear_edgev(g->edges); E = next_ref_edgev(g->edges); memset(E, 0, sizeof(edge_t)); E->closed = WT_EDGE_CLOSED_MASK; E->cov = 1; //E->status = 0; clear_edgehash(g->ehash); offs = init_edgeoffv(32); regs = init_vplist(32); for(rid=0;rid<g->kbm->reads->size;rid++){ rd = ref_readv(g->reads, rid); if(rd->regs.cnt < 2) continue; clear_vplist(regs); idx = rd->regs.idx; while(idx){ r2 = ref_regv(g->regs, idx); idx = r2->read_link; if(g->nodes->buffer[r2->node].closed) continue; if(r2->closed) continue; if(!(r2->beg >= rd->clps[0] && r2->end <= rd->clps[1])) continue; for(i=regs->size;i>0;i--){ r1 = (reg_t*)get_vplist(regs, i - 1); if(r1->end + g->max_edge_span < r2->beg) break; E = ref_edgev(g->edges, 0); if(r1->node < r2->node){ E->node1 = r1->node; E->node2 = r2->node; E->dir1 = r1->dir; E->dir2 = r2->dir; } else { E->node1 = r2->node; E->node2 = r1->node; E->dir1 = !r2->dir; E->dir2 = !r1->dir; } E->off = ((int)r2->beg) - r1->end; u = prepare_edgehash(g->ehash, 0, &exists); if(exists){ //if(g->edges->buffer[*u].cov < WT_MAX_EDGE_COV) g->edges->buffer[*u].cov ++; } else { *u = g->edges->size; push_edgev(g->edges, *E); } if(i == regs->size){ e = ref_edgev(g->edges, *u); //if(e->cov < WT_MAX_EDGE_COV) e->cov ++; //if(e->cov >= 1){ e->closed = 0; //} } push_edgeoffv(offs, (edge_off_t){*u, ((int)r2->beg) - r1->end}); } push_vplist(regs, r2); } } free_vplist(regs); if(g->edges->size == 0){ free_edgeoffv(offs); return; } psort_array(offs->buffer, offs->size, edge_off_t, ncpu, num_cmpgtx(a.idx, b.idx, a.off, b.off)); lst = 0; for(idx=1;idx<=offs->size;idx++){ if(idx < offs->size && offs->buffer[idx].idx == offs->buffer[lst].idx) continue; if(1){ g->edges->buffer[offs->buffer[lst].idx].off = offs->buffer[(lst+idx)/2].off; g->edges->buffer[offs->buffer[lst].idx].cov = idx - lst; } else { g->edges->buffer[offs->buffer[lst].idx].off = estimate_edge_length(offs->buffer + lst, idx - lst, idxs); g->edges->buffer[offs->buffer[lst].idx].cov = idxs[1] - idxs[0]; } if(0){ uint64_t m; for(m=lst;m<idx;m++) fprintf(stdout, "%u\t", offs->buffer[m].off); fprintf(stdout, "\n"); for(m=lst+idxs[0];m<lst+idxs[1];m++) fprintf(stdout, "%u\t", offs->buffer[m].off); fprintf(stdout, "\n"); } lst = idx; } free_edgeoffv(offs); clear_edgerefv(g->erefs); push_edgerefv(g->erefs, EDGE_REF_NULL); for(idx=1;idx<g->edges->size;idx++){ if(g->edges->buffer[idx].cov < g->min_edge_cov){ if(g->store_low_cov_edge) g->edges->buffer[idx].closed = WT_EDGE_CLOSED_LESS; else continue; } if(g->nodes->buffer[g->edges->buffer[idx].node1].closed || g->nodes->buffer[g->edges->buffer[idx].node2].closed){ g->edges->buffer[idx].closed = WT_EDGE_CLOSED_HARD; } else if(g->edges->buffer[idx].closed == WT_EDGE_CLOSED_MASK){ g->edges->buffer[idx].closed = WT_EDGE_CLOSED_LESS; //} else { //g->edges->buffer[idx].closed = WT_EDGE_CLOSED_NULL; } push_edgerefv(g->erefs, (edge_ref_t){idx, 0, 0}); push_edgerefv(g->erefs, (edge_ref_t){idx, 1, 0}); } psort_array(g->erefs->buffer + 1, g->erefs->size - 1, edge_ref_t, ncpu, num_cmpgtx( (a.flg? ((g->edges->buffer[a.idx].node2 << 1) | !g->edges->buffer[a.idx].dir2) : ((g->edges->buffer[a.idx].node1 << 1) | g->edges->buffer[a.idx].dir1)), (b.flg? ((g->edges->buffer[b.idx].node2 << 1) | !g->edges->buffer[b.idx].dir2) : ((g->edges->buffer[b.idx].node1 << 1) | g->edges->buffer[b.idx].dir1)), g->edges->buffer[a.idx].off, g->edges->buffer[b.idx].off)); f1.idx = g->nodes->size; f1.flg = 0; cnt = 0; for(lst=idx=1;idx<g->erefs->size;idx++){ if(g->erefs->buffer[idx].flg){ f2.idx = g->edges->buffer[g->erefs->buffer[idx].idx].node2; f2.flg = !g->edges->buffer[g->erefs->buffer[idx].idx].dir2; } else { f2.idx = g->edges->buffer[g->erefs->buffer[idx].idx].node1; f2.flg = g->edges->buffer[g->erefs->buffer[idx].idx].dir1; } if(f1.idx == f2.idx && f1.flg == f2.flg) continue; if(lst < idx){ n = ref_nodev(g->nodes, f1.idx); n->edges[f1.flg].idx = lst; n->edges[f1.flg].cnt = 0; for(x=lst;x+1<idx;x++){ g->erefs->buffer[x].next = x + 1; if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++; } if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++; } lst = idx; f1 = f2; } if(lst < idx){ n = ref_nodev(g->nodes, f1.idx); n->edges[f1.flg].idx = lst; n->edges[f1.flg].cnt = 0; for(x=lst;x+1<idx;x++){ g->erefs->buffer[x].next = x + 1; if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++; } if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++; } } static inline void load_nodes_graph(Graph *g, FileReader *clp, FileReader *nds){ read_t *rd; node_t *n; reg_t *reg, *r; uint64_t nid; uint32_t i, nreg, rid; char *str, *tok; int ncol, closed, nwarn; clear_nodev(g->nodes); clear_regv(g->regs); next_ref_regv(g->regs); nwarn = 0; while((ncol = readtable_filereader(clp)) != -1){ if(clp->line->string[0] == '#') continue; if(ncol < 4) continue; if((rid = getval_cuhash(g->kbm->tag2idx, get_col_str(clp, 0))) == MAX_U4){ if(nwarn < 10){ fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(clp, 0), clp->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); nwarn ++; } continue; } rd = ref_readv(g->reads, rid); rd->clps[0] = atoi(get_col_str(clp, 2)) / KBM_BIN_SIZE; rd->clps[1] = atoi(get_col_str(clp, 3)) / KBM_BIN_SIZE; } nwarn = 0; while((ncol = readtable_filereader(nds)) != -1){ if(nds->line->string[0] == '#') continue; if(ncol < 2) continue; nreg = atoi(get_col_str(nds, 1)); //if(nreg == 0) continue; //if(nreg < g->min_node_cov) continue; //node_id ~ N(\d+)(\*?) nid = atol(get_col_str(nds, 0) + 1); if(get_col_str(nds, 0)[get_col_len(nds, 0) - 1] == '*'){ // assert get_col_len(nds, 0) > 0 closed = 1; } else { closed = 0; } while(g->nodes->size < nid){ n = next_ref_nodev(g->nodes); ZEROS(n); n->closed = 1; } n = next_ref_nodev(g->nodes); ZEROS(n); n->closed = closed; n->regs.idx = g->regs->size; n->regs.cnt = nreg; for(i=0;i<nreg;i++){ reg = next_ref_regv(g->regs); reg->node = nid; reg->closed = 0; reg->read_link = 0; str = get_col_str(nds, 2 + i); if(0){ if(str[get_col_len(nds, 2 + i)-1] == '*'){ reg->closed = 1; } tok = index(str, '_'); *tok = '\0'; reg->rid = getval_cuhash(g->kbm->tag2idx, str); str = tok + 1; *tok = '_'; tok = index(str, '_'); *tok = '\0'; reg->dir = (str[0] == 'R'); str = tok + 1; *tok = '_'; tok = index(str, '_'); *tok = '\0'; reg->beg = atoi(str); str = tok + 1; reg->end = atoi(str); reg->end += reg->beg; } else { if(str[get_col_len(nds, 2 + i)-1] == '*'){ reg->closed = 1; } tok = rindex(str, '_'); *tok = '\0'; tok ++; reg->end = atoi(tok); tok = rindex(str, '_'); *tok = '\0'; tok ++; reg->beg = atoi(tok); reg->end += reg->beg; tok = rindex(str, '_'); *tok = '\0'; tok ++; reg->dir = (tok[0] == 'R'); if((reg->rid = getval_cuhash(g->kbm->tag2idx, str)) == MAX_U4){ g->regs->size --; if(nwarn < 10){ fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", str, nds->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr); nwarn ++; } continue; } } rd = ref_readv(g->reads, reg->rid); if(rd->regs.idx){ r = ref_regv(g->regs, rd->regs.idx); if(r->beg > reg->beg){ reg->read_link = rd->regs.idx; rd->regs.idx = g->regs->size - 1; } else { while(1){ if(r->read_link == 0) break; if(g->regs->buffer[r->read_link].beg > reg->beg) break; r = ref_regv(g->regs, r->read_link); } reg->read_link = r->read_link; r->read_link = g->regs->size - 1; } } else { rd->regs.idx = g->regs->size - 1; } rd->regs.cnt ++; } } encap_regv(g->regs, 1); g->regs->buffer[g->regs->size].node = WT_MAX_NODE; } #endif