12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291 |
- /*
- *
- * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
- *
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- #ifndef __WTDBG_H_RJ
- #define __WTDBG_H_RJ
- #include "kbm.h"
- #include "kbmpoa.h"
- #include "filewriter.h"
- #include "pgzf.h"
- #include <getopt.h>
- #include <regex.h>
- #define WT_MAX_RD 0x3FFFFFFF // 1 G
- #define WT_MAX_RDLEN 0x00FFFFFF // 16 Mb
- #define WT_MAX_RDBIN 0x0000FFFF // 64 K bins
- #define WT_MAX_NODE 0x000000FFFFFFFFFFLLU
- #define WT_MAX_EDGE 0x000000FFFFFFFFFFLLU
- #define WT_MAX_NODE_EDGES 0xFFFF
- #define WT_MAX_EDGE_COV 0xFFF
- typedef struct {
- u8i rid:30, dir:1, beg:16, end:16, closed:1;
- } rd_frg_t;
- define_list(rdfrgv, rd_frg_t);
- typedef struct {
- u8i node;
- u8i rid:30, dir:1, beg:16, end:16, closed:1;
- } rd_reg_t;
- define_list(rdregv, rd_reg_t);
- typedef struct {
- u8i rid:30, dir:1, beg:16, end:16, closed:1;
- u8i read_link;
- } rd_rep_t;
- define_list(rdrepv, rd_rep_t);
- typedef struct {
- u8i node;
- u8i rid:30, dir:1, beg:16, end:16, closed:1;
- u8i read_link;
- } reg_t;
- define_list(regv, reg_t);
- typedef struct { uint32_t x, y; } xy_t;
- define_list(xyv, xy_t);
- typedef struct { uint64_t idx:46, cnt:18; } vec_ref_t;
- typedef struct { uint64_t idx:46, cnt:18; } ptr_ref_t;
- typedef struct { uint64_t idx:46, cnt:18, fix:1, rank:45, score:18; } rnk_ref_t;
- static const vec_ref_t VEC_REF_NULL = (vec_ref_t){0, 0};
- static const ptr_ref_t PTR_REF_NULL = (ptr_ref_t){0, 0};
- static const rnk_ref_t RNK_REF_NULL = (rnk_ref_t){0, 0, 0, 0, 0};
- define_list(vecrefv, vec_ref_t);
- define_list(ptrrefv, ptr_ref_t);
- define_list(rnkrefv, rnk_ref_t);
- #define ptrref_hashcode(E) u64hashcode((E).idx)
- #define ptrref_hashequals(E1, E2) ((E1).idx == (E2).idx)
- define_hashset(ptrrefhash, ptr_ref_t, ptrref_hashcode, ptrref_hashequals);
- #define WT_EDGE_CLOSED_NULL 0
- #define WT_EDGE_CLOSED_MASK 1
- #define WT_EDGE_CLOSED_LESS 2
- #define WT_EDGE_CLOSED_HARD 3
- typedef struct {
- uint64_t node1:45, dir1:1, dir2:1, status:1, closed:2, flag:2, cov:12;
- uint64_t node2:45; int64_t off:19;
- } edge_t;
- define_list(edgev, edge_t);
- static inline uint64_t _edge_hashcode(edge_t e){
- const uint64_t m = 0xc6a4a7935bd1e995LLU;
- const int r = 47;
- uint64_t h = 1023 ^ (16 * m);
- uint64_t k = (e.node1 << 1) | e.dir1;
- k *= m;
- k ^= k >> r;
- k *= m;
- h ^= k;
- h *= m;
- k = (e.node2 << 1) | e.dir2;
- k *= m;
- k ^= k >> r;
- k *= m;
- h ^= k;
- h *= m;
- h ^= h >> r;
- h *= m;
- h ^= h >> r;
- return h;
- }
- #define EDGEHASH(idx) ((edgev *)set->userdata)->buffer[idx]
- #define edge_hashcode(E) _edge_hashcode(EDGEHASH(E))
- #define edge_hashequals(E1, E2) (EDGEHASH(E1).node1 == EDGEHASH(E2).node1 && EDGEHASH(E1).node2 == EDGEHASH(E2).node2 \
- && EDGEHASH(E1).dir1 == EDGEHASH(E2).dir1 && EDGEHASH(E1).dir2 == EDGEHASH(E2).dir2)
- define_hashset(edgehash, uint64_t, edge_hashcode, edge_hashequals);
- typedef struct { uint64_t idx:63, flg:1; uint64_t next; } edge_ref_t;
- static const edge_ref_t EDGE_REF_NULL = (edge_ref_t){0x7FFFFFFFFFFFFFFLLU, 1, 0};
- define_list(edgerefv, edge_ref_t);
- #define MAX_REP_IDX MAX_U4
- typedef struct {
- u8i rep_idx:32;
- u4i unvisit:16, cov:16;
- u8i closed:1, single_in:1, bt_visit:45, rep_dir:1, bt_idx:16, init_end:1;
- vec_ref_t regs;
- ptr_ref_t edges[2];
- } node_t;
- define_list(nodev, node_t);
- typedef struct {
- u8i idx:39, flg:1, cnt:24;
- } hit_lnk_t;
- define_list(hitlnkv, hit_lnk_t);
- typedef struct {
- rd_frg_t frgs[2];
- hit_lnk_t lnks[2]; // idx+flg: link, cnt[0]: mat, cnt[1]:cglen
- } rd_hit_t;
- define_list(rdhitv, rd_hit_t);
- typedef struct {
- u8i visit:63, flag:1;
- hit_lnk_t hits; // point to the g->rdhits
- int clps[2];
- ptr_ref_t regs;
- //u2i corr_bincnt;
- } read_t;
- define_list(readv, read_t);
- typedef struct {
- u8i node;
- //node_t *n;
- edge_ref_t edges[2];
- u4i dir:2, cov:30;
- int off;
- } trace_t;
- define_list(tracev, trace_t);
- #define WT_TRACE_MSG_ZERO 0
- #define WT_TRACE_MSG_ONE 1
- #define WT_TRACE_MSG_MORE 2
- #define WT_TRACE_MSG_VISITED 3
- #define WT_TRACE_MSG_UNDEF 4
- typedef struct {
- union {
- struct {
- u4i frg1:31, dir1:1;
- u4i frg2:31, dir2:1;
- };
- u8i key;
- };
- u4i cov:13, flag:2, tidx1:8, tidx2:8, weak:1, closed:2;
- b4i off;
- } lnk_t;
- define_list(lnkv, lnk_t);
- #define lnk_hashcode(E) (E).key
- #define lnk_hashequals(E1, E2) (E1).key == (E2).key
- define_hashset(lnkhash, lnk_t, lnk_hashcode, lnk_hashequals);
- typedef struct {
- u8i toff:46, tcnt:18; // extended traces and core traces
- u4i tx, ty; // core traces
- ptr_ref_t lnks[2];
- u4i len, length;
- u8i rep_idx:48, unvisit:16;
- u8i closed:1, single_in:1, bt_visit:46, rep_dir:1, bt_idx:16;
- } frg_t;
- define_list(frgv, frg_t);
- typedef struct {
- edge_ref_t lnks[2];
- u4i frg;
- u4i dir:2, cov:30;
- int off;
- u4i tx, ty;
- } path_t;
- define_list(pathv, path_t);
- #define WT_PATH_MSG_ZERO 0
- #define WT_PATH_MSG_ONE 1
- #define WT_PATH_MSG_MORE 2
- #define WT_PATH_MSG_VISITED 3
- #define WT_PATH_MSG_UNDEF 4
- typedef struct {
- u8i node1:63, dir1:1;
- u8i node2:63, dir2:1;
- b8i off:42, len:22;
- } seqlet_t;
- define_list(seqletv, seqlet_t);
- typedef struct {
- KBM *kbm;
- KBMPar *par, *rpar;
- regv *regs;
- readv *reads;
- rdhitv *rdhits;
- BitsVec *cigars;
- nodev *nodes;
- edgev *edges;
- edgehash *ehash;
- edgerefv *erefs;
- frgv *frgs;
- lnkv *lnks;
- edgerefv *lrefs;
- tracev *traces;
- u8i genome_size;
- u4i num_index;
- //u4i corr_mode, corr_min, corr_max;
- //u4i corr_bsize, corr_bstep;
- //float corr_cov, corr_gcov;
- int node_order, mem_stingy;
- u4i n_fix, only_fix; // first n sequences are accurate contigs; only_fix means whether to include other pacbio sequenes
- u4i reglen, regovl, bestn;
- int min_node_mats;
- int max_overhang, chainning_hits, uniq_hit;
- float node_max_conflict; // 0.25
- float node_merge_cutoff;
- uint32_t max_node_cov, min_node_cov, exp_node_cov, min_edge_cov, max_edge_span;
- u4i max_node_cov_sg, max_sg_end;
- int cut_tip;
- int store_low_cov_edge;
- int rep_filter, rep_detach;
- uint32_t bub_step, tip_step, rep_step;
- int min_ctg_len, min_ctg_nds, minimal_output;
- vplist *utgs;
- u4i major_nctg;
- vplist *ctgs;
- } Graph;
- static const char *colors[2][2] = {{"blue", "green"}, {"red", "gray"}};
- static inline Graph* init_graph(KBM *kbm){
- Graph *g;
- u4i rid;
- g = malloc(sizeof(Graph));
- g->kbm = kbm;
- g->par = kbm->par;
- g->rpar = NULL;
- g->regs = init_regv(32);
- g->reads = init_readv(kbm->reads->size);
- g->reads->size = kbm->reads->size;
- for(rid=0;rid<g->reads->size;rid++){
- g->reads->buffer[rid].clps[0] = 0;
- g->reads->buffer[rid].clps[1] = g->kbm->reads->buffer[rid].bincnt;
- }
- g->nodes = init_nodev(32);
- g->rdhits = init_rdhitv(1024);
- g->cigars = init_bitsvec(1024, 3);
- g->edges = init_edgev(32);
- g->ehash = init_edgehash(1023);
- set_userdata_edgehash(g->ehash, g->edges);
- g->erefs = init_edgerefv(32);
- g->frgs = init_frgv(32);
- g->lnks = init_lnkv(32);
- g->lrefs = init_edgerefv(32);
- g->traces = init_tracev(32);
- g->genome_size = 1024 * 1024 * 1024LLU;
- g->num_index = 1;
- //g->corr_mode = 0;
- //g->corr_min = 5;
- //g->corr_max = 10;
- //g->corr_cov = 0.75;
- //g->corr_gcov = 5.0;
- //g->corr_bsize = 2048;
- //g->corr_bstep = 2048 - 512;
- g->node_order = 0;
- g->n_fix = 0;
- g->only_fix = 0;
- g->reglen = 1024;
- g->regovl = 256;
- g->node_max_conflict = 0.25;
- g->node_merge_cutoff = 0.8;
- g->max_overhang = -1;
- g->bestn = 0;
- g->min_node_mats = 1;
- g->chainning_hits = 0;
- g->uniq_hit = 0;
- g->min_node_cov = 4;
- g->max_node_cov = 60;
- g->exp_node_cov = 40; // expected node cov
- g->min_edge_cov = 4;
- g->max_edge_span = 1024; // 1024 * 256 = 256 kb
- g->max_node_cov_sg = 2;
- g->max_sg_end = 5;
- g->store_low_cov_edge = 1;
- g->rep_filter = 1;
- g->rep_detach = 1;
- g->bub_step = 10;
- g->tip_step = 5;
- g->rep_step = 20;
- g->cut_tip = 1;
- g->min_ctg_len = 10000;
- g->min_ctg_nds = 5;
- g->minimal_output = 0;
- g->utgs = init_vplist(32);
- g->ctgs = init_vplist(32);
- g->major_nctg = 0;
- return g;
- }
- static inline void free_graph(Graph *g){
- u8i i;
- free_regv(g->regs);
- free_readv(g->reads);
- free_nodev(g->nodes);
- free_rdhitv(g->rdhits);
- free_bitsvec(g->cigars);
- free_edgev(g->edges);
- free_edgehash(g->ehash);
- free_edgerefv(g->erefs);
- free_frgv(g->frgs);
- free_lnkv(g->lnks);
- free_edgerefv(g->lrefs);
- free_tracev(g->traces);
- for(i=0;i<g->utgs->size;i++) free_tracev(g->utgs->buffer[i]);
- free_vplist(g->utgs);
- for(i=0;i<g->ctgs->size;i++) free_tracev(g->ctgs->buffer[i]);
- free_vplist(g->ctgs);
- free(g);
- }
- #define KBM_MAP2RDHIT_QUICK
- static inline int map2rdhits_graph(Graph *g, kbm_map_t *hit){
- u4i k, f, d, p, n, add;
- rd_hit_t *rh, *hp, *hn;
- read_t *rd;
- if(hit->mat == 0) return 0;
- rh = next_ref_rdhitv(g->rdhits);
- rh->frgs[0] = (rd_frg_t){hit->qidx, hit->qdir, hit->qb, hit->qe, 0};
- rh->frgs[1] = (rd_frg_t){hit->tidx, hit->tdir, hit->tb, hit->te, 0};
- rh->lnks[0].cnt = hit->mat;
- rh->lnks[1].cnt = hit->cglen;
- add = 0;
- for(k=0;k<2;k++){
- rd = ref_readv(g->reads, rh->frgs[k].rid);
- #ifdef KBM_MAP2RDHIT_QUICK
- { // Just add it
- rh->lnks[k].idx = rd->hits.idx;
- rh->lnks[k].flg = rd->hits.flg;
- rd->hits.idx = offset_rdhitv(g->rdhits, rh);
- rd->hits.flg = k;
- rd->hits.cnt ++;
- add ++;
- continue;
- }
- #endif
- hn = ref_rdhitv(g->rdhits, rd->hits.idx);
- f = rd->hits.flg;
- hp = NULL;
- p = 0;
- n = 1;
- while(1){
- if(hn->lnks[0].cnt <= rh->lnks[0].cnt){ // hit->mat
- break;
- }
- p = f;
- hp = hn;
- if(hn->lnks[f].idx == 0) break;
- f = hn->lnks[p].flg;
- hn = ref_rdhitv(g->rdhits, hn->lnks[p].idx);
- n ++;
- }
- if(g->bestn && n > g->bestn){
- continue;
- }
- if(hp == NULL){
- rh->lnks[k].idx = rd->hits.idx;
- rh->lnks[k].flg = rd->hits.flg;
- rd->hits.idx = offset_rdhitv(g->rdhits, rh);
- rd->hits.flg = k;
- } else {
- rh->lnks[k].idx = hp->lnks[p].idx;
- rh->lnks[k].flg = hp->lnks[p].flg;
- hp->lnks[p].idx = offset_rdhitv(g->rdhits, rh);
- hp->lnks[p].flg = k;
- }
- rd->hits.cnt ++;
- if(g->bestn && rd->hits.cnt > g->bestn){
- hn = rh;
- p = k;
- while(n < g->bestn){
- f = hn->lnks[p].flg;
- hn = ref_rdhitv(g->rdhits, hn->lnks[p].idx);
- p = f;
- n ++;
- }
- hp = hn;
- f = p;
- while(hn->lnks[f].idx){
- d = hn->lnks[f].flg;
- hn = ref_rdhitv(g->rdhits, hn->lnks[f].idx);
- hn->frgs[f].closed = 1;
- f = d;
- }
- hp->lnks[p].idx = 0;
- hp->lnks[p].flg = 0;
- rd->hits.cnt = n;
- }
- add ++;
- }
- if(add == 0){
- trunc_rdhitv(g->rdhits, 1);
- g->cigars->size -= hit->cglen;
- }
- return add;
- }
- static inline int is_dovetail_overlap(Graph *g, kbm_map_t *hit){
- read_t *q, *t;
- int overhangs[2][2];
- q = ref_readv(g->reads, hit->qidx);
- t = ref_readv(g->reads, hit->tidx);
- overhangs[1][0] = hit->tb - t->clps[0];
- overhangs[1][1] = (t->clps[1] - t->clps[0]) - hit->te;
- if(hit->qdir){
- overhangs[0][0] = (q->clps[1] - q->clps[0]) - hit->qe;
- overhangs[0][1] = hit->qb;
- } else {
- overhangs[0][0] = hit->qb;
- overhangs[0][1] = (q->clps[1] - q->clps[0]) - hit->qe;
- }
- if(overhangs[0][0] > g->max_overhang && overhangs[1][0] > g->max_overhang){
- return 0;
- }
- if(overhangs[0][1] > g->max_overhang && overhangs[1][1] > g->max_overhang){
- return 0;
- }
- return 1;
- }
- // qlen is measured by KBM_BIN_SIZE
- static inline int hit2rdregs_graph(Graph *g, rdregv *regs, int qlen, kbm_map_t *hit, BitsVec *cigars, u4v *maps[3]){
- KBM *kbm;
- u8i ndoff;
- u4i bpos[2][2], npos[2][2], clen, ndbeg, qn, j, qbincnt;
- int tmp, bt, tlen, x, y, mat, beg, end, min_node_len, max_node_len;
- int mask, closed;
- kbm = g->kbm;
- mask = 0;
- if(g->max_overhang >= 0){
- if(!is_dovetail_overlap(g, hit)){
- mask = 1;
- }
- }
- qn = g->reglen;
- min_node_len = (qn - 1);
- max_node_len = (qn + 1);
- // translate into reverse sequence order
- if(qlen == 0){
- qbincnt = kbm->reads->buffer[hit->qidx].bincnt;
- qlen = qbincnt;
- } else if(qlen > kbm->reads->buffer[hit->qidx].bincnt){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- abort();
- } else {
- qbincnt = qlen;
- }
- tlen = kbm->reads->buffer[hit->tidx].bincnt;
- if(hit->qdir){
- tmp = qlen - hit->qb;
- hit->qb = qlen - hit->qe;
- hit->qe = tmp;
- }
- {
- clear_u4v(maps[0]);
- clear_u4v(maps[1]);
- clear_u4v(maps[2]);
- clen = hit->cglen;
- x = -1; y = -1;
- while(clen){
- bt = get_bitsvec(cigars, hit->cgoff + clen - 1);
- push_u4v(maps[2], !(bt >> 2));
- bt = bt & 0x03;
- switch(bt){
- case 0: x ++; y ++; break;
- case 1: x ++; break;
- case 2: y ++; break;
- }
- push_u4v(maps[0], (x < 0)? 0 : x);
- push_u4v(maps[1], (y < 0)? 0 : y);
- clen --;
- }
- push_u4v(maps[0], x + 1);
- push_u4v(maps[1], y + 1);
- push_u4v(maps[2], 0);
- #if DEBUG
- if(x + 1 + hit->qb != hit->qe || y + 1 + hit->tb != hit->te){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- print_hit_kbm(g->kbm, g->kbm->reads->buffer[hit->qidx].tag, g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE, hit, cigars, NULL, stderr);
- abort();
- }
- #endif
- }
- bpos[0][0] = hit->qb;
- bpos[0][1] = hit->qe;
- bpos[1][0] = hit->tb;
- bpos[1][1] = hit->te;
- #if 0
- fprintf(stdout, "BPOS\t%d\t%d\t%d\t%d\n", bpos[0][0], bpos[0][1], bpos[1][0], bpos[1][1]);
- for(j=0;j<maps[0]->size;j++){
- fprintf(stdout, "%d,%d\t", maps[0]->buffer[j], maps[1]->buffer[j]);
- if((j % 10) == 9) fprintf(stdout, "\n");
- }
- fprintf(stdout, "\n");
- #endif
- {
- ndoff = kbm->reads->buffer[hit->qidx].binoff;
- if(hit->qdir){
- ndbeg = qbincnt - bpos[0][0];
- ndbeg = ndbeg % qn;
- ndoff = ndoff + ((qbincnt - (ndbeg + bpos[0][0])) / qn) - 1;
- } else {
- ndbeg = (bpos[0][0] % qn)? qn - (bpos[0][0] % qn) : 0;
- ndoff = ndoff + ((ndbeg + bpos[0][0]) / qn);
- }
- x = 0;
- for(j=ndbeg+bpos[0][0];j+qn<=bpos[0][1];j+=qn){
- mat = 0;
- while(maps[0]->buffer[x] < j - bpos[0][0]){
- x ++;
- }
- npos[0][0] = maps[1]->buffer[x];
- while(maps[0]->buffer[x] == j - bpos[0][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[0][1] = maps[1]->buffer[x];
- while(maps[0]->buffer[x] < j + qn - 1 - bpos[0][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[1][0] = maps[1]->buffer[x];
- while(maps[0]->buffer[x + 1] == j + qn - 1 - bpos[0][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[1][1] = maps[1]->buffer[x];
- // TODO: arbitrarily use outer boundary as matched region, need to test its effect
- // TODO: whether to remove regs outside clps
- if(mat >= g->min_node_mats){
- beg = (npos[0][0] + bpos[1][0]);
- end = (npos[1][1] + bpos[1][0] + 1);
- if(end - beg >= min_node_len && end - beg <= max_node_len){
- closed = 0;
- } else {
- closed = 1;
- }
- #if DEBUG
- if(beg >= end || beg < 0){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- abort();
- }
- #endif
- push_rdregv(regs, (rd_reg_t){ndoff, hit->tidx, hit->qdir, beg, end, mask | closed});
- }
- if(hit->qdir){
- ndoff --;
- } else {
- ndoff ++;
- }
- #if 0
- rd_reg_t *rg = ref_rdregv(regs, regs->size - 1);
- fprintf(stdout, "NPOS\tX=%d,%d\tY=%d,%d\t%d\t%d\t%d\t%d\n", j, j - bpos[0][0], j + qn - 1, j + qn - 1 - bpos[0][0], npos[0][0], npos[0][1], npos[1][0], npos[1][1]);
- fprintf(stdout, "REG\t%llu\t%s\t%c\t%d\t%d\t%d\n", rg->node, kbm->reads->buffer[rg->rid].tag, "+-"[rg->dir], rg->beg, rg->end, rg->end - rg->beg);
- #endif
- }
- }
- //if(!g->corr_mode){
- {
- ndoff = kbm->reads->buffer[hit->tidx].binoff;
- ndbeg = (bpos[1][0] % qn)? qn - (bpos[1][0] % qn): 0;
- ndoff = ndoff + ((ndbeg + bpos[1][0]) / qn);
- x = 0;
- for(j=ndbeg+bpos[1][0];j+qn<=bpos[1][1];j+=qn){
- mat = 0;
- while(maps[1]->buffer[x] < j - bpos[1][0]){
- x ++;
- }
- npos[0][0] = maps[0]->buffer[x];
- while(maps[1]->buffer[x] == j - bpos[1][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[0][1] = maps[0]->buffer[x];
- while(maps[1]->buffer[x] < j + qn - 1 - bpos[1][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[1][0] = maps[0]->buffer[x];
- while(maps[1]->buffer[x + 1] == j + qn - 1 - bpos[1][0]){
- if(maps[2]->buffer[x]) mat ++;
- x ++;
- }
- npos[1][1] = maps[0]->buffer[x];
- // TODO: arbitrarily use outer boundary as matched region, need to test its effect
- if(mat >= g->min_node_mats){
- if(hit->qdir){
- beg = qlen - (npos[1][1] + bpos[0][0] + 1);
- end = qlen - (npos[0][0] + bpos[0][0]);
- } else {
- beg = (npos[0][0] + bpos[0][0]);
- end = (npos[1][1] + bpos[0][0] + 1);
- }
- if(end - beg >= min_node_len && end - beg <= max_node_len){
- closed = 0;
- } else {
- closed = 1;
- }
- #if DEBUG
- if(beg >= end || beg < 0){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- abort();
- }
- #endif
- push_rdregv(regs, (rd_reg_t){ndoff, hit->qidx, hit->qdir, beg, end, mask | closed});
- #if 0
- fprintf(stdout, "NPOS\tX=%d,%d\tY=%d,%d\t%d\t%d\t%d\t%d\n", j, j - bpos[1][0], j + qn - 1, j + qn - 1 - bpos[1][0], npos[0][0], npos[0][1], npos[1][0], npos[1][1]);
- rd_reg_t *rg = ref_rdregv(regs, regs->size - 1);
- fprintf(stdout, "REG\t%llu\t%s\t%c\t%d\t%d\t%d\n", rg->node, kbm->reads->buffer[rg->rid].tag, "+-"[rg->dir], rg->beg, rg->end, rg->end - rg->beg);
- #endif
- }
- ndoff ++;
- }
- }
- if(hit->qdir){
- tmp = qlen - hit->qb;
- hit->qb = qlen - hit->qe;
- hit->qe = tmp;
- }
- return !mask;
- }
- thread_beg_def(mdbg);
- Graph *g;
- KBMAux *aux, *raux;
- CTGCNS *cc;
- reg_t reg;
- rdregv *regs;
- BitVec *rdflags;
- u4i beg, end;
- int raw;
- FILE *alno;
- int task;
- thread_end_def(mdbg);
- thread_beg_func(mdbg);
- Graph *g;
- KBM *kbm;
- KBMAux *aux, *raux;
- rdregv *regs;
- BitVec *rdflags;
- u4v *maps[3], *tidxs;
- volatile reg_t *reg;
- g = mdbg->g;
- kbm = g->kbm;
- reg = (reg_t*)&mdbg->reg;
- aux = mdbg->aux;
- raux = mdbg->raux;
- regs = mdbg->regs;
- rdflags = mdbg->rdflags;
- maps[0] = init_u4v(32);
- maps[1] = init_u4v(32);
- maps[2] = init_u4v(32);
- tidxs = init_u4v(16);
- thread_beg_loop(mdbg);
- if(mdbg->task == 1){
- if(reg->closed) continue;
- //if(g->corr_mode){
- if(0){
- } else {
- query_index_kbm(aux, NULL, reg->rid, kbm->rdseqs, (kbm->reads->buffer[reg->rid].seqoff + UInt(reg->beg)) * KBM_BIN_SIZE, UInt(reg->end - reg->beg) * KBM_BIN_SIZE);
- map_kbm(aux);
- }
- if(raux && aux->hits->size){ // refine
- kbm_read_t *rd;
- u4i i, j, tidx;
- clear_kbm(raux->kbm);
- bitpush_kbm(raux->kbm, NULL, 0, kbm->rdseqs->bits, (kbm->reads->buffer[reg->rid].seqoff + UInt(reg->beg)) * KBM_BIN_SIZE, UInt(reg->end - reg->beg) * KBM_BIN_SIZE);
- ready_kbm(raux->kbm);
- simple_index_kbm(raux->kbm, 0, raux->kbm->bins->size);
- clear_u4v(tidxs);
- sort_array(aux->hits->buffer, aux->hits->size, kbm_map_t, num_cmpgt(a.tidx, b.tidx));
- for(i=0;i<aux->hits->size;i++){
- if(i && tidxs->buffer[tidxs->size - 1] == aux->hits->buffer[i].tidx) continue;
- push_u4v(tidxs, aux->hits->buffer[i].tidx);
- }
- clear_kbmmapv(aux->hits);
- clear_bitsvec(aux->cigars);
- for(i=0;i<tidxs->size;i++){
- tidx = get_u4v(tidxs, i);
- rd = ref_kbmreadv(aux->kbm->reads, tidx);
- query_index_kbm(raux, rd->tag, tidx, aux->kbm->rdseqs, rd->seqoff * KBM_BIN_SIZE, rd->bincnt * KBM_BIN_SIZE);
- map_kbm(raux);
- for(j=0;j<raux->hits->size;j++){
- flip_hit_kbmaux(aux, raux, j);
- }
- }
- }
- sort_array(aux->hits->buffer, aux->hits->size, kbm_map_t, num_cmpgt(b.mat, a.mat));
- }
- thread_end_loop(mdbg);
- free_u4v(maps[0]);
- free_u4v(maps[1]);
- free_u4v(maps[2]);
- free_u4v(tidxs);
- thread_end_func(mdbg);
- typedef struct {
- int pos:19;
- u4i dir:1, spur:1, dep:11;
- } rd_clp_t;
- define_list(rdclpv, rd_clp_t);
- static inline void clip_read_algo(int clps[2], rdclpv *brks, rdclpv *chis, int avg_dep, int min_dep){
- rd_clp_t *c;
- u4i i;
- int dep, x, y, max, mx, my;
- sort_array(brks->buffer, brks->size, rd_clp_t, num_cmpgt((a.pos << 1) | a.dir, (b.pos << 1) | b.dir));
- x = y = 0; max = 0; dep = 0; mx = my = 0;
- for(i=0;i<brks->size;i++){
- c = ref_rdclpv(brks, i);
- if(dep >= min_dep){
- y = c->pos;
- if(y - x > max){
- max = y - x;
- mx = x;
- my = y;
- }
- }
- if(c->dir){ // end of overlap
- c->dep = dep;
- dep --;
- } else {
- dep ++;
- c->dep = dep;
- if(dep == min_dep){
- x = c->pos;
- }
- }
- if(c->spur){
- push_rdclpv(chis, (rd_clp_t){c->pos - KBM_BIN_SIZE, 0, 0, c->dep});
- push_rdclpv(chis, (rd_clp_t){c->pos - 1, 1, 0, c->dep});
- push_rdclpv(chis, (rd_clp_t){c->pos, 0, 1, c->dep});
- push_rdclpv(chis, (rd_clp_t){c->pos + KBM_BIN_SIZE, 1, 0, c->dep});
- }
- }
- clps[0] = mx / KBM_BIN_SIZE;
- clps[1] = my / KBM_BIN_SIZE;
- if((int)chis->size < avg_dep){
- return;
- }
- sort_array(chis->buffer, chis->size, rd_clp_t, num_cmpgtx(a.pos, b.pos, b.dir, a.dir));
- dep = 0; max = 0; mx = 0;
- for(i=0;i<chis->size;i++){
- c = ref_rdclpv(chis, i);
- if(c->dir){
- if(c->spur){
- if(dep >= max){
- max = dep;
- mx = i;
- }
- }
- dep --;
- } else {
- dep ++;
- if(c->spur){
- if(dep >= max){
- max = dep;
- mx = i;
- }
- }
- }
- }
- if(max * 2 < avg_dep){
- return;
- }
- c = ref_rdclpv(chis, mx);
- if(c->dep >= avg_dep){
- return;
- }
- if(2 * c->dep > max + 1){
- return;
- }
- if(c->pos <= clps[0] * KBM_BIN_SIZE || c->pos >= clps[1] * KBM_BIN_SIZE){
- return;
- }
- if(c->pos - clps[0] > clps[1] - c->pos){
- clps[1] = (c->pos + 1) / KBM_BIN_SIZE;
- } else {
- clps[0] = (c->pos + 1) / KBM_BIN_SIZE;
- }
- }
- static inline void clip_read_core(Graph *g, u4i rid, hitlnkv *lnks, rdclpv *brks, rdclpv *chis){
- read_t *rd;
- hit_lnk_t *lnk;
- rd_hit_t *hit;
- rd_frg_t *f1, *f2;
- u4i i;
- int rlen, dlen, margin, min_dep, dep, avg, x, y;
- rd = ref_readv(g->reads, rid);
- rlen = g->kbm->reads->buffer[rid].bincnt;
- if(rlen == 0){
- return;
- }
- lnk = &rd->hits;
- clear_rdclpv(brks);
- margin = g->max_overhang > 0? g->max_overhang : 4;
- min_dep = 2;
- dep = 0;
- clear_hitlnkv(lnks);
- while(lnk->idx){
- hit = ref_rdhitv(g->rdhits, lnk->idx);
- f1 = hit->frgs + lnk->flg;
- if(f1->closed == 0) push_hitlnkv(lnks, *lnk);
- lnk = hit->lnks + lnk->flg;
- }
- for(i=0;i<lnks->size;i++){
- lnk = ref_hitlnkv(lnks, i);
- hit = ref_rdhitv(g->rdhits, lnk->idx);
- f1 = hit->frgs + lnk->flg;
- if(f1->closed == 0){
- // whether be contained
- if(f1->beg <= margin && f1->end + margin >= rlen){
- rd->clps[0] = 0;
- rd->clps[1] = rlen;
- return;
- }
- f2 = hit->frgs + !lnk->flg;
- dlen = g->kbm->reads->buffer[f2->rid].bincnt;
- if(f1->dir ^ f2->dir){
- x = (f1->beg > margin && f2->end + margin < dlen);
- y = (f1->end + margin < rlen && f2->beg > margin);
- } else {
- x = (f1->beg > margin && f2->beg > margin);
- y = (f1->end + margin < rlen && f2->end + margin < dlen);
- }
- if(x && y){
- //push_rdclpv(brks, (rd_clp_t){f1->beg, 0, 1, 0});
- //push_rdclpv(brks, (rd_clp_t){f1->end, 1, 1, 0});
- } else if(x){
- push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 1, 0});
- push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 0, 0});
- } else if(y){
- push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 0, 0});
- push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 1, 0});
- } else {
- dep += f1->end - f1->beg;
- push_rdclpv(brks, (rd_clp_t){f1->beg * KBM_BIN_SIZE, 0, 0, 0});
- push_rdclpv(brks, (rd_clp_t){f1->end * KBM_BIN_SIZE, 1, 0, 0});
- }
- }
- }
- clear_rdclpv(chis);
- avg = (dep + rlen - 1) / rlen;
- clip_read_algo(rd->clps, brks, chis, avg, min_dep);
- }
- thread_beg_def(mclp);
- Graph *g;
- int task;
- thread_end_def(mclp);
- thread_beg_func(mclp);
- Graph *g;
- rdclpv *brks, *chis;
- hitlnkv *lnks;
- u4i rid;
- g = mclp->g;
- brks = init_rdclpv(32);
- chis = init_rdclpv(32);
- lnks = init_hitlnkv(32);
- thread_beg_loop(mclp);
- if(mclp->task == 1){
- for(rid=mclp->t_idx;rid<mclp->g->reads->size;rid+=mclp->n_cpu){
- clip_read_core(mclp->g, rid, lnks, brks, chis);
- }
- } else if(mclp->task == 2){
- hitlnkv *lnks, *chks;
- read_t *rd;
- hit_lnk_t *lnk;
- u4i i;
- lnks = init_hitlnkv(1024);
- chks = init_hitlnkv(1024);
- for(rid=mclp->t_idx;rid<mclp->g->reads->size;rid+=mclp->n_cpu){
- rd = ref_readv(g->reads, rid);
- if(rd->hits.idx == 0) continue;
- clear_hitlnkv(lnks);
- lnk = &rd->hits;
- while(1){
- if(lnk->idx == 0) break;
- push_hitlnkv(lnks, *lnk);
- lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg;
- }
- if(lnks->size < 2) continue;
- sort_array(lnks->buffer, lnks->size, hit_lnk_t, num_cmpgt(g->rdhits->buffer[b.idx].lnks[0].cnt, g->rdhits->buffer[a.idx].lnks[0].cnt));
- lnk = &rd->hits;
- for(i=0;i<lnks->size;i++){
- lnk->idx = lnks->buffer[i].idx;
- lnk->flg = lnks->buffer[i].flg;
- lnk = g->rdhits->buffer[lnks->buffer[i].idx].lnks + lnks->buffer[i].flg;
- }
- lnk->idx = 0;
- lnk->flg = 0;
- {
- clear_hitlnkv(chks);
- lnk = &rd->hits;
- while(1){
- if(lnk->idx == 0) break;
- push_hitlnkv(chks, *lnk);
- lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg;
- }
- if(lnks->size != chks->size){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- abort();
- }
- for(i=0;i<lnks->size;i++){
- if(lnks->buffer[i].idx != chks->buffer[i].idx || lnks->buffer[i].flg != chks->buffer[i].flg){
- fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- abort();
- }
- }
- }
- if(g->bestn && lnks->size > mclp->g->bestn){
- lnk = lnks->buffer + g->bestn - 1;
- lnk = g->rdhits->buffer[lnk->idx].lnks + lnk->flg;
- lnk->idx = 0;
- lnk->flg = 0;
- for(i=g->bestn;i<lnks->size;i++){
- lnk = lnks->buffer + i;
- g->rdhits->buffer[lnk->idx].frgs[lnk->flg].closed = 1;
- }
- rd->hits.cnt = g->bestn;
- }
- }
- free_hitlnkv(lnks);
- free_hitlnkv(chks);
- }
- thread_end_loop(mclp);
- free_rdclpv(brks);
- free_rdclpv(chis);
- free_hitlnkv(lnks);
- thread_end_func(mclp);
- static inline u8i check_read_reg_conflict_core(Graph *g, rd_reg_t *hit, int *conflict){
- read_t *rd;
- reg_t *reg;
- u8i idx, pre;
- int x, y;
- rd = ref_readv(g->reads, hit->rid);
- idx = rd->regs.idx;
- pre = 0;
- *conflict = 0;
- while(idx){
- reg = ref_regv(g->regs, idx);
- if(hit->beg >= reg->beg) pre = idx;
- idx = reg->read_link;
- if(hit->end <= reg->beg) break;
- if(hit->beg >= reg->end) continue;
- if(reg->closed) continue;
- if(hit->beg <= reg->beg){
- x = reg->beg;
- if(hit->end >= reg->end){
- y = reg->end;
- } else y = hit->end;
- } else {
- x = hit->beg;
- if(hit->end <= reg->end){
- y = hit->end;
- } else y = reg->end;
- }
- if(x < y){
- if(x + (int)g->regovl < y) *conflict = 1;
- }
- }
- return pre;
- }
- thread_beg_def(mupd);
- Graph *g;
- rdregv *regs;
- rnk_ref_t *nd;
- u8v *ins;
- u8i vt;
- thread_end_def(mupd);
- thread_beg_func(mupd);
- Graph *g;
- rdregv *regs;
- rd_reg_t *hit;
- u8i pre;
- u4i i;
- int conflict;
- g = mupd->g;
- regs = mupd->regs;
- thread_beg_loop(mupd);
- if(mupd->nd == NULL) continue;
- clear_u8v(mupd->ins);
- if(1){
- for(i=1;i<mupd->nd->cnt;i++){
- if(regs->buffer[mupd->nd->idx + i].rid == regs->buffer[mupd->nd->idx + i - 1].rid){
- regs->buffer[mupd->nd->idx + i].closed = 1;
- regs->buffer[mupd->nd->idx + i - 1].closed = 1;
- }
- }
- }
- for(i=0;i<mupd->nd->cnt;i++){
- hit = ref_rdregv(regs, mupd->nd->idx + i);
- conflict = 0;
- pre = check_read_reg_conflict_core(g, hit, &conflict);
- if(conflict){
- hit->closed = 1;
- }
- push_u8v(mupd->ins, pre);
- }
- thread_end_loop(mupd);
- thread_end_func(mupd);
- //TODO: reg_t to store regs, sort them by rank, and sort them by rid in another u4i/u8i array
- //TODO: fast generate nodes and read_link, then select important intervals
- static inline void mul_update_regs_graph(Graph *g, rdregv *regs, rnkrefv *nds, u4i ncpu, u8i upds[3]){
- u8i idx, vt;
- u4i i, j, pass;
- read_t *rd;
- node_t *n;
- reg_t *reg, *r;
- rd_reg_t *hit;
- int conflict, closed;
- thread_prepare(mupd);
- for(i=0;i<g->reads->size;i++){
- g->reads->buffer[i].visit = 0; // in which round did the read be touched
- }
- encap_regv(g->regs, regs->size + 1); // make sure next_ref_regv in this function is thread-safe
- thread_beg_init(mupd, ncpu);
- mupd->g = g;
- mupd->regs = regs;
- mupd->nd = NULL;
- mupd->ins = init_u8v(64);
- mupd->vt = 0;
- thread_end_init(mupd);
- vt = 1;
- upds[0] = upds[1] = upds[2] = 0;
- for(idx=0;idx<nds->size+ncpu;idx++){
- thread_wait_next(mupd);
- if(mupd->nd){
- pass = 0;
- for(j=0;j<mupd->nd->cnt;j++){
- hit = ref_rdregv(regs, mupd->nd->idx + j);
- rd = ref_readv(g->reads, hit->rid);
- if(rd->visit >= mupd->vt){ // previous node had changed the layout of this read, need to check conflict again
- mupd->ins->buffer[j] = check_read_reg_conflict_core(g, hit, &conflict);
- if(conflict){
- hit->closed = 1;
- }
- }
- rd->visit = idx;
- if(hit->closed == 0) pass ++;
- }
- do {
- closed = 0;
- if(0 && mupd->nd->cnt > g->max_node_cov){
- // Repetitive nodes should be in graph to avoid mis-assembling
- upds[1] ++;
- closed = 1;
- continue;
- }
- if(mupd->nd->fix){
- if(pass == 0){
- upds[1] ++;
- closed = 1;
- continue;
- } else {
- upds[0] ++;
- }
- } else {
- if(pass < g->min_node_cov || pass < (u4i)(mupd->nd->cnt * (1 - g->node_max_conflict))){
- upds[1] ++;
- closed = 1;
- continue;
- } else {
- upds[0] ++;
- }
- }
- n = next_ref_nodev(g->nodes);
- n->rep_idx = MAX_REP_IDX;
- n->unvisit = 0;
- n->closed = 0;
- n->single_in = 0;
- n->bt_visit = 0;
- n->bt_idx = 0;
- n->init_end = 0;
- n->regs.idx = g->regs->size;
- n->regs.cnt = 0;
- n->cov = 0;
- n->edges[0] = PTR_REF_NULL;
- n->edges[1] = PTR_REF_NULL;
- for(j=0;j<mupd->nd->cnt;j++){
- hit = ref_rdregv(regs, mupd->nd->idx + j);
- rd = ref_readv(g->reads, hit->rid);
- n->regs.cnt ++;
- reg = next_ref_regv(g->regs); // Now, it is thread-safe
- reg->node = g->nodes->size - 1;
- reg->rid = hit->rid;
- reg->dir = hit->dir;
- reg->beg = hit->beg;
- reg->end = hit->end;
- reg->closed = hit->closed | closed;
- if(reg->closed == 0) n->cov ++;
- reg->read_link = 0;
- if(mupd->ins->buffer[j]){
- r = ref_regv(g->regs, mupd->ins->buffer[j]);
- reg->read_link = r->read_link;
- r->read_link = g->regs->size - 1;
- } else {
- reg->read_link = rd->regs.idx;
- rd->regs.idx = g->regs->size - 1;
- }
- rd->regs.cnt ++;
- }
- } while(0);
- }
- if(idx < nds->size){
- mupd->nd = ref_rnkrefv(nds, idx);
- mupd->vt = idx;
- thread_wake(mupd);
- } else {
- mupd->nd = NULL;
- }
- }
- thread_beg_close(mupd);
- free_u8v(mupd->ins);
- thread_end_close(mupd);
- }
- static inline u8i chainning_hits_core(kbmmapv *hits, BitsVec *cigars, int uniq_hit, float aln_var){
- kbm_map_t HIT;
- u8i idx, bst, ite, lst, lsx, nrm;
- u4i state;
- sort_array(hits->buffer, hits->size, kbm_map_t, num_cmpgtxx(a.qidx, b.qidx, a.tidx, b.tidx, a.qdir, b.qdir));
- nrm = 0;
- for(idx=lst=lsx=0;idx<=hits->size;idx++){
- state = 0;
- if(idx == hits->size){
- state = 1;
- } else if(hits->buffer[lst].qidx == hits->buffer[idx].qidx && hits->buffer[lst].tidx == hits->buffer[idx].tidx){
- if(hits->buffer[lst].qdir == hits->buffer[idx].qdir){
- state = 0;
- } else {
- state = 2;
- }
- } else {
- state = 1;
- }
- if(state){
- if(idx > lst + 1){
- if(simple_chain_all_maps_kbm(hits->buffer + lst, idx - lst, cigars, &HIT, cigars, aln_var)){
- hits->buffer[lst++] = HIT;
- while(lst < idx){
- hits->buffer[lst++].mat = 0; // closed = 1
- nrm ++;
- }
- }
- }
- if(state == 1){
- // Choose the best hit
- if(uniq_hit && idx > lsx + 1){
- bst = lsx;
- for(bst=lsx;bst<idx;bst++){
- if(hits->buffer[bst].mat) break;
- }
- for(ite=bst+1;ite<idx;ite++){
- if(hits->buffer[ite].mat == 0) continue;
- if((hits->buffer[ite].aln > hits->buffer[bst].aln) || (hits->buffer[ite].aln == hits->buffer[bst].aln && hits->buffer[ite].mat > hits->buffer[bst].mat)){
- bst = ite;
- }
- }
- for(ite=lsx;ite<bst;ite++){
- hits->buffer[ite].mat = 0;
- }
- for(ite=bst+1;ite<idx;ite++){
- hits->buffer[ite].mat = 0;
- }
- nrm += idx - lsx - 1;
- }
- lsx = idx;
- }
- lst = idx;
- }
- }
- return nrm;
- }
- static inline u8i load_alignments_core(Graph *g, FileReader *pws, int raw, rdregv *regs, u4v *maps[3]){
- kbmmapv *hits;
- BitsVec *cigars;
- u4v *cgs;
- kbm_map_t *hit, HIT, *h;
- cuhash_t *cu;
- u8i nhit;
- u4i i, rid;
- int qlen, val, flg, nwarn, mwarn, ncol;
- char *cgstr, *qtag;
- mwarn = 20;
- nwarn = 0;
- cgs = init_u4v(4);
- hits = init_kbmmapv(32);
- cigars = g->chainning_hits? init_bitsvec(1024, 3) : g->cigars;
- memset(&HIT, 0, sizeof(kbm_map_t));
- hit = &HIT;
- nhit = 0;
- while((ncol = readtable_filereader(pws)) != -1){
- if((pws->n_line % 100000) == 0){
- fprintf(KBM_LOGF, "\r%llu", pws->n_line); fflush(KBM_LOGF);
- }
- if(pws->line->buffer[0] == '#'){
- //if(strncasecmp(pws->line->buffer, "#corr_mode=1", 12) == 0){
- //g->corr_mode = 1;
- //}
- continue;
- }
- if(ncol < 15) continue;
- if((cu = get_cuhash(g->kbm->tag2idx, get_col_str(pws, 0))) == NULL){
- if(nwarn < mwarn){
- fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(pws, 0), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- nwarn ++;
- }
- continue;
- } else rid = cu->val;
- hit->qidx = rid;
- qtag = get_col_str(pws, 0);
- hit->qdir = (get_col_str(pws, 1)[0] == '-');
- qlen = atoi(get_col_str(pws, 2));
- //if(g->corr_mode){
- if(0){
- //g->reads->buffer[hit->qidx].corr_bincnt = qlen / KBM_BIN_SIZE;
- } else if(qlen != (int)g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE){
- if(nwarn < mwarn){
- fprintf(stderr, " -- inconsisitent read length \"%s\" %d != %d in %s -- %s:%d --\n", qtag, qlen, g->kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- nwarn ++;
- }
- }
- hit->qb = atoi(get_col_str(pws, 3)) / KBM_BIN_SIZE;
- hit->qe = atoi(get_col_str(pws, 4)) / KBM_BIN_SIZE;
- if((cu = get_cuhash(g->kbm->tag2idx, get_col_str(pws, 5))) == NULL){
- if(nwarn < mwarn){
- fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(pws, 5), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- nwarn ++;
- }
- continue;
- } else rid = cu->val;
- if(hit->qidx >= rid) continue;
- hit->tidx = rid;
- hit->tdir = 0; // col 6 always eq '+'
- // skil col 7
- hit->tb = atoi(get_col_str(pws, 8)) / KBM_BIN_SIZE;
- hit->te = atoi(get_col_str(pws, 9)) / KBM_BIN_SIZE;
- // skip col 10-13
- hit->mat = atoi(get_col_str(pws, 10));
- if(hit->mat < g->par->min_mat) continue;
- hit->aln = atoi(get_col_str(pws, 11)) / KBM_BIN_SIZE;
- if(hit->aln < g->par->min_aln) continue;
- if(hit->mat < hit->aln * KBM_BIN_SIZE * g->par->min_sim) continue;
- if(num_diff(hit->qe - hit->qb, hit->te - hit->tb) > (int)num_max(g->par->aln_var * hit->aln, 1.0)) continue;
- hit->cnt = atoi(get_col_str(pws, 12));
- hit->gap = atoi(get_col_str(pws, 13));
- if(g->chainning_hits){
- if(hits->size && peer_kbmmapv(hits)->qidx != hit->qidx){
- chainning_hits_core(hits, cigars, g->uniq_hit, g->kbm->par->aln_var);
- for(i=0;i<hits->size;i++){
- h = ref_kbmmapv(hits, i);
- if(h->mat == 0) continue;
- append_bitsvec(g->cigars, cigars, h->cgoff, h->cglen);
- h->cgoff = g->cigars->size - h->cglen;
- nhit ++;
- map2rdhits_graph(g, h);
- }
- clear_kbmmapv(hits);
- clear_bitsvec(cigars);
- }
- }
- hit->cgoff = cigars->size;
- clear_u4v(cgs);
- cgstr = get_col_str(pws, 14);
- if(cgstr[0] == '*'){ // no cigar
- continue;
- }
- val = 0;
- while(cgstr[0]){
- if(cgstr[0] >= '0' && cgstr[0] <= '9'){
- val = val * 10 + (cgstr[0] - '0');
- } else {
- flg = -1;
- switch(cgstr[0]){
- case 'M': flg = 0; break;
- case 'I': flg = 1; break;
- case 'D': flg = 2; break;
- case 'm': flg = 4; break;
- case 'i': flg = 5; break;
- case 'd': flg = 6; break;
- default:
- fprintf(stderr, " -- Bad cigar '%c' \"%s\" in LINE:%llu in %s -- %s:%d --\n", cgstr[0], get_col_str(pws, 14), pws->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- exit(1);
- }
- if(val == 0) val = 1;
- push_u4v(cgs, (val << 8) | flg);
- val = 0;
- }
- cgstr ++;
- }
- for(i=cgs->size;i>0;i--){
- val = cgs->buffer[i - 1] >> 8;
- flg = cgs->buffer[i - 1] & 0xFF;
- pushs_bitsvec(cigars, flg, val);
- }
- hit->cglen = cigars->size - hit->cgoff;
- if(raw){
- hit2rdregs_graph(g, regs, qlen / KBM_BIN_SIZE, hit, cigars, maps);
- clear_bitsvec(cigars);
- } else if(g->chainning_hits){
- push_kbmmapv(hits, *hit);
- } else {
- map2rdhits_graph(g, hit);
- }
- }
- if(g->chainning_hits){
- if(hits->size){
- chainning_hits_core(hits, cigars, g->uniq_hit, g->kbm->par->aln_var);
- for(i=0;i<hits->size;i++){
- h = ref_kbmmapv(hits, i);
- if(h->mat == 0) continue;
- append_bitsvec(g->cigars, cigars, h->cgoff, h->cglen);
- h->cgoff = g->cigars->size - h->cglen;
- nhit ++;
- map2rdhits_graph(g, h);
- }
- clear_kbmmapv(hits);
- clear_bitsvec(cigars);
- }
- }
- fprintf(KBM_LOGF, "\r%llu lines, %llu hits\n", pws->n_line, nhit);
- print_proc_stat_info(g_mpi_comm_rank); //每个分布节点比对完成后消耗时间 ningwenmo@outlook.com
- free_kbmmapv(hits);
- if(g->chainning_hits) free_bitsvec(cigars);
- free_u4v(cgs);
- return nhit;
- }
- static inline u8i proc_alignments_core(Graph *g, int ncpu, int raw, rdregv *regs, u4v *maps[3], char *prefix, char *dump_kbm){
- kbm_map_t *hit;
- kbm_read_t *pb;
- BitVec *rdflags;
- BufferedWriter *bw;
- FILE *alno, *kmlog;
- u8i nbp, mbp, nhit;
- u8i i, ib, ie, ic;
- u4i rid, qb, qe, ii, in;
- int reset_kbm, n_cpu;
- thread_prepare(mdbg);
- if(KBM_LOG) n_cpu = 1;
- else n_cpu = ncpu;
- ic = (g->kbm->bins->size + g->num_index - 1) / g->num_index;
- ie = 0;
- UNUSED(mbp);
- #if 0
- if(g->corr_mode){
- mbp = g->genome_size * g->corr_gcov;
- qb = qe = g->kbm->reads->size / 2;
- nbp = g->kbm->reads->buffer[qb].bincnt * KBM_BIN_SIZE;
- while(nbp < mbp && qb && qe + 1 < g->kbm->reads->size){
- qb --;
- qe ++;
- nbp += g->kbm->reads->buffer[qb].bincnt * KBM_BIN_SIZE;
- nbp += g->kbm->reads->buffer[qe].bincnt * KBM_BIN_SIZE;
- }
- if(qe < g->kbm->reads->size) qe ++;
- fprintf(KBM_LOGF, "[%s] turn correct-mode on, reads[%u ~ %u = %u] (%llu bp), genome-size=%llu, corr-gcov=%0.2f, corr-dep=[%d,%d,%0.2f]\n", date(), qb, qe, qe - qb, nbp, g->genome_size, g->corr_gcov, g->corr_min, g->corr_max, g->corr_cov); fflush(KBM_LOGF);
- } else {
- #else
- {
- #endif
- qb = 0;
- qe = g->reads->size;
- }
- // alno = open_file_for_write(prefix, ".alignments.gz", 1);
- // 每个分布节点输出相应计算的alignments ningwenmo@outlook.com
- char* alignments_file_name[64];
- sprintf(alignments_file_name, ".%u_%u.alignments", g_mpi_comm_rank, g_mpi_comm_size);
- alno = open_file_for_write(prefix, alignments_file_name, 1);
- // bw = zopen_bufferedwriter(alno, 1024 * 1024, ncpu, 0);
- #if 0
- if(g->corr_mode){
- beg_bufferedwriter(bw);
- fprintf(bw->out, "#corr_mode=1\n");
- end_bufferedwriter(bw);
- }
- rdflags = (!g->corr_mode && g->par->skip_contained)? init_bitvec(g->kbm->reads->size) : NULL;
- in = g->corr_mode? 1 : g->num_index;
- #else
- rdflags = (g->par->skip_contained)? init_bitvec(g->kbm->reads->size) : NULL;
- in = g->num_index;
- #endif
- if(g->kbm->seeds->size){
- reset_kbm = 0;
- if(in > 1){
- fprintf(KBM_LOGF, " ** WARNNING: change number of kbm index to 1 **\n"); fflush(KBM_LOGF);
- in = 1;
- }
- } else {
- reset_kbm = 1;
- }
- //fix_node = 0;
- nhit = 0;
- for(ii=0;ii<in;ii++){
- ib = ie;
- ie = num_min(ib + ic, g->kbm->bins->size);
- while(ie > 0 && ie < g->kbm->bins->size && g->kbm->bins->buffer[ie - 1].ridx == g->kbm->bins->buffer[ie].ridx) ie ++;
- qb = ib ? g->kbm->bins->buffer[ib - 1].ridx : 0;
- qe = ie ? g->kbm->bins->buffer[ie - 1].ridx : 0;
- nbp = ((u8i)(ie - ib)) * KBM_BIN_SIZE;
- //每个分布节点计算分别计算相应数据 ningwenmo@outlook.com
- if (g_mpi_comm_size >= 2 && ii != (g_mpi_comm_rank - 1))
- continue;
- if(reset_kbm){
- reset_index_kbm(g->kbm);
- fprintf(KBM_LOGF, "[%s] indexing bins[(%llu,%llu)/%llu] (%llu/%llu bp), %d threads\n", date(), ib, ie, (u8i)g->kbm->bins->size, nbp, (u8i)g->kbm->rdseqs->size, ncpu); fflush(KBM_LOGF);
- kmlog = (in > 1)? NULL : open_file_for_write(prefix, ".kmerdep", 1);
- index_kbm(g->kbm, ib, ie, ncpu, kmlog);
- if(kmlog){
- fclose(kmlog);
- kmlog = NULL;
- }
- fprintf(KBM_LOGF, "[%s] Done\n", date()); fflush(KBM_LOGF);
- if(in == 1 && dump_kbm){
- FILE *dump;
- fprintf(KBM_LOGF, "[%s] dump kbm-index to %s ...", date(), dump_kbm); fflush(KBM_LOGF);
- dump = open_file_for_write(dump_kbm, NULL, 1);
- mem_dump_obj_file(g->kbm, 1, &kbm_obj_desc, 1, 0, dump);
- fclose(dump);
- fprintf(KBM_LOGF, " Done\n"); fflush(KBM_LOGF);
- }
- if(in == 1){
- u4i *deps;
- u8i hidx;
- kmlog = open_file_for_write(prefix, ".binkmer", 1);
- deps = calloc(KBM_BIN_SIZE + 1, 4);
- for(hidx=0;hidx<g->kbm->bins->size;hidx++){
- deps[g->kbm->bins->buffer[hidx].degree] ++;
- }
- for(hidx=0;hidx<KBM_BIN_SIZE;hidx++){
- fprintf(kmlog, "%u\n", deps[hidx]);
- }
- fclose(kmlog);
- free(deps);
- if(!g->minimal_output && 0){
- kbm_bin_t *bn;
- kmlog = open_file_for_write(prefix, ".closed_bins", 1);
- for(hidx=0;hidx<g->kbm->bins->size;hidx++){
- bn = ref_kbmbinv(g->kbm->bins, hidx);
- if(bn->closed){
- fprintf(kmlog, "%s_F_%d_%d\t%d\n", g->kbm->reads->buffer[bn->ridx].tag, bn->off * KBM_BIN_SIZE, KBM_BIN_SIZE, bn->degree);
- }
- }
- fclose(kmlog);
- }
- }
- }
- {
- {
- thread_beg_init(mdbg, n_cpu);
- mdbg->g = g;
- memset((void*)&mdbg->reg, 0, sizeof(reg_t));
- mdbg->reg.closed = 1;
- mdbg->aux = init_kbmaux(g->kbm);
- if(g->rpar){
- mdbg->raux = init_kbmaux(init_kbm(g->rpar));
- } else {
- mdbg->raux = NULL;
- }
- #if 0
- if(g->corr_mode){
- KBMBlock *kb;
- POGPar par;
- kb = init_kbmblock(g->corr_bsize, g->corr_bstep);
- //mdbg->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, 1, g->corr_max, 200, 100, 1, 96, 2, -5, -2, -4, -1, 16, 3, 0.5, g->corr_bsize - g->corr_bstep + KBM_BIN_SIZE);
- par = DEFAULT_POG_PAR;
- par.refmode = 1;
- mdbg->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, g->corr_max, 200, 100, 1, g->corr_bsize - g->corr_bstep + KBM_BIN_SIZE, &par);
- } else {
- #else
- {
- #endif
- mdbg->cc = NULL;
- }
- mdbg->aux->par = (KBMPar*)malloc(sizeof(KBMPar));
- memcpy(mdbg->aux->par, g->par, sizeof(KBMPar));
- mdbg->regs = regs;
- mdbg->rdflags = rdflags;
- mdbg->beg = 0;
- mdbg->end = 0;
- mdbg->raw = raw;
- mdbg->alno = alno;
- thread_end_init(mdbg);
- }
- thread_beg_iter(mdbg);
- mdbg->task = 1;
- thread_end_iter(mdbg);
- for(rid=qb;rid<=qe+ncpu;rid++){
- if(rid < qe){
- if(!KBM_LOG && ((rid - qb) % 2000) == 0){ fprintf(KBM_LOGF, "\r%u|%llu", rid - qb, nhit); fflush(KBM_LOGF); }
- thread_wait_one(mdbg);
- } else {
- thread_wait_next(mdbg);
- pb = NULL;
- }
- if(mdbg->reg.closed == 0){
- KBMAux *aux = mdbg->aux;
- //if(g->corr_mode && mdbg->cc->cns->size){
- //g->reads->buffer[mdbg->reg.rid].corr_bincnt = mdbg->cc->cns->size / KBM_BIN_SIZE;
- //}
- if(alno){
- // beg_bufferedwriter(bw);
- //if(g->corr_mode && mdbg->aux->cns->size){
- //fprintf(bw->out, "#corrected\t%s\t%u\t", mdbg->cc->tag->string, (u4i)mdbg->cc->cns->size);
- //println_fwdseq_basebank(mdbg->cc->cns, 0, mdbg->cc->cns->size, bw->out);
- //}
- // for(i=0;i<mdbg->aux->hits->size;i++){
- // hit = ref_kbmmapv(mdbg->aux->hits, i);
- // fprint_hit_kbm(mdbg->aux, i, bw->out);
- // }
- // end_bufferedwriter(bw);
- for (i = 0; i < mdbg->aux->hits->size; i++)
- {
- hit = ref_kbmmapv(mdbg->aux->hits, i);
- fprint_hit_kbm(mdbg->aux, i, alno);
- }
- }
- for(i=0;i<mdbg->aux->hits->size;i++){
- hit = ref_kbmmapv(mdbg->aux->hits, i);
- if(hit->mat == 0) continue;
- if(rdflags
- && g->kbm->reads->buffer[hit->tidx].bincnt < g->kbm->reads->buffer[hit->qidx].bincnt
- && (hit->tb <= 1 && hit->te + 1 >= (int)(g->kbm->reads->buffer[hit->tidx].bincnt))
- && (hit->qb > 1 || hit->qe + 1 < (int)(g->kbm->reads->buffer[hit->qidx].bincnt))
- ){
- one_bitvec(rdflags, hit->tidx);
- }
- }
- if(g->chainning_hits){
- chainning_hits_core(aux->hits, aux->cigars, g->uniq_hit, g->kbm->par->aln_var);
- }
- for(i=0;i<aux->hits->size;i++){
- hit = ref_kbmmapv(aux->hits, i);
- if(hit->mat == 0) continue;
- //hit->qb /= KBM_BIN_SIZE;
- //hit->qe /= KBM_BIN_SIZE;
- //hit->tb /= KBM_BIN_SIZE;
- //hit->te /= KBM_BIN_SIZE;
- //hit->aln /= KBM_BIN_SIZE;
- nhit ++;
- append_bitsvec(g->cigars, aux->cigars, hit->cgoff, hit->cglen);
- hit->cgoff = g->cigars->size - hit->cglen;
- if(raw){
- //hit2rdregs_graph(g, regs, g->corr_mode? mdbg->cc->cns->size / KBM_BIN_SIZE : 0, hit, mdbg->aux->cigars, maps);
- hit2rdregs_graph(g, regs, 0, hit, mdbg->aux->cigars, maps);
- } else {
- map2rdhits_graph(g, hit);
- }
- }
- if(KBM_LOG){
- fprintf(KBM_LOGF, "QUERY: %s\t+\t%d\t%d\n", g->kbm->reads->buffer[mdbg->reg.rid].tag, mdbg->reg.beg, mdbg->reg.end);
- for(i=0;i<mdbg->aux->hits->size;i++){
- hit = ref_kbmmapv(mdbg->aux->hits, i);
- fprintf(KBM_LOGF, "\t%s\t%c\t%d\t%d\t%d\t%d\t%d\n", g->kbm->reads->buffer[hit->tidx].tag, "+-"[hit->qdir], g->kbm->reads->buffer[hit->tidx].bincnt * KBM_BIN_SIZE, hit->tb * KBM_BIN_SIZE, hit->te * KBM_BIN_SIZE, hit->aln * KBM_BIN_SIZE, hit->mat);
- }
- }
- mdbg->reg.closed = 1;
- }
- if(rid < qe && (rdflags == NULL || get_bitvec(rdflags, rid) == 0)){
- pb = ref_kbmreadv(g->kbm->reads, rid);
- mdbg->reg = (reg_t){0, rid, 0, 0, pb->bincnt, 0, 0};
- thread_wake(mdbg);
- }
- }
- {
- thread_beg_close(mdbg);
- free(mdbg->aux->par);
- free_kbmaux(mdbg->aux);
- //if(g->corr_mode){
- //free_kbmblock((KBMBlock*)mdbg->cc->obj);
- //free_ctgcns(mdbg->cc);
- //}
- if(mdbg->raux){
- free_kbm(mdbg->raux->kbm);
- free_kbmaux(mdbg->raux);
- }
- thread_end_close(mdbg);
- }
- }
- if(!KBM_LOG) fprintf(KBM_LOGF, "\r%u reads|total hits %llu\n", qe - qb, nhit);
- if(reset_kbm){
- reset_index_kbm(g->kbm);
- }
- print_proc_stat_info(g_mpi_comm_rank);
- }
- // if(bw) close_bufferedwriter(bw);
- if(alno) fclose(alno);
- if(rdflags) free_bitvec(rdflags);
- return nhit;
- }
- static inline void build_nodes_graph(Graph *g, u8i maxbp, int ncpu, FileReader *pws, int rdclip, char *prefix, char *dump_kbm){
- kbm_map_t *hit, HIT;
- BitVec *rks;
- u4v *maps[3];
- FILE *clplog;
- u8i idx, rank, kcnts[256], upds[3], fix_node, nhit;
- u4i rid, i, dep;
- int raw;
- rdregv *regs;
- rnkrefv *nds;
- rnk_ref_t *nd;
- thread_preprocess(mclp);
- regs = init_rdregv(1024);
- nds = init_rnkrefv(1024);
- renew_rdhitv(g->rdhits, 1024);
- maps[0] = init_u4v(4);
- maps[1] = init_u4v(4);
- maps[2] = init_u4v(4);
- clear_regv(g->regs);
- next_ref_regv(g->regs);
- clear_rdhitv(g->rdhits);
- ZEROS(next_ref_rdhitv(g->rdhits));
- //clear_kbmmapv(g->pwalns);
- clear_bitsvec(g->cigars);
- raw = !((g->bestn > 0) || rdclip);
- //if(g->corr_mode){
- //raw = 1;
- //}
- fix_node = 0; // TODO: new code hasn't coped with contigs+longreads mode
- if(pws){
- nhit = load_alignments_core(g, pws, raw, regs, maps);
- } else {
- UNUSED(maxbp);
- nhit = proc_alignments_core(g, ncpu, raw, regs, maps, prefix, dump_kbm);
- }
- // 每个节点分布计算完成后返回 ningwenmo@outlook.com
- if (g_mpi_comm_size >= 2 && g_mpi_comm_rank != 0){
- // int message = 0;
- // MPI_Send(&message, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
- // fprintf(KBM_LOGF, "[%s] send %u/%u\n", date(), g_mpi_comm_rank, g_mpi_comm_size);
- // fflush(KBM_LOGF);
- return;
- }
- // print_proc_stat_info(g_mpi_comm_rank);
- if(raw){
- fprintf(KBM_LOGF, "[%s] generated %llu regs\n", date(), (u8i)regs->size);
- } else {
- #ifdef KBM_MAP2RDHIT_QUICK
- // sort rdhits and pick bestn
- {
- fprintf(KBM_LOGF, "[%s] sorting rdhits ... ", date()); fflush(KBM_LOGF);
- thread_beg_init(mclp, ncpu);
- mclp->g = g;
- mclp->task = 2;
- thread_end_init(mclp);
- thread_wake_all(mclp);
- thread_beg_close(mclp);
- thread_end_close(mclp);
- fprintf(KBM_LOGF, "Done\n"); fflush(KBM_LOGF);
- }
- #endif
- // clip reads
- if(rdclip){
- fprintf(KBM_LOGF, "[%s] clipping ... ", date()); fflush(KBM_LOGF);
- clplog = open_file_for_write(prefix, ".clps", 1);
- thread_beg_init(mclp, ncpu);
- mclp->g = g;
- mclp->task = 1;
- thread_end_init(mclp);
- thread_wake_all(mclp);
- thread_beg_close(mclp);
- thread_end_close(mclp);
- u8i tot, clp;
- tot = clp = 0;
- for(rid=0;rid<g->reads->size;rid++){
- tot += g->kbm->reads->buffer[rid].bincnt * KBM_BIN_SIZE;
- clp += (g->reads->buffer[rid].clps[1] - g->reads->buffer[rid].clps[0]) * KBM_BIN_SIZE;
- fprintf(clplog, "%s\t%d\t%d\t%d\n", g->kbm->reads->buffer[rid].tag, g->kbm->reads->buffer[rid].bincnt * KBM_BIN_SIZE, g->reads->buffer[rid].clps[0] * KBM_BIN_SIZE, g->reads->buffer[rid].clps[1] * KBM_BIN_SIZE);
- }
- fclose(clplog);
- fprintf(KBM_LOGF, "%.2f%% bases\n", ((tot - clp) * 100.0) / tot); fflush(KBM_LOGF);
- }
- fprintf(KBM_LOGF, "[%s] generating regs ... ", date()); fflush(KBM_LOGF);
- rd_hit_t *rh;
- u8i cgoff;
- cgoff = 0;
- hit = &HIT;
- hit->cnt = 0;
- hit->gap = 0;
- for(idx=0;idx<g->rdhits->size;idx++){
- rh = ref_rdhitv(g->rdhits, idx);
- hit->cgoff = cgoff;
- hit->cglen = rh->lnks[1].cnt;
- cgoff += hit->cglen;
- if(rh->frgs[0].closed && rh->frgs[1].closed){
- continue;
- }
- hit->qidx = rh->frgs[0].rid;
- hit->tidx = rh->frgs[1].rid;
- hit->qdir = rh->frgs[0].dir;
- hit->tdir = rh->frgs[1].dir;
- hit->qb = rh->frgs[0].beg;
- hit->qe = rh->frgs[0].end;
- hit->tb = rh->frgs[1].beg;
- hit->te = rh->frgs[1].end;
- hit->mat = rh->lnks[0].cnt;
- hit->aln = num_min(hit->qe - hit->qb, hit->te - hit->tb);
- //hit2rdregs_graph(g, regs, g->reads->buffer[hit->qidx].corr_bincnt, hit, g->cigars, maps);
- hit2rdregs_graph(g, regs, g->kbm->reads->buffer[hit->qidx].bincnt, hit, g->cigars, maps);
- }
- free_bitsvec(g->cigars); g->cigars = init_bitsvec(1024, 3);
- fprintf(KBM_LOGF, "%llu\n", (u8i)regs->size); fflush(KBM_LOGF);
- }
- free_u4v(maps[0]);
- free_u4v(maps[1]);
- free_u4v(maps[2]);
- // add node itself
- //if(!g->corr_mode){
- {
- rks = init_bitvec(g->kbm->bins->size);
- for(idx=0;idx<regs->size;idx++){
- one_bitvec(rks, regs->buffer[idx].node);
- }
- for(idx=0;idx<g->kbm->bins->size;idx++){
- if(get_bitvec(rks, idx)){
- rid = g->kbm->bins->buffer[idx].ridx;
- kbm_read_t *rd = ref_kbmreadv(g->kbm->reads, rid);
- push_rdregv(regs, (rd_reg_t){idx, rid, 0, (idx - rd->binoff) * g->reglen, (idx + 1 - rd->binoff) * g->reglen, 0});
- }
- }
- free_bitvec(rks);
- }
- // generating nodes
- fprintf(KBM_LOGF, "[%s] sorting regs ... ", date()); fflush(KBM_LOGF);
- psort_array(regs->buffer, regs->size, rd_reg_t, ncpu, num_cmpgtxx((a.node << 30) | a.rid, (b.node << 30) | b.rid, a.beg, b.beg, b.end, a.end));
- fprintf(KBM_LOGF, " Done\n"); fflush(KBM_LOGF);
- u4v *kbcnts;
- kbcnts = init_u4v(1024);
- rank = 0xFFFFFFFFFFFFFFFFLLU;
- nd = NULL;
- fprintf(KBM_LOGF, "[%s] generating intervals ... ", date()); fflush(KBM_LOGF);
- for(idx=0;idx<regs->size;idx++){
- if(0){
- kbm_read_t *rd1, *rd2;
- rd_reg_t *r;
- r = ref_rdregv(regs, idx);
- rd1 = ref_kbmreadv(g->kbm->reads, g->kbm->bins->buffer[r->node].ridx);
- rd2 = ref_kbmreadv(g->kbm->reads, r->rid);
- fprintf(stdout, "%s\t%c\t%d\t%d\t%s\t%c\t%d\t%d\n", rd1->tag, '+', (int)((r->node) - rd1->binoff) * g->reglen,
- (int)(r->node - rd1->binoff + 1) * g->reglen,
- rd2->tag, "+-"[r->dir], r->beg, r->end);
- }
- if(regs->buffer[idx].node != rank){
- if(nd){
- while(nd->cnt >= kbcnts->size) push_u4v(kbcnts, 0);
- kbcnts->buffer[nd->cnt] ++;
- }
- nd = next_ref_rnkrefv(nds);
- nd->idx = idx;
- nd->rank = regs->buffer[idx].node;
- nd->fix = regs->buffer[idx].node < fix_node;
- nd->cnt = 1;
- //nd->score = (regs->buffer[idx].beg >= g->reads->buffer[regs->buffer[idx].rid].clps[0] && regs->buffer[idx].end <= g->reads->buffer[regs->buffer[idx].rid].clps[1]);
- rank = regs->buffer[idx].node;
- } else {
- nd->cnt ++;
- //nd->score += (regs->buffer[idx].beg >= g->reads->buffer[regs->buffer[idx].rid].clps[0] && regs->buffer[idx].end <= g->reads->buffer[regs->buffer[idx].rid].clps[1]);
- }
- }
- if(nd){
- while(nd->cnt >= kbcnts->size) push_u4v(kbcnts, 0);
- kbcnts->buffer[nd->cnt] ++;
- }
- // find medean k-bin depth
- u4i gcov;
- for(gcov=0,rank=0;gcov<kbcnts->size&&rank<nds->size;gcov++){
- rank += kbcnts->buffer[gcov];
- }
- //psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(b.cnt, a.cnt, a.rank, b.rank));
- psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(num_diff(a.cnt, gcov), num_diff(b.cnt, gcov), a.rank, b.rank));
- fprintf(KBM_LOGF, " %llu intervals\n", (u8i)nds->size); fflush(KBM_LOGF);
- //psort_array(nds->buffer, nds->size, rnk_ref_t, ncpu, num_cmpgtx(b.score, a.score, a.rank, b.rank));
- if(0){
- memset(kcnts, 0, sizeof(uint64_t) * 256);
- for(idx=0;idx<nds->size;idx++){
- dep = num_min(nds->buffer[idx].cnt, 255);
- kcnts[dep] ++;
- }
- for(i=1;i<51;i++){
- fprintf(KBM_LOGF, "%10llu", (long long unsigned int)kcnts[i]);
- if(((i - 1) % 10) == 9) fprintf(KBM_LOGF, "\n");
- }
- if(((i - 1) % 10) != 0) fprintf(KBM_LOGF, "\n");
- }
- fprintf(KBM_LOGF, "[%s] selecting important intervals from %llu intervals\n", date(), (u8i)nds->size);
- mul_update_regs_graph(g, regs, nds, ncpu, upds);
- free_rdregv(regs);
- free_rnkrefv(nds);
- encap_regv(g->regs, 1);
- memset(g->regs->buffer + g->regs->size, 0xFF, sizeof(reg_t));
- if(!KBM_LOG) fprintf(KBM_LOGF, "[%s] Intervals: kept %llu, discarded %llu\n", date(), upds[0], upds[1]);
- print_proc_stat_info(g_mpi_comm_rank);
- }
- uint32_t estimate_genome_size(Graph *g, unsigned long long tot_bp, FILE *out){
- uint64_t kcnts[256];
- node_t *n;
- uint64_t nid, sum, ncnt, pmax;
- uint32_t i, dep, peak, mid;
- float avg;
- ncnt = g->nodes->size;
- memset(kcnts, 0, sizeof(uint64_t) * 256);
- sum = 0;
- for(nid=0;nid<ncnt;nid++){
- n = ref_nodev(g->nodes, nid);
- dep = num_min(n->regs.cnt, 255);
- sum += n->regs.cnt;
- kcnts[dep] ++;
- }
- mid = pmax = 0;
- while(mid < 255){
- pmax += kcnts[mid];
- if(pmax >= ncnt / 2) break;
- mid ++;
- }
- avg = 1.0 * sum / (ncnt + 1);
- fprintf(out, "[%s] median node depth = %d\n", date(), mid);
- return mid;
- //TODO: calculate the genome coverage
- for(i=1;i<51;i++){
- fprintf(out, "%10llu", (long long unsigned int)kcnts[i]);
- if(((i - 1) % 10) == 9) fprintf(out, "\n");
- }
- if(((i - 1) % 10) != 0) fprintf(out, "\n");
- return avg;
- pmax = 0; peak = avg;
- for(i=g->min_node_cov+1;i<256;i++){
- if(kcnts[i] > pmax){ peak = i; pmax = kcnts[i]; }
- else if(i > avg && kcnts[i] < 0.8 * pmax) break;
- }
- fprintf(out, "[%s] depth peak = %d\n", date(), peak);
- fprintf(out, "[%s] genome size = %llu\n", date(), tot_bp / peak);
- return peak;
- }
- // MUST be called before build_edges
- static u8i mask_nodes_by_cov_graph(Graph *g, FILE *out){
- node_t *n;
- u8i ret, i;
- ret = 0;
- for(i=0;i<g->nodes->size;i++){
- n = ref_nodev(g->nodes, i);
- if(n->regs.cnt > g->max_node_cov || n->regs.cnt < g->min_node_cov){
- n->closed = 1;
- ret ++;
- if(out) fprintf(out, "MASK_COV\tN%llu\t%u\t%u\n", (u8i)i, (u4i)n->regs.cnt, n->cov);
- }
- }
- return ret;
- }
- static inline void remove_all_edges_graph(Graph *g){
- node_t *n;
- uint64_t nid;
- free_edgev(g->edges);
- g->edges = init_edgev(32);
- free_edgehash(g->ehash);
- g->ehash = init_edgehash(1023);
- set_userdata_edgehash(g->ehash, g->edges);
- free_edgerefv(g->erefs);
- g->erefs = init_edgerefv(32);
- for(nid=0;nid<g->nodes->size;nid++){
- n = ref_nodev(g->nodes, nid);
- n->edges[0] = n->edges[1] = PTR_REF_NULL;
- }
- }
- typedef struct {
- uint64_t idx:46; int off:18;
- } edge_off_t;
- define_list(edgeoffv, edge_off_t);
- static inline int estimate_edge_length(edge_off_t *ps, uint32_t size, uint32_t idxs[2]){
- int64_t tot, var;
- uint32_t i, b, e, mi;
- int v, max, len, avg, std;
- idxs[0] = 0; idxs[1] = size;
- if(size == 0){ return 0; }
- if(size <= 2){ return ps[size/2].off; }
- b = 0; e = size;
- for(i=b,tot=0;i<e;i++) tot += ps[i].off;
- len = tot / (e - b);
- //fprintf(stdout, "b=%d\te=%d\tlen=%d\n", b, e, len);
- while(b + 2 < e){
- max = 0; mi = 0;
- for(i=b+1;i<e;i++){
- if(ps[i].off - ps[i-1].off > max){
- max = ps[i].off - ps[i-1].off;
- mi = i;
- }
- }
- if(max < len * 0.5) break;
- else if(max < 100) break;
- if(mi - b > e - mi) e = mi;
- else b = mi;
- for(i=b,tot=0;i<e;i++) tot += ps[i].off; avg = tot / (e - b);
- if(num_diff(avg, len) < num_max(avg * 0.2, 50)) break;
- len = avg;
- }
- //fprintf(stdout, "b=%d\te=%d\tlen=%d\n", b, e, len);
- if(0){
- if(b + 1 < e){
- for(i=b,var=0;i<e;i++){
- v = ((int)ps[i].off) - ((int)len);
- var += v * v;
- }
- std = sqrt(var / (e - b));
- //fprintf(stdout, "std=%d\n", std);
- b = 0; e = size;
- while(b < e && num_diff(ps[b].off, len) > 3 * std) b ++;
- while(b < e && num_diff(ps[e - 1].off, len) > 3 * std) e --;
- }
- idxs[0] = b;
- idxs[1] = e;
- }
- return len;
- }
- static inline int calculate_edge_cov_off_graph(Graph *g, edge_t *e, edgeoffv *offs){
- node_t *n1, *n2;
- reg_t *r1, *r2;
- uint32_t dir1, dir2, cov, idxs[2];
- int off;
- n1 = ref_nodev(g->nodes, e->node1);
- n2 = ref_nodev(g->nodes, e->node2);
- r1 = ref_regv(g->regs, n1->regs.idx);
- r2 = ref_regv(g->regs, n2->regs.idx);
- cov = e->cov;
- clear_edgeoffv(offs);
- while(1){
- if(r1->rid > r2->rid){
- r2 ++;
- if(r2->node != e->node2) break;
- } else if(r1->rid < r2->rid){
- r1 ++;
- if(r1->node != e->node1) break;
- } else {
- if(r1->beg < r2->beg){
- if(r1->node < r2->node){
- dir1 = r1->dir;
- dir2 = r2->dir;
- } else {
- dir1 = !r2->dir;
- dir2 = !r1->dir;
- }
- off = r2->beg - r1->end;
- } else {
- if(r2->node < r1->node){
- dir1 = r2->dir;
- dir2 = r1->dir;
- } else {
- dir1 = !r1->dir;
- dir2 = !r2->dir;
- }
- off = ((int)r1->beg) - r2->end;
- }
- if(dir1 == e->dir1 && dir2 == e->dir2){
- push_edgeoffv(offs, (edge_off_t){e - g->edges->buffer, off});
- }
- r1 ++;
- if(r1->node != e->node1) break;
- r2 ++;
- if(r2->node != e->node2) break;
- }
- }
- e->off = estimate_edge_length(offs->buffer, offs->size, idxs);
- e->cov = idxs[1] - idxs[0];
- if(cov != e->cov) return 1;
- else return 0;
- }
- static inline void build_edges_graph(Graph *g, int ncpu, FILE *log){
- read_t *rd;
- node_t *n;
- edge_t *E, *e;
- vplist *regs;
- reg_t *r1, *r2;
- edge_ref_t f1, f2;
- edgeoffv *offs;
- uint64_t idx, lst, cnt, x, *u;
- uint32_t rid, i, idxs[2];
- int exists;
- UNUSED(log);
- clear_edgev(g->edges);
- E = next_ref_edgev(g->edges);
- memset(E, 0, sizeof(edge_t));
- E->closed = WT_EDGE_CLOSED_MASK;
- E->cov = 1;
- //E->status = 0;
- clear_edgehash(g->ehash);
- offs = init_edgeoffv(32);
- regs = init_vplist(32);
- for(rid=0;rid<g->kbm->reads->size;rid++){
- rd = ref_readv(g->reads, rid);
- if(rd->regs.cnt < 2) continue;
- clear_vplist(regs);
- idx = rd->regs.idx;
- while(idx){
- r2 = ref_regv(g->regs, idx);
- idx = r2->read_link;
- if(g->nodes->buffer[r2->node].closed) continue;
- if(r2->closed) continue;
- if(!(r2->beg >= rd->clps[0] && r2->end <= rd->clps[1])) continue;
- for(i=regs->size;i>0;i--){
- r1 = (reg_t*)get_vplist(regs, i - 1);
- if(r1->end + g->max_edge_span < r2->beg) break;
- E = ref_edgev(g->edges, 0);
- if(r1->node < r2->node){
- E->node1 = r1->node;
- E->node2 = r2->node;
- E->dir1 = r1->dir;
- E->dir2 = r2->dir;
- } else {
- E->node1 = r2->node;
- E->node2 = r1->node;
- E->dir1 = !r2->dir;
- E->dir2 = !r1->dir;
- }
- E->off = ((int)r2->beg) - r1->end;
- u = prepare_edgehash(g->ehash, 0, &exists);
- if(exists){
- //if(g->edges->buffer[*u].cov < WT_MAX_EDGE_COV) g->edges->buffer[*u].cov ++;
- } else {
- *u = g->edges->size;
- push_edgev(g->edges, *E);
- }
- if(i == regs->size){
- e = ref_edgev(g->edges, *u);
- //if(e->cov < WT_MAX_EDGE_COV) e->cov ++;
- //if(e->cov >= 1){
- e->closed = 0;
- //}
- }
- push_edgeoffv(offs, (edge_off_t){*u, ((int)r2->beg) - r1->end});
- }
- push_vplist(regs, r2);
- }
- }
- free_vplist(regs);
- if(g->edges->size == 0){
- free_edgeoffv(offs);
- return;
- }
- psort_array(offs->buffer, offs->size, edge_off_t, ncpu, num_cmpgtx(a.idx, b.idx, a.off, b.off));
- lst = 0;
- for(idx=1;idx<=offs->size;idx++){
- if(idx < offs->size && offs->buffer[idx].idx == offs->buffer[lst].idx) continue;
- if(1){
- g->edges->buffer[offs->buffer[lst].idx].off = offs->buffer[(lst+idx)/2].off;
- g->edges->buffer[offs->buffer[lst].idx].cov = idx - lst;
- } else {
- g->edges->buffer[offs->buffer[lst].idx].off = estimate_edge_length(offs->buffer + lst, idx - lst, idxs);
- g->edges->buffer[offs->buffer[lst].idx].cov = idxs[1] - idxs[0];
- }
- if(0){
- uint64_t m;
- for(m=lst;m<idx;m++) fprintf(stdout, "%u\t", offs->buffer[m].off);
- fprintf(stdout, "\n");
- for(m=lst+idxs[0];m<lst+idxs[1];m++) fprintf(stdout, "%u\t", offs->buffer[m].off);
- fprintf(stdout, "\n");
- }
- lst = idx;
- }
- free_edgeoffv(offs);
- clear_edgerefv(g->erefs);
- push_edgerefv(g->erefs, EDGE_REF_NULL);
- for(idx=1;idx<g->edges->size;idx++){
- if(g->edges->buffer[idx].cov < g->min_edge_cov){
- if(g->store_low_cov_edge) g->edges->buffer[idx].closed = WT_EDGE_CLOSED_LESS;
- else continue;
- }
- if(g->nodes->buffer[g->edges->buffer[idx].node1].closed || g->nodes->buffer[g->edges->buffer[idx].node2].closed){
- g->edges->buffer[idx].closed = WT_EDGE_CLOSED_HARD;
- } else if(g->edges->buffer[idx].closed == WT_EDGE_CLOSED_MASK){
- g->edges->buffer[idx].closed = WT_EDGE_CLOSED_LESS;
- //} else {
- //g->edges->buffer[idx].closed = WT_EDGE_CLOSED_NULL;
- }
- push_edgerefv(g->erefs, (edge_ref_t){idx, 0, 0});
- push_edgerefv(g->erefs, (edge_ref_t){idx, 1, 0});
- }
- psort_array(g->erefs->buffer + 1, g->erefs->size - 1, edge_ref_t, ncpu, num_cmpgtx(
- (a.flg? ((g->edges->buffer[a.idx].node2 << 1) | !g->edges->buffer[a.idx].dir2) : ((g->edges->buffer[a.idx].node1 << 1) | g->edges->buffer[a.idx].dir1)),
- (b.flg? ((g->edges->buffer[b.idx].node2 << 1) | !g->edges->buffer[b.idx].dir2) : ((g->edges->buffer[b.idx].node1 << 1) | g->edges->buffer[b.idx].dir1)),
- g->edges->buffer[a.idx].off, g->edges->buffer[b.idx].off));
- f1.idx = g->nodes->size; f1.flg = 0;
- cnt = 0;
- for(lst=idx=1;idx<g->erefs->size;idx++){
- if(g->erefs->buffer[idx].flg){
- f2.idx = g->edges->buffer[g->erefs->buffer[idx].idx].node2;
- f2.flg = !g->edges->buffer[g->erefs->buffer[idx].idx].dir2;
- } else {
- f2.idx = g->edges->buffer[g->erefs->buffer[idx].idx].node1;
- f2.flg = g->edges->buffer[g->erefs->buffer[idx].idx].dir1;
- }
- if(f1.idx == f2.idx && f1.flg == f2.flg) continue;
- if(lst < idx){
- n = ref_nodev(g->nodes, f1.idx);
- n->edges[f1.flg].idx = lst;
- n->edges[f1.flg].cnt = 0;
- for(x=lst;x+1<idx;x++){
- g->erefs->buffer[x].next = x + 1;
- if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++;
- }
- if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++;
- }
- lst = idx;
- f1 = f2;
- }
- if(lst < idx){
- n = ref_nodev(g->nodes, f1.idx);
- n->edges[f1.flg].idx = lst;
- n->edges[f1.flg].cnt = 0;
- for(x=lst;x+1<idx;x++){
- g->erefs->buffer[x].next = x + 1;
- if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++;
- }
- if(g->edges->buffer[g->erefs->buffer[x].idx].closed == WT_EDGE_CLOSED_NULL) n->edges[f1.flg].cnt ++;
- }
- }
- static inline void load_nodes_graph(Graph *g, FileReader *clp, FileReader *nds){
- read_t *rd;
- node_t *n;
- reg_t *reg, *r;
- uint64_t nid;
- uint32_t i, nreg, rid;
- char *str, *tok;
- int ncol, closed, nwarn;
- clear_nodev(g->nodes);
- clear_regv(g->regs);
- next_ref_regv(g->regs);
- nwarn = 0;
- while((ncol = readtable_filereader(clp)) != -1){
- if(clp->line->string[0] == '#') continue;
- if(ncol < 4) continue;
- if((rid = getval_cuhash(g->kbm->tag2idx, get_col_str(clp, 0))) == MAX_U4){
- if(nwarn < 10){
- fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", get_col_str(clp, 0), clp->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- nwarn ++;
- }
- continue;
- }
- rd = ref_readv(g->reads, rid);
- rd->clps[0] = atoi(get_col_str(clp, 2)) / KBM_BIN_SIZE;
- rd->clps[1] = atoi(get_col_str(clp, 3)) / KBM_BIN_SIZE;
- }
- nwarn = 0;
- while((ncol = readtable_filereader(nds)) != -1){
- if(nds->line->string[0] == '#') continue;
- if(ncol < 2) continue;
- nreg = atoi(get_col_str(nds, 1));
- //if(nreg == 0) continue;
- //if(nreg < g->min_node_cov) continue;
- //node_id ~ N(\d+)(\*?)
- nid = atol(get_col_str(nds, 0) + 1);
- if(get_col_str(nds, 0)[get_col_len(nds, 0) - 1] == '*'){ // assert get_col_len(nds, 0) > 0
- closed = 1;
- } else {
- closed = 0;
- }
- while(g->nodes->size < nid){
- n = next_ref_nodev(g->nodes);
- ZEROS(n);
- n->closed = 1;
- }
- n = next_ref_nodev(g->nodes);
- ZEROS(n);
- n->closed = closed;
- n->regs.idx = g->regs->size;
- n->regs.cnt = nreg;
- for(i=0;i<nreg;i++){
- reg = next_ref_regv(g->regs);
- reg->node = nid;
- reg->closed = 0;
- reg->read_link = 0;
- str = get_col_str(nds, 2 + i);
- if(0){
- if(str[get_col_len(nds, 2 + i)-1] == '*'){
- reg->closed = 1;
- }
- tok = index(str, '_'); *tok = '\0';
- reg->rid = getval_cuhash(g->kbm->tag2idx, str);
- str = tok + 1; *tok = '_'; tok = index(str, '_'); *tok = '\0';
- reg->dir = (str[0] == 'R');
- str = tok + 1; *tok = '_'; tok = index(str, '_'); *tok = '\0';
- reg->beg = atoi(str);
- str = tok + 1;
- reg->end = atoi(str);
- reg->end += reg->beg;
- } else {
- if(str[get_col_len(nds, 2 + i)-1] == '*'){
- reg->closed = 1;
- }
- tok = rindex(str, '_'); *tok = '\0'; tok ++;
- reg->end = atoi(tok);
- tok = rindex(str, '_'); *tok = '\0'; tok ++;
- reg->beg = atoi(tok);
- reg->end += reg->beg;
- tok = rindex(str, '_'); *tok = '\0'; tok ++;
- reg->dir = (tok[0] == 'R');
- if((reg->rid = getval_cuhash(g->kbm->tag2idx, str)) == MAX_U4){
- g->regs->size --;
- if(nwarn < 10){
- fprintf(stderr, " -- Cannot find read \"%s\" in LINE:%llu in %s -- %s:%d --\n", str, nds->n_line, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
- nwarn ++;
- }
- continue;
- }
- }
- rd = ref_readv(g->reads, reg->rid);
- if(rd->regs.idx){
- r = ref_regv(g->regs, rd->regs.idx);
- if(r->beg > reg->beg){
- reg->read_link = rd->regs.idx;
- rd->regs.idx = g->regs->size - 1;
- } else {
- while(1){
- if(r->read_link == 0) break;
- if(g->regs->buffer[r->read_link].beg > reg->beg) break;
- r = ref_regv(g->regs, r->read_link);
- }
- reg->read_link = r->read_link;
- r->read_link = g->regs->size - 1;
- }
- } else {
- rd->regs.idx = g->regs->size - 1;
- }
- rd->regs.cnt ++;
- }
- }
- encap_regv(g->regs, 1);
- g->regs->buffer[g->regs->size].node = WT_MAX_NODE;
- }
- #endif
|