kbm.h 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317
  1. /*
  2. *
  3. * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef __KMER_BINMAP_RJ_H
  20. #define __KMER_BINMAP_RJ_H
  21. #include "list.h"
  22. #include "hashset.h"
  23. #include "dna.h"
  24. #include "filereader.h"
  25. #include "bitvec.h"
  26. #include "bitsvec.h"
  27. #include "bit2vec.h"
  28. #include "thread.h"
  29. #include "txtplot.h"
  30. #include "mpi.h" // openmpi go go go. ningwenmo@outlook.com, 20200511
  31. //#define __DEBUG__ 1
  32. #define TEST_MODE
  33. #define KBM_BIN_BITS 8
  34. #define KBM_BSIZE 256
  35. #define KBM_BIN_SIZE KBM_BSIZE
  36. #define KBM_MAX_BINCNT 0xFFFFFFFFFFLLU // 40 bits, 1024 G
  37. #define KBM_MAX_RDCNT 0x3FFFFFFF // 30 bits, 1 G
  38. #define KBM_MAX_RDBINCNT 0x7FFFFF // 23 bits
  39. // able to index reference sequences
  40. #define KBM_MAX_RDLEN 0x7FFFFFFFU // 31 bits, 2 G bp
  41. #define KBM_MAX_KSIZE 23
  42. #define KBM_MAX_KCNT 0xFFFF // 16 bits, 65535
  43. #define KBM_N_HASH 4096
  44. #define KBM_LOGF stderr
  45. #define KBM_LOGFNO STDERR_FILENO
  46. static int KBM_LOG = 0;
  47. #define KBM_LOG_LOW 1
  48. #define KBM_LOG_MID 2
  49. #define KBM_LOG_HIG 3
  50. #define KBM_LOG_ALL 4
  51. #define KBM_MAX_RDGRP1 0x7FFFFF
  52. #define KBM_MAX_RDGRP2 0xFF
  53. typedef struct {
  54. u8i seqoff:40, flag:24; // seqoff is counted in BIN(256bp)
  55. u8i binoff:40, bincnt:23, closed:1;
  56. char *tag;
  57. } kbm_read_t;
  58. define_list(kbmreadv, kbm_read_t);
  59. const obj_desc_t kbm_read_t_obj_desc = {"kbm_read_t_obj_desc", sizeof(kbm_read_t), 1, {1}, {offsetof(kbm_read_t, tag)}, {&OBJ_DESC_CHAR_ARRAY}, NULL, NULL};
  60. static inline size_t kbmreadv_deep_obj_desc_cnt(void *list, int idx){ if(idx == 0) return ((kbmreadv*)list)->size; else return 0; }
  61. static const obj_desc_t kbmreadv_deep_obj_desc = {.tag = "kbmreadv_deep_obj_desc", .size = sizeof(kbmreadv), .n_child = 1, .mem_type = {1}, .addr = {offsetof(kbmreadv, buffer)}, .desc = {&kbm_read_t_obj_desc}, .cnt = kbmreadv_deep_obj_desc_cnt, .post = NULL};
  62. #define kbm_read_seqoff(rd) ((rd)->seqoff << KBM_BIN_BITS)
  63. #define kbm_read_seqlen(rd) ((rd)->bincnt << KBM_BIN_BITS)
  64. #define KBM_MAX_BIN_DEGREE 0x1FFU
  65. // each BIN takes KBM_BIN_SIZE bp in uncompressed reads
  66. typedef struct {
  67. u4i ridx:30, off:24, closed:1, degree:9; // off * KBM_BIN_SIZE is the real position
  68. } kbm_bin_t;
  69. define_list(kbmbinv, kbm_bin_t);
  70. typedef struct {
  71. u4i bidx;
  72. } kbm_bmer_t;
  73. define_list(kbmbmerv, kbm_bmer_t);
  74. typedef struct {
  75. u1i bidx; // bidx = (kbm_baux_t->bidx << 32 | kbm_bmer_t->bidx)
  76. u1i dir:1, koff:7; // koff is the real (offset? >> 1), here offset is +0 or +1
  77. } kbm_baux_t;
  78. define_list(kbmbauxv, kbm_baux_t);
  79. #define getval_bidx(kbm, offset) ((((u8i)((kbm)->sauxs->buffer[offset].bidx)) << 32) | (kbm)->seeds->buffer[offset].bidx)
  80. //#define kbm_kmer_smear(K) ((K) ^ ((K) >> 4) ^ ((K) >> 7) ^ ((K) >> 12))
  81. #define kbm_kmer_smear(K) ((K) ^ ((K) >> 4) ^ ((K) >> 7))
  82. typedef struct {
  83. u8i mer:46, tot:17, flt:1;
  84. } kbm_kmer_t;
  85. define_list(kbmkmerv, kbm_kmer_t);
  86. #define KBM_KMERCODE(E) ((E).mer)
  87. #define KBM_KMEREQUALS(E1, E2) ((E1).mer == (E2).mer)
  88. #define KBM_KEYEQUALS(K, E) ((K) == (E).mer)
  89. define_hashtable(kbmhash, kbm_kmer_t, KBM_KMERCODE, KBM_KMEREQUALS, u8i, ITSELF, KBM_KEYEQUALS, kbm_kmer_t*, ITSELF);
  90. typedef struct {
  91. u8i off:40, cnt:24;
  92. } kbm_kaux_t;
  93. define_list(kbmkauxv, kbm_kaux_t);
  94. typedef struct {
  95. kbm_kmer_t *mer;
  96. kbm_kaux_t *aux;
  97. u4i kidx;
  98. u4i off:24, dir:1, pdir:1, fine:1, closed:1, extra_bits1:4;
  99. u4i qbidx;
  100. u4i poffs[2];
  101. u8i bidx, boff, bend;
  102. //kbm_bmer_t *b, *end;
  103. } kbm_ref_t;
  104. define_list(kbmrefv, kbm_ref_t);
  105. #if 0
  106. #define heap_cmp_kbm_bmer(refs, a, b) num_cmpx(refs[a].b->bidx, refs[b].b->bidx, refs[a].poffs[refs[a].pdir], refs[b].poffs[refs[b].pdir])
  107. #endif
  108. #define heap_cmp_kbm_bmer(refs, a, b) num_cmpx(refs[a]->bidx, refs[b]->bidx, refs[a].poffs[refs[a].pdir], refs[b].poffs[refs[b].pdir])
  109. typedef struct {
  110. u4i koff;
  111. u4i kcnt:8, kmat:9, boff:15; // offset from the start bin_idx
  112. } kbm_cmer_t;
  113. define_list(kbmcmerv, kbm_cmer_t);
  114. static const kbm_cmer_t KBM_CMER_NULL = {0, 0, 0, 0};
  115. typedef struct {
  116. u8i beg:46, mat:16, bt:2;
  117. b2i var;
  118. u2i gap;
  119. b4i score;
  120. } kbm_cell_t;
  121. static const kbm_cell_t KBM_CELL_NULL = {0, 0, 0, 0, 0, 0};
  122. define_list(kbmcellv, kbm_cell_t);
  123. typedef struct {
  124. u8i beg, end;
  125. u4i mat;
  126. int score;
  127. } kbm_path_t;
  128. define_list(kbmpathv, kbm_path_t);
  129. #define kbmpath_hashcode(E) E.beg
  130. #define kbmpath_hashequals(E1, E2) (E1).beg == (E2).beg
  131. define_hashset(kbmphash, kbm_path_t, kbmpath_hashcode, kbmpath_hashequals);
  132. typedef struct {
  133. u4i qidx:31, qdir:1;
  134. u4i tidx:31, tdir:1;
  135. u8i cgoff:40, cglen:24;
  136. int qb, qe, tb, te;
  137. int mat, cnt, aln, gap; // gap is counted in BINs
  138. } kbm_map_t;
  139. define_list(kbmmapv, kbm_map_t);
  140. typedef struct {
  141. int rd_len_order; // 0
  142. int min_bin_degree; // 2
  143. u4i ksize, psize; // 0, 21
  144. u4i kmax, kmin, kmer_mod, ksampling; // 1000, 1, 4.0 * KBM_N_HASH, KBM_BSIZE
  145. float ktop; // 0.05
  146. // runtime
  147. u4i strand_mask; // 3. 1: forward; 2: reverse; 3: both
  148. int self_aln; // 0. 0: map to all; 1: only map to greater read_id; 2: itself but reverse complementary
  149. int skip_contained; // 1
  150. u2i max_bgap; // 4
  151. u2i max_bvar; // 4
  152. float max_gap; // 0.6
  153. u8i max_bcnt; // 0xFFFF
  154. int pgap, pvar; // -7, -21
  155. u4i max_hit; // 1000
  156. int min_aln, min_mat; // 2048/256, 200
  157. float aln_var; // 0.25
  158. float min_sim; // kmer similarity: 0.05
  159. int test_mode; // see codes
  160. } KBMPar;
  161. static const obj_desc_t kbmpar_obj_desc = {"kbmpar_obj_desc", sizeof(KBMPar), 0, {}, {}, {}, NULL, NULL};
  162. typedef struct {
  163. u8i flags; // 64 bits, 0: mem_load all, 1: mem_load rdseqs+reads; 2: Single Hash Mode;3-63: unused
  164. KBMPar *par;
  165. BaseBank *rdseqs;
  166. u8i avail_bases;
  167. u4i avail_reads;
  168. kbmreadv *reads;
  169. cuhash *tag2idx;
  170. kbmbinv *bins;
  171. BitVec *binmarks;
  172. kbmbmerv *seeds;
  173. kbmbauxv *sauxs;
  174. kbmhash *hashs[KBM_N_HASH];
  175. kbmkauxv *kauxs[KBM_N_HASH];
  176. } KBM;
  177. int g_mpi_comm_rank, g_mpi_comm_size = 0;
  178. static inline size_t kbm_obj_desc_cnt(void *kbm, int idx){
  179. UNUSED(kbm);
  180. if(idx == 8 || idx == 9) return KBM_N_HASH;
  181. else return 1;
  182. }
  183. static inline void rebuild_tag2idx_kbm(void *kbm, size_t aux);
  184. static const obj_desc_t kbm_obj_desc = {.tag = "kbm_obj_desc", .size = sizeof(KBM), .n_child = 10,
  185. .mem_type = {1, 1, 1, 1, 1, 1, 1, 1, 2, 2},
  186. .addr = {offsetof(KBM, par), offsetof(KBM, rdseqs), offsetof(KBM, reads), offsetof(KBM, tag2idx), offsetof(KBM, bins), offsetof(KBM, binmarks), offsetof(KBM, seeds), offsetof(KBM, sauxs), offsetof(KBM, hashs), offsetof(KBM, kauxs)},
  187. .desc = {&kbmpar_obj_desc, &basebank_obj_desc, &kbmreadv_deep_obj_desc, &cuhash_obj_desc, &kbmbinv_obj_desc, &bitvec_obj_desc, &kbmbmerv_obj_desc, &kbmbauxv_obj_desc, &kbmhash_obj_desc, &kbmkauxv_obj_desc},
  188. kbm_obj_desc_cnt, rebuild_tag2idx_kbm
  189. };
  190. // Please note that, kbm->tag2idx is not functional after mem_load, because we use cuhash_obj_desc instread of cuhash_deep_obj_desc
  191. typedef struct {
  192. #if 0
  193. u4i poff, bidx;
  194. u4i refidx:26, koff:6;
  195. #endif
  196. u4i poff;
  197. u4i refidx;
  198. u8i bidx:40, koff:24;
  199. } kbm_dpe_t;
  200. define_list(kbmdpev, kbm_dpe_t);
  201. typedef struct {
  202. kbmdpev *kms; // kmer offset in query and bidx
  203. u4i km_len;
  204. BitVec *cmask; // bit for kbm_cmer_t
  205. kbmcmerv *cms;
  206. u4v *coffs; // kbm_cmer_t offset for each bin
  207. BitVec *rmask[2];
  208. kbmcellv *cells[2];
  209. Bit2Vec *bts; // back trace flag: 0: diagonal, 1: horizontal, 2: vertical
  210. kbmphash *paths; // storing best unique paths by now
  211. u8i boff;
  212. u8i last_bidx;
  213. } KBMDP;
  214. typedef struct {
  215. u8i kmer;
  216. u4i off;
  217. u4i kidx:30, dir:1, closed:1;
  218. } kmer_off_t;
  219. define_list(kmeroffv, kmer_off_t);
  220. typedef struct {
  221. KBM *kbm;
  222. KBMPar *par; // can diff from kbm->par
  223. char *qtag;
  224. BaseBank *qseqs;
  225. u8i qsoff;
  226. u4i qlen, slen, qnbin, qnbit;
  227. u4i qidx;
  228. u8i bmin, bmax;
  229. kmeroffv *koffs[2];
  230. kbmrefv *refs;
  231. u4v *rank, **heaps;
  232. u4i nheap;
  233. u4i hptr;
  234. u2i *binmap;
  235. u4i bmlen, bmoff, bmcnt;
  236. kbmdpev *caches[2];
  237. KBMDP *dps[2];
  238. kbmmapv *hits;
  239. BitsVec *cigars;
  240. BitVec *solids;
  241. String *str;
  242. } KBMAux;
  243. static inline KBMPar* init_kbmpar(){
  244. KBMPar *par;
  245. par = malloc(sizeof(KBMPar));
  246. par->rd_len_order = 0;
  247. par->min_bin_degree = 2;
  248. par->ksize = 0;
  249. par->psize = 21;
  250. par->kmax = 1000;
  251. par->kmin = 1;
  252. par->kmer_mod = 4 * KBM_N_HASH;
  253. par->ksampling = KBM_BSIZE;
  254. par->ktop = 0.05;
  255. par->strand_mask = 3;
  256. par->self_aln = 0;
  257. par->skip_contained = 1;
  258. par->max_bgap = 4; // 4 * KBM_BIN_SIZE
  259. par->max_bvar = 4;
  260. par->max_bcnt = 0xFFFF;
  261. par->max_gap = 0.6;
  262. par->pgap = -7;
  263. par->pvar = -21;
  264. par->max_hit = 1000;
  265. par->min_aln = 2048 / KBM_BIN_SIZE;
  266. par->min_mat = 200;
  267. par->min_sim = 0.05;
  268. par->aln_var = 0.25;
  269. par->test_mode = 0;
  270. return par;
  271. }
  272. static inline void free_kbmpar(KBMPar *par){ free(par); }
  273. static inline KBM* init_kbm(KBMPar *par){
  274. KBM *kbm;
  275. u4i i;
  276. kbm = malloc(sizeof(KBM));
  277. kbm->flags = 0;
  278. kbm->par = par;
  279. kbm->rdseqs = init_basebank();
  280. kbm->avail_bases = 0;
  281. kbm->avail_reads = 0;
  282. kbm->reads = init_kbmreadv(64);
  283. kbm->tag2idx = init_cuhash(1023);
  284. kbm->bins = init_kbmbinv(64);
  285. kbm->binmarks = init_bitvec(1024);
  286. //kbm->kfs = NULL;
  287. kbm->seeds = init_kbmbmerv(64);
  288. kbm->sauxs = init_kbmbauxv(64);
  289. for(i=0;i<KBM_N_HASH;i++) kbm->hashs[i] = init_kbmhash(1023);
  290. for(i=0;i<KBM_N_HASH;i++) kbm->kauxs[i] = init_kbmkauxv(64);
  291. return kbm;
  292. }
  293. static inline void free_kbm(KBM *kbm){
  294. u4i i;
  295. if(kbm->flags & 0x1) return;
  296. if(kbm->flags & 0x2){
  297. } else {
  298. free_basebank(kbm->rdseqs);
  299. for(i=0;i<kbm->reads->size;i++){
  300. if(kbm->reads->buffer[i].tag) free(kbm->reads->buffer[i].tag);
  301. }
  302. free_kbmreadv(kbm->reads);
  303. }
  304. free_cuhash(kbm->tag2idx);
  305. free_kbmbinv(kbm->bins);
  306. free_bitvec(kbm->binmarks);
  307. //if(kbm->kfs) free(kbm->kfs);
  308. free_kbmbmerv(kbm->seeds);
  309. free_kbmbauxv(kbm->sauxs);
  310. for(i=0;i<KBM_N_HASH;i++) free_kbmhash(kbm->hashs[i]);
  311. for(i=0;i<KBM_N_HASH;i++) free_kbmkauxv(kbm->kauxs[i]);
  312. free(kbm);
  313. }
  314. static inline void reset_index_kbm(KBM *kbm){
  315. u4i i;
  316. if(kbm->flags & 0x04){
  317. free_kbmhash(kbm->hashs[0]);
  318. kbm->hashs[0] = init_kbmhash(1023);
  319. free_kbmkauxv(kbm->kauxs[0]);
  320. kbm->kauxs[0] = init_kbmkauxv(64);
  321. } else {
  322. for(i=0;i<KBM_N_HASH;i++){
  323. free_kbmhash(kbm->hashs[i]);
  324. kbm->hashs[i] = init_kbmhash(1023);
  325. free_kbmkauxv(kbm->kauxs[i]);
  326. kbm->kauxs[i] = init_kbmkauxv(64);
  327. }
  328. }
  329. free_kbmbmerv(kbm->seeds);
  330. kbm->seeds = init_kbmbmerv(64);
  331. free_kbmbauxv(kbm->sauxs);
  332. kbm->sauxs = init_kbmbauxv(64);
  333. //zeros_bitvec(kbm->binmarks);
  334. }
  335. static inline void clear_kbm(KBM *kbm){
  336. u4i i;
  337. if(kbm->flags & 0x1) return;
  338. if(kbm->flags & 0x2){
  339. } else {
  340. clear_basebank(kbm->rdseqs);
  341. for(i=0;i<kbm->reads->size;i++){
  342. if(kbm->reads->buffer[i].tag) free(kbm->reads->buffer[i].tag);
  343. }
  344. clear_kbmreadv(kbm->reads);
  345. clear_cuhash(kbm->tag2idx);
  346. }
  347. clear_kbmbinv(kbm->bins);
  348. clear_bitvec(kbm->binmarks);
  349. for(i=0;i<KBM_N_HASH;i++){
  350. free_kbmhash(kbm->hashs[i]);
  351. kbm->hashs[i] = init_kbmhash(1023);
  352. free_kbmkauxv(kbm->kauxs[i]);
  353. kbm->kauxs[i] = init_kbmkauxv(64);
  354. if(kbm->flags & 0x04) break;
  355. }
  356. clear_kbmbmerv(kbm->seeds);
  357. clear_kbmbauxv(kbm->sauxs);
  358. }
  359. #define kbm_cvt_length(seqlen) ((seqlen) & (MAX_U8 << KBM_BIN_BITS))
  360. static inline void push_kbm(KBM *kbm, char *tag, int taglen, char *seq, u4i seqlen){
  361. kbm_read_t *rd;
  362. char *ptr;
  363. if(taglen){
  364. ptr = malloc(taglen + 1);
  365. memcpy(ptr, tag, taglen);
  366. ptr[taglen] = 0;
  367. } else {
  368. ptr = NULL;
  369. }
  370. if(seqlen > KBM_MAX_RDLEN) seqlen = KBM_MAX_RDLEN;
  371. seqlen = kbm_cvt_length(seqlen);
  372. rd = next_ref_kbmreadv(kbm->reads);
  373. rd->seqoff = (kbm->rdseqs->size >> KBM_BIN_BITS);
  374. rd->flag = 0;
  375. rd->binoff = 0;
  376. rd->bincnt = seqlen / KBM_BIN_SIZE;
  377. rd->closed = 0;
  378. rd->tag = ptr;
  379. seq2basebank(kbm->rdseqs, seq, seqlen);
  380. }
  381. static inline void bitpush_kbm(KBM *kbm, char *tag, int taglen, u8i *seqs, u8i seqoff, u4i seqlen){
  382. kbm_read_t *rd;
  383. char *ptr;
  384. if(taglen){
  385. ptr = malloc(taglen + 1);
  386. memcpy(ptr, tag, taglen);
  387. ptr[taglen] = 0;
  388. } else {
  389. ptr = NULL;
  390. }
  391. if(seqlen > KBM_MAX_RDLEN) seqlen = KBM_MAX_RDLEN;
  392. seqlen = kbm_cvt_length(seqlen);
  393. rd = next_ref_kbmreadv(kbm->reads);
  394. rd->seqoff = (kbm->rdseqs->size >> KBM_BIN_BITS);
  395. rd->flag = 0;
  396. rd->binoff = 0;
  397. rd->bincnt = seqlen / KBM_BIN_SIZE;
  398. rd->closed = 0;
  399. rd->tag = ptr;
  400. fast_fwdbits2basebank(kbm->rdseqs, seqs, seqoff, seqlen);
  401. }
  402. // Please call no more than once
  403. static inline u8i filter_reads_kbm(KBM *kbm, u8i retain_size, int strategy){
  404. u8i m, b, e, len;
  405. if(kbm->reads->size == 0) return 0;
  406. if(retain_size == 0 || retain_size >= kbm->rdseqs->size) return kbm->rdseqs->size;
  407. if((kbm->flags & 0x2) == 0){
  408. if(kbm->par->rd_len_order){
  409. sort_array(kbm->reads->buffer, kbm->reads->size, kbm_read_t, num_cmpgt(b.bincnt, a.bincnt));
  410. if(strategy == 0){ // longest
  411. len = 0;
  412. for(e=0;e<kbm->reads->size;e++){
  413. len += kbm->reads->buffer[e].bincnt * KBM_BIN_SIZE;
  414. if(len >= retain_size) break;
  415. }
  416. kbm->reads->size = e;
  417. } else if(strategy == 1){ // median
  418. m = kbm->reads->size / 2;
  419. len = kbm->reads->buffer[m].bincnt * KBM_BIN_SIZE;
  420. e = m;
  421. for(b=0;b<=m&&len<retain_size;b++){
  422. len += kbm->reads->buffer[m - b].bincnt * KBM_BIN_SIZE;
  423. len += kbm->reads->buffer[m + b].bincnt * KBM_BIN_SIZE;
  424. }
  425. e = b * 2;
  426. b = m - b;
  427. if(b){
  428. remove_array_kbmreadv(kbm->reads, 0, b);
  429. }
  430. kbm->reads->size = e;
  431. } else {
  432. return kbm->rdseqs->size;
  433. }
  434. return len;
  435. } else {
  436. return kbm->rdseqs->size;
  437. }
  438. } else {
  439. return kbm->rdseqs->size;
  440. }
  441. }
  442. static inline void ready_kbm(KBM *kbm){
  443. kbm_read_t *rd;
  444. u4i i, j;
  445. if((kbm->flags & 0x2) == 0){
  446. if(kbm->par->rd_len_order){
  447. sort_array(kbm->reads->buffer, kbm->reads->size, kbm_read_t, num_cmpgt(b.bincnt, a.bincnt));
  448. }
  449. encap_basebank(kbm->rdseqs, KBM_BSIZE);
  450. }
  451. clear_kbmbinv(kbm->bins);
  452. for(i=0;i<kbm->reads->size;i++){
  453. rd = ref_kbmreadv(kbm->reads, i);
  454. if(rd->tag) put_cuhash(kbm->tag2idx, (cuhash_t){rd->tag, i});
  455. if((kbm->flags & 0x2) == 0) rd->binoff = kbm->bins->size;
  456. for(j=0;j<rd->bincnt;j++){
  457. push_kbmbinv(kbm->bins, (kbm_bin_t){i, j, 0, 0});
  458. }
  459. if((kbm->flags & 0x2) == 0) rd->bincnt = j;
  460. }
  461. clear_bitvec(kbm->binmarks);
  462. encap_bitvec(kbm->binmarks, kbm->bins->size);
  463. kbm->binmarks->n_bit = kbm->bins->size;
  464. zeros_bitvec(kbm->binmarks);
  465. }
  466. // Share seqs, reads
  467. static inline KBM* clone_seqs_kbm(KBM *src, KBMPar *par){
  468. KBM *dst;
  469. dst = init_kbm(par);
  470. free_basebank(dst->rdseqs);
  471. free_kbmreadv(dst->reads);
  472. dst->rdseqs = src->rdseqs;
  473. dst->reads = src->reads;
  474. dst->flags = 1LLU << 1;
  475. ready_kbm(dst); // Notice encap_basebank in ready_kbm
  476. return dst;
  477. }
  478. // rs[0]->n_head MUST >= 1
  479. static inline void split_FIXP_kmers_kbm(BaseBank *rdseqs, u8i offset, u4i length, u1i ksize, u1i psize, u4i kmod, kmeroffv *rs[2]){
  480. kmer_off_t *kp;
  481. u8i kmer, krev, hv, npz, kmask, p, pmask;
  482. u4i ki, npl;
  483. int i, j;
  484. u1i c, b, kshift, pshift, kpshf;
  485. clear_kmeroffv(rs[0]);
  486. clear_kmeroffv(rs[1]);
  487. kshift = (ksize - 1) << 1;
  488. kmask = (1LLU << (ksize << 1)) - 1LLU;
  489. pshift = (psize - 1) << 1;
  490. pmask = (1LLU << (psize << 1)) - 1LLU;
  491. kpshf = psize << 1;
  492. if(ksize){
  493. // scan F-part of kmers
  494. kmer = krev = 0;
  495. for(i=0;i+1<ksize;i++){
  496. c = bits2bit(rdseqs->bits, offset + i);
  497. kmer = (kmer << 2) | c;
  498. krev = (krev >> 2) | (((u8i)((~c) & 0x3)) << kshift);
  499. }
  500. for(;i<(int)length;i++){
  501. c = bits2bit(rdseqs->bits, offset + i);
  502. kmer = ((kmer << 2) | c) & kmask;
  503. krev = (krev >> 2) | (((u8i)((~c) & 0x3)) << kshift);
  504. if(kmer < krev){
  505. push_kmeroffv(rs[0], (kmer_off_t){kmer, i - (ksize - 1), 0, 0, 1});
  506. } else if(krev < kmer){
  507. push_kmeroffv(rs[1], (kmer_off_t){krev, i - (ksize - 1), 0, 1, 1});
  508. }
  509. }
  510. if(psize){
  511. // scan P-part of forward kmers
  512. assert(rs[0]->n_head > 0);
  513. memset(rs[0]->buffer - 1, 0, sizeof(kmer_off_t)); // rs[0]->n_head > 0
  514. rs[0]->buffer[-1].off = length;
  515. npz = 0;
  516. npl = 0;
  517. kp = rs[0]->buffer + rs[0]->size - 1;
  518. b = 4;
  519. for(i=length-1;i>=0;i--){
  520. if(kp->off + (ksize - 1) == (u4i)i){
  521. if(npl >= psize){
  522. p = npz & pmask;
  523. kp->closed = 0;
  524. kp->kmer = (kp->kmer << kpshf) | p;
  525. }
  526. kp --;
  527. }
  528. c = bits2revbit(rdseqs->bits, offset + i);
  529. if(c == b){
  530. } else {
  531. npz = (npz << 2) | c;
  532. npl ++;
  533. b = c;
  534. }
  535. }
  536. // scan P-part of reverse kmers
  537. encap_kmeroffv(rs[1], 1); memset(rs[1]->buffer + rs[1]->size, 0xFF, sizeof(kmer_off_t));
  538. npz = 0;
  539. npl = 0;
  540. kp = rs[1]->buffer;
  541. b = 4;
  542. for(i=0;i<(int)length;i++){
  543. if(kp->off == (u4i)(i)){
  544. if(npl >= psize){
  545. p = npz & pmask;
  546. kp->closed = 0;
  547. kp->kmer = (kp->kmer << kpshf) | p;
  548. kp->off = kp->off - psize;
  549. }
  550. kp ++;
  551. }
  552. c = bits2bit(rdseqs->bits, offset + i);
  553. if(c == b){
  554. } else {
  555. npz = (npz << 2) | c;
  556. npl ++;
  557. b = c;
  558. }
  559. }
  560. } else {
  561. for(i=0;(u4i)i<rs[0]->size;i++) rs[0]->buffer[i].closed = 0;
  562. for(i=0;(u4i)i<rs[1]->size;i++) rs[1]->buffer[i].closed = 0;
  563. }
  564. } else if(psize){
  565. kmer = krev = 0;
  566. b = 4;
  567. for(i=j=0;i<(int)length;i++){
  568. c = bits2bit(rdseqs->bits, offset + i);
  569. if(b == c) continue;
  570. b = c;
  571. kmer = ((kmer << 2) | c) & pmask;
  572. krev = (krev >> 2) | (((u8i)((~c) & 0x3)) << pshift);
  573. j ++;
  574. if(j < psize) continue;
  575. if(kmer < krev){
  576. push_kmeroffv(rs[0], (kmer_off_t){kmer, i - (psize - 1), 0, 0, 0});
  577. } else if(krev < kmer){
  578. push_kmeroffv(rs[1], (kmer_off_t){krev, i - (psize - 1), 0, 1, 0});
  579. }
  580. }
  581. }
  582. for(b=0;b<2;b++){
  583. for(i=0;(u4i)i<rs[b]->size;i++){
  584. if(rs[b]->buffer[i].closed) continue;
  585. hv = kbm_kmer_smear(rs[b]->buffer[i].kmer);
  586. ki = hv % kmod;
  587. if(ki >= KBM_N_HASH) rs[b]->buffer[i].closed = 1;
  588. rs[b]->buffer[i].kidx = ki;
  589. }
  590. }
  591. }
  592. static inline u8i seed2solid_idx_kbm(KBM *kbm, kbm_dpe_t *p){
  593. kbm_bin_t *b;
  594. kbm_read_t *rd;
  595. u8i seqoff;
  596. b = kbm->bins->buffer + p->bidx;
  597. rd = kbm->reads->buffer + b->ridx;
  598. seqoff = (((rd->seqoff + b->off) * KBM_BSIZE) >> 1) + p->koff;
  599. return seqoff;
  600. }
  601. static inline u8i rdoff2solid_idx_kbm(KBM *kbm, u4i ridx, u4i roff){
  602. kbm_read_t *rd;
  603. u8i seqoff;
  604. rd = kbm->reads->buffer + ridx;
  605. seqoff = (rd->seqoff * KBM_BIN_SIZE + roff) >> 1;
  606. return seqoff;
  607. }
  608. #define binoff2solid_koff_kbm(kbm, bidx, boff) ((boff) >> 1)
  609. typedef struct {
  610. u8i mer:50, kidx:14;
  611. u8i bidx;
  612. u4i cnt:22, koff:8, dir:1, used:1;
  613. } kbm_midx_t;
  614. define_list(kbmmidxv, kbm_midx_t);
  615. typedef struct {
  616. u8i bidx;
  617. kbm_baux_t aux;
  618. } kbm_tmp_bmer_t;
  619. define_list(tmpbmerv, kbm_tmp_bmer_t);
  620. thread_beg_def(midx);
  621. KBM *kbm;
  622. u8i beg, end; // (end - beg) * KBM_BSIZE MUST <= KBM_KMEROFF_MAX
  623. u8i ktot, nrem, Nrem, none, nflt, offset;
  624. u8i srem, Srem;
  625. u8i *cnts, n_cnt;
  626. int task;
  627. int cal_degree;
  628. pthread_mutex_t *locks;
  629. thread_end_def(midx);
  630. thread_beg_func(midx);
  631. KBM *kbm;
  632. kbm_bin_t *bin;
  633. kbmmidxv **kidxs;
  634. kbm_midx_t *mx;
  635. kmer_off_t *f;
  636. kbm_kmer_t *u;
  637. kbm_kaux_t *x;
  638. kmeroffv *kmers[2];
  639. tmpbmerv *bms;
  640. u8i bidx, off;
  641. u4i i, j, k, len, ncpu, tidx, kidx;
  642. int exists;
  643. kbm = midx->kbm;
  644. ncpu = midx->n_cpu;
  645. tidx = midx->t_idx;
  646. kmers[0] = adv_init_kmeroffv(64, 0, 1);
  647. kmers[1] = adv_init_kmeroffv(64, 0, 1);
  648. kidxs = malloc(KBM_N_HASH * sizeof(kbmmidxv*));
  649. for(i=0;i<KBM_N_HASH;i++) kidxs[i] = init_kbmmidxv(64);
  650. bms = init_tmpbmerv(KBM_MAX_KCNT);
  651. thread_beg_loop(midx);
  652. if(midx->task == 1){
  653. // counting kmers
  654. for(i=0;i<KBM_N_HASH;i++) clear_kbmmidxv(kidxs[i]);
  655. for(bidx=midx->beg+tidx;bidx<midx->end;bidx+=ncpu){
  656. if(KBM_LOG == 0 && tidx == 0 && ((bidx - midx->beg) % 100000) == 0){ fprintf(KBM_LOGF, "\r%llu", bidx - midx->beg); fflush(KBM_LOGF); }
  657. bin = ref_kbmbinv(kbm->bins, bidx);
  658. if(bin->closed) continue;
  659. off = (kbm->reads->buffer[bin->ridx].seqoff + bin->off) * KBM_BIN_SIZE;
  660. len = KBM_BIN_SIZE;
  661. split_FIXP_kmers_kbm(kbm->rdseqs, off, len, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod, kmers);
  662. for(i=0;i<2;i++){
  663. for(j=0;j<kmers[i]->size;j++){
  664. f = ref_kmeroffv(kmers[i], j);
  665. if(f->closed) continue;
  666. mx = next_ref_kbmmidxv(kidxs[f->kidx]);
  667. mx->mer = f->kmer;
  668. mx->kidx = f->kidx;
  669. mx->bidx = bidx;
  670. mx->dir = i;
  671. mx->koff = f->off;
  672. if(kidxs[f->kidx]->size >= 64){
  673. kidx = f->kidx;
  674. // lock hashs[kidx]
  675. pthread_mutex_lock(midx->locks + kidx);
  676. // hash adding
  677. for(k=0;k<kidxs[kidx]->size;k++){
  678. mx = ref_kbmmidxv(kidxs[kidx], k);
  679. u = prepare_kbmhash(kbm->hashs[kidx], mx->mer, &exists);
  680. if(exists){
  681. if(u->tot < KBM_MAX_KCNT) u->tot ++;
  682. } else {
  683. u->mer = mx->mer;
  684. u->tot = 1;
  685. u->flt = 0;
  686. }
  687. }
  688. // free hashs[f->kidx]
  689. pthread_mutex_unlock(midx->locks + kidx);
  690. clear_kbmmidxv(kidxs[kidx]);
  691. }
  692. }
  693. }
  694. }
  695. for(kidx=0;kidx<KBM_N_HASH;kidx++){
  696. if(kidxs[kidx]->size){
  697. // lock hashs[kidx]
  698. pthread_mutex_lock(midx->locks + kidx);
  699. // hash adding
  700. for(k=0;k<kidxs[kidx]->size;k++){
  701. mx = ref_kbmmidxv(kidxs[kidx], k);
  702. u = prepare_kbmhash(kbm->hashs[kidx], mx->mer, &exists);
  703. if(exists){
  704. if(u->tot < KBM_MAX_KCNT) u->tot ++;
  705. } else {
  706. u->mer = mx->mer;
  707. u->tot = 1;
  708. u->flt = 0;
  709. }
  710. }
  711. // free hashs[f->kidx]
  712. pthread_mutex_unlock(midx->locks + kidx);
  713. clear_kbmmidxv(kidxs[kidx]);
  714. }
  715. }
  716. } else if(midx->task == 2){
  717. // delete low freq kmers
  718. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  719. reset_iter_kbmhash(kbm->hashs[i]);
  720. while((u = ref_iter_kbmhash(kbm->hashs[i]))){
  721. if(u->tot < kbm->par->kmin){
  722. delete_kbmhash(kbm->hashs[i], u);
  723. }
  724. }
  725. }
  726. } else if(midx->task == 3){
  727. // stat kmer counts
  728. memset(midx->cnts, 0, midx->n_cnt * sizeof(u8i));
  729. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  730. reset_iter_kbmhash(kbm->hashs[i]);
  731. while((u = ref_iter_kbmhash(kbm->hashs[i]))){
  732. midx->cnts[num_min(midx->n_cnt, u->tot) - 1] ++;
  733. }
  734. }
  735. } else if(midx->task == 4){
  736. // stat counts
  737. midx->offset = 0;
  738. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  739. reset_iter_kbmhash(kbm->hashs[i]);
  740. while((u = ref_iter_kbmhash(kbm->hashs[i]))){
  741. x = ref_kbmkauxv(kbm->kauxs[i], offset_kbmhash(kbm->hashs[i], u));
  742. x->off = midx->offset;
  743. x->cnt = 0;
  744. midx->ktot += u->tot;
  745. if(u->tot < kbm->par->kmin){
  746. u->flt = 1;
  747. } else if(u->tot > kbm->par->kmax){
  748. u->flt = 1;
  749. } else {
  750. midx->offset += u->tot;
  751. }
  752. }
  753. }
  754. } else if(midx->task == 5){
  755. // revise offset
  756. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  757. for(off=0;off<kbm->kauxs[i]->size;off++) kbm->kauxs[i]->buffer[off].off += midx->offset;
  758. }
  759. } else if(midx->task == 6){
  760. // fill seeds
  761. for(i=0;i<KBM_N_HASH;i++) clear_kbmmidxv(kidxs[i]);
  762. u4v *chgs;
  763. chgs = init_u4v(KBM_BSIZE);
  764. for(bidx=midx->beg+tidx;bidx<midx->end;bidx+=ncpu){
  765. if(KBM_LOG == 0 && tidx == 0 && ((bidx - midx->beg) % 100000) == 0){ fprintf(KBM_LOGF, "\r%llu", bidx - midx->beg); fflush(KBM_LOGF); }
  766. bin = ref_kbmbinv(kbm->bins, bidx);
  767. if(bin->closed) continue;
  768. bin->degree = 0;
  769. off = (kbm->reads->buffer[bin->ridx].seqoff + bin->off) * KBM_BIN_SIZE;
  770. len = KBM_BIN_SIZE;
  771. split_FIXP_kmers_kbm(kbm->rdseqs, off, len, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod, kmers);
  772. clear_u4v(chgs);
  773. for(i=0;i<2;i++){
  774. for(j=0;j<kmers[i]->size;j++){
  775. f = ref_kmeroffv(kmers[i], j);
  776. if(f->closed) continue;
  777. mx = next_ref_kbmmidxv(kidxs[f->kidx]);
  778. mx->mer = f->kmer;
  779. mx->kidx = f->kidx;
  780. mx->bidx = bidx;
  781. mx->dir = i;
  782. mx->koff = f->off;
  783. if(kidxs[f->kidx]->size == 64){
  784. push_u4v(chgs, f->kidx);
  785. }
  786. }
  787. }
  788. for(i=0;i<chgs->size;i++){
  789. kidx = chgs->buffer[i];
  790. if(kidxs[kidx]->size == 0) continue;
  791. pthread_mutex_lock(midx->locks + kidx);
  792. for(k=0;k<kidxs[kidx]->size;k++){
  793. mx = ref_kbmmidxv(kidxs[kidx], k);
  794. u = get_kbmhash(kbm->hashs[kidx], mx->mer);
  795. if(u && u->flt == 0){
  796. x = ref_kbmkauxv(kbm->kauxs[kidx], offset_kbmhash(kbm->hashs[kidx], u));
  797. kbm->bins->buffer[mx->bidx].degree ++;
  798. if(x->cnt < u->tot){
  799. if(x->cnt && getval_bidx(kbm, x->off + x->cnt - 1) == mx->bidx && kbm->sauxs->buffer[x->off + x->cnt - 1].dir == mx->dir){
  800. // repeated kmer within one bin
  801. } else {
  802. kbm->seeds->buffer[x->off + x->cnt].bidx = mx->bidx & MAX_U4;
  803. kbm->sauxs->buffer[x->off + x->cnt].bidx = mx->bidx >> 32;
  804. kbm->sauxs->buffer[x->off + x->cnt].dir = mx->dir;
  805. kbm->sauxs->buffer[x->off + x->cnt].koff = mx->koff >> 1;
  806. x->cnt ++;
  807. }
  808. }
  809. }
  810. }
  811. pthread_mutex_unlock(midx->locks + kidx);
  812. clear_kbmmidxv(kidxs[kidx]);
  813. }
  814. }
  815. for(kidx=0;kidx<KBM_N_HASH;kidx++){
  816. if(kidxs[kidx]->size){
  817. // lock hashs[kidx]
  818. pthread_mutex_lock(midx->locks + kidx);
  819. // hash adding
  820. for(k=0;k<kidxs[kidx]->size;k++){
  821. mx = ref_kbmmidxv(kidxs[kidx], k);
  822. u = get_kbmhash(kbm->hashs[kidx], mx->mer);
  823. if(u && u->flt == 0){
  824. x = ref_kbmkauxv(kbm->kauxs[kidx], offset_kbmhash(kbm->hashs[kidx], u));
  825. kbm->bins->buffer[mx->bidx].degree ++;
  826. if(x->cnt < u->tot){
  827. if(x->cnt && getval_bidx(kbm, x->off + x->cnt - 1) == mx->bidx && kbm->sauxs->buffer[x->off + x->cnt - 1].dir == mx->dir){
  828. // repeated kmer within one bin
  829. } else {
  830. kbm->seeds->buffer[x->off + x->cnt].bidx = mx->bidx & MAX_U4;
  831. kbm->sauxs->buffer[x->off + x->cnt].bidx = mx->bidx >> 32;
  832. kbm->sauxs->buffer[x->off + x->cnt].dir = mx->dir;
  833. kbm->sauxs->buffer[x->off + x->cnt].koff = mx->koff >> 1;
  834. x->cnt ++;
  835. }
  836. }
  837. }
  838. }
  839. // free hashs[f->kidx]
  840. pthread_mutex_unlock(midx->locks + kidx);
  841. clear_kbmmidxv(kidxs[kidx]);
  842. }
  843. }
  844. free_u4v(chgs);
  845. } else if(midx->task == 7){
  846. // count added kmers
  847. midx->srem = midx->Srem = 0;
  848. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  849. reset_iter_kbmhash(kbm->hashs[i]);
  850. while((u = ref_iter_kbmhash(kbm->hashs[i]))){
  851. x = ref_kbmkauxv(kbm->kauxs[i], offset_kbmhash(kbm->hashs[i], u));
  852. if(x->cnt){
  853. midx->srem ++;
  854. midx->Srem += x->cnt;
  855. }
  856. }
  857. }
  858. } else if(midx->task == 8){
  859. // sort seeds within a kmer
  860. for(i=tidx;i<KBM_N_HASH;i+=ncpu){
  861. reset_iter_kbmhash(kbm->hashs[i]);
  862. while((u = ref_iter_kbmhash(kbm->hashs[i]))){
  863. x = ref_kbmkauxv(kbm->kauxs[i], offset_kbmhash(kbm->hashs[i], u));
  864. if(x->cnt < 2) continue;
  865. clear_tmpbmerv(bms);
  866. for(j=0;j<x->cnt;j++){
  867. push_tmpbmerv(bms, (kbm_tmp_bmer_t){getval_bidx(kbm, x->off + j), kbm->sauxs->buffer[x->off + j]});
  868. }
  869. sort_array(bms->buffer, bms->size, kbm_tmp_bmer_t, num_cmpgt(a.bidx, b.bidx));
  870. kbm->seeds->buffer[x->off + 0].bidx = bms->buffer[0].bidx & MAX_U4;
  871. kbm->sauxs->buffer[x->off + 0].bidx = bms->buffer[0].bidx >> 32;
  872. kbm->sauxs->buffer[x->off + 0] = bms->buffer[0].aux;
  873. len = 1;
  874. for(j=1;j<x->cnt;j++){
  875. if(bms->buffer[j].bidx < bms->buffer[j - 1].bidx){
  876. continue;
  877. }
  878. kbm->seeds->buffer[x->off + len].bidx = bms->buffer[j].bidx & MAX_U4;
  879. kbm->sauxs->buffer[x->off + len].bidx = bms->buffer[0].bidx >> 32;
  880. kbm->sauxs->buffer[x->off + len] = bms->buffer[j].aux;
  881. len ++;
  882. }
  883. x->cnt = len;
  884. }
  885. }
  886. }
  887. thread_end_loop(midx);
  888. free_kmeroffv(kmers[0]);
  889. free_kmeroffv(kmers[1]);
  890. for(i=0;i<KBM_N_HASH;i++) free_kbmmidxv(kidxs[i]);
  891. free(kidxs);
  892. free_tmpbmerv(bms);
  893. thread_end_func(midx);
  894. static inline void index_kbm(KBM *kbm, u8i beg, u8i end, u4i ncpu, FILE *kmstat){
  895. u8i ktyp, nflt, nrem, Nrem, none, ktot, srem, Srem, off, cnt, *kcnts, MAX;
  896. u8i i, b, e, n;
  897. u4i j, kavg, batch_size;
  898. pthread_mutex_t *hash_locks;
  899. thread_preprocess(midx);
  900. batch_size = 10000;
  901. clear_kbmbmerv(kbm->seeds);
  902. for(i=0;i<KBM_N_HASH;i++) clear_kbmhash(kbm->hashs[i]);
  903. kcnts = NULL;
  904. MAX = KBM_MAX_KCNT;
  905. hash_locks = calloc(KBM_N_HASH, sizeof(pthread_mutex_t));
  906. thread_beg_init(midx, ncpu);
  907. midx->kbm = kbm;
  908. midx->beg = beg;
  909. midx->end = end;
  910. midx->cnts = NULL;
  911. midx->n_cnt = MAX;
  912. midx->task = 0;
  913. midx->cal_degree = 0;
  914. midx->locks = hash_locks;
  915. thread_end_init(midx);
  916. fprintf(KBM_LOGF, "[%s] - scanning kmers (K%dP%dS%0.2f) from %llu bins\n", date(), kbm->par->ksize, kbm->par->psize, 1.0 * kbm->par->kmer_mod / KBM_N_HASH, end - beg);
  917. b = e = beg;
  918. thread_apply_all(midx, midx->task = 1);
  919. if(KBM_LOG == 0){ fprintf(KBM_LOGF, "\r%llu bins\n", end - beg); fflush(KBM_LOGF); }
  920. kcnts = calloc(MAX, sizeof(u8i));
  921. thread_beg_iter(midx);
  922. midx->cnts = calloc(MAX, sizeof(u8i));
  923. midx->task = 3; // counting raw kmers
  924. thread_wake(midx);
  925. thread_end_iter(midx);
  926. thread_beg_iter(midx);
  927. thread_wait(midx);
  928. for(i=0;i<MAX;i++) kcnts[i] += midx->cnts[i];
  929. thread_end_iter(midx);
  930. if(kmstat){
  931. fprintf(kmstat, "#Reads: %llu\n", (u8i)kbm->reads->size);
  932. fprintf(kmstat, "#Bases: %llu bp\n", (u8i)kbm->rdseqs->size);
  933. fprintf(kmstat, "#K%dP%dS%0.2f\n", kbm->par->ksize, kbm->par->psize, 1.0 * kbm->par->kmer_mod / KBM_N_HASH);
  934. for(i=0;i+1<MAX;i++){
  935. fprintf(kmstat, "%llu\t%llu\t%llu\n", i + 1, kcnts[i], (i + 1) * kcnts[i]);
  936. }
  937. fflush(kmstat);
  938. }
  939. if(kmstat){
  940. u8i *_kcnts;
  941. _kcnts = malloc(200 * sizeof(u8i));
  942. for(i=0;i<200;i++){
  943. _kcnts[i] = (i + 1) * kcnts[i];
  944. }
  945. char *txt = barplot_txt_u8_simple(100, 20, _kcnts, 200, 0);
  946. fprintf(KBM_LOGF, "********************** Kmer Frequency **********************\n");
  947. fputs(txt, KBM_LOGF);
  948. fprintf(KBM_LOGF, "********************** 1 - 201 **********************\n");
  949. ktyp = 0;
  950. for(i=0;i<MAX;i++){
  951. ktyp += (i + 1) * kcnts[i];
  952. }
  953. _kcnts[0] = 0.10 * ktyp;
  954. _kcnts[1] = 0.20 * ktyp;
  955. _kcnts[2] = 0.30 * ktyp;
  956. _kcnts[3] = 0.40 * ktyp;
  957. _kcnts[4] = 0.50 * ktyp;
  958. _kcnts[5] = 0.60 * ktyp;
  959. _kcnts[6] = 0.70 * ktyp;
  960. _kcnts[7] = 0.80 * ktyp;
  961. _kcnts[8] = 0.90 * ktyp;
  962. _kcnts[9] = 0.95 * ktyp;
  963. fprintf(KBM_LOGF, "Quatiles:\n");
  964. fprintf(KBM_LOGF, " 10%% 20%% 30%% 40%% 50%% 60%% 70%% 80%% 90%% 95%%\n");
  965. off = 0;
  966. for(i=j=0;j<10;j++){
  967. while(off < _kcnts[j] && i < MAX){
  968. off += kcnts[i] * (i + 1);
  969. i ++;
  970. }
  971. fprintf(KBM_LOGF, "%6llu", i);
  972. }
  973. fprintf(KBM_LOGF, "\n");
  974. //fprintf(KBM_LOGF,
  975. //"# If the kmer distribution is not good, please kill me and adjust -k, -p, and -K\n"
  976. //"# Cannot get a good distribution anyway, should adjust -S -s, also -A -e in assembly\n"
  977. //);
  978. free(_kcnts);
  979. free(txt);
  980. }
  981. // delete low freq kmer from hash
  982. thread_apply_all(midx, midx->task = 2);
  983. // freeze hash to save memory and speed up the query
  984. for(i=0;i<KBM_N_HASH;i++){
  985. if(0){
  986. fprintf(KBM_LOGF, "%12llu ", (u8i)kbm->hashs[i]->count); fflush(KBM_LOGF);
  987. if((i % 8) == 7){
  988. fprintf(KBM_LOGF, "\n"); fflush(KBM_LOGF);
  989. }
  990. }
  991. freeze_kbmhash(kbm->hashs[i], 1.0 / 16);
  992. free_kbmkauxv(kbm->kauxs[i]);
  993. kbm->kauxs[i] = init_kbmkauxv(kbm->hashs[i]->count);
  994. kbm->kauxs[i]->size = kbm->hashs[i]->count;
  995. }
  996. print_proc_stat_info(g_mpi_comm_rank);
  997. ktot = nrem = Nrem = none = nflt = ktyp = 0;
  998. for(i=0;i+1<MAX;i++){
  999. ktot += kcnts[i];
  1000. if(i + 1 < kbm->par->kmin){
  1001. none += kcnts[i];
  1002. } else {
  1003. ktyp += kcnts[i] * (i + 1);
  1004. }
  1005. }
  1006. if(kbm->par->ktop){
  1007. off = 0;
  1008. for(i=MAX;i>kbm->par->kmin;i--){
  1009. off += kcnts[i-1] * i;
  1010. if(off >= (ktyp * kbm->par->ktop)) break;
  1011. }
  1012. if(i > kbm->par->kmax){
  1013. kbm->par->kmax = i;
  1014. }
  1015. fprintf(KBM_LOGF, "[%s] - high frequency kmer depth is set to %d\n", date(), kbm->par->kmax);
  1016. }
  1017. for(i=kbm->par->kmin? kbm->par->kmin - 1 : 0;i<kbm->par->kmax;i++){
  1018. nrem += kcnts[i];
  1019. Nrem += kcnts[i] * (i + 1);
  1020. }
  1021. for(i=kbm->par->kmax;i+1<MAX;i++){
  1022. nflt += kcnts[i];
  1023. }
  1024. off = 0;
  1025. thread_apply_all(midx, midx->task = 4);
  1026. thread_beg_iter(midx);
  1027. cnt = midx->offset;
  1028. midx->offset = off;
  1029. off += cnt;
  1030. midx->task = 5;
  1031. thread_wake(midx);
  1032. thread_end_iter(midx);
  1033. thread_wait_all(midx);
  1034. clear_and_encap_kbmbmerv(kbm->seeds, off + 1);
  1035. kbm->seeds->size = off;
  1036. free_kbmbauxv(kbm->sauxs);
  1037. kbm->sauxs = init_kbmbauxv(off + 1);
  1038. kbm->sauxs->size = off;
  1039. kavg = Nrem / (nrem + 1);
  1040. fprintf(KBM_LOGF, "[%s] - Total kmers = %llu\n", date(), ktot);
  1041. fprintf(KBM_LOGF, "[%s] - average kmer depth = %d\n", date(), kavg);
  1042. fprintf(KBM_LOGF, "[%s] - %llu low frequency kmers (<%d)\n", date(), none, kbm->par->kmin);
  1043. fprintf(KBM_LOGF, "[%s] - %llu high frequency kmers (>%d)\n", date(), nflt, kbm->par->kmax);
  1044. fprintf(KBM_LOGF, "[%s] - indexing %llu kmers, %llu instances (at most)\n", date(), nrem, Nrem);
  1045. thread_apply_all(midx, midx->task = 6);
  1046. if(KBM_LOG == 0){ fprintf(KBM_LOGF, "\r%llu bins\n", end - beg); fflush(KBM_LOGF); }
  1047. thread_apply_all(midx, midx->task = 7);
  1048. srem = Srem = 0;
  1049. thread_beg_iter(midx);
  1050. srem += midx->srem;
  1051. Srem += midx->Srem;
  1052. thread_end_iter(midx);
  1053. fprintf(KBM_LOGF, "[%s] - indexed %llu kmers, %llu instances\n", date(), srem, Srem);
  1054. {
  1055. n = 0;
  1056. for(i=beg;i<end;i++){
  1057. if(kbm->bins->buffer[i].degree < kbm->par->min_bin_degree){
  1058. kbm->bins->buffer[i].closed = 1;
  1059. one_bitvec(kbm->binmarks, i);
  1060. n ++;
  1061. }
  1062. }
  1063. index_bitvec(kbm->binmarks);
  1064. fprintf(KBM_LOGF, "[%s] - masked %llu bins as closed\n", date(), n);
  1065. }
  1066. fprintf(KBM_LOGF, "[%s] - sorting\n", date());
  1067. thread_apply_all(midx, midx->task = 8);
  1068. if(kcnts) free(kcnts);
  1069. thread_beg_close(midx);
  1070. if(midx->cnts) free(midx->cnts);
  1071. thread_end_close(midx);
  1072. free(hash_locks);
  1073. print_proc_stat_info(g_mpi_comm_rank);
  1074. }
  1075. static inline void simple_index_kbm(KBM *kbm, u8i beg, u8i end){
  1076. kbm_bin_t *bin;
  1077. kbm_kmer_t *u;
  1078. kbm_kaux_t *x;
  1079. kmeroffv *kmers[2];
  1080. tmpbmerv *bms;
  1081. kmer_off_t *f;
  1082. u8i bidx, off;
  1083. u4i i, j, len;
  1084. int exists;
  1085. kbm->flags |= 1LLU << 2;
  1086. kmers[0] = adv_init_kmeroffv(64, 0, 1);
  1087. kmers[1] = adv_init_kmeroffv(64, 0, 1);
  1088. bms = init_tmpbmerv(KBM_MAX_KCNT);
  1089. // count kmers
  1090. {
  1091. for(bidx=beg;bidx<end;bidx++){
  1092. bin = ref_kbmbinv(kbm->bins, bidx);
  1093. if(bin->closed) continue;
  1094. off = (kbm->reads->buffer[bin->ridx].seqoff + bin->off) * KBM_BIN_SIZE;
  1095. split_FIXP_kmers_kbm(kbm->rdseqs, off, KBM_BIN_SIZE, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod, kmers);
  1096. for(i=0;i<2;i++){
  1097. for(j=0;j<kmers[i]->size;j++){
  1098. f = ref_kmeroffv(kmers[i], j);
  1099. if(f->closed) continue;
  1100. u = prepare_kbmhash(kbm->hashs[0], f->kmer, &exists);
  1101. if(exists){
  1102. if(u->tot < KBM_MAX_KCNT) u->tot ++;
  1103. } else {
  1104. u->mer = f->kmer;
  1105. u->tot = 1;
  1106. u->flt = 0;
  1107. }
  1108. }
  1109. }
  1110. }
  1111. }
  1112. // delete low freq kmers
  1113. if(kbm->par->kmin){
  1114. reset_iter_kbmhash(kbm->hashs[0]);
  1115. while((u = ref_iter_kbmhash(kbm->hashs[0]))){
  1116. if(u->tot < kbm->par->kmin){
  1117. delete_kbmhash(kbm->hashs[0], u);
  1118. }
  1119. }
  1120. }
  1121. // freeze hash
  1122. {
  1123. freeze_kbmhash(kbm->hashs[0], 1.0 / 16);
  1124. free_kbmkauxv(kbm->kauxs[0]);
  1125. kbm->kauxs[0] = init_kbmkauxv(kbm->hashs[0]->count);
  1126. kbm->kauxs[0]->size = kbm->hashs[0]->count;
  1127. }
  1128. // encap seeds
  1129. {
  1130. off = 0;
  1131. reset_iter_kbmhash(kbm->hashs[0]);
  1132. while((u = ref_iter_kbmhash(kbm->hashs[0]))){
  1133. x = ref_kbmkauxv(kbm->kauxs[0], offset_kbmhash(kbm->hashs[0], u));
  1134. x->off = off;
  1135. x->cnt = 0;
  1136. if(u->tot < kbm->par->kmin){ // Never happens
  1137. u->flt = 1;
  1138. } else if(kbm->par->kmax && u->tot > kbm->par->kmax){
  1139. u->flt = 1;
  1140. } else {
  1141. off += u->tot;
  1142. }
  1143. }
  1144. clear_and_encap_kbmbmerv(kbm->seeds, off + 1);
  1145. kbm->seeds->size = off;
  1146. clear_and_encap_kbmbauxv(kbm->sauxs, off + 1);
  1147. kbm->sauxs->size = off;
  1148. }
  1149. // fill seeds
  1150. {
  1151. for(bidx=beg;bidx<end;bidx++){
  1152. bin = ref_kbmbinv(kbm->bins, bidx);
  1153. if(bin->closed) continue;
  1154. off = (kbm->reads->buffer[bin->ridx].seqoff + bin->off) * KBM_BIN_SIZE;
  1155. split_FIXP_kmers_kbm(kbm->rdseqs, off, KBM_BIN_SIZE, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod, kmers);
  1156. for(i=0;i<2;i++){
  1157. for(j=0;j<kmers[i]->size;j++){
  1158. f = ref_kmeroffv(kmers[i], j);
  1159. if(f->closed) continue;
  1160. u = get_kbmhash(kbm->hashs[0], f->kmer);
  1161. if(u == NULL || u->flt) continue;
  1162. x = ref_kbmkauxv(kbm->kauxs[0], offset_kbmhash(kbm->hashs[0], u));
  1163. kbm->bins->buffer[bidx].degree ++;
  1164. if(x->cnt < u->tot){
  1165. if(x->cnt && getval_bidx(kbm, x->off + x->cnt - 1) == bidx && kbm->sauxs->buffer[x->off + x->cnt - 1].dir == i){
  1166. // repeated kmer within one bin
  1167. } else {
  1168. kbm->seeds->buffer[x->off + x->cnt].bidx = bidx & MAX_U4;
  1169. kbm->sauxs->buffer[x->off + x->cnt].bidx = bidx >> 32;
  1170. kbm->sauxs->buffer[x->off + x->cnt].dir = i;
  1171. kbm->sauxs->buffer[x->off + x->cnt].koff = f->off >> 1;
  1172. x->cnt ++;
  1173. }
  1174. }
  1175. }
  1176. }
  1177. }
  1178. }
  1179. // sort seeds within a kmer
  1180. {
  1181. reset_iter_kbmhash(kbm->hashs[0]);
  1182. while((u = ref_iter_kbmhash(kbm->hashs[0]))){
  1183. x = ref_kbmkauxv(kbm->kauxs[0], offset_kbmhash(kbm->hashs[0], u));
  1184. clear_tmpbmerv(bms);
  1185. for(j=0;j<x->cnt;j++){
  1186. push_tmpbmerv(bms, (kbm_tmp_bmer_t){getval_bidx(kbm, x->off + j), kbm->sauxs->buffer[x->off + j]});
  1187. }
  1188. sort_array(bms->buffer, bms->size, kbm_tmp_bmer_t, num_cmpgt(a.bidx, b.bidx));
  1189. kbm->seeds->buffer[x->off + 0].bidx = bms->buffer[0].bidx & MAX_U4;
  1190. kbm->sauxs->buffer[x->off + 0].bidx = bms->buffer[0].bidx >> 32;
  1191. kbm->sauxs->buffer[x->off + 0] = bms->buffer[0].aux;
  1192. len = 1;
  1193. for(j=1;j<x->cnt;j++){
  1194. if(bms->buffer[j].bidx < bms->buffer[j - 1].bidx){
  1195. continue;
  1196. }
  1197. kbm->seeds->buffer[x->off + len].bidx = bms->buffer[j].bidx & MAX_U4;
  1198. kbm->sauxs->buffer[x->off + len].bidx = bms->buffer[0].bidx >> 32;
  1199. kbm->sauxs->buffer[x->off + len] = bms->buffer[j].aux;
  1200. len ++;
  1201. }
  1202. x->cnt = len;
  1203. }
  1204. }
  1205. free_kmeroffv(kmers[0]);
  1206. free_kmeroffv(kmers[1]);
  1207. free_tmpbmerv(bms);
  1208. }
  1209. static inline KBMDP* init_kbmdp(){
  1210. KBMDP *dp;
  1211. dp = malloc(sizeof(KBMDP));
  1212. dp->kms = init_kbmdpev(1024);
  1213. dp->km_len = 0;
  1214. dp->cmask = init_bitvec(1024);
  1215. dp->cms = init_kbmcmerv(64);
  1216. dp->coffs = init_u4v(32);
  1217. dp->rmask[0] = init_bitvec(256);
  1218. dp->rmask[1] = init_bitvec(256);
  1219. dp->cells[0] = init_kbmcellv(16);
  1220. dp->cells[1] = init_kbmcellv(16);
  1221. dp->bts = init_bit2vec(1024);
  1222. dp->paths = init_kbmphash(13);
  1223. dp->boff = 0;
  1224. dp->last_bidx = 0;
  1225. return dp;
  1226. }
  1227. static inline void reset_kbmdp(KBMDP *dp, KBMAux *aux, u8i bidx){
  1228. //clear_kbmdpev(dp->kms);
  1229. //dp->km_len = 0;
  1230. clear_bitvec(dp->cmask);
  1231. clear_kbmcmerv(dp->cms);
  1232. clear_u4v(dp->coffs);
  1233. recap_bitvec(dp->rmask[0], aux->qnbit);
  1234. zeros_bitvec(dp->rmask[0]);
  1235. recap_bitvec(dp->rmask[1], aux->qnbit);
  1236. zeros_bitvec(dp->rmask[1]);
  1237. recap_kbmcellv(dp->cells[0], aux->qnbin);
  1238. recap_kbmcellv(dp->cells[1], aux->qnbin);
  1239. clear_bit2vec(dp->bts);
  1240. if(dp->paths->size > 1024 * 10){
  1241. free_kbmphash(dp->paths);
  1242. dp->paths = init_kbmphash(1023);
  1243. } else {
  1244. clear_kbmphash(dp->paths);
  1245. }
  1246. dp->boff = bidx;
  1247. dp->last_bidx = bidx;
  1248. }
  1249. static inline void clear_kbmdp(KBMDP *dp, KBMAux *aux, u8i bidx){
  1250. reset_kbmdp(dp, aux , bidx);
  1251. clear_kbmdpev(dp->kms);
  1252. dp->km_len = 0;
  1253. }
  1254. static inline void free_kbmdp(KBMDP *dp){
  1255. free_kbmdpev(dp->kms);
  1256. free_bitvec(dp->cmask);
  1257. free_kbmcmerv(dp->cms);
  1258. free_u4v(dp->coffs);
  1259. free_bitvec(dp->rmask[0]);
  1260. free_bitvec(dp->rmask[1]);
  1261. free_kbmcellv(dp->cells[0]);
  1262. free_kbmcellv(dp->cells[1]);
  1263. free_bit2vec(dp->bts);
  1264. free_kbmphash(dp->paths);
  1265. free(dp);
  1266. }
  1267. static inline KBMAux* init_kbmaux(KBM *kbm){
  1268. KBMAux *aux;
  1269. aux = malloc(sizeof(KBMAux));
  1270. aux->kbm = kbm;
  1271. aux->par = kbm->par;
  1272. aux->qtag = NULL;
  1273. aux->qseqs = NULL;
  1274. aux->qsoff = 0;
  1275. aux->qlen = 0;
  1276. aux->slen = 0;
  1277. aux->qidx = 0;
  1278. aux->qnbin = 0;
  1279. aux->qnbit = (aux->qnbin + 63) & 0xFFFFFFC0U;
  1280. aux->bmin = 0;
  1281. aux->bmax = MAX_U8;
  1282. aux->koffs[0] = adv_init_kmeroffv(32, 0, 1);
  1283. aux->koffs[1] = adv_init_kmeroffv(32, 0, 1);
  1284. aux->refs = init_kbmrefv(64);
  1285. aux->rank = init_u4v(64);
  1286. aux->nheap = 1024;
  1287. aux->heaps = malloc((aux->nheap + 1) * sizeof(u4v*));
  1288. {
  1289. u4i i;
  1290. for(i=0;i<=aux->nheap;i++){
  1291. aux->heaps[i] = init_u4v(8);
  1292. }
  1293. }
  1294. aux->bmlen = 1;
  1295. aux->bmoff = 0;
  1296. aux->bmcnt = aux->kbm->bins->size;
  1297. aux->binmap = NULL;
  1298. aux->caches[0] = init_kbmdpev(64);
  1299. aux->caches[1] = init_kbmdpev(64);
  1300. aux->dps[0] = init_kbmdp();
  1301. aux->dps[1] = init_kbmdp();
  1302. aux->hits = init_kbmmapv(16);
  1303. aux->cigars = init_bitsvec(1024, 3);
  1304. aux->solids = NULL;
  1305. aux->str = init_string(1024);
  1306. return aux;
  1307. }
  1308. static inline void free_kbmaux(KBMAux *aux){
  1309. free_kmeroffv(aux->koffs[0]);
  1310. free_kmeroffv(aux->koffs[1]);
  1311. free_kbmrefv(aux->refs);
  1312. free_u4v(aux->rank);
  1313. if(aux->heaps){
  1314. u4i i;
  1315. for(i=0;i<=aux->nheap;i++){
  1316. if(aux->heaps[i]) free_u4v(aux->heaps[i]);
  1317. }
  1318. free(aux->heaps);
  1319. }
  1320. if(aux->binmap) free(aux->binmap);
  1321. free_kbmdpev(aux->caches[0]);
  1322. free_kbmdpev(aux->caches[1]);
  1323. free_kbmdp(aux->dps[0]);
  1324. free_kbmdp(aux->dps[1]);
  1325. free_kbmmapv(aux->hits);
  1326. free_bitsvec(aux->cigars);
  1327. free_string(aux->str);
  1328. free(aux);
  1329. }
  1330. static inline void query_index_kbm(KBMAux *aux, char *qtag, u4i qidx, BaseBank *rdseqs, u8i seqoff, u4i seqlen){
  1331. KBM *kbm;
  1332. KBMPar *par;
  1333. kbm_kmer_t *u;
  1334. kbm_kaux_t *x;
  1335. kmer_off_t *f;
  1336. kbm_ref_t *ref;
  1337. u8i sidx, bmin, bmax;
  1338. u4i hidx, next;
  1339. u4i i, j, l, tot, mr, pdir;
  1340. kbm = aux->kbm;
  1341. par = aux->par;
  1342. aux->qtag = qtag? qtag : kbm->reads->buffer[qidx].tag;
  1343. aux->qseqs = rdseqs;
  1344. aux->qsoff = seqoff;
  1345. aux->qlen = seqlen;
  1346. aux->slen = (seqlen / KBM_BIN_SIZE) * KBM_BIN_SIZE;
  1347. aux->qidx = qidx;
  1348. aux->qnbin = aux->slen / KBM_BIN_SIZE;
  1349. aux->qnbit = (aux->qnbin + 63) & 0xFFFFFFC0U;
  1350. clear_kbmdp(aux->dps[0], aux, 0);
  1351. clear_kbmdp(aux->dps[1], aux, 0);
  1352. clear_kbmrefv(aux->refs);
  1353. clear_u4v(aux->rank);
  1354. clear_kbmdpev(aux->caches[0]);
  1355. clear_kbmdpev(aux->caches[1]);
  1356. clear_kbmmapv(aux->hits);
  1357. clear_bitsvec(aux->cigars);
  1358. #ifdef TEST_MODE
  1359. if(par->test_mode >= 7) return;
  1360. #endif
  1361. bmin = par->self_aln? kbm->reads->buffer[qidx].binoff + kbm->reads->buffer[qidx].bincnt : 0;
  1362. if(bmin < aux->bmin) bmin = aux->bmin;
  1363. bmax = aux->bmax;
  1364. if(par->self_aln == 2){
  1365. par->strand_mask = 2; // 1: forward; 2: reverse; 3: both
  1366. }
  1367. split_FIXP_kmers_kbm(rdseqs, seqoff, aux->slen, par->ksize, par->psize, par->kmer_mod, aux->koffs);
  1368. #ifdef TEST_MODE
  1369. if(par->test_mode >= 6) return;
  1370. #endif
  1371. tot = 0;
  1372. for(i=0;i<2;i++){
  1373. next = 0;
  1374. for(j=0;j<aux->koffs[i]->size;j++){
  1375. f = ref_kmeroffv(aux->koffs[i], j);
  1376. if(f->closed) continue;
  1377. if(kbm->flags & (1LLU << 2)) f->kidx = 0;
  1378. u = get_kbmhash(kbm->hashs[f->kidx], f->kmer);
  1379. if(u == NULL || u->flt || u->tot < par->kmin){
  1380. continue;
  1381. }
  1382. x = ref_kbmkauxv(kbm->kauxs[f->kidx], offset_kbmhash(kbm->hashs[f->kidx], u));
  1383. ref = next_ref_kbmrefv(aux->refs);
  1384. ref->mer = u;
  1385. ref->aux = x;
  1386. ref->kidx = f->kidx;
  1387. ref->off = f->off;
  1388. ref->dir = i;
  1389. if(par->self_aln && aux->solids){
  1390. sidx = (kbm->reads->buffer[qidx].seqoff * KBM_BIN_SIZE + ref->off) >> 1;
  1391. ref->fine = get_bitvec(aux->solids, sidx);
  1392. } else {
  1393. ref->fine = 0;
  1394. }
  1395. ref->qbidx = ref->off / KBM_BIN_SIZE;
  1396. ref->poffs[0] = ref->off;
  1397. ref->poffs[1] = aux->slen - (ref->off + (aux->par->ksize + aux->par->psize));
  1398. ref->boff = x->off;
  1399. ref->bend = x->off + x->cnt;
  1400. ref->bidx = getval_bidx(kbm, x->off);
  1401. ref->closed = 0;
  1402. {
  1403. // Refine boundray
  1404. if(par->self_aln == 2){ // reverse complementary only
  1405. while(ref->boff < ref->bend && getval_bidx(kbm, ref->bend - 1) > bmin){
  1406. ref->bend --;
  1407. }
  1408. }
  1409. while(ref->boff < ref->bend){
  1410. if(ref->bidx < bmin){
  1411. ref->boff ++;
  1412. ref->bidx = getval_bidx(kbm, ref->boff);
  1413. continue;
  1414. //} else if(ref->b->bidx >= bmax){
  1415. //break;
  1416. }
  1417. break;
  1418. }
  1419. while(ref->bend > ref->boff){
  1420. if(getval_bidx(kbm, ref->bend - 1) > bmax){
  1421. ref->bend --;
  1422. } else {
  1423. break;
  1424. }
  1425. }
  1426. if(ref->boff >= ref->bend){
  1427. ref->closed = 1;
  1428. }
  1429. }
  1430. if(ref->closed){
  1431. aux->refs->size --;
  1432. continue;
  1433. }
  1434. tot += x->cnt;
  1435. }
  1436. }
  1437. if(0){
  1438. //sort_array(aux->refs->buffer, aux->refs->size, kbm_ref_t, num_cmpgt(a.off, b.off));
  1439. for(i=0;i<aux->refs->size;i++){
  1440. ref = ref_kbmrefv(aux->refs, i);
  1441. fprintf(KBM_LOGF, "%s\t%d\t%c\t%d\n", aux->qtag, ref->off, "+-"[ref->dir], (int)ref->aux->cnt);
  1442. }
  1443. }
  1444. if(par->self_aln && aux->solids){
  1445. // Obsolete
  1446. sort_array(aux->refs->buffer, aux->refs->size, kbm_ref_t, num_cmpgt(a.off, b.off));
  1447. tot = 0;
  1448. next = 0;
  1449. for(i=0;i<aux->refs->size;i++){
  1450. ref = ref_kbmrefv(aux->refs, i);
  1451. if(ref->closed){
  1452. continue;
  1453. } else if(ref->fine){
  1454. tot += ref->bend - ref->boff;
  1455. next = ref->off + (aux->par->ksize + aux->par->psize) / 2 + 1;
  1456. } else if(ref->off >= next){
  1457. tot += ref->bend - ref->boff;
  1458. } else {
  1459. ref->boff = ref->bend;
  1460. ref->closed = 1;
  1461. }
  1462. }
  1463. } else if(aux->par->ksampling < KBM_BIN_SIZE && aux->refs->size){
  1464. sort_array(aux->refs->buffer, aux->refs->size, kbm_ref_t, num_cmpgtx(a.qbidx, b.qbidx, b.bend - b.boff, a.bend - a.boff));
  1465. tot = 0;
  1466. for(i=j=0;i<aux->refs->size;i++){
  1467. if(aux->refs->buffer[i].qbidx != aux->refs->buffer[j].qbidx){
  1468. if(aux->refs->buffer[j].qbidx){ // skip the first and last bin
  1469. if((i - j) > aux->par->ksampling){
  1470. l = j + aux->par->ksampling;
  1471. for(;j<l;j++){
  1472. tot += aux->refs->buffer[j].bend - aux->refs->buffer[j].boff;
  1473. }
  1474. for(;j<i;j++){
  1475. aux->refs->buffer[j].boff = aux->refs->buffer[j].bend;
  1476. aux->refs->buffer[j].bidx = getval_bidx(kbm, aux->refs->buffer[j].boff);
  1477. aux->refs->buffer[j].closed = 1;
  1478. }
  1479. }
  1480. }
  1481. j = i;
  1482. }
  1483. }
  1484. //sort_array(aux->refs->buffer, aux->refs->size, kbm_ref_t, num_cmpgt(a.off, b.off));
  1485. }
  1486. sort_array(aux->refs->buffer, aux->refs->size, kbm_ref_t, num_cmpgt(a.off, b.off));
  1487. // estimate binmap
  1488. aux->bmoff = 0;
  1489. if(aux->refs->size){
  1490. mr = aux->par->min_mat / (aux->par->ksize + aux->par->psize);
  1491. if(mr < 512) mr = 512;
  1492. aux->bmlen = tot / mr;
  1493. if(aux->bmlen == 0) aux->bmlen = 1;
  1494. aux->bmcnt = (aux->kbm->bins->size + aux->bmlen - 1) / aux->bmlen;
  1495. if(aux->bmcnt < aux->qnbin * 50){
  1496. aux->bmcnt = aux->qnbin * 50;
  1497. aux->bmlen = (aux->kbm->bins->size + aux->bmcnt - 1) / aux->bmcnt;
  1498. }
  1499. } else {
  1500. aux->bmlen = 1;
  1501. aux->bmcnt = aux->kbm->bins->size;
  1502. }
  1503. if(0 && aux->bmlen > aux->nheap){
  1504. aux->bmlen = aux->nheap;
  1505. aux->bmcnt = (aux->kbm->bins->size + aux->bmlen - 1) / aux->bmlen;
  1506. }
  1507. //fprintf(stderr, " -- %s tot=%d bmlen=%d bmcnt=%d mr=%d in %s -- %s:%d --\n", aux->qtag, tot, aux->bmlen, aux->bmcnt, tot / aux->bmlen, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1508. for(i=0;i<aux->nheap&&i<aux->bmlen;i++){
  1509. clear_u4v(aux->heaps[i]);
  1510. }
  1511. //fprintf(stderr, " -- %s tot=%d avg=%d bmlen=%d bmcnt=%d mr=%d qnbin=%d in %s -- %s:%d --\n", aux->qtag, tot, tot / aux->bmlen, aux->bmlen, aux->bmcnt, mr, aux->qnbin, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1512. #ifdef TEST_MODE
  1513. if(par->test_mode >= 5) return;
  1514. #endif
  1515. // init heaps
  1516. for(i=0;i<aux->refs->size;i++){
  1517. ref = ref_kbmrefv(aux->refs, i);
  1518. while(ref->boff < ref->bend){
  1519. if(0){
  1520. pdir = (ref->dir ^ kbm->sauxs->buffer[ref->boff].dir);
  1521. if(((aux->par->strand_mask >> pdir) & 0x01) == 0){
  1522. ref->boff ++;
  1523. ref->bidx = getval_bidx(aux->kbm, ref->boff);
  1524. continue;
  1525. }
  1526. }
  1527. hidx = ref->bidx / aux->bmcnt;
  1528. if(hidx - aux->bmoff < aux->nheap){
  1529. push_u4v(aux->heaps[hidx - aux->bmoff], i);
  1530. }
  1531. break;
  1532. }
  1533. }
  1534. aux->hptr = 0;
  1535. }
  1536. static inline void print_exists_index_kbm(KBM *kbm, char *qtag, BaseBank *rdseqs, u8i seqoff, u4i seqlen, kmeroffv *kmers[2], FILE *out){
  1537. KBMPar *par;
  1538. kbm_kmer_t *u;
  1539. kbm_kaux_t *x;
  1540. kmer_off_t *f;
  1541. u4i i, j;
  1542. par = kbm->par;
  1543. split_FIXP_kmers_kbm(rdseqs, seqoff, seqlen, par->ksize, par->psize, par->kmer_mod, kmers);
  1544. for(i=0;i<2;i++){
  1545. for(j=0;j<kmers[i]->size;j++){
  1546. f = ref_kmeroffv(kmers[i], j);
  1547. if(f->closed) continue;
  1548. if(kbm->flags & 0x04) f->kidx = 0;
  1549. u = get_kbmhash(kbm->hashs[f->kidx], f->kmer);
  1550. if(u == NULL){
  1551. continue;
  1552. }
  1553. x = ref_kbmkauxv(kbm->kauxs[f->kidx], offset_kbmhash(kbm->hashs[f->kidx], u));
  1554. fprintf(out, "%s\t%d\t%c\t0x%llx\t%u\t%u\t%c\n", qtag, f->off, "+-"[f->dir], f->kmer, x->cnt, u->tot, "YN"[u->flt]);
  1555. }
  1556. }
  1557. }
  1558. static inline int _update_dp_path_kbm(KBMDP *dp, u8i end, kbm_cell_t *c){
  1559. int exists;
  1560. kbm_path_t *p, P;
  1561. P.beg = c->beg;
  1562. p = prepare_kbmphash(dp->paths, P, &exists);
  1563. if(exists){
  1564. if(p->score < c->score){
  1565. p->end = end;
  1566. p->mat = c->mat;
  1567. p->score = c->score;
  1568. } else {
  1569. return 0;
  1570. }
  1571. } else {
  1572. p->beg = c->beg;
  1573. p->end = end;
  1574. p->mat = c->mat;
  1575. p->score = c->score;
  1576. }
  1577. return 1;
  1578. }
  1579. static inline void _dp_cal_spare_row_kbm(KBMAux *aux, int dir){
  1580. KBMDP *dp;
  1581. kbmcellv *cells[2];
  1582. BitVec *masks[2];
  1583. kbm_cmer_t *m;
  1584. kbm_cell_t D, H, V;
  1585. u4i i, n, ni, rowoff;
  1586. u8i bitoff, celoff;
  1587. int flg, is_gap, score;
  1588. dp = aux->dps[dir];
  1589. flg = (dp->last_bidx - dp->boff) & 0x01;
  1590. cells[0] = dp->cells[flg];
  1591. cells[1] = dp->cells[!flg];
  1592. masks[0] = dp->rmask[flg];
  1593. masks[1] = dp->rmask[!flg];
  1594. rowoff = dp->coffs->buffer[dp->last_bidx - dp->boff];
  1595. bitoff = (dp->last_bidx - dp->boff) * aux->qnbit;
  1596. celoff = (dp->last_bidx - dp->boff) * aux->qnbin;
  1597. // update cells' mask to be calculated by new coming kbm_cmer_t
  1598. for(i=0;i<aux->qnbit;i+=64){
  1599. masks[1]->bits[i >> 6] = masks[0]->bits[i >> 6] | dp->cmask->bits[(i + bitoff) >> 6];
  1600. }
  1601. // dp core
  1602. H = KBM_CELL_NULL;
  1603. i = next_one_bitvec(masks[1], 0);
  1604. D = (i && get_bitvec(masks[0], i - 1))? cells[0]->buffer[i - 1] : KBM_CELL_NULL;
  1605. reg_zeros_bitvec(masks[0], 0, i);
  1606. encap_bit2vec(dp->bts, aux->qnbin);
  1607. n = 0;
  1608. while(i < aux->qnbin){
  1609. if(get_bitvec(dp->cmask, i + bitoff)){
  1610. is_gap = 0;
  1611. m = ref_kbmcmerv(dp->cms, rowoff + n);
  1612. score = m->kmat + m->kcnt;
  1613. n ++;
  1614. } else {
  1615. is_gap = 1;
  1616. m = (kbm_cmer_t*)&KBM_CMER_NULL;
  1617. score = aux->par->pgap;
  1618. }
  1619. // horizontal
  1620. {
  1621. H.score += score;
  1622. H.mat += m->kmat;
  1623. //H.gap = 0;
  1624. }
  1625. if(H.var < 0){
  1626. H.score += - aux->par->pvar; // score increases for abs(var) decreased
  1627. } else {
  1628. H.score += aux->par->pvar;
  1629. }
  1630. H.var ++;
  1631. H.bt = 1; // horizontal backtrace, insertion for query sequence
  1632. // diagonal
  1633. {
  1634. D.score += score;
  1635. D.mat += m->kmat;
  1636. //D.gap = 0;
  1637. }
  1638. D.var = 0; // whether to reset var
  1639. D.bt = 0; // diagonal backtrace
  1640. if(D.score >= H.score){
  1641. H = D;
  1642. }
  1643. // vertical
  1644. V = D = get_bitvec(masks[0], i)? cells[0]->buffer[i] : KBM_CELL_NULL;
  1645. {
  1646. V.score += score;
  1647. V.mat += m->kmat;
  1648. //V.gap = 0;
  1649. }
  1650. if(V.var > 0){
  1651. V.score += - aux->par->pvar; // score increases for abs(var) decreased
  1652. } else {
  1653. V.score += aux->par->pvar;
  1654. }
  1655. V.var --;
  1656. V.bt = 2; // vertical backtrace
  1657. if(V.score > H.score){
  1658. H = V;
  1659. }
  1660. if(is_gap){
  1661. H.gap ++;
  1662. } else {
  1663. H.gap = 0;
  1664. }
  1665. set_bit2vec(dp->bts, dp->bts->size + i, H.bt);
  1666. // init new path ID when there is no progenitor
  1667. if(H.beg == 0){
  1668. H.beg = 1 + i + celoff;
  1669. }
  1670. #if __DEBUG__
  1671. if(KBM_LOG >= KBM_LOG_ALL){
  1672. fprintf(KBM_LOGF, "KBMLOG%d [x=%d, y=%llu, beg=%llu, score=%d, gap=%d, var=%d, bt=%d]\n", __LINE__, i, dp->last_bidx, (u8i)H.beg, H.score, H.gap, H.var, H.bt);
  1673. }
  1674. #endif
  1675. if(H.score > 0 && H.gap <= aux->par->max_bgap && num_abs(H.var) <= aux->par->max_bvar){
  1676. // set cell, move next
  1677. one_bitvec(masks[1], i);
  1678. cells[1]->buffer[i] = H;
  1679. if(is_gap == 0 && H.mat >= aux->par->min_mat) _update_dp_path_kbm(dp, 1 + i + celoff, &H);
  1680. i ++;
  1681. } else if(D.score > 0){
  1682. // move next, may have diagonal hit
  1683. zero_bitvec(masks[1], i);
  1684. H = KBM_CELL_NULL;
  1685. i ++;
  1686. } else {
  1687. zero_bitvec(masks[1], i);
  1688. ni = next_one_bitvec(masks[1], i + 1);
  1689. if(i + 1 < ni){
  1690. reg_zeros_bitvec(masks[1], i + 1, ni);
  1691. D = H = KBM_CELL_NULL;
  1692. }
  1693. i = ni;
  1694. }
  1695. }
  1696. // prepare next row
  1697. //NEXT_ROW:
  1698. push_u4v(dp->coffs, dp->cms->size);
  1699. zero_bitvec(dp->cmask, dp->cmask->n_bit);
  1700. dp->cmask->n_bit += aux->qnbit;
  1701. one_bitvec(dp->cmask, dp->cmask->n_bit);
  1702. dp->bts->size += aux->qnbin;
  1703. dp->last_bidx ++;
  1704. }
  1705. static inline int _backtrace_map_kbm(KBMAux *aux, int dir, kbm_path_t *p){
  1706. KBMDP *dp;
  1707. kbm_map_t *hit;
  1708. kbm_cmer_t *c;
  1709. u8i cgoff, sidx;
  1710. u4i i, mat, cnt, gap, cglen;
  1711. int tmp, x, y, bt, lst;
  1712. dp = aux->dps[dir];
  1713. hit = next_ref_kbmmapv(aux->hits);
  1714. hit->qidx = aux->qidx;
  1715. hit->qdir = dir;
  1716. hit->tidx = aux->kbm->bins->buffer[dp->boff + p->beg / aux->qnbin].ridx;
  1717. hit->tdir = 0;
  1718. hit->qb = p->beg % aux->qnbin;
  1719. hit->qe = p->end % aux->qnbin;
  1720. hit->tb = p->beg / aux->qnbin;
  1721. hit->te = p->end / aux->qnbin;
  1722. hit->aln = num_min(hit->qe - hit->qb, hit->te - hit->tb);
  1723. hit->aln ++;
  1724. if(hit->aln < aux->par->min_aln){
  1725. aux->hits->size --;
  1726. return 0;
  1727. }
  1728. cgoff = aux->cigars->size;
  1729. cglen = 0;
  1730. cnt = 0;
  1731. mat = 0;
  1732. gap = 0;
  1733. x = hit->qe;
  1734. y = hit->te;
  1735. lst = 0;
  1736. while(x >= hit->qb && y >= hit->tb){
  1737. bt = get_bit2vec(dp->bts, x + y * aux->qnbin);
  1738. if(get_bitvec(dp->cmask, x + y * aux->qnbit)){
  1739. c = ref_kbmcmerv(dp->cms, rank_bitvec(dp->cmask, x + y * aux->qnbit));
  1740. cnt += c->kcnt;
  1741. mat += c->kmat;
  1742. // try merge 'ID' or 'DI' into 'M'
  1743. if(lst == 0){
  1744. if(bt == 0){
  1745. push_bitsvec(aux->cigars, 0);
  1746. }
  1747. lst = bt;
  1748. } else if(bt == 0){
  1749. push_bitsvec(aux->cigars, lst);
  1750. push_bitsvec(aux->cigars, 0);
  1751. lst = 0;
  1752. } else if(bt == lst){
  1753. push_bitsvec(aux->cigars, lst);
  1754. } else {
  1755. push_bitsvec(aux->cigars, 0);
  1756. lst = 0;
  1757. }
  1758. } else {
  1759. gap ++;
  1760. if(lst){
  1761. push_bitsvec(aux->cigars, lst);
  1762. }
  1763. lst = 0;
  1764. push_bitsvec(aux->cigars, 0x4 | bt);
  1765. }
  1766. switch(bt){
  1767. case 0: x --; y --; break;
  1768. case 1: x --; break;
  1769. default: y --; break;
  1770. }
  1771. }
  1772. if(lst){
  1773. push_bitsvec(aux->cigars, lst);
  1774. }
  1775. cglen = aux->cigars->size - cgoff;
  1776. if(mat < (u4i)aux->par->min_mat || mat < UInt(hit->aln * KBM_BSIZE * aux->par->min_sim)
  1777. || gap > (u4i)(hit->aln * aux->par->max_gap)
  1778. || hit->aln < (int)aux->par->min_aln
  1779. || num_diff(hit->qe - hit->qb, hit->te - hit->tb) > (int)num_max(aux->par->aln_var * hit->aln, 1.0)){
  1780. aux->hits->size --;
  1781. aux->cigars->size = cgoff;
  1782. return 0;
  1783. }
  1784. if(aux->par->self_aln && aux->solids){
  1785. // Obsolete
  1786. x = hit->qe;
  1787. y = hit->te;
  1788. while(x >= hit->qb && y >= hit->tb){
  1789. bt = get_bit2vec(dp->bts, x + y * aux->qnbin);
  1790. if(get_bitvec(dp->cmask, x + y * aux->qnbit)){
  1791. c = ref_kbmcmerv(dp->cms, rank_bitvec(dp->cmask, x + y * aux->qnbit));
  1792. for(i=0;i<c->kcnt;i++){
  1793. sidx = seed2solid_idx_kbm(aux->kbm, dp->kms->buffer + c->koff + i);
  1794. one_bitvec(aux->solids, sidx); // Thread-unsafe, but no hurt
  1795. }
  1796. } else {
  1797. }
  1798. switch(bt){
  1799. case 0: x --; y --; break;
  1800. case 1: x --; break;
  1801. default: y --; break;
  1802. }
  1803. }
  1804. }
  1805. hit->qe = (hit->qe + 1);
  1806. hit->tb = (dp->boff + hit->tb - aux->kbm->reads->buffer[hit->tidx].binoff);
  1807. hit->te = (dp->boff + hit->te - aux->kbm->reads->buffer[hit->tidx].binoff + 1);
  1808. hit->mat = mat;
  1809. hit->cnt = cnt;
  1810. hit->gap = gap;
  1811. hit->cgoff = cgoff;
  1812. hit->cglen = cglen;
  1813. if(dir){
  1814. tmp = aux->qnbin - hit->qb;
  1815. hit->qb = aux->qnbin - hit->qe;
  1816. hit->qe = tmp;
  1817. }
  1818. #if __DEBUG__
  1819. if(KBM_LOG){
  1820. fprintf(KBM_LOGF, "HIT\tQ[%d]\t%c\t%d\t%d", hit->qidx, "+-"[hit->qdir], hit->qb, hit->qe);
  1821. fprintf(KBM_LOGF, "\tT[%d]\t%c\t%d\t%d", hit->tidx, "+-"[hit->tdir], hit->tb, hit->te);
  1822. fprintf(KBM_LOGF, "\t%d\t%d\t%d\t%d\n", hit->mat, hit->aln, hit->cnt, hit->gap);
  1823. }
  1824. #endif
  1825. return 1;
  1826. }
  1827. static inline int check_hit_cigar_kbm(kbm_map_t *hit, BitsVec *cigars){
  1828. u4i i, bt;
  1829. int x, y;
  1830. x = y = -1;
  1831. i = hit->cglen;
  1832. while(i){
  1833. bt = get_bitsvec(cigars, hit->cgoff + i - 1);
  1834. bt = bt & 0x03;
  1835. x += (0b0011 >> bt) & 0x01;
  1836. y += (0b0101 >> bt) & 0x01;
  1837. i --;
  1838. }
  1839. return !(x + 1 + hit->qb == hit->qe && y + 1 + hit->tb == hit->te);
  1840. }
  1841. static inline void print_hit_kbm(KBM *kbm, char *qtag, u4i qlen, kbm_map_t *hit, BitsVec *cigars, String *_str, FILE *out){
  1842. String *str;
  1843. u8i coff;
  1844. u4i clen, len, bt, _bt;
  1845. if(hit->mat == 0) return;
  1846. fprintf(out, "%s\t%c\t%d\t%d\t%d", qtag, "+-"[hit->qdir], qlen, hit->qb * KBM_BIN_SIZE, hit->qe * KBM_BIN_SIZE);
  1847. fprintf(out, "\t%s\t%c\t%d\t%d\t%d", kbm->reads->buffer[hit->tidx].tag, "+-"[hit->tdir], kbm->reads->buffer[hit->tidx].bincnt * KBM_BIN_SIZE, hit->tb * KBM_BIN_SIZE, hit->te * KBM_BIN_SIZE);
  1848. fprintf(out, "\t%d\t%d\t%d\t%d\t", hit->mat, hit->aln * KBM_BIN_SIZE, hit->cnt, hit->gap);
  1849. if(cigars){
  1850. str = _str? _str : init_string(64);
  1851. bt = len = 0;
  1852. coff = hit->cgoff;
  1853. clen = hit->cglen;
  1854. clear_string(str);
  1855. while(clen){
  1856. _bt = get_bitsvec(cigars, coff + clen - 1);
  1857. if(_bt == bt){
  1858. len ++;
  1859. } else {
  1860. if(len > 1){
  1861. add_int_string(str, len);
  1862. add_char_string(str, "MID?mid?"[bt]);
  1863. } else if(len == 1){
  1864. add_char_string(str, "MID?mid?"[bt]);
  1865. }
  1866. bt = _bt;
  1867. len = 1;
  1868. }
  1869. clen --;
  1870. }
  1871. if(len > 1){
  1872. add_int_string(str, len);
  1873. add_char_string(str, "MID?mid?"[bt]);
  1874. } else if(len == 1){
  1875. add_char_string(str, "MID?mid?"[bt]);
  1876. }
  1877. fputs(str->string, out);
  1878. if(_str == NULL) free_string(str);
  1879. } else {
  1880. fputc('*', out);
  1881. }
  1882. fprintf(out, "\n");
  1883. }
  1884. static inline void fprint_hit_kbm(KBMAux *aux, u4i hidx, FILE *out){
  1885. kbm_map_t *hit;
  1886. hit = ref_kbmmapv(aux->hits, hidx);
  1887. print_hit_kbm(aux->kbm, aux->qtag, aux->slen, hit, aux->cigars, aux->str, out);
  1888. }
  1889. static inline void flip_hit_kbmaux(KBMAux *dst, KBMAux *src, u4i hidx){
  1890. kbm_map_t *h1, *h2;
  1891. u4i t, i;
  1892. h2 = next_ref_kbmmapv(dst->hits);
  1893. h1 = ref_kbmmapv(src->hits, hidx);
  1894. //h2->qidx = h1->tidx;
  1895. h2->qidx = dst->qidx;
  1896. h2->qdir = h1->qdir;
  1897. h2->tidx = h1->qidx;
  1898. h2->tdir = h1->tdir;
  1899. h2->cgoff = dst->cigars->size;
  1900. h2->cglen = h1->cglen;
  1901. h2->qb = h1->tb;
  1902. h2->qe = h1->te;
  1903. h2->tb = h1->qb;
  1904. h2->te = h1->qe;
  1905. h2->mat = h1->mat;
  1906. h2->cnt = h1->cnt;
  1907. h2->aln = h1->aln;
  1908. h2->gap = h1->gap;
  1909. if(h1->qdir){
  1910. for(i=0;i<h1->cglen;i++){
  1911. t = get_bitsvec(src->cigars, h1->cgoff + h1->cglen - i - 1);
  1912. if((t & 0x03)){
  1913. t = ((~(t & 0x03)) & 0x03) | (t & 0x4);
  1914. }
  1915. push_bitsvec(dst->cigars, t);
  1916. }
  1917. } else {
  1918. for(i=0;i<h1->cglen;i++){
  1919. t = get_bitsvec(src->cigars, h1->cgoff + i);
  1920. if((t & 0x03)){
  1921. t = ((~(t & 0x03)) & 0x03) | (t & 0x4);
  1922. }
  1923. push_bitsvec(dst->cigars, t);
  1924. }
  1925. }
  1926. }
  1927. static inline int _dp_path2map_kbm(KBMAux *aux, int dir){
  1928. KBMDP *dp;
  1929. kbm_path_t *p;
  1930. u4i ret;
  1931. dp = aux->dps[dir];
  1932. ret = 0;
  1933. index_bitvec_core(dp->cmask, roundup_times(dp->cmask->n_bit, 64 * 8));
  1934. reset_iter_kbmphash(dp->paths);
  1935. while((p = ref_iter_kbmphash(dp->paths))){
  1936. p->beg --; // 1-based to 0-based
  1937. p->end --;
  1938. #if __DEBUG__
  1939. if(KBM_LOG >= KBM_LOG_HIG){
  1940. fprintf(KBM_LOGF, "KBMLOG%d\t%d\t%c\tkbm_path_t[%llu(%d:%d),%llu(%d:%d),%d]\n", __LINE__, aux->qidx, "+-"[dir], (u8i)p->beg, (u4i)(p->beg % aux->qnbin), (u4i)((p->beg / aux->qnbin) + dp->boff), (u8i)p->end, (u4i)(p->end % aux->qnbin), (u4i)((p->end / aux->qnbin) + dp->boff), p->score);
  1941. }
  1942. #endif
  1943. if(_backtrace_map_kbm(aux, dir, p)){
  1944. ret ++;
  1945. }
  1946. }
  1947. //clear_kbmphash(dp->paths);
  1948. return ret;
  1949. }
  1950. static inline void push_kmer_match_kbm(KBMAux *aux, int dir, kbm_dpe_t *p){
  1951. KBM *kbm;
  1952. KBMDP *dp;
  1953. kbm_cmer_t *c;
  1954. kbm_dpe_t *e, E;
  1955. u8i bb;
  1956. u4i i, qb, kmat, kcnt, blen;
  1957. kbm = aux->kbm;
  1958. dp = aux->dps[dir];
  1959. if(p == NULL){
  1960. if(dp->kms->size == 0){
  1961. dp->km_len = 0;
  1962. return;
  1963. }
  1964. e = ref_kbmdpev(dp->kms, dp->kms->size - 1);
  1965. if((int)dp->km_len < aux->par->min_mat || e->bidx + 1 < dp->kms->buffer[0].bidx + aux->par->min_aln){
  1966. clear_kbmdpev(dp->kms);
  1967. dp->km_len = aux->par->ksize + aux->par->psize;
  1968. return;
  1969. }
  1970. } else {
  1971. if(dp->kms->size == 0){
  1972. dp->km_len = (aux->par->ksize + aux->par->psize);
  1973. push_kbmdpev(dp->kms, *p);
  1974. return;
  1975. }
  1976. e = ref_kbmdpev(dp->kms, dp->kms->size - 1);
  1977. if(e->bidx == p->bidx){
  1978. if(p->poff <= e->poff + aux->par->ksize + aux->par->psize){
  1979. dp->km_len += p->poff - e->poff;
  1980. } else {
  1981. dp->km_len += aux->par->ksize + aux->par->psize;
  1982. }
  1983. push_kbmdpev(dp->kms, *p);
  1984. return;
  1985. #if __DEBUG__
  1986. } else if(p->bidx < e->bidx){
  1987. fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  1988. abort();
  1989. #endif
  1990. }
  1991. if(e->bidx + aux->par->max_bgap + 1 < p->bidx || dp->kms->buffer[0].bidx + aux->par->max_bcnt < p->bidx || aux->kbm->bins->buffer[e->bidx].ridx != aux->kbm->bins->buffer[p->bidx].ridx){
  1992. if(Int(dp->km_len) < aux->par->min_mat || e->bidx + 1 < dp->kms->buffer[0].bidx + aux->par->min_aln){
  1993. clear_kbmdpev(dp->kms);
  1994. push_kbmdpev(dp->kms, *p);
  1995. dp->km_len = aux->par->ksize + aux->par->psize;
  1996. return;
  1997. }
  1998. } else {
  1999. dp->km_len += aux->par->ksize + aux->par->psize;
  2000. push_kbmdpev(dp->kms, *p);
  2001. return;
  2002. }
  2003. }
  2004. reset_kbmdp(dp, aux, dp->kms->buffer[0].bidx);
  2005. #ifdef TEST_MODE
  2006. if(aux->par->test_mode >= 1){
  2007. clear_kbmdpev(dp->kms);
  2008. if(p){
  2009. dp->km_len = aux->par->ksize + aux->par->psize;
  2010. push_kbmdpev(dp->kms, *p);
  2011. }
  2012. return;
  2013. }
  2014. #endif
  2015. blen = dp->kms->buffer[dp->kms->size - 1].bidx + 1 - dp->kms->buffer[0].bidx + 2;
  2016. {
  2017. push_u4v(dp->coffs, 0);
  2018. clear_bitvec(dp->cmask);
  2019. encap_bitvec(dp->cmask, aux->qnbit * blen);
  2020. reg_zeros_bitvec(dp->cmask, 0, aux->qnbit * blen);
  2021. dp->cmask->n_bit = aux->qnbit;
  2022. one_bitvec(dp->cmask, dp->cmask->n_bit);
  2023. }
  2024. qb = dp->kms->buffer[0].poff;
  2025. bb = dp->kms->buffer[0].bidx;
  2026. kcnt = 0;
  2027. kmat = aux->par->ksize + aux->par->psize;
  2028. E.bidx = dp->kms->buffer[dp->kms->size - 1].bidx + 1;
  2029. E.poff = 0;
  2030. for(i=0;i<=dp->kms->size;i++){
  2031. e = (i < dp->kms->size)? ref_kbmdpev(dp->kms, i) : &E;
  2032. if(e->bidx == bb && e->poff / KBM_BIN_SIZE == qb / KBM_BIN_SIZE){
  2033. kcnt ++;
  2034. if(qb + aux->par->ksize + aux->par->psize >= e->poff){
  2035. kmat += e->poff - qb;
  2036. } else {
  2037. kmat += aux->par->ksize + aux->par->psize;
  2038. }
  2039. } else {
  2040. one_bitvec(dp->cmask, (bb - dp->boff) * aux->qnbit + qb / KBM_BIN_SIZE);
  2041. c = next_ref_kbmcmerv(dp->cms);
  2042. c->koff = i - kcnt;
  2043. c->kcnt = kcnt;
  2044. c->kmat = kmat;
  2045. c->boff = bb - dp->boff;
  2046. #if __DEBUG__
  2047. if(KBM_LOG >= KBM_LOG_ALL){
  2048. fprintf(KBM_LOGF, "KBMLOG%d [x=%d, y=%d(%s:%d), off=%d cnt=%d, mat=%d]\n", __LINE__, qb / KBM_BIN_SIZE, bb,
  2049. aux->kbm->reads->buffer[aux->kbm->bins->buffer[bb].ridx].tag, bb - aux->kbm->reads->buffer[aux->kbm->bins->buffer[bb].ridx].binoff, c->koff, c->kcnt, c->kmat);
  2050. }
  2051. #endif
  2052. while(bb < e->bidx){
  2053. _dp_cal_spare_row_kbm(aux, dir);
  2054. bb ++;
  2055. }
  2056. kcnt = 1;
  2057. kmat = aux->par->ksize + aux->par->psize;
  2058. }
  2059. qb = e->poff;
  2060. }
  2061. // flush last row
  2062. _dp_cal_spare_row_kbm(aux, dir);
  2063. //collecting maps
  2064. _dp_path2map_kbm(aux, dir);
  2065. clear_kbmdpev(dp->kms);
  2066. if(p) push_kbmdpev(dp->kms, *p);
  2067. dp->km_len = aux->par->ksize + aux->par->psize;
  2068. }
  2069. static inline void map_kbm(KBMAux *aux){
  2070. KBM *kbm;
  2071. kbm_ref_t *ref;
  2072. kbm_baux_t *saux;
  2073. u4v *heap;
  2074. u4i idx, hidx, i, j, pdir;
  2075. kbm = aux->kbm;
  2076. #ifdef TEST_MODE
  2077. if(aux->par->test_mode >= 4) return;
  2078. #endif
  2079. while(aux->hptr < aux->bmlen){
  2080. if(aux->hptr - aux->bmoff >= aux->nheap){
  2081. aux->bmoff += aux->nheap;
  2082. for(i=0;i<aux->nheap;i++){
  2083. clear_u4v(aux->heaps[i]);
  2084. }
  2085. for(i=0;i<aux->refs->size;i++){
  2086. ref = ref_kbmrefv(aux->refs, i);
  2087. while(ref->boff < ref->bend){
  2088. hidx = ref->bidx / aux->bmcnt;
  2089. if(hidx - aux->bmoff < aux->nheap){
  2090. push_u4v(aux->heaps[hidx - aux->bmoff], i);
  2091. }
  2092. break;
  2093. }
  2094. }
  2095. }
  2096. heap = aux->heaps[aux->hptr - aux->bmoff];
  2097. if(heap->size){
  2098. clear_kbmdpev(aux->caches[0]);
  2099. clear_kbmdpev(aux->caches[1]);
  2100. for(i=0;i<heap->size;i++){
  2101. idx = heap->buffer[i];
  2102. ref = ref_kbmrefv(aux->refs, idx);
  2103. while(1){
  2104. saux = ref_kbmbauxv(kbm->sauxs, ref->boff);
  2105. pdir = (ref->dir ^ saux->dir);
  2106. if(((aux->par->strand_mask >> pdir) & 0x01)){
  2107. push_kbmdpev(aux->caches[pdir], (kbm_dpe_t){ref->poffs[pdir], idx, ref->bidx, saux->koff});
  2108. }
  2109. ref->boff ++;
  2110. ref->bidx = getval_bidx(aux->kbm, ref->boff);
  2111. if(ref->boff >= ref->bend) break;
  2112. #if __DEBUG__
  2113. if(ref->bidx < getval_bidx(aux->kbm, ref->boff - 1)){
  2114. fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  2115. abort();
  2116. }
  2117. #endif
  2118. hidx = ref->bidx / aux->bmcnt;
  2119. if(hidx > aux->hptr){
  2120. if(hidx - aux->bmoff < aux->nheap){
  2121. push_u4v(aux->heaps[hidx - aux->bmoff], idx);
  2122. }
  2123. break;
  2124. }
  2125. }
  2126. }
  2127. {
  2128. #ifdef TEST_MODE
  2129. if(aux->par->test_mode <= 2){
  2130. #endif
  2131. if(aux->caches[0]->size * (aux->par->ksize + aux->par->psize) < UInt(aux->par->min_mat)){
  2132. aux->caches[0]->size = 0;
  2133. } else {
  2134. sort_array(aux->caches[0]->buffer, aux->caches[0]->size, kbm_dpe_t, num_cmpgtx(a.bidx, b.bidx, a.poff, b.poff));
  2135. }
  2136. if(aux->caches[1]->size * (aux->par->ksize + aux->par->psize) < UInt(aux->par->min_mat)){
  2137. aux->caches[1]->size = 0;
  2138. } else {
  2139. sort_array(aux->caches[1]->buffer, aux->caches[1]->size, kbm_dpe_t, num_cmpgtx(a.bidx, b.bidx, a.poff, b.poff));
  2140. }
  2141. //sort_array(aux->caches[0]->buffer, aux->caches[0]->size, kbm_dpe_t, num_cmpgtx(a.bidx, b.bidx, a.poff, b.poff));
  2142. //sort_array(aux->caches[1]->buffer, aux->caches[1]->size, kbm_dpe_t, num_cmpgtx(a.bidx, b.bidx, a.poff, b.poff));
  2143. // TODO: sort by bidx+koff is more reasonable, need to modify push_kmer_match_kbm too
  2144. #ifdef TEST_MODE
  2145. }
  2146. #endif
  2147. #ifdef TEST_MODE
  2148. if(aux->par->test_mode <= 1){
  2149. #endif
  2150. for(i=0;i<2;i++){
  2151. for(j=0;j<aux->caches[i]->size;j++){
  2152. #if __DEBUG__
  2153. if(KBM_LOG >= KBM_LOG_ALL){
  2154. //fprintf(KBM_LOGF, "KBMLOG%d\t%d\t%d\t%c\t%d\t%llu[%d,%d]\t%llu[%d,%d]\n", __LINE__, aux->qidx, ref->poffs[ref->pdir], "+-"[ref->pdir], aux->hptr, ref->bidx, aux->kbm->bins->buffer[ref->bidx].ridx, aux->kbm->bins->buffer[ref->bidx].off * KBM_BIN_SIZE, (u8i)e->bidx, aux->kbm->bins->buffer[e->bidx].ridx, e->poff);
  2155. }
  2156. #endif
  2157. push_kmer_match_kbm(aux, i, aux->caches[i]->buffer + j);
  2158. }
  2159. }
  2160. if(aux->hits->size >= aux->par->max_hit) return;
  2161. #ifdef TEST_MODE
  2162. }
  2163. #endif
  2164. }
  2165. }
  2166. aux->hptr ++;
  2167. }
  2168. if(aux->par->strand_mask & 0x01) push_kmer_match_kbm(aux, 0, NULL);
  2169. if(aux->par->strand_mask & 0x02) push_kmer_match_kbm(aux, 1, NULL);
  2170. }
  2171. // KBM's tag2idx is wrongly loaded, need to be corrected
  2172. static inline void rebuild_tag2idx_kbm(void *_kbm, size_t aux){
  2173. KBM *kbm;
  2174. kbm_read_t *rd;
  2175. u4i i;
  2176. UNUSED(aux);
  2177. kbm = (KBM*)_kbm;
  2178. clear_cuhash(kbm->tag2idx); // hash size is not changed, thus there won't have hash re-size
  2179. for(i=0;i<kbm->reads->size;i++){
  2180. rd = ref_kbmreadv(kbm->reads, i);
  2181. if(rd->tag) put_cuhash(kbm->tag2idx, (cuhash_t){rd->tag, i});
  2182. }
  2183. kbm->flags |= 1LLU << 0;
  2184. }
  2185. static inline int simple_chain_all_maps_kbm(kbm_map_t *srcs, u4i size, BitsVec *src_cigars, kbm_map_t *dst, BitsVec *dst_cigars, float max_aln_var){
  2186. kbm_map_t *hit;
  2187. u4i i, x, y, z, f;
  2188. if(size < 2) return 0;
  2189. sort_array(srcs, size, kbm_map_t, num_cmpgt(a.tb, b.tb));
  2190. *dst = srcs[size - 1];
  2191. dst->cgoff = dst_cigars->size;
  2192. append_bitsvec(dst_cigars, src_cigars, srcs[size - 1].cgoff, srcs[size - 1].cglen);
  2193. for(i=size-1;i>0;i--){
  2194. hit = srcs + i - 1;
  2195. if(dst->tb < hit->te){
  2196. goto FAILED;
  2197. } else {
  2198. y = dst->tb - hit->te;
  2199. dst->tb = hit->tb;
  2200. }
  2201. if(dst->qdir){
  2202. if(hit->qe > dst->qb){
  2203. goto FAILED;
  2204. } else {
  2205. x = dst->qb - hit->qe;
  2206. dst->qb = hit->qb;
  2207. }
  2208. } else {
  2209. if(dst->qb < hit->qe){
  2210. goto FAILED;
  2211. } else {
  2212. x = dst->qb - hit->qe;
  2213. dst->qb = hit->qb;
  2214. }
  2215. }
  2216. dst->mat += hit->mat;
  2217. dst->cnt += hit->cnt;
  2218. dst->gap += hit->gap;
  2219. dst->gap += num_max(x, y);
  2220. z = num_min(x, y);
  2221. f = 0x4 | 0; // diagonal GAP
  2222. pushs_bitsvec(dst_cigars, f, z);
  2223. x -= z;
  2224. y -= z;
  2225. if(x > y){
  2226. z = x;
  2227. f = 0x4 | 1;
  2228. } else {
  2229. z = y;
  2230. f = 0x4 | 2;
  2231. }
  2232. pushs_bitsvec(dst_cigars, f, z);
  2233. append_bitsvec(dst_cigars, src_cigars, hit->cgoff, hit->cglen);
  2234. }
  2235. dst->aln = num_min(dst->qe - dst->qb, dst->te - dst->tb);
  2236. if(dst->aln * max_aln_var < num_diff(dst->qe - dst->qb, dst->te - dst->tb)){
  2237. goto FAILED;
  2238. }
  2239. dst->cglen = dst_cigars->size - dst->cgoff;
  2240. return 1;
  2241. FAILED:
  2242. dst_cigars->size = dst->cgoff;
  2243. return 0;
  2244. }
  2245. #endif