kbm.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. /*
  2. *
  3. * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "kbm.h"
  20. #include "kbmpoa.h"
  21. #include <regex.h>
  22. #ifndef VERSION
  23. #define VERSION 0.0
  24. #endif
  25. #ifndef RELEASE
  26. #define RELEASE 19830203
  27. #endif
  28. int kbm_usage(){
  29. fprintf(stdout, "Program: kbm is a simple instance which implemented kmer-binmap\n");
  30. fprintf(stdout, " it maps query sequence against reference by kmer matching\n");
  31. fprintf(stdout, " matched kmer-pairs are bined (256bp) and counted in a matrix\n");
  32. fprintf(stdout, " dynamic programming is used to search the best path\n");
  33. fprintf(stdout, "Version: %s (%s)\n", TOSTR(VERSION), TOSTR(RELEASE));
  34. fprintf(stdout, "Author: Jue Ruan <ruanjue@gmail.com>\n");
  35. fprintf(stdout, "Usage: kbm <options> [start|list|stop]\n");
  36. fprintf(stdout, "Options:\n");
  37. fprintf(stdout, " -i <string> File(s) of query sequences, +, [STDIN]\n");
  38. fprintf(stdout, " -d <string> File(s) of reference sequences, +, [<-i>]\n");
  39. fprintf(stdout, " -L <int> Choose the longest subread and drop reads shorter than <int> (5000 recommended for PacBio) [0]\n");
  40. fprintf(stdout, " Negative integer indicate keeping read names, e.g. -5000.\n");
  41. fprintf(stdout, " -o <string> Output file, [STDOUT]\n");
  42. fprintf(stdout, " -I Interactive mode\n");
  43. fprintf(stdout, " e.g. `mkfifo pipe` then `while true; do cat pipe && sleep 1; done | kbm -t 8 -I -d ref.fa -i - -Hk 21 -S 4`\n");
  44. fprintf(stdout, " then `cat 1.fq >pipe; cat 2.fq >pipe`, fastq format is better in interaction\n");
  45. fprintf(stdout, " -f Force overwrite\n");
  46. fprintf(stdout, " -t <int> Number of threads, 0: all cores, [1]\n");
  47. fprintf(stdout, " -k <int> Kmer-f size, <= %d, [0]\n", KBM_MAX_KSIZE);
  48. fprintf(stdout, " -p <int> Kmer-p size, <= %d, [21]\n", KBM_MAX_KSIZE);
  49. fprintf(stdout, " -K <float> Filter high frequency kmers, maybe repetitive, [1000]\n");
  50. fprintf(stdout, " if K >= 1, take the integer value as cutoff, MUST <= 65535\n");
  51. fprintf(stdout, " else, mask the top fraction part high frequency kmers\n");
  52. fprintf(stdout, " -E <int> Min kmer frequency, [1]\n");
  53. fprintf(stdout, " -O <int> Filter low complexity bins (#indexed_kmer less than <-O>), [2]\n");
  54. fprintf(stdout, " -S <float> Subsampling kmers, 1/(<-S>) kmers are indexed, [4.00]\n");
  55. fprintf(stdout, " -S is very useful in saving memeory and speeding up\n");
  56. fprintf(stdout, " please note that subsampling kmers will have less matched length\n");
  57. fprintf(stdout, " -B <int> Select no more than n seeds in a query bin, [256]\n");
  58. // Obsolete
  59. //fprintf(stdout, " -G <int> Recognize error kmers in a bin when be aligned >= <-G> times, [0]\n");
  60. fprintf(stdout, " If you are using shared kbmidx by other process using -D too, it will bring wrong behavior\n");
  61. fprintf(stdout, " -D <int> Strand of alignment, 1: forward, 2: reverse, 3: both, [3]\n");
  62. fprintf(stdout, " -X <int> Max number of bin(256bp) in one gap, [4]\n");
  63. fprintf(stdout, " -Y <int> Max number of bin(256bp) in one deviation, [4]\n");
  64. fprintf(stdout, " -Z <float> Max fraction of gapped BINs / aligned BINs, [0.6]\n");
  65. fprintf(stdout, " -x <int> penalty for BIN gap, [-7]\n");
  66. fprintf(stdout, " -y <int> penalty for BIN deviation, [-21]\n");
  67. fprintf(stdout, " -z <int> Enable refine alignment with -p <-z> [0]\n");
  68. fprintf(stdout, " -l <int> Min alignment length, [2048]\n");
  69. fprintf(stdout, " -m <int> Min matched length, [200]\n");
  70. fprintf(stdout, " -s <float> Min similarity, calculated by kmer matched length / aligned length, [0.05]\n");
  71. fprintf(stdout, " -r <float> Max length variation of two aligned fragments, [0.25]\n");
  72. fprintf(stdout, " -c Insist to query contained reads against all\n");
  73. fprintf(stdout, " -C Chainning alignments\n");
  74. fprintf(stdout, " -n <int> Max hits per query, [1000]\n");
  75. #ifdef TEST_MODE
  76. fprintf(stdout, " -T <int> For debug, [0]\n");
  77. #endif
  78. fprintf(stdout, " -W <string> Dump kbm index to file, [NULL]\n");
  79. fprintf(stdout, " -R <string> Load kbm index from file, [NULL]\n");
  80. fprintf(stdout, " -q Quiet\n");
  81. fprintf(stdout, " -V Print version information and then exit\n");
  82. #if __DEBUG__
  83. fprintf(stdout, " -v Verbose, +\n");
  84. #endif
  85. fprintf(stdout, "Server start: {kbm -R <wt.fa.kbmidx> start}, will mmap wt.fa.kbmidx into mmeory\n");
  86. fprintf(stdout, "Server list: {kbm -R <wt.fa.kbmidx> list [10]}, will list the object tree in file\n");
  87. fprintf(stdout, "Server stop: {kbm -R <wt.fa.kbmidx> stop}, will remove the mmap object\n");
  88. return 1;
  89. }
  90. thread_beg_def(maln);
  91. CTGCNS *cc;
  92. KBMAux *aux;
  93. String *rdtag;
  94. BaseBank *rdseqs;
  95. u4i qidx;
  96. u8i rdoff;
  97. u4i rdlen;
  98. int corr_mode;
  99. float corr_cov;
  100. u4i corr_min, corr_max;
  101. FILE *out, *lay;
  102. int chainning;
  103. int interactive;
  104. int refine;
  105. thread_end_def(maln);
  106. thread_beg_func(maln);
  107. KBMPar *rpar;
  108. KBM *rkbm;
  109. KBMAux *raux;
  110. kbm_map_t HIT;
  111. u4v *tidxs;
  112. {
  113. rpar = init_kbmpar();
  114. rpar->ksize = 0;
  115. rpar->psize = maln->refine;
  116. rpar->min_bin_degree = 0;
  117. rpar->kmin = 1;
  118. rpar->kmax = 1000;
  119. rpar->kmer_mod = KBM_N_HASH;
  120. rkbm = init_kbm(rpar);
  121. raux = init_kbmaux(rkbm);
  122. }
  123. tidxs = init_u4v(16);
  124. thread_beg_loop(maln);
  125. if(maln->rdlen == 0) break;
  126. if(maln->corr_mode){
  127. if(map_kbmpoa(maln->cc, maln->aux, maln->rdtag->size? maln->rdtag->string : NULL, maln->qidx, maln->rdseqs, maln->rdoff, maln->rdlen, maln->corr_min, maln->corr_max, maln->corr_cov, maln->lay) == 0){
  128. clear_kbmmapv(maln->aux->hits);
  129. break;
  130. }
  131. } else {
  132. query_index_kbm(maln->aux, maln->rdtag->size? maln->rdtag->string : NULL, maln->qidx, maln->rdseqs, maln->rdoff, maln->rdlen);
  133. map_kbm(maln->aux);
  134. if(maln->refine && maln->aux->hits->size){
  135. kbm_read_t *rd;
  136. kbm_map_t *hit;
  137. u4i i, j, tidx;
  138. clear_kbm(rkbm);
  139. bitpush_kbm(rkbm, maln->rdtag->size? maln->rdtag->string : NULL, maln->rdtag->size, maln->rdseqs->bits, 0, maln->rdlen);
  140. ready_kbm(rkbm);
  141. simple_index_kbm(rkbm, 0, rkbm->bins->size);
  142. clear_u4v(tidxs);
  143. for(i=0;i<maln->aux->hits->size;i++){
  144. hit = ref_kbmmapv(maln->aux->hits, i);
  145. if(tidxs->size == 0 || hit->tidx != tidxs->buffer[tidxs->size - 1]){
  146. push_u4v(tidxs, hit->tidx);
  147. }
  148. if(KBM_LOG){
  149. fprintf(maln->out, "#");
  150. fprint_hit_kbm(maln->aux, i, maln->out);
  151. }
  152. }
  153. clear_kbmmapv(maln->aux->hits);
  154. clear_bitsvec(maln->aux->cigars);
  155. for(i=0;i<tidxs->size;i++){
  156. tidx = get_u4v(tidxs, i);
  157. rd = ref_kbmreadv(maln->aux->kbm->reads, tidx);
  158. query_index_kbm(raux, rd->tag, tidx, maln->aux->kbm->rdseqs, rd->seqoff * KBM_BIN_SIZE, rd->bincnt * KBM_BIN_SIZE);
  159. map_kbm(raux);
  160. for(j=0;j<raux->hits->size;j++){
  161. flip_hit_kbmaux(maln->aux, raux, j);
  162. }
  163. }
  164. }
  165. }
  166. if(maln->chainning){
  167. u4i idx, lst;
  168. for(idx=lst=0;idx<=maln->aux->hits->size;idx++){
  169. if(idx == maln->aux->hits->size || maln->aux->hits->buffer[lst].tidx != maln->aux->hits->buffer[idx].tidx || maln->aux->hits->buffer[idx].qdir != maln->aux->hits->buffer[lst].qdir){
  170. if(idx > lst + 1){
  171. if(simple_chain_all_maps_kbm(maln->aux->hits->buffer + lst, idx - lst, maln->aux->cigars, &HIT, maln->aux->cigars, maln->aux->par->aln_var)){
  172. maln->aux->hits->buffer[lst++] = HIT;
  173. while(lst < idx){
  174. maln->aux->hits->buffer[lst++].mat = 0;
  175. }
  176. }
  177. }
  178. lst = idx;
  179. }
  180. }
  181. }
  182. if(maln->aux->par->max_hit){
  183. sort_array(maln->aux->hits->buffer, maln->aux->hits->size, kbm_map_t, num_cmpgt(b.mat, a.mat));
  184. if(maln->aux->hits->size > maln->aux->par->max_hit) maln->aux->hits->size = maln->aux->par->max_hit;
  185. }
  186. if(maln->interactive){
  187. u4i i;
  188. thread_beg_syn(maln);
  189. for(i=0;i<maln->aux->hits->size;i++){
  190. fprint_hit_kbm(maln->aux, i, maln->out);
  191. }
  192. fflush(maln->out);
  193. thread_end_syn(maln);
  194. }
  195. thread_end_loop(maln);
  196. {
  197. free_kbmaux(raux);
  198. free_kbm(rkbm);
  199. free_kbmpar(rpar);
  200. }
  201. free_u4v(tidxs);
  202. thread_end_func(maln);
  203. int kbm_main(int argc, char **argv){
  204. cplist *qrys, *refs;
  205. char *outf, *loadf, *dumpf;
  206. FILE *out, *dump;
  207. KBM *kbm;
  208. KBMPar *par;
  209. KBMAux *aux;
  210. BitVec *solids, *rdflags;
  211. FileReader *fr;
  212. BioSequence *seqs[2], *seq;
  213. char regtag[14];
  214. u8i tot_bp, max_bp, opt_flags, nhit;
  215. u4i qidx, i;
  216. int c, ncpu, buffered_read, overwrite, quiet, tidy_reads, tidy_rdtag, skip_ctn, chainning, interactive, server, tree_maxcnt;
  217. int solid_kmer, refine;
  218. float fval;
  219. thread_preprocess(maln);
  220. par = init_kbmpar();
  221. par->rd_len_order = 1;
  222. chainning = 0;
  223. KBM_LOG = 0;
  224. buffered_read = 1;
  225. skip_ctn = 1;
  226. interactive = 0;
  227. solid_kmer = 0;
  228. refine = 0;
  229. solids = NULL;
  230. rdflags = NULL;
  231. ncpu = 1;
  232. quiet = 0;
  233. tidy_reads = 0;
  234. tidy_rdtag = -1;
  235. max_bp = MAX_U8;
  236. qrys = init_cplist(4);
  237. refs = init_cplist(4);
  238. outf = NULL;
  239. loadf = NULL;
  240. dumpf = NULL;
  241. overwrite = 0;
  242. server = 0;
  243. tree_maxcnt = 10;
  244. opt_flags = 0;
  245. while((c = getopt(argc, argv, "hi:d:o:fIt:k:p:K:E:O:S:B:G:D:X:Y:Z:x:y:z:l:m:n:s:cr:CT:W:R:qvV")) != -1){
  246. switch(c){
  247. case 'h': return kbm_usage();
  248. case 'i': push_cplist(qrys, optarg); break;
  249. case 'd': push_cplist(refs, optarg); break;
  250. case 'L': tidy_reads = atoi(optarg); break;
  251. case 'o': outf = optarg; break;
  252. case 'f': overwrite = 1; break;
  253. case 'I': interactive = 1; break;
  254. case 't': ncpu = atoi(optarg); break;
  255. case 'k': par->ksize = atoi(optarg); opt_flags |= (1 << 1); break;
  256. case 'p': par->psize = atoi(optarg); opt_flags |= (1 << 0); break;
  257. case 'K': fval = atof(optarg); par->kmax = fval; par->ktop = fval - par->kmax; break;
  258. case 'E': par->kmin = atoi(optarg); break;
  259. case 'O': par->min_bin_degree = atoi(optarg); break;
  260. case 'S': par->kmer_mod = UInt(atof(optarg) * KBM_N_HASH); opt_flags |= (1 << 2); break;
  261. case 'B': par->ksampling = atoi(optarg); break;
  262. case 'G': solid_kmer = atoi(optarg); break;
  263. case 'D': par->strand_mask = atoi(optarg); break;
  264. case 'X': par->max_bgap = atoi(optarg); break;
  265. case 'Y': par->max_bvar = atoi(optarg); break;
  266. case 'Z': par->max_gap = atof(optarg); break;
  267. case 'x': par->pgap = atoi(optarg); break;
  268. case 'y': par->pvar = atoi(optarg); break;
  269. case 'z': refine = atoi(optarg); break;
  270. case 'l': par->min_aln = atoi(optarg) / KBM_BIN_SIZE; break;
  271. case 'm': par->min_mat = atoi(optarg); break;
  272. case 'n': par->max_hit = atoi(optarg); break;
  273. case 's': par->min_sim = atof(optarg); break;
  274. case 'r': par->aln_var = atof(optarg); break;
  275. case 'c': skip_ctn = 0; break;
  276. case 'C': chainning = 1; break;
  277. #ifdef TEST_MODE
  278. case 'T': par->test_mode = atoi(optarg); break;
  279. #endif
  280. case 'W': dumpf = optarg; break;
  281. case 'R': loadf = optarg; break;
  282. case 'q': quiet = 1; break;
  283. case 'v': KBM_LOG ++; break;
  284. case 'V': fprintf(stdout, "kbm2 %s\n", TOSTR(VERSION)); return 0;
  285. default: return kbm_usage();
  286. }
  287. }
  288. if(quiet){
  289. int devnull;
  290. devnull = open("/dev/null", O_WRONLY);
  291. dup2(devnull, KBM_LOGFNO);
  292. }
  293. if(tidy_rdtag == -1){
  294. tidy_rdtag = (tidy_reads >= 0);
  295. }
  296. if(tidy_reads < 0) tidy_reads = - tidy_reads;
  297. BEG_STAT_PROC_INFO(KBM_LOGF, argc, argv);
  298. if(par->ksize + par->psize > KBM_MAX_KSIZE){
  299. fprintf(stderr, " -- Invalid kmer size %d+%d=%d > %d in %s -- %s:%d --\n", par->ksize, par->psize, par->ksize + par->psize, KBM_MAX_KSIZE, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  300. return 1;
  301. }
  302. out = open_file_for_write(outf, NULL, overwrite);
  303. if(refs->size == 0){
  304. if(loadf){
  305. } else if(qrys->size){
  306. append_cplist(refs, qrys);
  307. clear_cplist(qrys);
  308. } else {
  309. fprintf(KBM_LOGF, " ** -h to show document\n"); fflush(KBM_LOGF);
  310. push_cplist(refs, "-");
  311. }
  312. }
  313. if(qrys->size == 0){
  314. par->self_aln = 1;
  315. interactive = 0;
  316. if(par->kmin < 2) par->kmin = 2;
  317. }
  318. if(interactive) buffered_read = 0;
  319. if(ncpu <= 0){
  320. ncpu = _proc_deamon->ncpu;
  321. if(ncpu == 0) ncpu = 1; // failed to get number of cores
  322. }
  323. if(KBM_LOG > 0){
  324. fprintf(KBM_LOGF, "KBM_LOG_LEVEL = %d\n", KBM_LOG);
  325. }
  326. if(optind < argc){
  327. server = 0;
  328. if(strcasecmp("start", argv[optind]) == 0) server = 1;
  329. else if(strcasecmp("stop", argv[optind]) == 0) server = 2;
  330. else if(strcasecmp("list", argv[optind]) == 0){
  331. server = 3;
  332. if(optind + 1 < argc) tree_maxcnt = atoi(argv[optind + 1]);
  333. }
  334. if(loadf == NULL) server = 0;
  335. }
  336. if(loadf){
  337. if(server == 1){
  338. fprintf(KBM_LOGF, "[%s] loading kbm index from %s\n", date(), loadf);
  339. kbm = mem_load_obj_file(&kbm_obj_desc, loadf, NULL, NULL, NULL, NULL);
  340. fprintf(KBM_LOGF, "[%s] Done. %u sequences, %llu bp, parameter('-k %d -p %d -S %d')\n", date(), (u4i)kbm->reads->size, (u8i)kbm->rdseqs->size, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod / KBM_N_HASH);
  341. fprintf(KBM_LOGF, "[%s] kbm-index server start\n", date());
  342. return 0;
  343. } else if(server == 2){
  344. if(mem_stop_obj_file(loadf)){
  345. fprintf(KBM_LOGF, "[%s] kbm-index server for '%s' stop\n", date(), loadf);
  346. } else {
  347. fprintf(KBM_LOGF, "[%s] unable to find kbm-index server for '%s'\n", date(), loadf);
  348. }
  349. return 0;
  350. } else if(server == 3){
  351. print_tree_obj_file(stdout, &kbm_obj_desc, loadf, tree_maxcnt, 0);
  352. return 0;
  353. } else {
  354. fprintf(KBM_LOGF, "[%s] loading kbm index from %s\n", date(), loadf);
  355. if((kbm = mem_find_obj_file(&kbm_obj_desc, loadf, NULL, NULL, NULL, NULL, 1)) == NULL){
  356. fprintf(KBM_LOGF, " -- cannot find mmap object %s --\n", loadf);
  357. fprintf(KBM_LOGF, " -- try read from file --\n");
  358. kbm = mem_read_obj_file(&kbm_obj_desc, loadf, NULL, NULL, NULL, NULL);
  359. }
  360. }
  361. fprintf(KBM_LOGF, "[%s] Done. %u sequences, %llu bp, parameter('-k %d -p %d -S %d')\n", date(), (u4i)kbm->reads->size, (u8i)kbm->rdseqs->size, kbm->par->ksize, kbm->par->psize, kbm->par->kmer_mod / KBM_N_HASH);
  362. // Please note that, kbm->tag2idx is not functional after mem_load
  363. // check KBMPar
  364. if((opt_flags >> 0) & 0x01){
  365. if(kbm->par->psize != par->psize){
  366. fprintf(KBM_LOGF, " ** -p is different, %d != %d\n", kbm->par->psize, par->psize); exit(1);
  367. }
  368. } else {
  369. par->psize = kbm->par->psize;
  370. }
  371. if((opt_flags >> 1) & 0x01){
  372. if(kbm->par->ksize != par->ksize){
  373. fprintf(KBM_LOGF, " ** -k is different, %d != %d\n", kbm->par->ksize, par->ksize); exit(1);
  374. }
  375. } else {
  376. par->ksize = kbm->par->ksize;
  377. }
  378. if((opt_flags >> 2) & 0x01){
  379. if(kbm->par->kmer_mod != par->kmer_mod){
  380. fprintf(KBM_LOGF, " ** -S is different, %d != %d\n", kbm->par->kmer_mod / KBM_N_HASH, par->kmer_mod / KBM_N_HASH); exit(1);
  381. }
  382. } else {
  383. par->kmer_mod = kbm->par->kmer_mod;
  384. }
  385. if((opt_flags >> 3) & 0x01){
  386. if(kbm->par->rd_len_order != par->rd_len_order){
  387. fprintf(KBM_LOGF, " ** par->rd_len_order is different, %d != %d\n", kbm->par->rd_len_order, par->rd_len_order); exit(1);
  388. }
  389. } else {
  390. par->rd_len_order = kbm->par->rd_len_order;
  391. }
  392. } else {
  393. kbm = init_kbm(par);
  394. fprintf(KBM_LOGF, "[%s] loading sequences\n", date()); fflush(KBM_LOGF);
  395. fr = open_all_filereader(refs->size, refs->buffer, buffered_read);
  396. tot_bp = 0;
  397. seqs[0] = init_biosequence();
  398. seqs[1] = init_biosequence();
  399. regex_t reg;
  400. regmatch_t mats[3];
  401. int z, tag_size, len;
  402. z = regcomp(&reg, "^(.+?)/[0-9]+_[0-9]+$", REG_EXTENDED);
  403. if(z){
  404. regerror(z, &reg, regtag, 13);
  405. fprintf(stderr, " -- REGCOMP: %s --\n", regtag); fflush(stderr);
  406. return 1;
  407. }
  408. {
  409. int k = 0;
  410. reset_biosequence(seqs[0]);
  411. reset_biosequence(seqs[1]);
  412. while(1){
  413. int has = readseq_filereader(fr, seqs[k]);
  414. if(tidy_reads){
  415. if(has){
  416. if((z = regexec(&reg, seqs[k]->tag->string, 3, mats, 0)) == 0){
  417. trunc_string(seqs[k]->tag, mats[1].rm_eo);
  418. } else if(z != REG_NOMATCH){
  419. regerror(z, &reg, regtag, 13);
  420. fprintf(stderr, " -- REGEXEC: %s --\n", regtag); fflush(stderr);
  421. }
  422. //fprintf(stderr, "1: %s len=%d\n", seqs[k]->tag->string, seqs[k]->seq->size); fflush(stderr);
  423. //fprintf(stderr, "2: %s len=%d\n", seqs[!k]->tag->string, seqs[!k]->seq->size); fflush(stderr);
  424. if(seqs[k]->tag->size == seqs[!k]->tag->size && strcmp(seqs[k]->tag->string, seqs[!k]->tag->string) == 0){
  425. if(seqs[k]->seq->size > seqs[!k]->seq->size){
  426. k = !k;
  427. }
  428. continue;
  429. } else {
  430. seq = seqs[!k];
  431. k = !k;
  432. }
  433. } else {
  434. seq = seqs[!k];
  435. }
  436. if(seq->seq->size < tidy_reads){
  437. if(has) continue;
  438. else break;
  439. }
  440. if(tidy_rdtag){
  441. sprintf(regtag, "S%010llu", (u8i)kbm->reads->size);
  442. clear_string(seq->tag);
  443. append_string(seq->tag, regtag, 11);
  444. }
  445. } else {
  446. if(has == 0) break;
  447. seq = seqs[k];
  448. }
  449. tag_size = seq->tag->size;
  450. for(i=0;i<UInt(seq->seq->size);i+=KBM_MAX_RDLEN){
  451. len = num_min(seq->seq->size - i, KBM_MAX_RDLEN);
  452. if(i){
  453. append_string(seq->tag, "_V", 2);
  454. add_int_string(seq->tag, i / KBM_MAX_RDLEN);
  455. }
  456. if(!KBM_LOG && (kbm->reads->size % 10000) == 0){ fprintf(KBM_LOGF, "\r%u", (u4i)kbm->reads->size); fflush(KBM_LOGF); }
  457. //fprintf(stderr, " -- %s len=%d in %s -- %s:%d --\n", seq->tag->string, seq->seq->size, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  458. if(kbm->reads->size >= KBM_MAX_RDCNT){
  459. fprintf(stderr, " -- Read Number Out of Range: %u --\n", (u4i)kbm->reads->size); fflush(stderr);
  460. break;
  461. }
  462. push_kbm(kbm, seq->tag->string, seq->tag->size, seq->seq->string + i, len);
  463. if(i){ seq->tag->size = tag_size; seq->tag->string[tag_size] = '\0'; }
  464. }
  465. tot_bp += seq->seq->size;
  466. if(max_bp && tot_bp >= max_bp){ break; }
  467. if(has == 0) break;
  468. if(kbm->reads->size >= KBM_MAX_RDCNT){
  469. fprintf(stderr, " -- Read Number Out of Range: %u --\n", (u4i)kbm->reads->size); fflush(stderr);
  470. break;
  471. }
  472. }
  473. }
  474. close_filereader(fr);
  475. regfree(&reg);
  476. free_biosequence(seqs[0]);
  477. free_biosequence(seqs[1]);
  478. if(!KBM_LOG){ fprintf(KBM_LOGF, "\r%u reads", (unsigned)kbm->reads->size); fflush(KBM_LOGF); }
  479. ready_kbm(kbm);
  480. fprintf(KBM_LOGF, "\n[%s] Done, %u reads, %llu bp, %u bins\n", date(), (u4i)kbm->reads->size, tot_bp, (u4i)kbm->bins->size); fflush(KBM_LOGF);
  481. fprintf(KBM_LOGF, "[%s] indexing, %d threads\n", date(), ncpu);
  482. index_kbm(kbm, 0, kbm->bins->size, ncpu, NULL);
  483. if(dumpf){
  484. u8i size;
  485. dump = open_file_for_write(dumpf, NULL, 1);
  486. size = mem_dump_obj_file(kbm, 1, &kbm_obj_desc, 1, 0, dump);
  487. fclose(dump);
  488. fprintf(KBM_LOGF, "[%s] kbm index dumped to %s\n", date(), dumpf);
  489. }
  490. }
  491. fprintf(KBM_LOGF, "[%s] mapping\n", date());
  492. if(solid_kmer && par->self_aln){
  493. solids = init_bitvec((kbm->rdseqs->size >> 1) + 1);
  494. }
  495. thread_beg_init(maln, ncpu);
  496. maln->aux = init_kbmaux(kbm);
  497. maln->aux->par = par; // par might be different from kbm->par
  498. maln->aux->solids = solids;
  499. //maln->aux->bmin = 611077674;
  500. //maln->aux->bmax = 611077674 + 172;
  501. maln->rdtag = init_string(64);
  502. maln->rdseqs = qrys->size? init_basebank() : kbm->rdseqs;
  503. maln->rdoff = 0;
  504. maln->rdlen = 0;
  505. maln->cc = NULL;
  506. maln->corr_mode = 0;
  507. maln->corr_cov = 0.75;
  508. maln->corr_min = 5;
  509. maln->corr_max = 10;
  510. maln->out = out;
  511. maln->lay = NULL;
  512. maln->chainning = chainning;
  513. maln->interactive = interactive;
  514. maln->refine = refine;
  515. thread_end_init(maln);
  516. if(qrys->size){
  517. int run_mode;
  518. fr = open_all_filereader(qrys->size, qrys->buffer, buffered_read);
  519. run_mode = 0;
  520. if(readline_filereader(fr) > 0){
  521. if(strcasecmp(fr->line->string, "#pairwise_test") == 0){
  522. run_mode = 1;
  523. } else if(strcasecmp(fr->line->string, "#print_exists") == 0){
  524. run_mode = 2;
  525. } else if(strcasecmp(fr->line->string, "#correct_align") == 0){
  526. run_mode = 3;
  527. cns_debug = KBM_LOG;
  528. thread_beg_iter(maln);
  529. maln->corr_mode = 1;
  530. KBMBlock *kb;
  531. POGPar par;
  532. kb = init_kbmblock(2048, 2048 - 512);
  533. par = DEFAULT_POG_PAR;
  534. //maln->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, 1, maln->corr_max, 200, 100, 1, 96, 2, -5, -2, -4, -1, 16, 3, 0.5, 512);
  535. par.refmode = 1;
  536. maln->cc = init_ctgcns(kb, iter_kbmblock, info_kbmblock, 1, 1, maln->corr_max, 200, 100, 1, 512, &par);
  537. maln->lay = maln->out;
  538. thread_end_iter(maln);
  539. } else if(strcasecmp(fr->line->string, "#print_read") == 0){
  540. run_mode = 4;
  541. } else {
  542. rollback_filereader(fr);
  543. }
  544. }
  545. seq = init_biosequence();
  546. qidx = 0;
  547. nhit = 0;
  548. if(run_mode == 0 || run_mode == 3){
  549. while(readseq_filereader(fr, seq)){
  550. if((qidx % 100) == 0){
  551. fprintf(KBM_LOGF, "\r%u\t%llu", qidx, nhit); fflush(KBM_LOGF);
  552. }
  553. thread_wait_one(maln);
  554. if(maln->rdlen && !maln->interactive){
  555. {
  556. aux = maln->aux;
  557. for(i=0;i<aux->hits->size;i++){
  558. fprint_hit_kbm(aux, i, out);
  559. }
  560. if(run_mode == 3) fflush(out);
  561. nhit += aux->hits->size;
  562. }
  563. }
  564. trunc_string(seq->seq, kbm_cvt_length(seq->seq->size));
  565. clear_basebank(maln->rdseqs);
  566. seq2basebank(maln->rdseqs, seq->seq->string, seq->seq->size);
  567. clear_string(maln->rdtag);
  568. append_string(maln->rdtag, seq->tag->string, seq->tag->size);
  569. maln->qidx = qidx ++;
  570. maln->rdoff = 0;
  571. maln->rdlen = seq->seq->size;
  572. thread_wake(maln);
  573. }
  574. } else if(run_mode == 1){
  575. int nc;
  576. u4i tidx;
  577. thread_beg_operate(maln, 0);
  578. aux = maln->aux;
  579. free_basebank(maln->rdseqs);
  580. maln->rdseqs = kbm->rdseqs;
  581. while((nc = readtable_filereader(fr)) >= 0){
  582. if(nc < 2) continue;
  583. qidx = getval_cuhash(kbm->tag2idx, get_col_str(fr, 0));
  584. tidx = getval_cuhash(kbm->tag2idx, get_col_str(fr, 1));
  585. if(qidx == MAX_U4 || tidx == MAX_U4) continue;
  586. if(qidx > tidx){ swap_var(qidx, tidx); }
  587. maln->qidx = qidx;
  588. maln->rdoff = kbm->reads->buffer[qidx].seqoff * KBM_BIN_SIZE;
  589. maln->rdlen = kbm->reads->buffer[qidx].bincnt * KBM_BIN_SIZE;
  590. aux->bmin = kbm->reads->buffer[tidx].binoff;
  591. aux->bmax = kbm->reads->buffer[tidx].binoff + kbm->reads->buffer[tidx].bincnt;
  592. fprintf(out, "%s <-> %s\n", kbm->reads->buffer[qidx].tag, kbm->reads->buffer[tidx].tag);
  593. thread_wake(maln);
  594. thread_wait(maln);
  595. if(maln->rdlen && !maln->interactive){
  596. for(i=0;i<aux->hits->size;i++){
  597. fprint_hit_kbm(aux, i, out);
  598. }
  599. nhit += aux->hits->size;
  600. }
  601. fprintf(out, "//\n");
  602. maln->rdlen = 0;
  603. }
  604. } else if(run_mode == 2){
  605. kmeroffv *kmers[2];
  606. int nc;
  607. kmers[0] = adv_init_kmeroffv(32, 0, 1);
  608. kmers[1] = adv_init_kmeroffv(32, 0, 1);
  609. while((nc = readtable_filereader(fr)) >= 0){
  610. //fprintf(stdout, "%s\n", fr->line->string);
  611. if(nc < 1) continue;
  612. qidx = getval_cuhash(kbm->tag2idx, get_col_str(fr, 0));
  613. if(qidx == MAX_U4) continue;
  614. print_exists_index_kbm(kbm, kbm->reads->buffer[qidx].tag, kbm->rdseqs, kbm->reads->buffer[qidx].seqoff * KBM_BIN_SIZE,
  615. kbm->reads->buffer[qidx].bincnt * KBM_BIN_SIZE, kmers, stdout);
  616. }
  617. free_kmeroffv(kmers[0]);
  618. free_kmeroffv(kmers[1]);
  619. } else if(run_mode == 4){
  620. int nc;
  621. while((nc = readtable_filereader(fr)) >= 0){
  622. if(nc < 1) continue;
  623. if(fr->line->string[0] == '#') continue;
  624. qidx = getval_cuhash(kbm->tag2idx, get_col_str(fr, 0));
  625. if(qidx == MAX_U4){
  626. fprintf(out, "#%s NOT FOUND\n", get_col_str(fr, 0));
  627. continue;
  628. }
  629. fprintf(out, ">%s\n", kbm->reads->buffer[qidx].tag);
  630. println_fwdseq_basebank(kbm->rdseqs, kbm->reads->buffer[qidx].seqoff * KBM_BIN_SIZE, kbm->reads->buffer[qidx].bincnt * KBM_BIN_SIZE, out);
  631. fflush(out);
  632. }
  633. }
  634. free_filereader(fr);
  635. free_biosequence(seq);
  636. thread_beg_iter(maln);
  637. thread_wait(maln);
  638. if(maln->rdlen && !maln->interactive){
  639. aux = maln->aux;
  640. for(i=0;i<aux->hits->size;i++){
  641. fprint_hit_kbm(aux, i, out);
  642. }
  643. if(run_mode == 3) fflush(out);
  644. nhit += aux->hits->size;
  645. maln->rdlen = 0;
  646. }
  647. thread_end_iter(maln);
  648. fprintf(KBM_LOGF, "\r%u\t%llu\n", qidx, (u8i)nhit); fflush(KBM_LOGF);
  649. } else {
  650. nhit = 0;
  651. rdflags = init_bitvec(kbm->reads->size);
  652. for(qidx=0;qidx<kbm->reads->size+ncpu;qidx++){
  653. if(qidx < kbm->reads->size){
  654. thread_wait_one(maln);
  655. } else {
  656. thread_wait_next(maln);
  657. }
  658. if(maln->rdlen){
  659. aux = maln->aux;
  660. for(i=0;i<aux->hits->size;i++){
  661. if(par->test_mode == 0 && skip_ctn){
  662. // whether reads[tidx] is contained by reads[qidx]
  663. kbm_map_t *hit;
  664. int margin = KBM_BIN_SIZE;
  665. hit = ref_kbmmapv(aux->hits, i);
  666. if((hit->tb <= margin && hit->te + margin >= (int)kbm->reads->buffer[hit->tidx].bincnt * KBM_BIN_SIZE)
  667. && (hit->qb > margin || hit->qe + margin < (int)kbm->reads->buffer[hit->qidx].bincnt * KBM_BIN_SIZE)){
  668. one_bitvec(rdflags, hit->tidx);
  669. }
  670. }
  671. fprint_hit_kbm(aux, i, out);
  672. }
  673. nhit += aux->hits->size;
  674. }
  675. if(qidx >= kbm->reads->size || get_bitvec(rdflags, qidx)){
  676. maln->rdlen = 0;
  677. continue;
  678. } else if((qidx % 100) == 0){
  679. fprintf(KBM_LOGF, "\r%u\t%llu", qidx, nhit); fflush(KBM_LOGF);
  680. }
  681. maln->qidx = qidx;
  682. maln->rdoff = kbm->reads->buffer[qidx].seqoff * KBM_BIN_SIZE;
  683. maln->rdlen = kbm->reads->buffer[qidx].bincnt * KBM_BIN_SIZE;
  684. thread_wake(maln);
  685. }
  686. fprintf(KBM_LOGF, "\r%u\t%llu\n", qidx, nhit); fflush(KBM_LOGF);
  687. free_bitvec(rdflags);
  688. }
  689. thread_beg_close(maln);
  690. free_kbmaux(maln->aux);
  691. free_string(maln->rdtag);
  692. if(maln->rdseqs && maln->rdseqs != kbm->rdseqs) free_basebank(maln->rdseqs);
  693. if(maln->cc){
  694. free_kbmblock((KBMBlock*)maln->cc->obj);
  695. free_ctgcns(maln->cc);
  696. }
  697. thread_end_close(maln);
  698. if(solids) free_bitvec(solids);
  699. fprintf(KBM_LOGF, "[%s] Done\n", date());
  700. if(outf) fclose(out);
  701. free_cplist(qrys);
  702. free_cplist(refs);
  703. free_kbmpar(par);
  704. if(loadf){
  705. // DON't free kbm, for it is mem_load object
  706. } else {
  707. free_kbm(kbm);
  708. }
  709. END_STAT_PROC_INFO(KBM_LOGF);
  710. return 0;
  711. }
  712. int main(int argc, char **argv){
  713. return kbm_main(argc, argv);
  714. }