wtpoa-cns.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. /*
  2. *
  3. * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
  4. *
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "wtpoa.h"
  20. #ifndef VERSION
  21. #define VERSION 0.0
  22. #endif
  23. #ifndef RELEASE
  24. #define RELEASE 19830203
  25. #endif
  26. int usage(){
  27. printf(
  28. "WTPOA-CNS: Consensuser for wtdbg using PO-MSA\n"
  29. "Author: Jue Ruan <ruanjue@gmail.com>\n"
  30. "Version: %s (%s)\n"
  31. "Usage: wtpoa-cns [options]\n"
  32. "Options:\n"
  33. " -t <int> Number of threads, [4]\n"
  34. " -d <string> Reference sequences for SAM input, will invoke sorted-SAM input mode\n"
  35. " -u <int> XORed flags to handle SAM input. [0]\n"
  36. " 0x1: Only process reference regions present in/between SAM alignments\n"
  37. " 0x2: Don't fileter secondary/supplementary SAM records with flag (0x100 | 0x800)\n"
  38. " -r Force to use reference mode\n"
  39. " -p <string> Similar with -d, but translate SAM into wtdbg layout file\n"
  40. " -i <string> Input file(s) *.ctg.lay from wtdbg, +, [STDIN]\n"
  41. " Or sorted SAM files when having -d/-p\n"
  42. " -o <string> Output files, [STDOUT]\n"
  43. " -f Force overwrite\n"
  44. " -j <int> Expected max length of node, or say the overlap length of two adjacent units in layout file, [1500] bp\n"
  45. " overlap will be default to 1500(or 150 for sam-sr) when having -d/-p, block size will be 2.5 * overlap\n"
  46. " -b <int> Bonus for tri-bases match, [0]\n"
  47. " -M <int> Match score, [2]\n"
  48. " -X <int> Mismatch score, [-5]\n"
  49. " -I <int> Insertion score, [-2]\n"
  50. " -D <int> Deletion score, [-4]\n"
  51. " -H <float> Homopolymer merge score used in dp-call-cns mode, [-3]\n"
  52. " -B <expr> Bandwidth in POA, [Wmin[,Wmax[,mat_rate]]], mat_rate = matched_bases/total_bases [64,1024,0.92]\n"
  53. " Program will double bandwidth from Wmin to Wmax when mat_rate is lower than setting\n"
  54. " -W <int> Window size in the middle of the first read for fast align remaining reads, [200]\n"
  55. " If $W is negative, will disable fast align, but use the abs($W) as Band align score cutoff\n"
  56. " -w <int> Min size of aligned size in window, [$W * 0.5]\n"
  57. " -A Abort TriPOA when any read cannot be fast aligned, then try POA\n"
  58. " -S <int> Shuffle mode, 0: don't shuffle reads, 1: by shared kmers, 2: subsampling. [1]\n"
  59. " -R <int> Realignment bandwidth, 0: disable, [16]\n"
  60. " -c <int> Consensus mode: 0, run-length; 1, dp-call-cns, [0]\n"
  61. " -C <int> Min count of bases to call a consensus base, [3]\n"
  62. " -F <float> Min frequency of non-gap bases to call a consensus base, [0.5]\n"
  63. " -N <int> Max number of reads in PO-MSA [20]\n"
  64. " Keep in mind that I am not going to generate high accurate consensus sequences here\n"
  65. " -x <string> Presets, []\n"
  66. " sam-sr: polishs contigs from short reads mapping, accepts sorted SAM files\n"
  67. " shorted for '-j 50 -W 0 -R 0 -b 1 -c 1 -N 50 -rS 2'\n"
  68. " -q Quiet\n"
  69. " -v Verbose\n"
  70. " -V Print version information and then exit\n"
  71. "\n", TOSTR(VERSION), TOSTR(RELEASE));
  72. return 1;
  73. }
  74. int main(int argc, char **argv){
  75. POGPar par;
  76. CTGCNS *cc;
  77. ctg_cns_t *ctg;
  78. SeqBank *refs;
  79. FileReader *fr, *db;
  80. cplist *infs, *dbfs;
  81. FILE *out;
  82. char *outf;
  83. u4i i;
  84. int reglen, shuffle, winlen, winmin, fail_skip;
  85. int seqmax, wsize, print_lay, flags;
  86. int c, ncpu, overwrite, quiet;
  87. par = DEFAULT_POG_PAR;
  88. ncpu = 4;
  89. quiet = 0;
  90. shuffle = 1;
  91. seqmax = 20;
  92. winlen = 200;
  93. winmin = 0;
  94. fail_skip = 1;
  95. reglen = 0;
  96. infs = init_cplist(4);
  97. dbfs = init_cplist(4);
  98. outf = NULL;
  99. overwrite = 0;
  100. print_lay = 0;
  101. flags = 0;
  102. while((c = getopt(argc, argv, "hqvVt:d:rp:u:i:o:fj:S:B:W:w:Ab:M:X:I:D:E:H:R:c:C:F:N:x:")) != -1){
  103. switch(c){
  104. case 'h': return usage();
  105. case 't': ncpu = atoi(optarg); break;
  106. case 'p': print_lay = 1;
  107. case 'd': push_cplist(dbfs, optarg); break;
  108. case 'u': flags = atoi(optarg); break;
  109. case 'r': par.refmode = 1; break;
  110. case 'i': push_cplist(infs, optarg); break;
  111. case 'o': outf = optarg; break;
  112. case 'f': overwrite = 1; break;
  113. case 'j': reglen = atoi(optarg); break;
  114. case 'S': shuffle = atoi(optarg); break;
  115. case 'B':
  116. {
  117. char *ptr;
  118. par.W = strtol(optarg, &ptr, 10);
  119. if(ptr && ptr[0] == ','){
  120. par.Wmax = strtol(ptr + 1, &ptr, 10);
  121. if(ptr && ptr[0] == ','){
  122. par.W_mat_rate = atof(ptr + 1);
  123. }
  124. }
  125. }
  126. break;
  127. case 'W': winlen = atoi(optarg); break;
  128. case 'w': winmin = atoi(optarg); break;
  129. case 'A': fail_skip = 0; break;
  130. case 'b': par.tribase = atoi(optarg); break;
  131. case 'M': par.M = atoi(optarg); break;
  132. case 'X': par.X = atoi(optarg); break;
  133. case 'I': par.I = atoi(optarg); break;
  134. case 'D': par.D = atoi(optarg); break;
  135. case 'E': par.E = atoi(optarg); break;
  136. case 'H': par.H = atof(optarg); break;
  137. case 'R': par.rW = atoi(optarg); break;
  138. case 'c': par.cnsmode = atoi(optarg); break;
  139. case 'C': par.msa_min_cnt = atoi(optarg); break;
  140. case 'F': par.msa_min_freq = atof(optarg); break;
  141. case 'N': seqmax = atoi(optarg); break;
  142. case 'x':
  143. if(strcasecmp(optarg, "sam-sr") == 0){
  144. winlen = 0;
  145. reglen = 150;
  146. par.rW = 0;
  147. par.tribase = 1;
  148. par.cnsmode = 1;
  149. seqmax = 50;
  150. par.refmode = 1;
  151. shuffle = 2;
  152. } else {
  153. fprintf(stderr, "Unknown preset[%s]\n", optarg);
  154. return 1;
  155. }
  156. break;
  157. case 'q': quiet = 1; break;
  158. case 'v': cns_debug ++; break;
  159. case 'V': fprintf(stdout, "wtpoa-cns %s\n", TOSTR(VERSION)); return 0;
  160. default: return usage();
  161. }
  162. }
  163. if(quiet){
  164. int devnull;
  165. devnull = open("/dev/null", O_WRONLY);
  166. dup2(devnull, STDERR_FILENO);
  167. }
  168. BEG_STAT_PROC_INFO(stderr, argc, argv);
  169. //SET_PROC_LIMIT(10 * 1024 * 1024 * 1024ULL, 0); // TODO: remove it after debug
  170. if(reglen == 0){
  171. reglen = 1500;
  172. }
  173. wsize = 2.5 * reglen;
  174. if(winlen < 0){
  175. par.W_score = - winlen;
  176. winlen = 0;
  177. }
  178. if(winmin <= 0){
  179. winmin = winlen * 0.5;
  180. }
  181. if(seqmax <= 0 || seqmax >= POG_RDCNT_MAX){
  182. seqmax = POG_RDCNT_MAX;
  183. }
  184. if(ncpu <= 0 && _sig_proc_deamon) ncpu = _sig_proc_deamon->ncpu;
  185. if(ncpu <= 0){
  186. fprintf(stderr, " -- Invalid cpu number '%d' in %s -- %s:%d --\n", ncpu, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  187. return 1;
  188. }
  189. if(outf && !overwrite && file_exists(outf)){
  190. fprintf(stderr, "File exists! '%s'\n\n", outf);
  191. return usage();
  192. }
  193. if(infs->size) fr = open_all_filereader(infs->size, infs->buffer, 1);
  194. else fr = open_filereader(NULL, 1);
  195. if(outf){
  196. out = open_file_for_write(outf, NULL, 1);
  197. } else out = stdout;
  198. if(shuffle == 2){
  199. srand48(13);
  200. }
  201. if(dbfs->size == 0){
  202. WTLAYBlock *wb;
  203. wb = init_wtlayblock(fr);
  204. cc = init_ctgcns(wb, iter_wtlayblock, info_wtlayblock, ncpu, shuffle, seqmax, winlen, winmin, fail_skip, reglen, &par);
  205. cc->print_progress = 100;
  206. if(print_lay){
  207. print_lays_ctgcns(cc, out);
  208. } else {
  209. while((ctg = iter_ctgcns(cc))){
  210. fprintf(out, ">%s len=%d\n", ctg->tag->string, (u4i)ctg->cns->size);
  211. for(i=0;i<ctg->cns->size;i+=100){
  212. if(i + 100 <= ctg->cns->size){
  213. println_seq_basebank(ctg->cns, i, 100, out);
  214. } else {
  215. println_seq_basebank(ctg->cns, i, ctg->cns->size - i, out);
  216. }
  217. }
  218. fflush(out);
  219. if(cc->cycs->size){ // keep only one for reuse
  220. free_ctg(ctg);
  221. } else {
  222. repay_ctgcns(cc, ctg);
  223. }
  224. }
  225. }
  226. free_ctgcns(cc);
  227. free_wtlayblock(wb);
  228. } else {
  229. SAMBlock *sb;
  230. BioSequence *seq;
  231. if(2 * reglen >= wsize){
  232. fprintf(stderr, " -- SAM Input mode: -w wsize(%d), -j reglen(%d), MUST has 2 * reglen >= wsize in %s -- %s:%d --\n", wsize, reglen, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
  233. return 1;
  234. }
  235. refs = init_seqbank();
  236. db = open_all_filereader(dbfs->size, dbfs->buffer, 1);
  237. seq = init_biosequence();
  238. while(readseq_filereader(db, seq)){
  239. push_seqbank(refs, seq->tag->string, seq->tag->size, seq->seq->string, seq->seq->size);
  240. }
  241. free_biosequence(seq);
  242. close_filereader(db);
  243. sb = init_samblock(refs, fr, wsize, reglen * 2 / 3, flags);
  244. cc = init_ctgcns(sb, iter_samblock, info_samblock, ncpu, shuffle, seqmax, winlen, winmin, fail_skip, reglen, &par);
  245. cc->print_progress = 100;
  246. if(print_lay){
  247. print_lays_ctgcns(cc, out);
  248. } else {
  249. while((ctg = iter_ctgcns(cc))){
  250. fprintf(out, ">%s len=%d\n", ctg->tag->string, (u4i)ctg->cns->size);
  251. for(i=0;i<ctg->cns->size;i+=100){
  252. if(i + 100 <= ctg->cns->size){
  253. println_seq_basebank(ctg->cns, i, 100, out);
  254. } else {
  255. println_seq_basebank(ctg->cns, i, ctg->cns->size - i, out);
  256. }
  257. }
  258. fflush(out);
  259. if(cc->cycs->size){ // keep only one for reuse
  260. free_ctg(ctg);
  261. } else {
  262. repay_ctgcns(cc, ctg);
  263. }
  264. }
  265. }
  266. free_ctgcns(cc);
  267. free_samblock(sb);
  268. free_seqbank(refs);
  269. }
  270. close_filereader(fr);
  271. if(outf) fclose(out);
  272. free_cplist(infs);
  273. free_cplist(dbfs);
  274. END_STAT_PROC_INFO(stderr);
  275. return 0;
  276. }