best_sam_hits4longreads.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #include "mem_share.h"
  2. #include "filereader.h"
  3. #include "list.h"
  4. #include "hashset.h"
  5. typedef struct {
  6. u4i stroff, strlen;
  7. u4i taglen, flag;
  8. u4i qlen, qb, qe;
  9. u4i refidx, reflen;
  10. } lr_hit_t;
  11. define_list(lrhitv, lr_hit_t);
  12. int select_best_hit(lrhitv *hits, String *lines, u4i minlen, float mincov, FILE *out){
  13. lr_hit_t *h1, *h2;
  14. u4i i, j, pass;
  15. int x, y, ret;
  16. sort_array(hits->buffer, hits->size, lr_hit_t, num_cmpgt(b.qe - b.qb, a.qe - a.qb));
  17. ret = 0;
  18. for(i=0;i<hits->size;i++){
  19. h1 = ref_lrhitv(hits, i);
  20. if(h1->qe - h1->qb < minlen) break;
  21. if(h1->qe - h1->qb < UInt(mincov * h1->qlen)) break;
  22. pass = 1;
  23. for(j=0;j<i;j++){
  24. h2 = ref_lrhitv(hits, j);
  25. x = num_max(h1->qb, h2->qb);
  26. y = num_min(h1->qe, h2->qe);
  27. if(y - x >= (1 - mincov) * (h1->qe - h1->qb)){
  28. pass = 0;
  29. break;
  30. }
  31. }
  32. if(pass){
  33. fprintf(out, "%s\n", lines->string + h1->stroff);
  34. ret ++;
  35. }
  36. }
  37. clear_lrhitv(hits);
  38. clear_string(lines);
  39. return ret;
  40. }
  41. int usage(char *prog){
  42. printf(
  43. "Usage: %s [-h] [-v] [-B:retain secondary aligment] [-l min_map_len:100] [-f min_map_cov:0.70]\n"
  44. , prog
  45. );
  46. return 1;
  47. }
  48. int main(int argc, char **argv){
  49. FileReader *fr;
  50. FILE *out;
  51. cuhash *refs;
  52. cplist *reftags;
  53. u4v *reflens;
  54. String *lines;
  55. lrhitv *hits;
  56. lr_hit_t *hit, HIT;
  57. char *str, *reftag;
  58. u4i minlen, i, reflen;
  59. float mincov;
  60. int c, primary_hit, verbose;
  61. u1i movs[256];
  62. minlen = 100;
  63. mincov = 0.70;
  64. primary_hit = 1;
  65. verbose = 0;
  66. out = stdout;
  67. while((c = getopt(argc, argv, "hvBl:f:")) != -1){
  68. switch(c){
  69. case 'l': minlen = atoi(optarg); break;
  70. case 'f': mincov = atof(optarg); break;
  71. case 'B': primary_hit = 0; break;
  72. case 'v': verbose = 1; break;
  73. default: return usage(argv[0]);
  74. }
  75. }
  76. fr = open_filereader(NULL, 1);
  77. refs = init_cuhash(13);
  78. reftags = init_cplist(8);
  79. reflens = init_u4v(8);
  80. hits = init_lrhitv(4);
  81. lines = init_string(1024);
  82. memset(movs, 0, 256);
  83. movs[(int)'M'] = 0b11;
  84. movs[(int)'I'] = 0b10;
  85. movs[(int)'D'] = 0b01;
  86. movs[(int)'N'] = 0b01;
  87. movs[(int)'S'] = 0b10;
  88. movs[(int)'H'] = 0b10;
  89. movs[(int)'P'] = 0b00;
  90. movs[(int)'='] = 0b11;
  91. movs[(int)'X'] = 0b11;
  92. while((c = readline_filereader(fr))){
  93. if(fr->line->string[0] == '@'){
  94. fprintf(out, "%s\n", fr->line->string);
  95. if(fr->line->string[1] == 'S' && fr->line->string[2] == 'Q'){
  96. if((c = split_line_filereader(fr, '\t')) > 2){
  97. reftag = NULL;
  98. reflen = 0;
  99. for(i=1;i<3;i++){
  100. if(get_col_len(fr, i) <= 3){
  101. continue;
  102. }
  103. str = get_col_str(fr, i);
  104. if(str[0] == 'S' && str[1] == 'N' && str[2] == ':'){
  105. reftag = strdup(str + 3);
  106. } else if(str[0] == 'L' && str[1] == 'N' && str[2] == ':'){
  107. reflen = atol(str + 3);
  108. }
  109. }
  110. if(strlen(reftag) && reflen){
  111. push_cplist(reftags, reftag);
  112. push_u4v(reflens, reflen);
  113. put_cuhash(refs, (cuhash_t){reftag, reftags->size - 1});
  114. }
  115. }
  116. }
  117. } else {
  118. hit = &HIT;
  119. str = index(fr->line->string, '\t');
  120. if(str == NULL){
  121. fprintf(stderr, "[WARNNING:too_few_column] %s\n", fr->line->string);
  122. continue;
  123. }
  124. hit->taglen = str - fr->line->string;
  125. if(hits->size && (hits->buffer[0].taglen != hit->taglen || strncmp(lines->string + hits->buffer[0].stroff, fr->line->string, hit->taglen))){
  126. select_best_hit(hits, lines, minlen, mincov, out);
  127. }
  128. hit->stroff = lines->size;
  129. hit->strlen = fr->line->size;
  130. append_string(lines, fr->line->string, fr->line->size);
  131. add_char_string(lines, '\0');
  132. if((c = split_line_filereader(fr, '\t')) < 11){
  133. fprintf(stderr, "[WARNNING:too_few_columns] %s\n", lines->string + hit->stroff);
  134. continue;
  135. }
  136. hit->taglen = get_col_len(fr, 0);
  137. hit->flag = atol(get_col_str(fr, 1));
  138. if(primary_hit && (hit->flag & 0x900)){
  139. continue;
  140. }
  141. if(get_col_str(fr, 2)[0] == '*'){
  142. continue;
  143. }
  144. hit->refidx = getval_cuhash(refs, get_col_str(fr, 2));
  145. if(hit->refidx == MAX_U4){
  146. fprintf(stderr, "[WARNNING:unknown_refname] %s\n", lines->string + hit->stroff);
  147. continue;
  148. }
  149. hit->reflen = reflens->buffer[hit->refidx];
  150. hit->qlen = 0;
  151. u4i tb, te, qb, qe, len, cnt, tmp;
  152. u4i ln;
  153. char op;
  154. tb = atoi(get_col_str(fr, 3));
  155. te = tb;
  156. qb = qe = 0;
  157. len = 0;
  158. tmp = cnt = 0;
  159. str = get_col_str(fr, 5); // CIGAR
  160. ln = 0;
  161. op = 0;
  162. while(str[0]){
  163. if(str[0] >= '0' && str[0] <= '9'){
  164. ln = ln * 10 + str[0] - '0';
  165. } else {
  166. op = movs[(int)str[0]];
  167. if(op & 0b01){
  168. te += ln;
  169. qe += tmp;
  170. tmp = 0;
  171. if(cnt == 0){
  172. qb = qe;
  173. cnt = 1;
  174. }
  175. if(op & 0b10){
  176. qe += ln;
  177. len += ln;
  178. }
  179. } else if(op & 0b10){
  180. tmp += ln;
  181. len += ln;
  182. }
  183. ln = 0;
  184. }
  185. str ++;
  186. }
  187. if(hit->flag & 0x10){
  188. tmp = len - qb;
  189. qb = len - qe;
  190. qe = tmp;
  191. }
  192. hit->qlen = len;
  193. hit->qb = qb;
  194. hit->qe = qe;
  195. if(verbose){
  196. fprintf(out, "#%s\t%u\t+\t%u\t%u\t%s\t%c\t%u\t%u\n", get_col_str(fr, 0), len, qb, qe, get_col_str(fr, 2), "+-"[(hit->flag & 0x10) >> 4], tb, te);
  197. }
  198. push_lrhitv(hits, HIT);
  199. }
  200. }
  201. select_best_hit(hits, lines, minlen, mincov, out);
  202. free_string(lines);
  203. free_lrhitv(hits);
  204. free_cuhash(refs);
  205. for(i=0;i<reftags->size;i++){
  206. free(reftags->buffer[i]);
  207. }
  208. free_cplist(reftags);
  209. free_u4v(reflens);
  210. close_filereader(fr);
  211. return 0;
  212. }