best_sam_hits4longreads.pl 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/perl -w
  2. #
  3. # Author: Jue Ruan
  4. #
  5. use Getopt::Std;
  6. use strict;
  7. our ($opt_h, $opt_l, $opt_f);
  8. getopts("hl:f:");
  9. &usage if($opt_h);
  10. my $map_len = $opt_l || 100;
  11. my $map_cov = $opt_f || 0.75;
  12. my %refs = ();
  13. my %cigar_hash = (M=>[1,1], I=>[1,0], D=>[0,1], N=>[0,1], S=>[1,0], H=>[1,0], P=>[0,0], '='=>[1,1], X=>[1,1]);
  14. my @hits = ();
  15. while(<>){
  16. if(/^@/){
  17. print;
  18. if(/^\@SQ\sSN:(\S+)\sLN:(\d+)/){
  19. $refs{$1} = $2;
  20. }
  21. next;
  22. }
  23. my $str = $_;
  24. my @ts = split;
  25. my $tag = $ts[0];
  26. my $flg = $ts[1];
  27. my $ref = $ts[2];
  28. next if($ref eq '*');
  29. my $rlen = $refs{$ref};
  30. if(not defined $rlen){
  31. print;
  32. die("Cannot find '$ref' in SAM Header");
  33. }
  34. my $tb = $ts[3];
  35. my $te = $tb;
  36. my $qb = 0;
  37. my $qe = 0;
  38. my $len = 0;
  39. my $tmp = 0;
  40. my $cnt = 0;
  41. my $cgr = $ts[5];
  42. while($cgr=~/(\d+)(\D)/g){
  43. my $mov = $cigar_hash{$2};
  44. if($mov->[1]){
  45. $te += $1;
  46. $qe += $tmp;
  47. $tmp = 0;
  48. if($cnt == 0){
  49. $qb = $qe;
  50. $cnt = 1;
  51. }
  52. if($mov->[0]){
  53. $qe += $1;
  54. $len += $1;
  55. }
  56. } elsif($mov->[0]){
  57. $tmp += $1;
  58. $len += $1;
  59. }
  60. }
  61. if($flg & 0x10){
  62. $tmp = $len - $qb;
  63. $qb = $len - $qe;
  64. $qe = $tmp;
  65. }
  66. next unless($qe - $qb >= $map_len and $te - $tb >= $map_len);
  67. next unless($qe - $qb >= $map_cov * $len or $te - $tb >= $map_cov * $rlen);
  68. my $hit = [$tag, $len, $qb, $qe, $ref, $rlen, $tb, $te, $str];
  69. &select_best_hit($hit);
  70. }
  71. &select_best_hit(undef);
  72. 1;
  73. sub usage {
  74. print STDERR qq{Usage: $0 [-l 100:map_len] [-f 0.75:map_cov] <sam_file_in_query_order_with_header>\n};
  75. exit 1;
  76. }
  77. sub select_best_hit {
  78. my $hit = shift;
  79. if(@hits == 0){
  80. push(@hits, $hit) if($hit);
  81. } elsif(not defined $hit or $hit->[0] ne $hits[-1][0]){
  82. @hits = sort {$b->[3] - $b->[2] <=> $a->[3] - $a->[2]} @hits;
  83. for(my $i=0;$i<@hits;$i++){
  84. my $pass = 1;
  85. for(my $j=0;$j<$i;$j++){
  86. my $x = $hits[$i][2] < $hits[$j][2]? $hits[$j][2] : $hits[$i][2];
  87. my $y = $hits[$i][3] > $hits[$j][3]? $hits[$j][3] : $hits[$i][3];
  88. if($y - $x >= (1 - $map_cov) * ($hits[$i][3] - $hits[$i][2])){
  89. $pass = 0;
  90. last;
  91. }
  92. }
  93. print $hits[$i][8] if($pass);
  94. }
  95. @hits = ();
  96. push(@hits, $hit) if($hit);
  97. } else {
  98. push(@hits, $hit);
  99. }
  100. }