sam2dbgcns.pl 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/perl -w
  2. #
  3. # Author: Jue Ruan <ruanjue@gmail.com>
  4. #
  5. use strict;
  6. use Getopt::Std;
  7. our ($opt_h, $opt_w, $opt_s, $opt_d, $opt_q, $opt_t);
  8. getopts("hw:s:dq:t");
  9. &usage if($opt_h);
  10. $opt_w = 2000 unless($opt_w);
  11. $opt_s = 1000 unless($opt_s);
  12. $opt_d = 0 unless($opt_d);
  13. $opt_q = 0 unless($opt_q);
  14. $opt_t = 0 unless($opt_t);
  15. &usage if($opt_w <= $opt_s);
  16. my $ctgf = shift or &usage;
  17. my %refs = ();
  18. my $ctag = "";
  19. my $cdes = "";
  20. my $cseq = "";
  21. open(IN, "<", $ctgf) or die;
  22. while(<IN>){
  23. chomp;
  24. if(/^>(\S+)/){
  25. $refs{$ctag} = [$cseq, $cdes] if(length $cseq);
  26. $ctag = $1;
  27. $cdes = substr($_, 1 + length($1));
  28. $cseq = '';
  29. } else {
  30. $cseq .= $_;
  31. }
  32. }
  33. $refs{$ctag} = [$cseq, $cdes] if(length $cseq);
  34. close IN;
  35. $ctag = "";
  36. my $coff = 0;
  37. my $wlen = $opt_w;
  38. my $slen = $opt_s;
  39. my @rseqs = ();
  40. my $ref = undef;
  41. my $rsize = 0;
  42. my $nid = 0;
  43. my $line = 0;
  44. while(<>){
  45. $line ++;
  46. next if(/^@/);
  47. my @ts= split;
  48. next if($ts[2] eq '*');
  49. next if($opt_q and $ts[4] < $opt_q);
  50. my $rtag = $ts[0];
  51. my $rdir = (($ts[1] >> 4) & 0x01)? "-" : "+";
  52. my $gtag = $ts[2];
  53. my $gpos = $ts[3];
  54. my $rseq = $ts[9];
  55. if($opt_t){
  56. my $l = 0;
  57. my $r = 0;
  58. if($ts[5]=~/^((\d+)S)/){
  59. $l = $2;
  60. }
  61. if($ts[5]=~/((\d+)S)$/){
  62. $r = $2;
  63. }
  64. $rseq = substr($rseq, $l, length($rseq) - $r - $l) if($l or $r);
  65. }
  66. if($gtag ne $ctag){
  67. while(length $ctag and $coff < $rsize){
  68. &print_frag;
  69. }
  70. $ctag = $gtag;
  71. #print STDERR "$ctag begin at SAM line $line\n";
  72. $ref = $refs{$ctag};
  73. $rsize = length $ref->[0];
  74. $coff = 0;
  75. @rseqs = ();
  76. &print_header;
  77. } else {
  78. while($coff + $wlen < $gpos){
  79. &print_frag;
  80. }
  81. }
  82. my $dup = 0;
  83. if($opt_d){
  84. for(my $i=@rseqs-1;$i>=0;$i--){
  85. last if($rseqs[$i][0] != $gpos);
  86. if($rseqs[$i][3] eq $rseq){
  87. $dup = 1;
  88. last;
  89. }
  90. }
  91. }
  92. push(@rseqs, [$gpos, $rtag, $rdir, $rseq]) unless($dup);
  93. }
  94. while(length $ctag and $coff < $rsize){
  95. &print_frag;
  96. }
  97. 1;
  98. sub print_header {
  99. return unless(length $ctag and exists $refs{$ctag});
  100. print ">$ctag$refs{$ctag}->[1]\n";
  101. }
  102. sub print_frag {
  103. my $cend = $coff + $wlen;
  104. my $cnxt = $coff + $slen;
  105. $cend = $rsize if($cend > $rsize);
  106. $cnxt = $rsize if($cnxt > $rsize);
  107. print "E\t$coff\tN$nid\t+\t";
  108. $nid ++;
  109. print "N$nid\t+\n";
  110. $nid ++;
  111. print "S\t$ctag\_F_$coff\_", ($cend - $coff), "\t+\t$coff\t", ($cend - $coff),"\t", substr($ref->[0], $coff, $cend - $coff), "\n";
  112. my @nseqs = ();
  113. foreach my $r (@rseqs){
  114. next if($r->[0] + length($r->[3]) < $coff);
  115. if($r->[0] < $cend){
  116. print "S\t$r->[1]\t$r->[2]\t0\t", length($r->[3]), "\t$r->[3]\n";
  117. if($r->[0] >= $cnxt){
  118. push(@nseqs, $r);
  119. }
  120. }
  121. }
  122. @rseqs = @nseqs;
  123. $coff = $cnxt;
  124. }
  125. sub usage {
  126. die(qq{Usage: $0 [options] <ctg_fa_file> [srt_sam_file]
  127. Option:
  128. -w <int> window size, [2000]
  129. -s <int> silding size, [1000]
  130. -d remove duplication
  131. -q <int> filter by mapQ, [0]
  132. -t <int> trim tailing clip
  133. });
  134. }