dbm_read_fa.pl 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #!/usr/bin/perl -w
  2. #
  3. #Author: Ruan Jue
  4. #
  5. use strict;
  6. use DB_File;
  7. my $dbf = shift or die("Usage: $0 <dbm_file> [read_name1 ...]\n");
  8. my $faf = undef;
  9. if($dbf!~/\.dbm$/){
  10. $dbf .= ".dbm" if(-e "$dbf.dbm");
  11. }
  12. my @names = @ARGV;
  13. my $list_all = 0;
  14. if(@names == 1 and $names[0] eq '#LIST'){
  15. $list_all = 1;
  16. @names = ();
  17. } elsif(@names == 0){
  18. while(<>){
  19. chomp;
  20. push(@names, $_);
  21. }
  22. }
  23. my @tags = ();
  24. foreach my $tag (@names){
  25. if($tag=~/^(.+?)(\[([+-])(:(-?\d+),(-?\d+))?\])$/){
  26. push(@tags, [$1, $3 eq '+'? 1:2, (defined $5)? $5:1, (defined $6)? $6:-1, 1]);
  27. } elsif($tag=~/^(.+?)_([FR])(_(\d+)(_(\d+))?)?$/){
  28. push(@tags, [$1, $2 eq 'F'? 1:2, (defined $4)? $4:1, (defined $6)? $4 + $6 - 1 : 0, 1]);
  29. } else {
  30. push(@tags, [$tag, 1, 1, -1, 0]);
  31. }
  32. }
  33. foreach my $tag (@tags){
  34. $tag->[2] = 1 if($tag->[2] < 1);
  35. }
  36. my %seqs;
  37. tie %seqs, 'DB_File', $dbf, O_RDONLY or die "Cannot open $dbf: $!";
  38. if($dbf=~/^(.+?)\.dbm$/){
  39. $faf = $1;
  40. }
  41. my $fa_file = undef;
  42. foreach my $tag (@tags){
  43. if(exists $seqs{$tag->[0]}){
  44. #my $seq = $seqs{$tag->[0]};
  45. my $seq = &read_fasta($tag->[0], $tag->[4]);
  46. if($tag->[4]){
  47. $tag->[3] = length($seq) if($tag->[3] <= $tag->[2]);
  48. if($tag->[4]){
  49. print ">", join("_", $tag->[0], $tag->[1] == 1? "F":"R", $tag->[2], $tag->[3] + 1 - $tag->[2]), "\n";
  50. } else {
  51. print ">$tag->[0]\n";
  52. }
  53. if($tag->[2] < $tag->[3]){
  54. my $ss = substr($seq, $tag->[2] - 1, $tag->[3] - $tag->[2] + 1);
  55. if($tag->[1] == 2){
  56. $ss =~tr/ACGTacgt/TGCAtgca/;
  57. $ss = reverse $ss;
  58. }
  59. while($ss=~/(.{1,100})/g){ print "$1\n"; }
  60. }
  61. } else {
  62. print $seq;
  63. }
  64. } else {
  65. warn("'$tag->[0]' was not found\n");
  66. }
  67. }
  68. if($list_all){
  69. &list_all_seqs;
  70. }
  71. untie %seqs;
  72. if($fa_file){
  73. close $fa_file;
  74. }
  75. 1;
  76. sub read_fasta {
  77. my $tag = shift;
  78. my $tidy = shift || 0;
  79. my $obj = $seqs{$tag};
  80. if($obj!~/^[0-9]/){
  81. return $obj;
  82. }
  83. my $off = $obj;
  84. if(not defined $fa_file){
  85. if(not defined $faf){
  86. die("Cannot find fasta file");
  87. } else {
  88. open($fa_file, "<", $faf) or die $!;
  89. }
  90. }
  91. seek($fa_file, $off, 0);
  92. my $nam = '';
  93. my $seq = '';
  94. while(<$fa_file>){
  95. if(/^>(\S+)/){
  96. last if(length $nam);
  97. $nam = $1;
  98. if(!$tidy){
  99. $seq .= $_;
  100. }
  101. } else {
  102. if($tidy){
  103. chomp;
  104. }
  105. $seq .= $_;
  106. }
  107. }
  108. if($nam ne $tag){
  109. die("Broken dbm index, \"$nam\" ne \"$tag\", offset = $off");
  110. }
  111. return $seq;
  112. }
  113. sub list_all_seqs {
  114. while(my ($tag, $seq) = each %seqs){
  115. print "$tag\t$seq\n";
  116. }
  117. }