create_profile_from_hhm.pl 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #!/usr/bin/env perl
  2. #
  3. # create_profile_from_hhm.pl
  4. # Create a profile (.prf) from a given HHM file
  5. # HHsuite version 3.0.0 (15-03-2015)
  6. #
  7. # Reference:
  8. # Remmert M., Biegert A., Hauser A., and Soding J.
  9. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  10. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  11. # (C) Michael Remmert and Johannes Soeding, 2012
  12. # This program is free software: you can redistribute it and/or modify
  13. # it under the terms of the GNU General Public License as published by
  14. # the Free Software Foundation, either version 3 of the License, or
  15. # (at your option) any later version.
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. # You should have received a copy of the GNU General Public License
  21. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. # We are very grateful for bug reports! Please contact us at [email protected]
  23. use lib $ENV{"HHLIB"}."/scripts";
  24. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  25. use strict;
  26. $|= 1; # Activate autoflushing on STDOUT
  27. # Default values:
  28. our $v=2; # verbose mode
  29. my $help="
  30. create_profile_from_hhm.pl from HHsuite $VERSION
  31. Create a profile (.prf) from a given HHM file
  32. Usage: perl create_profile_from_hhm.pl -i <infile> [-o <outfile>]
  33. Options:
  34. -i <infile> Input file in HHM format
  35. -o <outfile> Output file in prf-format (default: infile.prf)
  36. -v [0-5] verbose mode (default: $v)
  37. \n";
  38. # Variable declarations
  39. my $line;
  40. my $infile;
  41. my $outfile;
  42. my $i;
  43. my $a;
  44. my @counts; # count profile
  45. my @neffs;
  46. my $name; # name of HHM profile
  47. my $len; # length of HHM profile
  48. # A C D E F G H I K L M N P Q R S T V W Y
  49. my @hhmaa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
  50. my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
  51. my %aa2i;
  52. for ($a = 0; $a < 20; $a++) {
  53. $aa2i{$aminoacids[$a]} = $a;
  54. }
  55. ###############################################################################################
  56. # Processing command line input
  57. ###############################################################################################
  58. if (@ARGV<1) {die ($help);}
  59. my $options="";
  60. for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
  61. if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
  62. if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
  63. if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
  64. if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
  65. if (!$outfile) {
  66. $infile =~ /^(\S+)\.\S+?$/;
  67. $outfile = "$1.prf";
  68. }
  69. ##############################################################################################
  70. # Main part
  71. ##############################################################################################
  72. ######################################
  73. # Read HHM profile
  74. ######################################
  75. open (IN, $infile);
  76. while ($line = <IN>) {
  77. if ($line =~ /^NAME\s+(\S+)/) {
  78. $name = $1;
  79. } elsif ($line =~ /^LENG\s+(\d+)/) {
  80. $len = $1;
  81. } elsif ($line =~ /^HMM/) {
  82. last;
  83. }
  84. }
  85. $i = 0;
  86. while ($line = <IN>) {
  87. if ($line =~ /^\/\//) { last; }
  88. if ($line =~ s/^\S \d+ //) {
  89. for ($a = 0; $a < 20; $a++) {
  90. $line =~ s/^\s*(\S+)\s/ /;
  91. $counts[$i][$hhmaa2csaa[$a]] = $1;
  92. if ($counts[$i][$hhmaa2csaa[$a]] !~ /\*/ && $counts[$i][$hhmaa2csaa[$a]] == 0) {
  93. $counts[$i][$hhmaa2csaa[$a]] = 1;
  94. }
  95. }
  96. $line = <IN>;
  97. $line =~ /^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+/;
  98. $neffs[$i] = $1;
  99. $i++;
  100. }
  101. }
  102. ######################################
  103. # write count_profile
  104. ######################################
  105. open (OUT, ">$outfile");
  106. # Write header
  107. printf(OUT "CountProfile\n");
  108. printf(OUT "NAME\t%s\n", $name);
  109. printf(OUT "LENG\t%i\n", $len);
  110. printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet
  111. printf(OUT "COUNTS");
  112. for ($a = 0; $a < 20; $a++) {
  113. printf(OUT "\t%s", $aminoacids[$a]);
  114. }
  115. printf(OUT "\tNEFF\n");
  116. # Write profile
  117. for ($i = 0; $i < $len; $i++) {
  118. printf(OUT "%i", $i+1);
  119. for ($a = 0; $a < 20; $a++) {
  120. if ($counts[$i][$a] == '*') {
  121. printf(OUT "\t*");
  122. } else {
  123. printf(OUT "\t%i", $counts[$i][$a]);
  124. }
  125. }
  126. printf(OUT "\t%i\n", $neffs[$i]);
  127. }
  128. printf(OUT "//\n");
  129. close OUT;
  130. exit;