create_profile_from_hmmer.pl 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #!/usr/bin/perl
  2. #
  3. # create_profile_from_hmmer.pl
  4. # Create a profile (.prf) from a given HMMER/HMMER3 file
  5. # HHsuite version 3.0.0 (15-03-2015)
  6. #
  7. # Reference:
  8. # Remmert M., Biegert A., Hauser A., and Soding J.
  9. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  10. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  11. # (C) Michael Remmert and Johannes Soeding, 2012
  12. # This program is free software: you can redistribute it and/or modify
  13. # it under the terms of the GNU General Public License as published by
  14. # the Free Software Foundation, either version 3 of the License, or
  15. # (at your option) any later version.
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. # You should have received a copy of the GNU General Public License
  21. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. # We are very grateful for bug reports! Please contact us at [email protected]
  23. use lib $ENV{"HHLIB"}."/scripts";
  24. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  25. use strict;
  26. $|= 1; # Activate autoflushing on STDOUT
  27. # Default values:
  28. our $v=2; # verbose mode
  29. my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER))
  30. my $scale = 1000;
  31. my $log2 = log(2);
  32. my $help="
  33. create_profile_from_hmmer.pl from HHsuite $VERSION
  34. Create a profile (.prf) from a given HMMER/HMMER3 file
  35. Usage: perl create_profile_from_hmmer.pl -i <infile> [-o <outfile>]
  36. Options:
  37. -i <infile> Input file in HMMER/HMMER3 format
  38. -o <outfile> Output file in prf-format (default: infile.prf)
  39. -v [0-5] verbose mode (default: $v)
  40. \n";
  41. # Variable declarations
  42. my $line;
  43. my $infile;
  44. my $outfile;
  45. my $i;
  46. my $a;
  47. my @counts; # count profile (normalised to 1)
  48. my $name; # name of HMMER profile
  49. my $len; # length of HMMER profile
  50. my @hmmer_prof; # HMMER profile
  51. # A C D E F G H I K L M N P Q R S T V W Y
  52. my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
  53. my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
  54. my %aa2i;
  55. for ($a = 0; $a < 20; $a++) {
  56. $aa2i{$aminoacids[$a]} = $a;
  57. }
  58. ###############################################################################################
  59. # Processing command line input
  60. ###############################################################################################
  61. if (@ARGV<1) {die ($help);}
  62. my $options="";
  63. for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
  64. if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
  65. if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
  66. if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;}
  67. if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
  68. if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
  69. if (!$outfile) {
  70. $infile =~ /^(\S+)\.\S+?$/;
  71. $outfile = "$1.prf";
  72. }
  73. ##############################################################################################
  74. # Main part
  75. ##############################################################################################
  76. ######################################
  77. # Read HMMER sequence and profile
  78. ######################################
  79. open (IN, $infile);
  80. $line = <IN>;
  81. if ($line =~ /^HMMER3/) {
  82. while ($line = <IN>) {
  83. if ($line =~ /^NAME\s+(\S+)/) {
  84. $name = $1;
  85. } elsif ($line =~ /^LENG\s+(\d+)/) {
  86. $len = $1;
  87. } elsif ($line =~ /^HMM/) {
  88. last;
  89. }
  90. }
  91. $line = <IN>; $line = <IN>; # Skip header lines
  92. if ($line =~ /^\s*COMPO/) {
  93. $line = <IN>; $line = <IN>; # Skip null model lines
  94. }
  95. # Read profiles and query seq
  96. $i = 0;
  97. while ($line = <IN>) {
  98. if ($line =~ /^\/\//) { last; }
  99. $line =~ s/^\s*\d+//; # sequence position
  100. for ($a = 0; $a < 20; $a++) {
  101. $line =~ s/^\s*(\S+)\s/ /;
  102. $hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1);
  103. }
  104. # Read query char in count profile
  105. for ($a = 0; $a < 20; $a++) {
  106. $counts[$i][$a] = 0;
  107. }
  108. $line =~ /^\s*\d+\s+(\S)/;
  109. $counts[$i][$aa2i{$1}] = 1;
  110. $line = <IN>; $line = <IN>;
  111. $i++;
  112. }
  113. } elsif ($line =~ /^HMMER/) {
  114. my @pb;
  115. while ($line = <IN>) {
  116. if ($line =~ /^NAME\s+(\S+)/) {
  117. $name = $1;
  118. } elsif ($line =~ /^LENG\s+(\d+)/) {
  119. $len = $1;
  120. } elsif ($line =~ /^NULE/) {
  121. $line =~ s/^NULE//; # sequence position
  122. for ($a = 0; $a < 20; $a++) {
  123. $line =~ s/^\s*(\S+)\s/ /;
  124. # pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE);
  125. $pb[$a] = 0.05 * (2**($1/1000));
  126. }
  127. } elsif ($line =~ /^HMM/) {
  128. last;
  129. }
  130. }
  131. $line = <IN>; $line = <IN>; # Skip header lines
  132. # Read profiles and query seq
  133. $i = 0;
  134. while ($line = <IN>) {
  135. if ($line =~ /^\/\//) { last; }
  136. $line =~ s/^\s*\d+//; # sequence position
  137. for ($a = 0; $a < 20; $a++) {
  138. $line =~ s/^\s*(\S+)\s/ /;
  139. # prob = pb[a]*fpow2(float(ptr)/HMMSCALE);
  140. $hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000));
  141. }
  142. # Read query char in count profile
  143. $line = <IN>;
  144. for ($a = 0; $a < 20; $a++) {
  145. $counts[$i][$a] = 0;
  146. }
  147. $line =~ /^\s*(\S)/;
  148. $counts[$i][$aa2i{$1}] = 1;
  149. $line = <IN>;
  150. $i++;
  151. }
  152. } else {
  153. print($help);
  154. print "ERROR! Unknown input format!\n";
  155. exit(1);
  156. }
  157. ######################################
  158. # build count_profile (mix orig sequence and HMMER-profile)
  159. ######################################
  160. for ($i = 0; $i < $len; $i++) {
  161. my $sum = 0;
  162. for ($a = 0; $a < 20; $a++) {
  163. $counts[$i][$a] += $factor * $hmmer_prof[$i][$a];
  164. $sum += $counts[$i][$a];
  165. }
  166. # Normalize to one
  167. my $fac = 1 / $sum;
  168. for ($a = 0; $a < 20; $a++) {
  169. $counts[$i][$a] *= $fac;
  170. }
  171. }
  172. ######################################
  173. # write count_profile
  174. ######################################
  175. open (OUT, ">$outfile");
  176. # Write header
  177. printf(OUT "CountProfile\n");
  178. printf(OUT "NAME\t%s\n", $name);
  179. printf(OUT "LENG\t%i\n", $len);
  180. printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet
  181. printf(OUT "COUNTS");
  182. for ($a = 0; $a < 20; $a++) {
  183. printf(OUT "\t%s", $aminoacids[$a]);
  184. }
  185. printf(OUT "\tNEFF\n");
  186. # Write profile
  187. for ($i = 0; $i < $len; $i++) {
  188. printf(OUT "%i", $i+1);
  189. for ($a = 0; $a < 20; $a++) {
  190. if ($counts[$i][$a] == 0) {
  191. printf(OUT "\t*");
  192. } else {
  193. printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5));
  194. }
  195. }
  196. printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1
  197. }
  198. printf(OUT "//\n");
  199. close OUT;
  200. exit;
  201. sub log2()
  202. {
  203. my $n = shift;
  204. return (log($n)/$log2);
  205. }