#!/usr/bin/perl # # create_profile_from_hmmer.pl # Create a profile (.prf) from a given HMMER/HMMER3 file # HHsuite version 3.0.0 (15-03-2015) # # Reference: # Remmert M., Biegert A., Hauser A., and Soding J. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). # (C) Michael Remmert and Johannes Soeding, 2012 # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de use lib $ENV{"HHLIB"}."/scripts"; use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. use strict; $|= 1; # Activate autoflushing on STDOUT # Default values: our $v=2; # verbose mode my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER)) my $scale = 1000; my $log2 = log(2); my $help=" create_profile_from_hmmer.pl from HHsuite $VERSION Create a profile (.prf) from a given HMMER/HMMER3 file Usage: perl create_profile_from_hmmer.pl -i [-o ] Options: -i Input file in HMMER/HMMER3 format -o Output file in prf-format (default: infile.prf) -v [0-5] verbose mode (default: $v) \n"; # Variable declarations my $line; my $infile; my $outfile; my $i; my $a; my @counts; # count profile (normalised to 1) my $name; # name of HMMER profile my $len; # length of HMMER profile my @hmmer_prof; # HMMER profile # A C D E F G H I K L M N P Q R S T V W Y my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18); my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'); my %aa2i; for ($a = 0; $a < 20; $a++) { $aa2i{$aminoacids[$a]} = $a; } ############################################################################################### # Processing command line input ############################################################################################### if (@ARGV<1) {die ($help);} my $options=""; for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";} if ($options=~s/ -i\s+(\S+) //) {$infile=$1;} if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;} if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;} if ($options=~s/ -v\s+(\S+) //) {$v=$1;} if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);} if (!$outfile) { $infile =~ /^(\S+)\.\S+?$/; $outfile = "$1.prf"; } ############################################################################################## # Main part ############################################################################################## ###################################### # Read HMMER sequence and profile ###################################### open (IN, $infile); $line = ; if ($line =~ /^HMMER3/) { while ($line = ) { if ($line =~ /^NAME\s+(\S+)/) { $name = $1; } elsif ($line =~ /^LENG\s+(\d+)/) { $len = $1; } elsif ($line =~ /^HMM/) { last; } } $line = ; $line = ; # Skip header lines if ($line =~ /^\s*COMPO/) { $line = ; $line = ; # Skip null model lines } # Read profiles and query seq $i = 0; while ($line = ) { if ($line =~ /^\/\//) { last; } $line =~ s/^\s*\d+//; # sequence position for ($a = 0; $a < 20; $a++) { $line =~ s/^\s*(\S+)\s/ /; $hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1); } # Read query char in count profile for ($a = 0; $a < 20; $a++) { $counts[$i][$a] = 0; } $line =~ /^\s*\d+\s+(\S)/; $counts[$i][$aa2i{$1}] = 1; $line = ; $line = ; $i++; } } elsif ($line =~ /^HMMER/) { my @pb; while ($line = ) { if ($line =~ /^NAME\s+(\S+)/) { $name = $1; } elsif ($line =~ /^LENG\s+(\d+)/) { $len = $1; } elsif ($line =~ /^NULE/) { $line =~ s/^NULE//; # sequence position for ($a = 0; $a < 20; $a++) { $line =~ s/^\s*(\S+)\s/ /; # pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE); $pb[$a] = 0.05 * (2**($1/1000)); } } elsif ($line =~ /^HMM/) { last; } } $line = ; $line = ; # Skip header lines # Read profiles and query seq $i = 0; while ($line = ) { if ($line =~ /^\/\//) { last; } $line =~ s/^\s*\d+//; # sequence position for ($a = 0; $a < 20; $a++) { $line =~ s/^\s*(\S+)\s/ /; # prob = pb[a]*fpow2(float(ptr)/HMMSCALE); $hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000)); } # Read query char in count profile $line = ; for ($a = 0; $a < 20; $a++) { $counts[$i][$a] = 0; } $line =~ /^\s*(\S)/; $counts[$i][$aa2i{$1}] = 1; $line = ; $i++; } } else { print($help); print "ERROR! Unknown input format!\n"; exit(1); } ###################################### # build count_profile (mix orig sequence and HMMER-profile) ###################################### for ($i = 0; $i < $len; $i++) { my $sum = 0; for ($a = 0; $a < 20; $a++) { $counts[$i][$a] += $factor * $hmmer_prof[$i][$a]; $sum += $counts[$i][$a]; } # Normalize to one my $fac = 1 / $sum; for ($a = 0; $a < 20; $a++) { $counts[$i][$a] *= $fac; } } ###################################### # write count_profile ###################################### open (OUT, ">$outfile"); # Write header printf(OUT "CountProfile\n"); printf(OUT "NAME\t%s\n", $name); printf(OUT "LENG\t%i\n", $len); printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet printf(OUT "COUNTS"); for ($a = 0; $a < 20; $a++) { printf(OUT "\t%s", $aminoacids[$a]); } printf(OUT "\tNEFF\n"); # Write profile for ($i = 0; $i < $len; $i++) { printf(OUT "%i", $i+1); for ($a = 0; $a < 20; $a++) { if ($counts[$i][$a] == 0) { printf(OUT "\t*"); } else { printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5)); } } printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1 } printf(OUT "//\n"); close OUT; exit; sub log2() { my $n = shift; return (log($n)/$log2); }