#!/usr/bin/perl
#
# create_profile_from_hmmer.pl
# Create a profile (.prf) from a given HMMER/HMMER3 file
# HHsuite version 3.0.0 (15-03-2015)
#
# Reference:
# Remmert M., Biegert A., Hauser A., and Soding J.
# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
# (C) Michael Remmert and Johannes Soeding, 2012
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
use lib $ENV{"HHLIB"}."/scripts";
use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
$|= 1; # Activate autoflushing on STDOUT
# Default values:
our $v=2; # verbose mode
my $factor = 1; # mix orig sequence and HMMER profile with this factor (1 = 50/50, 2 = 33/66, 0.5 = 66/33 (orig/HMMER))
my $scale = 1000;
my $log2 = log(2);
my $help="
create_profile_from_hmmer.pl from HHsuite $VERSION
Create a profile (.prf) from a given HMMER/HMMER3 file
Usage: perl create_profile_from_hmmer.pl -i [-o ]
Options:
-i Input file in HMMER/HMMER3 format
-o Output file in prf-format (default: infile.prf)
-v [0-5] verbose mode (default: $v)
\n";
# Variable declarations
my $line;
my $infile;
my $outfile;
my $i;
my $a;
my @counts; # count profile (normalised to 1)
my $name; # name of HMMER profile
my $len; # length of HMMER profile
my @hmmer_prof; # HMMER profile
# A C D E F G H I K L M N P Q R S T V W Y
my @hmmeraa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
my @aminoacids = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
my %aa2i;
for ($a = 0; $a < 20; $a++) {
$aa2i{$aminoacids[$a]} = $a;
}
###############################################################################################
# Processing command line input
###############################################################################################
if (@ARGV<1) {die ($help);}
my $options="";
for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
if ($options=~s/ -factor\s+(\S+) //) {$factor=$1;}
if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
if (!$outfile) {
$infile =~ /^(\S+)\.\S+?$/;
$outfile = "$1.prf";
}
##############################################################################################
# Main part
##############################################################################################
######################################
# Read HMMER sequence and profile
######################################
open (IN, $infile);
$line = ;
if ($line =~ /^HMMER3/) {
while ($line = ) {
if ($line =~ /^NAME\s+(\S+)/) {
$name = $1;
} elsif ($line =~ /^LENG\s+(\d+)/) {
$len = $1;
} elsif ($line =~ /^HMM/) {
last;
}
}
$line = ; $line = ; # Skip header lines
if ($line =~ /^\s*COMPO/) {
$line = ; $line = ; # Skip null model lines
}
# Read profiles and query seq
$i = 0;
while ($line = ) {
if ($line =~ /^\/\//) { last; }
$line =~ s/^\s*\d+//; # sequence position
for ($a = 0; $a < 20; $a++) {
$line =~ s/^\s*(\S+)\s/ /;
$hmmer_prof[$i][$hmmeraa2csaa[$a]] = exp(-1.0*$1);
}
# Read query char in count profile
for ($a = 0; $a < 20; $a++) {
$counts[$i][$a] = 0;
}
$line =~ /^\s*\d+\s+(\S)/;
$counts[$i][$aa2i{$1}] = 1;
$line = ; $line = ;
$i++;
}
} elsif ($line =~ /^HMMER/) {
my @pb;
while ($line = ) {
if ($line =~ /^NAME\s+(\S+)/) {
$name = $1;
} elsif ($line =~ /^LENG\s+(\d+)/) {
$len = $1;
} elsif ($line =~ /^NULE/) {
$line =~ s/^NULE//; # sequence position
for ($a = 0; $a < 20; $a++) {
$line =~ s/^\s*(\S+)\s/ /;
# pb[a] = (float) 0.05 * fpow2(float(ptr)/HMMSCALE);
$pb[$a] = 0.05 * (2**($1/1000));
}
} elsif ($line =~ /^HMM/) {
last;
}
}
$line = ; $line = ; # Skip header lines
# Read profiles and query seq
$i = 0;
while ($line = ) {
if ($line =~ /^\/\//) { last; }
$line =~ s/^\s*\d+//; # sequence position
for ($a = 0; $a < 20; $a++) {
$line =~ s/^\s*(\S+)\s/ /;
# prob = pb[a]*fpow2(float(ptr)/HMMSCALE);
$hmmer_prof[$i][$hmmeraa2csaa[$a]] = $pb[$a] * (2**($1/1000));
}
# Read query char in count profile
$line = ;
for ($a = 0; $a < 20; $a++) {
$counts[$i][$a] = 0;
}
$line =~ /^\s*(\S)/;
$counts[$i][$aa2i{$1}] = 1;
$line = ;
$i++;
}
} else {
print($help);
print "ERROR! Unknown input format!\n";
exit(1);
}
######################################
# build count_profile (mix orig sequence and HMMER-profile)
######################################
for ($i = 0; $i < $len; $i++) {
my $sum = 0;
for ($a = 0; $a < 20; $a++) {
$counts[$i][$a] += $factor * $hmmer_prof[$i][$a];
$sum += $counts[$i][$a];
}
# Normalize to one
my $fac = 1 / $sum;
for ($a = 0; $a < 20; $a++) {
$counts[$i][$a] *= $fac;
}
}
######################################
# write count_profile
######################################
open (OUT, ">$outfile");
# Write header
printf(OUT "CountProfile\n");
printf(OUT "NAME\t%s\n", $name);
printf(OUT "LENG\t%i\n", $len);
printf(OUT "ALPH\t20\n"); # 20 amino acid alphabet
printf(OUT "COUNTS");
for ($a = 0; $a < 20; $a++) {
printf(OUT "\t%s", $aminoacids[$a]);
}
printf(OUT "\tNEFF\n");
# Write profile
for ($i = 0; $i < $len; $i++) {
printf(OUT "%i", $i+1);
for ($a = 0; $a < 20; $a++) {
if ($counts[$i][$a] == 0) {
printf(OUT "\t*");
} else {
printf(OUT "\t%i", int(-(log2($counts[$i][$a]) * $scale)+0.5));
}
}
printf(OUT "\t%i\n", 1 * $scale); # set Neff to 1
}
printf(OUT "//\n");
close OUT;
exit;
sub log2()
{
my $n = shift;
return (log($n)/$log2);
}