hungerzs
/
hh-suite


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
							#!/usr/bin/perl -w
package utilities;

use strict;
use config;

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(System normalizeToOne sigmoid normalizeToOneHash Three2OneLetter One2ThreeLetter getTMscoreFrom BuildTabFiles BuildSingleTabFile max min KMP match_all_positions remove_ranges mean sd sample euklid_dist verbose get_dirname get_basename sumProbLen get_PDB_chain get_seq_len get_HMM_len trim ltrim rtrim TMalignBetween TMscoreBetween TMalignIDBetween hashToStr get_neff_from_hhm getSSPredFromHHM getCoverageApprox randBetween printMatrix printHash symmetricMinMatrix logistic scalarProduct PosteriorsFromTabFile getRandomString);

my $config = HHpredConfig->instance();

sub randBetween {
    my $lower = shift;
    my $upper = shift;

    my $rnum = int($lower + rand($upper-$lower+1));
    return $rnum;
}
    
sub isInRange {
    my $number = shift;
    my $lower = shift;
    my $upper = shift;

    if ($number >= $lower && $number <= $upper) { return 1; }
    return 0;
}


sub logistic {
    my $x = shift;
    return 1.0/(1.0 + exp(-$x));
}


sub scalarProduct {
    my ($xPtr, $yPtr) = @_;
    return -1 if (scalar(@$xPtr) != scalar(@$yPtr));
    
    my $result = 0;
    for (my $i=0; $i<scalar(@$xPtr); $i++) {
	$result += $xPtr->[$i] * $yPtr->[$i];
    }
    return $result;
}

## normalize entries in array to one
## assumes that all entries are non-negative
sub normalizeToOne {
    my @a = @_;

    my $sum = 0;

    for (my $i=0; $i<@a; $i++) {
	$sum += $a[$i];
    }

    if ($sum == 0) {
	print "normalizeToOne: Warning: normalizer equals zero!\n";
	return @a;
    }

    for (my $i=0; $i<@a; $i++) {
	$a[$i] /= $sum;
    }

    return @a;
}


sub printMatrix {
    my $matrixPtr = shift;
    my $matrixName = shift || "";

    my @matrix = @{$matrixPtr};

    if ($matrixName ne "") {
	print "$matrixName:\n";
    }

    for (my $i=0; $i<scalar(@matrix); $i++) {
	for (my $j=0; $j<scalar(@{$matrix[$i]}); $j++) {
	    print "$matrix[$i][$j] ";
	}
	print "\n";
    }
}


sub printHash {
    my $hashPtr = shift;
    my $inRow = 0;
    if (defined $_[1]) { $inRow = 1; }

    my %hash = %{$hashPtr};

    foreach my $key (sort keys %hash) {
	print "$key => $hash{$key}";
	($inRow == 0) ? print "\n" : print " ";
    }
}


## transform matrix into a symmetric one by always
## taking the minimum of two corresponding entries
sub symmetricMinMatrix {
    my $matrixPtr = shift;

    for (my $i=0; $i<scalar(@{$matrixPtr}); $i++) {
	for (my $j=$i+1; $j<@{$matrixPtr->[$i]}; $j++) {
#	    next if ($i == $j);
	    $matrixPtr->[$j][$i] = $matrixPtr->[$i][$j] if ($matrixPtr->[$i][$j] < $matrixPtr->[$j][$i]);
	    $matrixPtr->[$i][$j] = $matrixPtr->[$j][$i] if ($matrixPtr->[$j][$i] < $matrixPtr->[$i][$j]);
	}
    }
}

sub normalizeToOneHash {
    my $hashref = shift;

    my %hash = %$hashref;

    my $sum = 0;

    foreach my $key (keys %hash) {
	$sum += $hash{$key};
    }

    if ($sum == 0) {
	print "normalizeToOneHash: Warning: normalizer equals zero!\n";
	return %hash;
    }

    foreach my $key (keys %hash) {
	$hash{$key} /= $sum;
    }

    return %hash;
}


sub sigmoid {
    my $val = shift;
    
    if ($val < -15.0) {
	return 0.0;
    }
    elsif ($val > 15.0) {
	return 1.0;
    }
    else {
	return (1.0 / (1.0 + exp(-$val)));
    }
}


##################################################################################
# Convert three-letter amino acid code into one-letter code
##################################################################################
sub Three2OneLetter {
    my $res = uc($_[0]);

    if    ($res eq "GLY") {return "G";}
    elsif ($res eq "ALA") {return "A";}
    elsif ($res eq "VAL") {return "V";}
    elsif ($res eq "LEU") {return "L";}
    elsif ($res eq "ILE") {return "I";}
    elsif ($res eq "MET") {return "M";}
    elsif ($res eq "PHE") {return "F";}
    elsif ($res eq "TYR") {return "Y";}
    elsif ($res eq "TRP") {return "W";}
    elsif ($res eq "ASN") {return "N";}
    elsif ($res eq "ASP") {return "D";}
    elsif ($res eq "GLN") {return "Q";}
    elsif ($res eq "GLU") {return "E";}
    elsif ($res eq "CYS") {return "C";}
    elsif ($res eq "PRO") {return "P";}
    elsif ($res eq "SER") {return "S";}
    elsif ($res eq "THR") {return "T";}
    elsif ($res eq "LYS") {return "K";}
    elsif ($res eq "HIS") {return "H";}
    elsif ($res eq "ARG") {return "R";}
    elsif ($res eq "ASX") {return "D";}
    elsif ($res eq "GLX") {return "E";}
    elsif ($res eq "MSE") {return "M";} # SELENOMETHIONINE 
    elsif ($res eq "SEP") {return "S";} # PHOSPHOSERINE 
    elsif ($res eq "SEC") {return "C";} # SELENOCYSTEINE
    elsif ($res eq "TPO") {return "T";} # PHOSPHOTHREONINE 
    elsif ($res eq "TYS") {return "Y";} # SULFONATED TYROSINE 
    elsif ($res eq "KCX") {return "K";} # LYSINE NZ-CARBOXYLIC ACID
    else                  {return "X";}
}

    
sub One2ThreeLetter {
    my $res = uc($_[0]);

    if    ($res eq "G") {return "GLY";}
    elsif ($res eq "A") {return "ALA";}
    elsif ($res eq "V") {return "VAL";}
    elsif ($res eq "L") {return "LEU";}
    elsif ($res eq "I") {return "ILE";}
    elsif ($res eq "M") {return "MET";}
    elsif ($res eq "F") {return "PHE";}
    elsif ($res eq "Y") {return "TYR";}
    elsif ($res eq "W") {return "TRP";}
    elsif ($res eq "N") {return "ASN";}
    elsif ($res eq "D") {return "ASP";}
    elsif ($res eq "Q") {return "GLN";}
    elsif ($res eq "E") {return "GLU";}
    elsif ($res eq "C") {return "CYS";}
    elsif ($res eq "P") {return "PRO";}
    elsif ($res eq "S") {return "SER";}
    elsif ($res eq "T") {return "THR";}
    elsif ($res eq "K") {return "LYS";}
    elsif ($res eq "H") {return "HIS";}
    elsif ($res eq "R") {return "ARG";}
    elsif ($res eq "U") {return "SEC";}
    elsif ($res eq "B") {return "ASX";}
    elsif ($res eq "Z") {return "GLX";}
    else                {return "UNK";}
}


## get (approximated) coverage of query by template at index idx in tlist
sub getCoverageApprox {
    my $tlist = shift;
    my $idx = shift;

    if ($idx < 0 || $idx >= $tlist->get_queryLength()) {
	return -1;
    }
    my $range = $tlist->get($idx)->get_Qend() - $tlist->get($idx)->get_Qstart();
    ## gaps within this region are not considered => approximation
    my $coverageApprox = $range / $tlist->get_queryLength();

    return $coverageApprox;
}


## extract TMscore from TMscore/TMalign-output file
sub getTMscoreFrom {
    my $file = shift;

    my $tmscore = -1;

    open (TM, "< $file") or die "Cant open $file";
    while(my $line = <TM>) {
	if ($line =~ /TM-score\s*=\s*(\d+(\.\d+)?)/) {
	    $tmscore = $1;
	    last;
	}
    }
    close(TM);

    return $tmscore;
}
 

## TMalign between two given structures
sub TMalignBetween {
    my $strucFileOne = shift;
    my $strucFileTwo = shift;
    my $options = shift || " ";
    my $TMalign = shift || $config->get_TMalign();

    my $TMalignOutput = `$TMalign $strucFileOne $strucFileTwo $options`;
    my $TMscore = -1;
    if ($TMalignOutput =~ /TM-score\s*=\s*(\S+),/) {
	$TMscore = $1;
    }
    if ($TMscore == -1) {
	print "WARNING: TMalignBetween could not find TMscore in TMalign output!\n";
    }

    return $TMscore;
}


## TMalign (TMscore and sequence identity) between two given structures
sub TMalignIDBetween {
    my $strucFileOne = shift;
    my $strucFileTwo = shift;
    my $options = shift || " ";
    my $TMalign = shift || $config->get_TMalign();

    my $TMalignOutput = `$TMalign $strucFileOne $strucFileTwo $options`;
    my $TMID = -1;
    my $TMscore = -1;
    if ($TMalignOutput =~ /TM-score\s*=\s*(\S+),\s*ID\s*=\s*(\S+)/) {
	$TMscore = $1;
	$TMID = $2;
    }
    if ($TMscore == -1) {
	print "WARNING: TMalignBetween could not find TMscore in TMalign output!\n";
    }

    return ($TMscore, $TMID);
}


## TMalign (TMscore and sequence identity) between two given structures
sub TMscoreBetween {
    my $modelFile = shift;
    my $nativeFile = shift;
    my $options = shift || " ";
    my $TMscore = shift || $config->get_TMscore();

    my $TMscoreOutput = `$TMscore $modelFile $nativeFile $options`;
    my $score = -1;
    if ($TMscoreOutput =~ /TM-score\s*=\s*(\d+(\.\d+)?)/) {
	$score = $1;
    }
    if ($score == -1) {
	print "WARNING: TMscoreBetween could not find TMscore in TMscore output!\n";
    }

    return $score;
}

##################################################################################
## tabfile: tabfile containing all tabs-results (created by hhsearch options -atab)
## outbase: where to write the separate tab-files
## maxhits: how many hits to write (def: in fact all)
## 
## the input tabfile contains tab-entries (i.e. i j sim probab) for each template
## of the initial hhsearch. The order is as in initial hhr file.
##
## the created tab files (containing posteriori-prob-
## abilities) are saved in outbase.HITtemplateStartStop.tab,
## where start: first residue
##       stop: last residue
## it might be that a template is aligned more than one times at different
## positions. One can 
##################################################################################
sub BuildTabFiles {
    my $tabfile = shift;
    my $outbase = shift;
    my $maxHits = shift;

    $maxHits = defined($maxHits) ? $maxHits : 1000;


    open (TH, "< $tabfile") or die "Cant open $tabfile: $!\n";

    my $hitnr = 1;

    while (my $line = <TH>) {

	next if ($line =~ /^\s*i\s+j/);

	## new template
	if ($line =~ />(\S+)/) {
	    if ($hitnr > 1) { close(HH); }


	    ## write a new tabfile
	    my $singleTabFile = "$outbase.$1.tab";

	    if ($hitnr > $maxHits) { last; }
	    $hitnr++;

	    open (HH, "> $singleTabFile") or die "Cant open $singleTabFile: $!\n";
	    next;
	}

	## i j score ss probab [dssp]
	if ($line =~ /^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+/) {
	    print (HH $line);
	}
    }

    close(TH);
}


############################################################################
## see subroutine BuildTabFiles
## this one creates only a single tab-file - the hitnr-th in the tabfile
############################################################################
sub BuildSingleTabFile {
    my $tabfile = shift;
    my $hitnr = shift;
    my $outbase = shift;

    open (TH, "< $tabfile") or die "Cant open $tabfile: $!\n";

    my $hit = 1;
    my $template = "";
    my $found = 0;

    while (my $line = <TH>) {

	next if ($line =~ /^\s*i\s+j/);

	## begin of a new template
	if ($line =~ />(\S+)/) {

	    ## already found => stop
	    if ($found == 1) {
		close (HH);
		last;
	    }

	    ## found
	    if ($found == 0 and $hitnr == $hit) {
		$found = 1;
		$template = $1;
		open (HH, "> $outbase.$1.HIT$hitnr.tab") or die "Cant open $outbase.$1.HIT$hitnr.tab: $!\n";
		next;
	    }

	    $hit++;
	    
	}

	## not yet found
	if ($found == 0) {
	    next;
	}
	## found
	else {
	    ## i j score ss probab [dssp]
	    if ($line =~ /^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+/) {
		print (HH "$line");
	    }
	}
    }

    close(HH);
    close (TH);
    return $found;
}


############################################################################
## see subroutine BuildTabFiles
## returns hash for one hit with query-residue => pp entries
############################################################################
sub PosteriorsFromTabFile {
    my $tabfile = shift;
    my $hitnr = shift;

    open (TH, "< $tabfile") or die "Cant open $tabfile: $!\n";

    my $hit = 1;
    my $template = "";
    my $found = 0;
    my %QidxToPP;

    while (my $line = <TH>) {
	next if ($line =~ /^\s*i\s+j/);
	## begin of a new template
	if ($line =~ />(\S+)/) {
	    ## already found => stop
	    if ($found == 1) {
		last;
	    }
	    ## found
	    if ($found == 0 and $hitnr == $hit) {
		$found = 1;
		$template = $1;
		next;
	    }
	    $hit++;	   
	}
	## not yet found
	if ($found == 0) {
	    next;
	}
	## found
	else {
	    ## i j score ss probab [dssp]
	    if ($line =~ /^\s*(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)/) {
		$QidxToPP{$1} = $2;
	    }
	}
    }
    close (TH);
    
    return %QidxToPP;
}


sub max {
    my $max = shift;
    foreach (@_) {
	$max = $_ if ($_ > $max);
    }
    return $max;
}


sub min {
    my $min = shift;
    foreach (@_) {
	$min = $_ if ($_ < $min);
    }
    return $min;
}


sub mean {
    my @array = @_;
    my $sum = 0;
    
    for (my $i=0; $i<@array; $i++) {
	$sum += $array[$i];
    }
    return $sum/scalar(@array);
}


sub sd {
    my @array = @_;
    my $mean = &mean(@array);
    my $N = scalar(@array);

    my $var = 0;
    for (my $i=0; $i<@array; $i++) {
	$var += ($array[$i] - $mean)*($array[$i] - $mean)
    }
    $var *= 1.0/$N;
    
    return sqrt($var);
}

## sample a bin where probability of each bin is given in "probs"
## sum(probs) must be 1
sub sample {
    my @probs = @_;
    
    my $rand = rand();

    my $sum = 0;
    for (my $bin=0; $bin<@probs; $bin++) {
	$sum += $probs[$bin];
	if ($rand <= $sum) {
	    return $bin;
	}
    }
    return 0;
}


sub hashToStr {
    my $hashPtr = shift;

    my %myhash = %{$hashPtr};
    my $str = "";

    foreach my $key (sort keys(%myhash)) {
	$str .= "$key=$myhash{$key}\n";
    }

    return $str;
}


## calculate number of residues on which calculation
## of sumProbs is based (the ones which are aligned and
## which have dssp)
sub sumProbLen {
    my $ss_dssp = shift;
    my $conf = shift;

    my $len = length($ss_dssp);
    if (length($ss_dssp) != length($conf)) {
	print "WARNING: sumProbLen length(ss_dssp) != length(conf)!\n";
	$len = &min(length($ss_dssp), length($conf));
    }

    my @ssDsspTok = split(//, $ss_dssp);
    my @confTok = split(//, $conf);

    my $sumProbLen = 0;

    for (my $i=0; $i<$len; $i++) {
	if ($ssDsspTok[$i] ne '-' && $confTok[$i] ne " ") {
	    $sumProbLen++;
	}
    }
    return $sumProbLen;
}


sub euklid_dist {
    my $ref1 = shift;
    my $ref2 = shift;
    my $v = shift || 2;

    my @vec1 = @$ref1;
    my @vec2 = @$ref2;

    if ($v>=2) {
	if ($#vec1 != $#vec2) {
	    print "ERROR: euklid_dist: vec1 and vec2 differ in length!\n";
	}
    }

    my $sum = 0;

    for (my $i=0; $i<@vec1; $i++) {
	$sum += ($vec1[$i] - $vec2[$i]) * ($vec1[$i] - $vec2[$i]);
    }

    return (sqrt($sum));
}


## Knuth-Morris-Pratt algorithm 
## returns index of first occurence of ss in st
## or -1 otherwise
sub KMP {
    my $st = shift; ## text
    my $ss = shift; ## search string
    my $cs = shift || 0; ## case sensitivity

    if ($cs != 0) {
	$st = uc($st);
	$ss = uc($ss);
    }

    my @t = split(//, $st);
    my @s = split(//, $ss);

    my $n = scalar(@t);
    my $m = scalar(@s);

    ## compute borders
    my @borders;
    $borders[0] = -1;
    my $i = 0;
    $borders[1] = 0;

    for (my $j=2; $j<=$m; $j++) {
	while(($i>=0) && ($s[$i] ne $s[$j-1])) {
	    $i = $borders[$i];
	}
	$i++;
	$borders[$j] = $i
    }

    ## search routine
    $i = 0;
    my $j = 0;

    while ($i <= $n - $m) {
	while($t[$i+$j] eq $s[$j]) {
	    $j++;
	    if ($j == $m) {
		return $i;
	    }
	}
	$i = $i + ($j - $borders[$j]);
	$j = &max(0, $borders[$j]);
    }
    return -1;
}


## given a string and a regex,
## give back all positions (start and end) where regex matches string
sub match_all_positions {
    my ($regex, $string) = @_;
    my @ret;
    while ($string =~ /$regex/g) {
        push @ret, [ $-[0], $+[0] ];
    }
    return @ret;
}


sub trim($) {
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

# Left trim function to remove leading whitespace
sub ltrim($) {
    my $string = shift;
    $string =~ s/^\s+//;
    return $string;
}

# Right trim function to remove trailing whitespace
sub rtrim($) {
    my $string = shift;
    $string =~ s/\s+$//;
    return $string;
}


## setsPtr is a ptr to an array as
## generated by match_all_positions, i.e. an array of arrays
## with 2 elements (start, end)
## and returns start if: start <= idx < end
## or -1 if no such array is found
sub set_of_idx {
    my $idx = shift;
    my $setsPtr = shift;

    my @sets = @$setsPtr;
    for (my $i=0; $i<@sets; $i++) {
	my $start = $sets[$i]->[0];
	my $end = $sets[$i]->[1];

	return ($start, $end) if ($start >= $idx && $end < $idx);
    }
    return (-1,-1);
}
	   

## remove ranges given in splits (generated by e.g. match_all_positions)
## from str and give back new 
sub remove_ranges {
    my $str = shift;
    my @splits = @_;

    my $result = "";
    
    my $start = 0;
    my $end = 0;
    for (my $i=0; $i<@splits; $i++) {
	my $gStart = $splits[$i]->[0];
	my $gEnd = $splits[$i]->[1];
	
	$end = $gStart;
	$result .= substr($str, $start, $end-$start);
	$start = $gEnd;
    }
    $result .= substr($str, $start);
    return $result;
}


sub get_basename {
    my $dirbasename = shift;
    $dirbasename =~ /^.*\/(\S+?)(\.\S+)?$/;
    my $basename = $1;
    return $basename;
}


sub get_dirname {
    my $dirbasename = shift;
    $dirbasename =~ /^(.*)\//;
    my $dirname = $1;
    return $dirname;
}


## very simple: assumes name to look like 1a7j_A
sub get_PDB_chain {
    my $name = shift;
    
    if ($name =~ /\S+\_(\S+)$/) {
	return $1;
    } else {
	print "WARNING utitlities.pm get_PDB_chain strange format!\n";
	return "";
    }
}


sub get_seq_len {
    my $seq = shift;
    chomp($seq);
    $seq =~ s/[\*-]//g;
    return length($seq);
}


sub get_neff_from_hhm {
    my $hhmFile = shift;
    my $neff = -1;

    open(HH, "< $hhmFile") or die ("Cant open $hhmFile: $!\n");
    while(my $line = <HH>) {
	if ($line =~ /^Neff\s+(\S+)/i) {
	    $neff = $1;
	    last;
	}	    
    }
    close(HH);

    return $neff;
}


sub get_HMM_len {
    my $hhmFile = shift;
    my $len = -1;

    open(HH, "< $hhmFile") or die ("Cant open $hhmFile: $!\n");
    while(my $line = <HH>) {
	if ($line =~ /^LENG\s+(\S+)/i) {
	    $len = $1;
	    last;
	}	    
    }
    close(HH);

    return $len;
}


sub verbose {
    my $level = shift;
    my $actLevel = shift;
    my $message = shift;

    if ($actLevel >= $level) { print "$message\n"; }
}


sub getSSPredFromHHM {
    my $hhmFile = shift;
    open(HHM, "< $hhmFile") or die "Cant open $hhmFile";
    my $ssFound = 0;
    my $sspred = "";

    while(my $line = <HHM>) {
	chomp($line);
	if ($line =~ /^>ss\_pred/) {
	    $ssFound = 1;
	    next;
	}
	if ($ssFound && $line =~ /^>/) {
	    last;
	}
	next if ($ssFound == 0);
	$sspred .= $line;
	    
    }
    close(HHM);
    return $sspred;
}


sub getRandomString {
	my $len = shift;
	
	my @chars = ("A".."Z", "a".."z");
	my $string = "";
	$string .= $chars[rand @chars] for 1..$len;
	return $string;
}


sub System {
    my $cmd = shift;
    print "$cmd\n";
    system("$cmd");
}

1;