123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674 |
- # Package Align.pl
- # (c) Johannes Soeding, 2006
- # Perl functions for Smith-Waterman and Needleman-Wunsch sequence alignment
- # HHsuite version 3.0.0 (15-03-2015)
- #
- # Reference:
- # Remmert M., Biegert A., Hauser A., and Soding J.
- # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
- # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
- # (C) Johannes Soeding and Michael Remmert, 2012
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- # We are very grateful for bug reports! Please contact us at [email protected]
- #############################################################################
- # Subroutine AlignSW
- # Smith-Waterman local alignment
- # usage:
- # 1. Use global variables of package Align.pm:
- # $score = &AlignSW();
- # printf(" XSEQ: $Align::xseq\n");
- # printf(" MATCH: $Align::Sstr\n");
- # printf(" YSEQ: $Align::yseq\n");
- # etc.
- #
- # 2. Use references and/or global variables
- # $score = &AlignSW(\$xseq,\$yseq);
- # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S);
- # printf(" XSEQ: $xseq\n");
- # printf(" MATCH: $Sstr\n");
- # printf(" YSEQ: $yseq\n");
- #
- # Input: $xseq, $yseq : sequences x and y as strings
- # Param: $main::d : gap opening penalty
- # $main::e : gap extension penalty
- # Output: return value : bit score
- # $xseq, $yseq : aligned residues of x and y (with - as gap)
- # @i : $i[$col],$j[$col] are aligned residues in column $col
- # @j : (first is 1 (NOT 0!), 0 means gap)
- # $imin : first aligned residue of sequence x
- # $imax : last aligned residue of sequence x
- # $jmin : first aligned residue of sequence y
- # $jmax : last aligned residue of sequence y
- # $Sstr : string belonging to $xseq and $yseq showing quality of alignment
- # $S[$col] : match score for aligning positions $i[$col] and $j[$col]
- #############################################################################
- #############################################################################
- # Subroutine AlignNW
- # Needleman-Wunsch global alignment
- # usage: $score = &AlignNW();
- # $score = &AlignNW(\$xseq,\$yseq);
- # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j);
- # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S);
- #
- # Input: $xseq, $yseq : sequences x and y as strings
- # Param: $main::d : gap opening penalty
- # $main::e : gap extension penalty
- # $main::g : end gap penalty
- # Output: return value : bit score
- # $xseq, $yseq : aligned residues of x and y (with - as gap)
- # @i : $i[$col],$j[$col] are aligned residues in column $col
- # @j : (first is 1 (NOT 0!), 0 means gap)
- # $imin : first aligned residue of sequence x
- # $imax : last aligned residue of sequence x
- # $jmin : first aligned residue of sequence y
- # $jmax : last aligned residue of sequence y
- # $Sstr : string belonging to $xseq and $yseq showing quality of alingment
- # $S[$col] : match score for aligning positions $i[$col] and $j[$col]
- #############################################################################
- package Align;
- use strict;
- use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
- use Exporter;
- our @ISA = qw(Exporter);
- our @EXPORT = qw(&AlignSW &AlignNW $matrix);
- our $xseq; # first sequence
- our $yseq; # second sequence
- our $ri; # reference to input array: $i[$col] -> $ri->[$col]
- our $rj; # reference to input array: $j[$col] -> $rj->[$col]
- our $imin; # first aligned residue of sequence x
- our $imax; # last aligned residue of sequence x
- our $jmax; # first aligned residue of sequence y
- our $jmin; # last aligned residue of sequence y
- our $Sstr; # $Sstr annotates the match quality
- our $rS; # reference $rS->[$col] -> $S[$col] = match score for aligning positions $i[$col] and $j[$col]
- our $matrix;
- my $firstcall=1;
- my @Sab; # Substitution matrix in bit
- # A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
- my @ch2i=( 0, 3, 4, 3, 6,13, 7, 8, 9,20,11,10,12, 2,20,14, 5, 1,15,16, 4,19,17,20,18, 6);
- my @Gonnet = (
- # A R N D C Q E G H I L K M F P S T W Y V X
- # The Gonnet matrix is in units of 10*log10()
- [ 2.4,-0.6,-0.3,-0.3, 0.5,-0.2, 0.0, 0.5,-0.8,-0.8,-1.2,-0.4,-0.7,-2.3, 0.3, 1.1, 0.6,-3.6,-2.2, 0.1,-1.0,-9.9], # A
- [-0.6, 4.7, 0.3,-0.3,-2.2, 1.5, 0.4,-1.0, 0.6,-2.4,-2.2, 2.7,-1.7,-3.2,-0.9,-0.2,-0.2,-1.6,-1.8,-2.0,-1.0,-9.9], # R
- [-0.3, 0.3, 3.8, 2.2,-1.8, 0.7, 0.9, 0.4, 1.2,-2.8,-3.0, 0.8,-2.2,-3.1,-0.9, 0.9, 0.5,-3.6,-1.4,-2.2,-1.0,-9.9], # N
- [-0.3,-0.3, 2.2, 4.7,-3.2, 0.9, 2.7, 0.1, 0.4,-3.8,-4.0, 0.5,-3.0,-4.5,-0.7, 0.5, 0.0,-5.2,-2.8,-2.9,-1.0,-9.9], # D
- [ 0.5,-2.2,-1.8,-3.2,11.5,-2.4,-3.0,-2.0,-1.3,-1.1,-1.5,-2.8,-0.9,-0.8,-3.1, 0.1,-0.5,-1.0,-0.5, 0.0,-1.0,-9.9], # C
- [-0.2, 1.5, 0.7, 0.9,-2.4, 2.7, 1.7,-1.0, 1.2,-1.9,-1.6, 1.5,-1.0,-2.6,-0.2, 0.2, 0.0,-2.7,-1.7,-1.5,-1.0,-9.9], # Q
- [ 0.0, 0.4, 0.9, 2.7,-3.0, 1.7, 3.6,-0.8, 0.4,-2.7,-2.8, 1.2,-2.0,-3.9,-0.5, 0.2,-0.1,-4.3,-2.7,-1.9,-1.0,-9.9], # E
- [ 0.5,-1.0, 0.4, 0.1,-2.0,-1.0,-0.8, 6.6,-1.4,-4.5,-4.4,-1.1,-3.5,-5.2,-1.6, 0.4,-1.1,-4.0,-4.0,-3.3,-1.0,-9.9], # G
- [-0.8, 0.6, 1.2, 0.4,-1.3, 1.2, 0.4,-1.4, 6.0,-2.2,-1.9, 0.6,-1.3,-0.1,-1.1,-0.2,-0.3,-0.8,-2.2,-2.0,-1.0,-9.9], # H
- [-0.8,-2.4,-2.8,-3.8,-1.1,-1.9,-2.7,-4.5,-2.2, 4.0, 2.8,-2.1, 2.5, 1.0,-2.6,-1.8,-0.6,-1.8,-0.7, 3.1,-1.0,-9.9], # I
- [-1.2,-2.2,-3.0,-4.0,-1.5,-1.6,-2.8,-4.4,-1.9, 2.8, 4.0,-2.1, 2.8, 2.0,-2.3,-2.1,-1.3,-0.7, 0.0, 1.8,-1.0,-9.9], # L
- [-0.4, 2.7, 0.8, 0.5,-2.8, 1.5, 1.2,-1.1, 0.6,-2.1,-2.1, 3.2,-1.4,-3.3,-0.6, 0.1, 0.1,-3.5,-2.1,-1.7,-1.0,-9.9], # K
- [-0.7,-1.7,-2.2,-3.0,-0.9,-1.0,-2.0,-3.5,-1.3, 2.5, 2.8,-1.4, 4.3, 1.6,-2.4,-1.4,-0.6,-1.0,-0.2, 1.6,-1.0,-9.9], # M
- [-2.3,-3.2,-3.1,-4.5,-0.8,-2.6,-3.9,-5.2,-0.1, 1.0, 2.0,-3.3, 1.6, 7.0,-3.8,-2.8,-2.2, 3.6, 5.1, 0.1,-1.0,-9.9], # F
- [ 0.3,-0.9,-0.9,-0.7,-3.1,-0.2,-0.5,-1.6,-1.1,-2.6,-2.3,-0.6,-2.4,-3.8, 7.6, 0.4, 0.1,-5.0,-3.1,-1.8,-1.0,-9.9], # P
- [ 1.1,-0.2, 0.9, 0.5, 0.1, 0.2, 0.2, 0.4,-0.2,-1.8,-2.1, 0.1,-1.4,-2.8, 0.4, 2.2, 1.5,-3.3,-1.9,-1.0,-1.0,-9.9], # S
- [ 0.6,-0.2, 0.5, 0.0,-0.5, 0.0,-0.1,-1.1,-0.3,-0.6,-1.3, 0.1,-0.6,-2.2, 0.1, 1.5, 2.5,-3.5,-1.9, 0.0,-1.0,-9.9], # T
- [-3.6,-1.6,-3.6,-5.2,-1.0,-2.7,-4.3,-4.0,-0.8,-1.8,-0.7,-3.5,-1.0, 3.6,-5.0,-3.3,-3.5,14.2, 4.1,-2.6,-1.0,-9.9], # W
- [-2.2,-1.8,-1.4,-2.8,-0.5,-1.7,-2.7,-4.0,-2.2,-0.7, 0.0,-2.1,-0.2, 5.1,-3.1,-1.9,-1.9, 4.1, 7.8,-1.1,-1.0,-9.9], # Y
- [ 0.1,-2.0,-2.2,-2.9, 0.0,-1.5,-1.9,-3.3,-2.0, 3.1, 1.8,-1.7, 1.6, 0.1,-1.8,-1.0, 0.0,-2.6,-1.1, 3.4,-1.0,-9.9], # V
- [-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,+1.0,-9.9], # X
- [-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9] # ~
- );
- # A R N D C Q E G H I L K M F P S T W Y V X
- my @BLOSUM62 = (
- [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0, 0,-9],
- [-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1,-9],
- [-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-1,-9],
- [-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-1,-9],
- [ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-2,-9],
- [-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-1,-9],
- [-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-1,-9],
- [ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-9],
- [-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-1,-9],
- [-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-1,-9],
- [-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-1,-9],
- [-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-1,-9],
- [-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-1,-9],
- [-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-1,-9],
- [-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-9],
- [ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0,-9],
- [ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0, 0,-9],
- [-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-2,-9],
- [-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-1,-9],
- [ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-1,-9],
- [ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,+1,-9],
- [-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9]
- );
- # print("Substitution matrix:\n");
- # for ($a=0; $a<=20; $a++) {
- # for ($b=0; $b<=20; $b++) {
- # printf("%6.1f ",$Sab[$a][$b]);
- # }
- # printf("\n");
- # }
- # Set substitution matrix in bits (do only at first call of one of the alignment routines)
- sub SetSubstitutionMatrix {
- if ($firstcall) {
- # Transform to bits;
- if (defined($matrix) && $matrix eq "Gonnet") {
- for (my $a=0; $a<=20; ++$a) {
- for (my $b=0; $b<=20; ++$b) {
- $Sab[$a][$b] = $Gonnet[$a][$b]*0.3322; # 1*log(10)/log(2);
- }
- }
- } elsif (defined($matrix) && $matrix eq "Blosum62") {
- {printf("Using Blosum62 matrix...\n");}
- for (my $a=0; $a<=20; $a++) {
- for (my $b=0; $b<=20; $b++) {
- $Sab[$a][$b] = $BLOSUM62[$a][$b];
- }
- }
- } else {
- for (my $a=0; $a<20; ++$a) {
- for (my $b=0; $b<20; ++$b) {
- $Sab[$a][$b] = -1;
- }
- $Sab[$a][$a] = 2;
- }
- for (my $b=0; $b<=20; ++$b) {
- $Sab[20][$b] = $Sab[$b][20] = 0;
- $Sab[21][$b] = $Sab[$b][21] = -10;
- }
- $Sab[20][20] = $Sab[20][20] = +1;# if in doubt, match X with X
- }
- $firstcall=0;
- }
- }
- # maxbt(val1,...,valx,\$bt) finds maximum of values and puts index of maximum into $bt
- sub maxbt {
- my $rbt=pop @_; # last element of @_ is address of $bt
- my $max = shift;
- my $i=0;
- $$rbt = 0;
- foreach $_ (@_) {
- $i++;
- if ($_>$max) {$max=$_; $$rbt=$i;}
- }
- return $max;
- }
- # max3bt(val1,val2,val3,\$bt) finds maximum of values and puts index of maximum into $bt
- sub max3bt {
- if ($_[1] < $_[0]) {
- if ($_[2] < $_[0]) {
- ${$_[3]}=0;
- return $_[0];
- } else {
- ${$_[3]}=2;
- return $_[2];
- }
- } else {
- if ($_[2] < $_[1]) {
- ${$_[3]}=1;
- return $_[1];
- } else {
- ${$_[3]}=2;
- return $_[2];
- }
- }
- }
- # max2bt(val1,val2,\$bt) finds maximum of values and puts index of maximum into $bt
- sub max2bt {
- if ($_[1] < $_[0]) {
- ${$_[2]}=0;
- return $_[0];
- } else {
- ${$_[2]}=1;
- return $_[1];
- }
- }
- #############################################################################
- # Subroutien AlignSW
- # Smith-Waterman local alignment
- #############################################################################
- sub AlignSW {
- if (@_>=1) {$xseq=$_[0];}
- if (@_>=2) {$yseq=$_[1];}
- if (@_>=3) {$ri=$_[2];}
- if (@_>=4) {$rj=$_[3];}
- if (@_>=5) {$imin=$_[4];}
- if (@_>=6) {$imax=$_[5];}
- if (@_>=7) {$jmin=$_[6];}
- if (@_>=8) {$jmax=$_[7];}
- if (@_>=9) {$Sstr=$_[8];}
- if (@_>=10) {$rS=$_[9];}
- if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
- if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
- my @xchr; # ASCII characters of $xseq
- my @ychr; # ASCII characters of $yseq
- my @xres; # internal integer representation of residues of x
- my @yres; # internal integer representation of residues of y
- $$xseq =~ s/\s//g;
- $$yseq =~ s/\s//g;
- @xchr = split(//,$$xseq);
- @ychr = split(//,$$yseq);
- my $Lx=@xchr; # length of sequence x
- my $Ly=@ychr; # length of sequence y
- my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state
- my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x
- my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y
- my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B
- my @Abt; # $Abt[a][b] = 0:A 1:M
- my @Bbt; # $Bbt[a][b] = 0:B 1:M
- my $score; # bit score of alignment
- my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0)
- my $state; # STOP:0 M:1 A:2 B:3
- my ($i, $j); # indices for sequence x and y, respectively
- my $dx = $main::dx;
- my $dy = $main::dy;
- if (! defined $dx) {$dx = $main::d;}
- if (! defined $dy) {$dy = $main::d;}
-
- # Transform @xres and @yres to integer
- for ($i=0; $i<@xchr; $i++) {
- my $a=ord(uc($xchr[$i]));
- if ($a<65 || $a>90) {
- if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
- printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]);
- }
- $xres[$i]=21;
- } else {
- $xres[$i]=$ch2i[$a-65];
- }
- }
- for ($j=0; $j<@ychr; $j++) {
- my $a=ord(uc($ychr[$j]));
- if ($a<65 || $a>90) {
- if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
- printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]);
- }
- $yres[$j]=21;
- } else {
- $yres[$j]=$ch2i[$a-65];
- }
- }
- unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element
- unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element
- &SetSubstitutionMatrix;
- # Initialization
- for ($i=0; $i<=$Lx; $i++) {
- $M[$i][0]=-999; $A[$i][0]=-999; $B[$i][0]=-999;
- }
- for ($j=1; $j<=$Ly; $j++) {
- $M[0][$j]=-999; $A[0][$j]=-999; $B[0][$j]=-999;
- }
-
- # Iteration
- for ($i=1; $i<=$Lx; ++$i) {
- my $Mi =$M[$i];
- my $Mi1=$M[$i-1];
- my $Ai =$A[$i];
- my $Ai1=$A[$i-1];
- my $Bi =$B[$i];
- my $Bi1=$B[$i-1];
- my $Sabx=$Sab[$xres[$i]];
- my $j1=0;
- for ($j=1; $j<=$Ly; ++$j, ++$j1) {
- ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]];
- ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]);
- ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]);
- }
- }
- # Finding maximum
- $score = -1000;
- for ($i=1; $i<=$Lx; $i++) {
- my $Mi =$M[$i];
- for ($j=1; $j<=$Ly; $j++) {
- if (${$Mi}[$j]>$score) {$score=${$Mi}[$j]; $$imax=$i; $$jmax=$j;}
- }
- }
- # Backtracing
- @$ri=();
- @$rj=();
- @$rS=();
- $state=1; # last state is M
- $i=$$imax; $j=$$jmax;
- $$xseq=""; $$yseq="";
- while ($state) {
- $i >= 0 or die("Error: \$i < 0 on line 370 in module Align.pm");
- $j >= 0 or die("Error: \$j < 0 on line 371 in module Align.pm");
- if ($state==1) {
- # current state is M (match-match)
- unshift(@$ri,$i);
- unshift(@$rj,$j);
- $state = $Mbt[$i][$j];
- $$xseq=$xchr[$i].$$xseq;
- $$yseq=$ychr[$j].$$yseq;
- unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]);
- $$imin=$i; $$jmin=$j;
- $i--; $j--;
- } elsif ($state==2) {
- # current state is A (gap in x)
- unshift(@$ri,0);
- unshift(@$rj,$j);
- $$xseq="-".$$xseq;
- $$yseq=$ychr[$j].$$yseq;
- $bt = $Abt[$i][$j--];
- if ($bt) {
- # previous state was M
- unshift(@$rS,-$dx);
- $state = 1;
- } else {
- # previous state was A
- unshift(@$rS,-$main::e);
- }
- } else {
- # current state is B (gap in y)
- unshift(@$ri,$i);
- unshift(@$rj,0);
- $$xseq=$xchr[$i].$$xseq;
- $$yseq="-".$$yseq;
- $bt = $Bbt[$i--][$j];
- if ($bt) {
- # previous state was M
- unshift(@$rS,-$dy);
- $state = 1;
- } else {
- # previous state was B
- unshift(@$rS,-$main::e);
- }
- }
- }
- # Set annotation string representing match quality
- $$Sstr="";
- for (my $col=0; $col<@$ri; $col++) {
- if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) {
- $$Sstr.=uc($xchr[$ri->[$col]]);
- } elsif ($rS->[$col] > 0 ) {
- $$Sstr.="+";
- } else {
- $$Sstr.=".";
- }
- }
- return $score;
- }
- #############################################################################
- # Subroutien AlignNW
- # Needleman-Wunsch global alignment
- #############################################################################
- sub AlignNW {
- if (@_>=1) {$xseq=$_[0];}
- if (@_>=2) {$yseq=$_[1];}
- if (@_>=3) {$ri=$_[2];}
- if (@_>=4) {$rj=$_[3];}
- if (@_>=5) {$imin=$_[4];}
- if (@_>=6) {$imax=$_[5];}
- if (@_>=7) {$jmin=$_[6];}
- if (@_>=8) {$jmax=$_[7];}
- if (@_>=9) {$Sstr=$_[8];}
- if (@_>=10) {$rS=$_[9];}
- if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
- if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
- my @xchr; # ASCII characters of $xseq
- my @ychr; # ASCII characters of $yseq
- my @xres; # internal integer representation of residues of x
- my @yres; # internal integer representation of residues of y
- $$xseq =~ s/\s//g;
- $$yseq =~ s/\s//g;
- @xchr = split(//,$$xseq);
- @ychr = split(//,$$yseq);
- my $Lx=@xchr; # length of sequence x
- my $Ly=@ychr; # length of sequence y
- my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state
- my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x
- my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y
- my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B
- my @Abt; # $Abt[a][b] = 0:A 1:M
- my @Bbt; # $Bbt[a][b] = 0:B 1:M
- my $score; # bit score of alignment
- my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0)
- my $state; # STOP:0 M:1 A:2 B:3
- my ($i, $j); # indices for sequence x and y, respectively
- my $dx = $main::dx;
- my $dy = $main::dy;
- if (! defined $dx) {$dx = $main::d;}
- if (! defined $dy) {$dy = $main::d;}
-
- # Transform @xres and @yres to integer
- for ($i=0; $i<@xchr; $i++) {
- my $a=ord(uc($xchr[$i]));
- if ($a<65 || $a>90) {
- if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
- printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]);
- }
- $xres[$i]=21;
- } else {
- $xres[$i]=$ch2i[$a-65];
- }
- }
- for ($j=0; $j<@ychr; $j++) {
- my $a=ord(uc($ychr[$j]));
- if ($a<65 || $a>90) {
- if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
- printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]);
- }
- $yres[$j]=21;
- } else {
- $yres[$j]=$ch2i[$a-65];
- }
- }
- unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element
- unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element
-
- &SetSubstitutionMatrix;
- my $OutsidePenalty = 999; # original value
- # To avoid that i or j gets negative, set OutsidePenalty high enough
- # It might be more efficient, to stop if $i=0 or $j=0 and add the gaps
- my $maxlen = $Lx;
- if ($Ly > $maxlen) { $maxlen = $Ly; }
- my $Maxpenalty = 9; # absolute of minimum value from BLOSUM62
- if ($main::g > $Maxpenalty) { $Maxpenalty = $main::g; }
- $OutsidePenalty = ($maxlen + 1) * $Maxpenalty;
- if ($OutsidePenalty < 999) { $OutsidePenalty = 999; }
-
- # Initialization
- $M[0][0]=$A[0][0]=$B[0][0]=0;
- for ($i=1; $i<=$Lx; $i++) {
- $M[$i][0] = -1.0 * $OutsidePenalty;
- $A[$i][0] = -1.0 * $OutsidePenalty;
- $B[$i][0] = -$i*$main::g;
- $Bbt[$i][0] = 0; # previous state was B as well (gap in y)
- }
- for ($j=1; $j<=$Ly; $j++) {
- $M[0][$j] = -1.0 * $OutsidePenalty;
- $A[0][$j] = -$j*$main::g;
- $B[0][$j] = -1.0 * $OutsidePenalty;
- $Abt[0][$j] = 0; # previous state was A as well (gap in x)
- }
-
- # Iteration
- for ($i=1; $i<=$Lx; ++$i) {
- my $Mi =$M[$i];
- my $Mi1=$M[$i-1];
- my $Ai =$A[$i];
- my $Ai1=$A[$i-1];
- my $Bi =$B[$i];
- my $Bi1=$B[$i-1];
- my $Sabx=$Sab[$xres[$i]];
- my $j1=0;
- for ($j=1; $j<=$Ly; ++$j, ++$j1) {
- ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]];
- ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]);
- ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]);
- }
- }
- # Finding maximum
- $score = -1.0 * $OutsidePenalty;
- for ($i=1; $i<=$Lx; $i++) {
- my $endgappenalty = ($Lx-$i)*$main::g;
- if ($M[$i][$Ly]-$endgappenalty > $score) {
- $score=$M[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 1;
- }
- if ($A[$i][$Ly]-$endgappenalty > $score) {
- $score=$A[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 2;
- }
- if ($B[$i][$Ly]-$endgappenalty > $score) {
- $score=$B[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 3;
- }
- }
- for ($j=1; $j<$Ly; $j++) {
- my $endgappenalty = ($Ly-$j)*$main::g;
- if ($M[$Lx][$j]-$endgappenalty > $score) {
- $score=$M[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 1;
- }
- if ($A[$Lx][$j]-$endgappenalty > $score) {
- $score=$A[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 2;
- }
- if ($B[$Lx][$j]-$endgappenalty > $score) {
- $score=$B[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 3;
- }
- }
- # Make sure the end gapped regions are also backtraced
- if ($$jmax<$Ly) {
- $Abt[$Lx][$$jmax+1] = $state;
- for ($j=$$jmax+2; $j<=$Ly; $j++) {$Abt[$Lx][$j] = 0;}
- $state = 2;
- } elsif ($$imax<$Lx) {
- $Bbt[$$imax+1][$Ly] = $state;
- for ($i=$$imax+2; $i<=$Lx; $i++) {$Bbt[$i][$Ly] = 0;}
- $state = 3;
- } else {
- $state = 1;
- }
- # Backtracing
- @$ri=();
- @$rj=();
- @$rS=();
- $i=$Lx; $j=$Ly;
- $$xseq=""; $$yseq="";
- while ($i || $j) {
- $i >= 0 or die("Error: \$i < 0 on line 595 in module Align.pm");
- $j >= 0 or die("Error: \$j < 0 on line 596 in module Align.pm");
- if ($state==1) {
- # current state is M (match-match)
- unshift(@$ri,$i);
- unshift(@$rj,$j);
- $state = $Mbt[$i][$j]+1; # previous state
- $$xseq=$xchr[$i].$$xseq;
- $$yseq=$ychr[$j].$$yseq;
- unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]);
- $$imin=$i; $$jmin=$j;
- $i--; $j--;
- } elsif ($state==2) {
- # current state is A (gap in x)
- unshift(@$ri,0); # $ri->[$col]=0 for gap in $x
- unshift(@$rj,$j);
- $$xseq="-".$$xseq;
- $$yseq=$ychr[$j].$$yseq;
- $bt = $Abt[$i][$j--];
- if ($bt) {
- # previous state was M
- if ($i==$Lx || $i==0) {
- unshift(@$rS,-$main::g); # end gap
- } else {
- unshift(@$rS,-$dx); # gap opening
- }
- $state = 1;
- } else {
- # previous state was A
- if ($i==$Lx || $i==0) {
- unshift(@$rS,-$main::g); # end gap
- } else {
- unshift(@$rS,-$main::e); # gap extension
- }
- }
- } else {
- # current state is B (gap in y)
- unshift(@$ri,$i);
- unshift(@$rj,0); # $j[$col]=0 for gap in $y
- $$xseq=$xchr[$i].$$xseq;
- $$yseq="-".$$yseq;
- $bt = $Bbt[$i--][$j];
- if ($bt) {
- # previous state was M
- if ($j==$Ly || $j==0) {
- unshift(@$rS,-$main::g); # end gap
- } else {
- unshift(@$rS,-$dy); # gap opening
- }
- $state = 1;
- } else {
- # previous state was B
- if ($j==$Ly || $j==0) {
- unshift(@$rS,-$main::g); # end gap
- } else {
- unshift(@$rS,-$main::e); # gap extension
- }
- }
- }
- }
- # Set annotation string representing match quality
- $$Sstr="";
- for (my $col=0; $col<@$ri; $col++) {
- if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) {
- $$Sstr.=uc($xchr[$ri->[$col]]);
- } elsif ($rS->[$col] > 0 ) {
- $$Sstr.="+";
- } else {
- $$Sstr.=".";
- }
- }
- return $score;
- }
- 1;
|