Align.pm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. # Package Align.pl
  2. # (c) Johannes Soeding, 2006
  3. # Perl functions for Smith-Waterman and Needleman-Wunsch sequence alignment
  4. # HHsuite version 3.0.0 (15-03-2015)
  5. #
  6. # Reference:
  7. # Remmert M., Biegert A., Hauser A., and Soding J.
  8. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  9. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  10. # (C) Johannes Soeding and Michael Remmert, 2012
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. # This program is distributed in the hope that it will be useful,
  16. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. # GNU General Public License for more details.
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. # We are very grateful for bug reports! Please contact us at [email protected]
  22. #############################################################################
  23. # Subroutine AlignSW
  24. # Smith-Waterman local alignment
  25. # usage:
  26. # 1. Use global variables of package Align.pm:
  27. # $score = &AlignSW();
  28. # printf(" XSEQ: $Align::xseq\n");
  29. # printf(" MATCH: $Align::Sstr\n");
  30. # printf(" YSEQ: $Align::yseq\n");
  31. # etc.
  32. #
  33. # 2. Use references and/or global variables
  34. # $score = &AlignSW(\$xseq,\$yseq);
  35. # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S);
  36. # printf(" XSEQ: $xseq\n");
  37. # printf(" MATCH: $Sstr\n");
  38. # printf(" YSEQ: $yseq\n");
  39. #
  40. # Input: $xseq, $yseq : sequences x and y as strings
  41. # Param: $main::d : gap opening penalty
  42. # $main::e : gap extension penalty
  43. # Output: return value : bit score
  44. # $xseq, $yseq : aligned residues of x and y (with - as gap)
  45. # @i : $i[$col],$j[$col] are aligned residues in column $col
  46. # @j : (first is 1 (NOT 0!), 0 means gap)
  47. # $imin : first aligned residue of sequence x
  48. # $imax : last aligned residue of sequence x
  49. # $jmin : first aligned residue of sequence y
  50. # $jmax : last aligned residue of sequence y
  51. # $Sstr : string belonging to $xseq and $yseq showing quality of alignment
  52. # $S[$col] : match score for aligning positions $i[$col] and $j[$col]
  53. #############################################################################
  54. #############################################################################
  55. # Subroutine AlignNW
  56. # Needleman-Wunsch global alignment
  57. # usage: $score = &AlignNW();
  58. # $score = &AlignNW(\$xseq,\$yseq);
  59. # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j);
  60. # $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr,\@S);
  61. #
  62. # Input: $xseq, $yseq : sequences x and y as strings
  63. # Param: $main::d : gap opening penalty
  64. # $main::e : gap extension penalty
  65. # $main::g : end gap penalty
  66. # Output: return value : bit score
  67. # $xseq, $yseq : aligned residues of x and y (with - as gap)
  68. # @i : $i[$col],$j[$col] are aligned residues in column $col
  69. # @j : (first is 1 (NOT 0!), 0 means gap)
  70. # $imin : first aligned residue of sequence x
  71. # $imax : last aligned residue of sequence x
  72. # $jmin : first aligned residue of sequence y
  73. # $jmax : last aligned residue of sequence y
  74. # $Sstr : string belonging to $xseq and $yseq showing quality of alingment
  75. # $S[$col] : match score for aligning positions $i[$col] and $j[$col]
  76. #############################################################################
  77. package Align;
  78. use strict;
  79. use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
  80. use Exporter;
  81. our @ISA = qw(Exporter);
  82. our @EXPORT = qw(&AlignSW &AlignNW $matrix);
  83. our $xseq; # first sequence
  84. our $yseq; # second sequence
  85. our $ri; # reference to input array: $i[$col] -> $ri->[$col]
  86. our $rj; # reference to input array: $j[$col] -> $rj->[$col]
  87. our $imin; # first aligned residue of sequence x
  88. our $imax; # last aligned residue of sequence x
  89. our $jmax; # first aligned residue of sequence y
  90. our $jmin; # last aligned residue of sequence y
  91. our $Sstr; # $Sstr annotates the match quality
  92. our $rS; # reference $rS->[$col] -> $S[$col] = match score for aligning positions $i[$col] and $j[$col]
  93. our $matrix;
  94. my $firstcall=1;
  95. my @Sab; # Substitution matrix in bit
  96. # A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
  97. my @ch2i=( 0, 3, 4, 3, 6,13, 7, 8, 9,20,11,10,12, 2,20,14, 5, 1,15,16, 4,19,17,20,18, 6);
  98. my @Gonnet = (
  99. # A R N D C Q E G H I L K M F P S T W Y V X
  100. # The Gonnet matrix is in units of 10*log10()
  101. [ 2.4,-0.6,-0.3,-0.3, 0.5,-0.2, 0.0, 0.5,-0.8,-0.8,-1.2,-0.4,-0.7,-2.3, 0.3, 1.1, 0.6,-3.6,-2.2, 0.1,-1.0,-9.9], # A
  102. [-0.6, 4.7, 0.3,-0.3,-2.2, 1.5, 0.4,-1.0, 0.6,-2.4,-2.2, 2.7,-1.7,-3.2,-0.9,-0.2,-0.2,-1.6,-1.8,-2.0,-1.0,-9.9], # R
  103. [-0.3, 0.3, 3.8, 2.2,-1.8, 0.7, 0.9, 0.4, 1.2,-2.8,-3.0, 0.8,-2.2,-3.1,-0.9, 0.9, 0.5,-3.6,-1.4,-2.2,-1.0,-9.9], # N
  104. [-0.3,-0.3, 2.2, 4.7,-3.2, 0.9, 2.7, 0.1, 0.4,-3.8,-4.0, 0.5,-3.0,-4.5,-0.7, 0.5, 0.0,-5.2,-2.8,-2.9,-1.0,-9.9], # D
  105. [ 0.5,-2.2,-1.8,-3.2,11.5,-2.4,-3.0,-2.0,-1.3,-1.1,-1.5,-2.8,-0.9,-0.8,-3.1, 0.1,-0.5,-1.0,-0.5, 0.0,-1.0,-9.9], # C
  106. [-0.2, 1.5, 0.7, 0.9,-2.4, 2.7, 1.7,-1.0, 1.2,-1.9,-1.6, 1.5,-1.0,-2.6,-0.2, 0.2, 0.0,-2.7,-1.7,-1.5,-1.0,-9.9], # Q
  107. [ 0.0, 0.4, 0.9, 2.7,-3.0, 1.7, 3.6,-0.8, 0.4,-2.7,-2.8, 1.2,-2.0,-3.9,-0.5, 0.2,-0.1,-4.3,-2.7,-1.9,-1.0,-9.9], # E
  108. [ 0.5,-1.0, 0.4, 0.1,-2.0,-1.0,-0.8, 6.6,-1.4,-4.5,-4.4,-1.1,-3.5,-5.2,-1.6, 0.4,-1.1,-4.0,-4.0,-3.3,-1.0,-9.9], # G
  109. [-0.8, 0.6, 1.2, 0.4,-1.3, 1.2, 0.4,-1.4, 6.0,-2.2,-1.9, 0.6,-1.3,-0.1,-1.1,-0.2,-0.3,-0.8,-2.2,-2.0,-1.0,-9.9], # H
  110. [-0.8,-2.4,-2.8,-3.8,-1.1,-1.9,-2.7,-4.5,-2.2, 4.0, 2.8,-2.1, 2.5, 1.0,-2.6,-1.8,-0.6,-1.8,-0.7, 3.1,-1.0,-9.9], # I
  111. [-1.2,-2.2,-3.0,-4.0,-1.5,-1.6,-2.8,-4.4,-1.9, 2.8, 4.0,-2.1, 2.8, 2.0,-2.3,-2.1,-1.3,-0.7, 0.0, 1.8,-1.0,-9.9], # L
  112. [-0.4, 2.7, 0.8, 0.5,-2.8, 1.5, 1.2,-1.1, 0.6,-2.1,-2.1, 3.2,-1.4,-3.3,-0.6, 0.1, 0.1,-3.5,-2.1,-1.7,-1.0,-9.9], # K
  113. [-0.7,-1.7,-2.2,-3.0,-0.9,-1.0,-2.0,-3.5,-1.3, 2.5, 2.8,-1.4, 4.3, 1.6,-2.4,-1.4,-0.6,-1.0,-0.2, 1.6,-1.0,-9.9], # M
  114. [-2.3,-3.2,-3.1,-4.5,-0.8,-2.6,-3.9,-5.2,-0.1, 1.0, 2.0,-3.3, 1.6, 7.0,-3.8,-2.8,-2.2, 3.6, 5.1, 0.1,-1.0,-9.9], # F
  115. [ 0.3,-0.9,-0.9,-0.7,-3.1,-0.2,-0.5,-1.6,-1.1,-2.6,-2.3,-0.6,-2.4,-3.8, 7.6, 0.4, 0.1,-5.0,-3.1,-1.8,-1.0,-9.9], # P
  116. [ 1.1,-0.2, 0.9, 0.5, 0.1, 0.2, 0.2, 0.4,-0.2,-1.8,-2.1, 0.1,-1.4,-2.8, 0.4, 2.2, 1.5,-3.3,-1.9,-1.0,-1.0,-9.9], # S
  117. [ 0.6,-0.2, 0.5, 0.0,-0.5, 0.0,-0.1,-1.1,-0.3,-0.6,-1.3, 0.1,-0.6,-2.2, 0.1, 1.5, 2.5,-3.5,-1.9, 0.0,-1.0,-9.9], # T
  118. [-3.6,-1.6,-3.6,-5.2,-1.0,-2.7,-4.3,-4.0,-0.8,-1.8,-0.7,-3.5,-1.0, 3.6,-5.0,-3.3,-3.5,14.2, 4.1,-2.6,-1.0,-9.9], # W
  119. [-2.2,-1.8,-1.4,-2.8,-0.5,-1.7,-2.7,-4.0,-2.2,-0.7, 0.0,-2.1,-0.2, 5.1,-3.1,-1.9,-1.9, 4.1, 7.8,-1.1,-1.0,-9.9], # Y
  120. [ 0.1,-2.0,-2.2,-2.9, 0.0,-1.5,-1.9,-3.3,-2.0, 3.1, 1.8,-1.7, 1.6, 0.1,-1.8,-1.0, 0.0,-2.6,-1.1, 3.4,-1.0,-9.9], # V
  121. [-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,+1.0,-9.9], # X
  122. [-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9,-9.9] # ~
  123. );
  124. # A R N D C Q E G H I L K M F P S T W Y V X
  125. my @BLOSUM62 = (
  126. [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0, 0,-9],
  127. [-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1,-9],
  128. [-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-1,-9],
  129. [-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-1,-9],
  130. [ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-2,-9],
  131. [-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-1,-9],
  132. [-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-1,-9],
  133. [ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-9],
  134. [-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-1,-9],
  135. [-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-1,-9],
  136. [-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-1,-9],
  137. [-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-1,-9],
  138. [-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-1,-9],
  139. [-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-1,-9],
  140. [-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-9],
  141. [ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0,-9],
  142. [ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0, 0,-9],
  143. [-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-2,-9],
  144. [-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-1,-9],
  145. [ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-1,-9],
  146. [ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,+1,-9],
  147. [-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9]
  148. );
  149. # print("Substitution matrix:\n");
  150. # for ($a=0; $a<=20; $a++) {
  151. # for ($b=0; $b<=20; $b++) {
  152. # printf("%6.1f ",$Sab[$a][$b]);
  153. # }
  154. # printf("\n");
  155. # }
  156. # Set substitution matrix in bits (do only at first call of one of the alignment routines)
  157. sub SetSubstitutionMatrix {
  158. if ($firstcall) {
  159. # Transform to bits;
  160. if (defined($matrix) && $matrix eq "Gonnet") {
  161. for (my $a=0; $a<=20; ++$a) {
  162. for (my $b=0; $b<=20; ++$b) {
  163. $Sab[$a][$b] = $Gonnet[$a][$b]*0.3322; # 1*log(10)/log(2);
  164. }
  165. }
  166. } elsif (defined($matrix) && $matrix eq "Blosum62") {
  167. {printf("Using Blosum62 matrix...\n");}
  168. for (my $a=0; $a<=20; $a++) {
  169. for (my $b=0; $b<=20; $b++) {
  170. $Sab[$a][$b] = $BLOSUM62[$a][$b];
  171. }
  172. }
  173. } else {
  174. for (my $a=0; $a<20; ++$a) {
  175. for (my $b=0; $b<20; ++$b) {
  176. $Sab[$a][$b] = -1;
  177. }
  178. $Sab[$a][$a] = 2;
  179. }
  180. for (my $b=0; $b<=20; ++$b) {
  181. $Sab[20][$b] = $Sab[$b][20] = 0;
  182. $Sab[21][$b] = $Sab[$b][21] = -10;
  183. }
  184. $Sab[20][20] = $Sab[20][20] = +1;# if in doubt, match X with X
  185. }
  186. $firstcall=0;
  187. }
  188. }
  189. # maxbt(val1,...,valx,\$bt) finds maximum of values and puts index of maximum into $bt
  190. sub maxbt {
  191. my $rbt=pop @_; # last element of @_ is address of $bt
  192. my $max = shift;
  193. my $i=0;
  194. $$rbt = 0;
  195. foreach $_ (@_) {
  196. $i++;
  197. if ($_>$max) {$max=$_; $$rbt=$i;}
  198. }
  199. return $max;
  200. }
  201. # max3bt(val1,val2,val3,\$bt) finds maximum of values and puts index of maximum into $bt
  202. sub max3bt {
  203. if ($_[1] < $_[0]) {
  204. if ($_[2] < $_[0]) {
  205. ${$_[3]}=0;
  206. return $_[0];
  207. } else {
  208. ${$_[3]}=2;
  209. return $_[2];
  210. }
  211. } else {
  212. if ($_[2] < $_[1]) {
  213. ${$_[3]}=1;
  214. return $_[1];
  215. } else {
  216. ${$_[3]}=2;
  217. return $_[2];
  218. }
  219. }
  220. }
  221. # max2bt(val1,val2,\$bt) finds maximum of values and puts index of maximum into $bt
  222. sub max2bt {
  223. if ($_[1] < $_[0]) {
  224. ${$_[2]}=0;
  225. return $_[0];
  226. } else {
  227. ${$_[2]}=1;
  228. return $_[1];
  229. }
  230. }
  231. #############################################################################
  232. # Subroutien AlignSW
  233. # Smith-Waterman local alignment
  234. #############################################################################
  235. sub AlignSW {
  236. if (@_>=1) {$xseq=$_[0];}
  237. if (@_>=2) {$yseq=$_[1];}
  238. if (@_>=3) {$ri=$_[2];}
  239. if (@_>=4) {$rj=$_[3];}
  240. if (@_>=5) {$imin=$_[4];}
  241. if (@_>=6) {$imax=$_[5];}
  242. if (@_>=7) {$jmin=$_[6];}
  243. if (@_>=8) {$jmax=$_[7];}
  244. if (@_>=9) {$Sstr=$_[8];}
  245. if (@_>=10) {$rS=$_[9];}
  246. if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
  247. if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
  248. my @xchr; # ASCII characters of $xseq
  249. my @ychr; # ASCII characters of $yseq
  250. my @xres; # internal integer representation of residues of x
  251. my @yres; # internal integer representation of residues of y
  252. $$xseq =~ s/\s//g;
  253. $$yseq =~ s/\s//g;
  254. @xchr = split(//,$$xseq);
  255. @ychr = split(//,$$yseq);
  256. my $Lx=@xchr; # length of sequence x
  257. my $Ly=@ychr; # length of sequence y
  258. my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state
  259. my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x
  260. my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y
  261. my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B
  262. my @Abt; # $Abt[a][b] = 0:A 1:M
  263. my @Bbt; # $Bbt[a][b] = 0:B 1:M
  264. my $score; # bit score of alignment
  265. my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0)
  266. my $state; # STOP:0 M:1 A:2 B:3
  267. my ($i, $j); # indices for sequence x and y, respectively
  268. my $dx = $main::dx;
  269. my $dy = $main::dy;
  270. if (! defined $dx) {$dx = $main::d;}
  271. if (! defined $dy) {$dy = $main::d;}
  272. # Transform @xres and @yres to integer
  273. for ($i=0; $i<@xchr; $i++) {
  274. my $a=ord(uc($xchr[$i]));
  275. if ($a<65 || $a>90) {
  276. if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
  277. printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]);
  278. }
  279. $xres[$i]=21;
  280. } else {
  281. $xres[$i]=$ch2i[$a-65];
  282. }
  283. }
  284. for ($j=0; $j<@ychr; $j++) {
  285. my $a=ord(uc($ychr[$j]));
  286. if ($a<65 || $a>90) {
  287. if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
  288. printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]);
  289. }
  290. $yres[$j]=21;
  291. } else {
  292. $yres[$j]=$ch2i[$a-65];
  293. }
  294. }
  295. unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element
  296. unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element
  297. &SetSubstitutionMatrix;
  298. # Initialization
  299. for ($i=0; $i<=$Lx; $i++) {
  300. $M[$i][0]=-999; $A[$i][0]=-999; $B[$i][0]=-999;
  301. }
  302. for ($j=1; $j<=$Ly; $j++) {
  303. $M[0][$j]=-999; $A[0][$j]=-999; $B[0][$j]=-999;
  304. }
  305. # Iteration
  306. for ($i=1; $i<=$Lx; ++$i) {
  307. my $Mi =$M[$i];
  308. my $Mi1=$M[$i-1];
  309. my $Ai =$A[$i];
  310. my $Ai1=$A[$i-1];
  311. my $Bi =$B[$i];
  312. my $Bi1=$B[$i-1];
  313. my $Sabx=$Sab[$xres[$i]];
  314. my $j1=0;
  315. for ($j=1; $j<=$Ly; ++$j, ++$j1) {
  316. ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]];
  317. ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]);
  318. ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]);
  319. }
  320. }
  321. # Finding maximum
  322. $score = -1000;
  323. for ($i=1; $i<=$Lx; $i++) {
  324. my $Mi =$M[$i];
  325. for ($j=1; $j<=$Ly; $j++) {
  326. if (${$Mi}[$j]>$score) {$score=${$Mi}[$j]; $$imax=$i; $$jmax=$j;}
  327. }
  328. }
  329. # Backtracing
  330. @$ri=();
  331. @$rj=();
  332. @$rS=();
  333. $state=1; # last state is M
  334. $i=$$imax; $j=$$jmax;
  335. $$xseq=""; $$yseq="";
  336. while ($state) {
  337. $i >= 0 or die("Error: \$i < 0 on line 370 in module Align.pm");
  338. $j >= 0 or die("Error: \$j < 0 on line 371 in module Align.pm");
  339. if ($state==1) {
  340. # current state is M (match-match)
  341. unshift(@$ri,$i);
  342. unshift(@$rj,$j);
  343. $state = $Mbt[$i][$j];
  344. $$xseq=$xchr[$i].$$xseq;
  345. $$yseq=$ychr[$j].$$yseq;
  346. unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]);
  347. $$imin=$i; $$jmin=$j;
  348. $i--; $j--;
  349. } elsif ($state==2) {
  350. # current state is A (gap in x)
  351. unshift(@$ri,0);
  352. unshift(@$rj,$j);
  353. $$xseq="-".$$xseq;
  354. $$yseq=$ychr[$j].$$yseq;
  355. $bt = $Abt[$i][$j--];
  356. if ($bt) {
  357. # previous state was M
  358. unshift(@$rS,-$dx);
  359. $state = 1;
  360. } else {
  361. # previous state was A
  362. unshift(@$rS,-$main::e);
  363. }
  364. } else {
  365. # current state is B (gap in y)
  366. unshift(@$ri,$i);
  367. unshift(@$rj,0);
  368. $$xseq=$xchr[$i].$$xseq;
  369. $$yseq="-".$$yseq;
  370. $bt = $Bbt[$i--][$j];
  371. if ($bt) {
  372. # previous state was M
  373. unshift(@$rS,-$dy);
  374. $state = 1;
  375. } else {
  376. # previous state was B
  377. unshift(@$rS,-$main::e);
  378. }
  379. }
  380. }
  381. # Set annotation string representing match quality
  382. $$Sstr="";
  383. for (my $col=0; $col<@$ri; $col++) {
  384. if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) {
  385. $$Sstr.=uc($xchr[$ri->[$col]]);
  386. } elsif ($rS->[$col] > 0 ) {
  387. $$Sstr.="+";
  388. } else {
  389. $$Sstr.=".";
  390. }
  391. }
  392. return $score;
  393. }
  394. #############################################################################
  395. # Subroutien AlignNW
  396. # Needleman-Wunsch global alignment
  397. #############################################################################
  398. sub AlignNW {
  399. if (@_>=1) {$xseq=$_[0];}
  400. if (@_>=2) {$yseq=$_[1];}
  401. if (@_>=3) {$ri=$_[2];}
  402. if (@_>=4) {$rj=$_[3];}
  403. if (@_>=5) {$imin=$_[4];}
  404. if (@_>=6) {$imax=$_[5];}
  405. if (@_>=7) {$jmin=$_[6];}
  406. if (@_>=8) {$jmax=$_[7];}
  407. if (@_>=9) {$Sstr=$_[8];}
  408. if (@_>=10) {$rS=$_[9];}
  409. if (length($$xseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
  410. if (length($$yseq)<1) {warn ("ERROR in Align.pm: sequence x is empty\n"); return 0;}
  411. my @xchr; # ASCII characters of $xseq
  412. my @ychr; # ASCII characters of $yseq
  413. my @xres; # internal integer representation of residues of x
  414. my @yres; # internal integer representation of residues of y
  415. $$xseq =~ s/\s//g;
  416. $$yseq =~ s/\s//g;
  417. @xchr = split(//,$$xseq);
  418. @ychr = split(//,$$yseq);
  419. my $Lx=@xchr; # length of sequence x
  420. my $Ly=@ychr; # length of sequence y
  421. my @M; # $M[a][b] = score of best alignment of x[1..a] and y[1..b] ending in match state
  422. my @A; # $A[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in x
  423. my @B; # $B[a][b] = score of best alignment of x[1..a] and y[1..b] ending in gap in y
  424. my @Mbt; # $Mbt[a][b] = 0:STOP 1:M 2:A 3:B
  425. my @Abt; # $Abt[a][b] = 0:A 1:M
  426. my @Bbt; # $Bbt[a][b] = 0:B 1:M
  427. my $score; # bit score of alignment
  428. my $bt; # backtracing variable set by &maxbt: which argument was largest? (first=0)
  429. my $state; # STOP:0 M:1 A:2 B:3
  430. my ($i, $j); # indices for sequence x and y, respectively
  431. my $dx = $main::dx;
  432. my $dy = $main::dy;
  433. if (! defined $dx) {$dx = $main::d;}
  434. if (! defined $dy) {$dy = $main::d;}
  435. # Transform @xres and @yres to integer
  436. for ($i=0; $i<@xchr; $i++) {
  437. my $a=ord(uc($xchr[$i]));
  438. if ($a<65 || $a>90) {
  439. if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
  440. printf(STDERR "\nWARNING: invalid symbol '%s' in pos $i of first sequence to be aligned\n",$xchr[$i]);
  441. }
  442. $xres[$i]=21;
  443. } else {
  444. $xres[$i]=$ch2i[$a-65];
  445. }
  446. }
  447. for ($j=0; $j<@ychr; $j++) {
  448. my $a=ord(uc($ychr[$j]));
  449. if ($a<65 || $a>90) {
  450. if ($a!=ord(".") && $a!=ord("-") && $a!=ord("~")) {
  451. printf(STDERR "\nWARNING: invalid symbol '%s' in pos $j of second sequence to be aligned\n",$ychr[$j]);
  452. }
  453. $yres[$j]=21;
  454. } else {
  455. $yres[$j]=$ch2i[$a-65];
  456. }
  457. }
  458. unshift (@xres,21); unshift (@xchr," "); # insert dummy 0'th element
  459. unshift (@yres,21); unshift (@ychr," "); # insert dummy 0'th element
  460. &SetSubstitutionMatrix;
  461. my $OutsidePenalty = 999; # original value
  462. # To avoid that i or j gets negative, set OutsidePenalty high enough
  463. # It might be more efficient, to stop if $i=0 or $j=0 and add the gaps
  464. my $maxlen = $Lx;
  465. if ($Ly > $maxlen) { $maxlen = $Ly; }
  466. my $Maxpenalty = 9; # absolute of minimum value from BLOSUM62
  467. if ($main::g > $Maxpenalty) { $Maxpenalty = $main::g; }
  468. $OutsidePenalty = ($maxlen + 1) * $Maxpenalty;
  469. if ($OutsidePenalty < 999) { $OutsidePenalty = 999; }
  470. # Initialization
  471. $M[0][0]=$A[0][0]=$B[0][0]=0;
  472. for ($i=1; $i<=$Lx; $i++) {
  473. $M[$i][0] = -1.0 * $OutsidePenalty;
  474. $A[$i][0] = -1.0 * $OutsidePenalty;
  475. $B[$i][0] = -$i*$main::g;
  476. $Bbt[$i][0] = 0; # previous state was B as well (gap in y)
  477. }
  478. for ($j=1; $j<=$Ly; $j++) {
  479. $M[0][$j] = -1.0 * $OutsidePenalty;
  480. $A[0][$j] = -$j*$main::g;
  481. $B[0][$j] = -1.0 * $OutsidePenalty;
  482. $Abt[0][$j] = 0; # previous state was A as well (gap in x)
  483. }
  484. # Iteration
  485. for ($i=1; $i<=$Lx; ++$i) {
  486. my $Mi =$M[$i];
  487. my $Mi1=$M[$i-1];
  488. my $Ai =$A[$i];
  489. my $Ai1=$A[$i-1];
  490. my $Bi =$B[$i];
  491. my $Bi1=$B[$i-1];
  492. my $Sabx=$Sab[$xres[$i]];
  493. my $j1=0;
  494. for ($j=1; $j<=$Ly; ++$j, ++$j1) {
  495. ${$Mi}[$j] = max3bt(${$Mi1}[$j1], ${$Ai1}[$j1], ${$Bi1}[$j1], \$Mbt[$i][$j]) + ${$Sabx}[$yres[$j]];
  496. ${$Ai}[$j] = max2bt(${$Ai}[$j1]-$main::e, ${$Mi}[$j1]-$dx, \$Abt[$i][$j]);
  497. ${$Bi}[$j] = max2bt(${$Bi1}[$j]-$main::e, ${$Mi1}[$j]-$dy, \$Bbt[$i][$j]);
  498. }
  499. }
  500. # Finding maximum
  501. $score = -1.0 * $OutsidePenalty;
  502. for ($i=1; $i<=$Lx; $i++) {
  503. my $endgappenalty = ($Lx-$i)*$main::g;
  504. if ($M[$i][$Ly]-$endgappenalty > $score) {
  505. $score=$M[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 1;
  506. }
  507. if ($A[$i][$Ly]-$endgappenalty > $score) {
  508. $score=$A[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 2;
  509. }
  510. if ($B[$i][$Ly]-$endgappenalty > $score) {
  511. $score=$B[$i][$Ly]-$endgappenalty; $$imax=$i; $$jmax=$Ly; $state = 3;
  512. }
  513. }
  514. for ($j=1; $j<$Ly; $j++) {
  515. my $endgappenalty = ($Ly-$j)*$main::g;
  516. if ($M[$Lx][$j]-$endgappenalty > $score) {
  517. $score=$M[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 1;
  518. }
  519. if ($A[$Lx][$j]-$endgappenalty > $score) {
  520. $score=$A[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 2;
  521. }
  522. if ($B[$Lx][$j]-$endgappenalty > $score) {
  523. $score=$B[$Lx][$j]-$endgappenalty; $$imax=$Lx; $$jmax=$j; $state = 3;
  524. }
  525. }
  526. # Make sure the end gapped regions are also backtraced
  527. if ($$jmax<$Ly) {
  528. $Abt[$Lx][$$jmax+1] = $state;
  529. for ($j=$$jmax+2; $j<=$Ly; $j++) {$Abt[$Lx][$j] = 0;}
  530. $state = 2;
  531. } elsif ($$imax<$Lx) {
  532. $Bbt[$$imax+1][$Ly] = $state;
  533. for ($i=$$imax+2; $i<=$Lx; $i++) {$Bbt[$i][$Ly] = 0;}
  534. $state = 3;
  535. } else {
  536. $state = 1;
  537. }
  538. # Backtracing
  539. @$ri=();
  540. @$rj=();
  541. @$rS=();
  542. $i=$Lx; $j=$Ly;
  543. $$xseq=""; $$yseq="";
  544. while ($i || $j) {
  545. $i >= 0 or die("Error: \$i < 0 on line 595 in module Align.pm");
  546. $j >= 0 or die("Error: \$j < 0 on line 596 in module Align.pm");
  547. if ($state==1) {
  548. # current state is M (match-match)
  549. unshift(@$ri,$i);
  550. unshift(@$rj,$j);
  551. $state = $Mbt[$i][$j]+1; # previous state
  552. $$xseq=$xchr[$i].$$xseq;
  553. $$yseq=$ychr[$j].$$yseq;
  554. unshift(@$rS, $Sab[$xres[$i]][$yres[$j]]);
  555. $$imin=$i; $$jmin=$j;
  556. $i--; $j--;
  557. } elsif ($state==2) {
  558. # current state is A (gap in x)
  559. unshift(@$ri,0); # $ri->[$col]=0 for gap in $x
  560. unshift(@$rj,$j);
  561. $$xseq="-".$$xseq;
  562. $$yseq=$ychr[$j].$$yseq;
  563. $bt = $Abt[$i][$j--];
  564. if ($bt) {
  565. # previous state was M
  566. if ($i==$Lx || $i==0) {
  567. unshift(@$rS,-$main::g); # end gap
  568. } else {
  569. unshift(@$rS,-$dx); # gap opening
  570. }
  571. $state = 1;
  572. } else {
  573. # previous state was A
  574. if ($i==$Lx || $i==0) {
  575. unshift(@$rS,-$main::g); # end gap
  576. } else {
  577. unshift(@$rS,-$main::e); # gap extension
  578. }
  579. }
  580. } else {
  581. # current state is B (gap in y)
  582. unshift(@$ri,$i);
  583. unshift(@$rj,0); # $j[$col]=0 for gap in $y
  584. $$xseq=$xchr[$i].$$xseq;
  585. $$yseq="-".$$yseq;
  586. $bt = $Bbt[$i--][$j];
  587. if ($bt) {
  588. # previous state was M
  589. if ($j==$Ly || $j==0) {
  590. unshift(@$rS,-$main::g); # end gap
  591. } else {
  592. unshift(@$rS,-$dy); # gap opening
  593. }
  594. $state = 1;
  595. } else {
  596. # previous state was B
  597. if ($j==$Ly || $j==0) {
  598. unshift(@$rS,-$main::g); # end gap
  599. } else {
  600. unshift(@$rS,-$main::e); # gap extension
  601. }
  602. }
  603. }
  604. }
  605. # Set annotation string representing match quality
  606. $$Sstr="";
  607. for (my $col=0; $col<@$ri; $col++) {
  608. if ($xres[$ri->[$col]] eq $yres[$rj->[$col]]) {
  609. $$Sstr.=uc($xchr[$ri->[$col]]);
  610. } elsif ($rS->[$col] > 0 ) {
  611. $$Sstr.="+";
  612. } else {
  613. $$Sstr.=".";
  614. }
  615. }
  616. return $score;
  617. }
  618. 1;