mergeali.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. #! /usr/bin/perl
  2. # mergeali.pl- Merge multiple alignments in A3M format via a multiple alignment of their seed sequences.
  3. # Usage: mergeali.pl [-i] infile.fas [-o] outfile.a3m [options]
  4. # (C) Johannes Soeding, 2012
  5. # HHsuite version 3.0.0 (15-03-2015)
  6. #
  7. # Reference:
  8. # Remmert M., Biegert A., Hauser A., and Soding J.
  9. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  10. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. # This program is distributed in the hope that it will be useful,
  16. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. # GNU General Public License for more details.
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. # We are very grateful for bug reports! Please contact us at [email protected]
  22. use lib $ENV{"HHLIB"}."/scripts";
  23. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  24. use Align;
  25. use File::Temp "tempfile";
  26. use strict;
  27. $|= 1; # Activate autoflushing on STDOUT
  28. # Default parameters
  29. our $d=1; # gap opening penatlty for Align.pm
  30. our $e=0.1; # gap extension penalty for Align.pm
  31. our $g=0.0; # endgap penalty for Align.pm
  32. my $program="mergeali.pl";
  33. my $v=2; # 3: DEBUG
  34. my $help="
  35. mergeali.pl - Merge multiple sequence alignments (MSAs) in a3m format via a FASTA-formatted
  36. master MSA of (parts of) their seed sequences.
  37. The file names of the 'slave' MSAs must be of the form <seqname>.a3m with <seqname> the name of
  38. the corresponding sequence in the master MSA. For example, if the name line in the master MSA
  39. is '>F6Y4V2_MONDO Uncharacterized protein...', the file containing the MSA for this sequence
  40. must be called F6Y4V2_MONDO.a3m and be found in the working directory (or some other
  41. directories specified using '-d <dirs>'.
  42. The sequences in the master MSA must be subsequences of the seed sequences in the A3M-formatted
  43. slave MSAs. They also must have a match state assignment by their seed (=first) sequence (as
  44. produced by hhblits or by reformat.pl with option -M first).
  45. DSSP state sequences will be included by default, and a consensus DSSP state sequence is
  46. appended at the top. The output file is generated in A3M format.
  47. Usage: $program <infile_master.fas> <outfile.a3m> [options]
  48. Options:
  49. -all match states are all columns with a residue in either of the seed sequences (default)
  50. -first match states are all columns with a residue in the first of the seed sequences
  51. -d <dirs> directories (separated by spaces) where to find the *.a3m alignments (default='.')
  52. -mark mark the seed sequences of slave MSAs in the master alignment by inserting a \@
  53. before their name: '>\@name'
  54. -diff <int> use only the <int> most different sequences from each alignment for merging
  55. (calls hhfilter)
  56. -name <name> write a name line '#name' as first line into the output file
  57. -full for the first seed sequence in the master alignment, use the full-length sequence from the alignment
  58. (including leading and trailing residues)
  59. -v <int> verbose mode (0: no messages; 1: only warnings; 2: verbose)
  60. Example: $program 1mkaA_templ.fas 1mkaA_full.a3m -d '. /home/soeding/pdb_8Jun05 /home/soeding/scop70_1.67'
  61. \n";
  62. # Processing command line options
  63. if (@ARGV<1) {die $help;}
  64. # Variable declarations
  65. my $infile="";
  66. my $outfile="";
  67. my @indirs=(".",""); # array containing all input directories where alignment files can be found
  68. # (default: current directory or absolute path given)
  69. my $indir; # variable looping through @indirs
  70. my $tmpfile; # temporary file used for filtering
  71. my $baseout; # outfile without extension
  72. my $alifile; # one of the alignment files to be merged
  73. my $nseed=0; # counts seed sequences in infile
  74. my $seed_in; # sequence read from input file
  75. my @seed_in; # residues of seed sequenences in $infile
  76. my @names; # seed names
  77. my $seed_ali; # sequence read from alignment files and to be reformatted by inserting the gaps from $seed_in
  78. my $i; # counts match states of $seed_in from $infile
  79. my $j; # counts match states of $seed from $alifile
  80. my $col; # columns in alignment of $seed_in (from $infile) against $seed (from $alifile)
  81. my ($imin,$imax,$jmin,$jmax);
  82. my @jj; # $jj[$i] is residue from $seed_ali aligned with residue $i of $seed_in
  83. my $mark=0; # do not mark seed sequences by introducing a @ before their name
  84. my $diff=""; # use only the X most different sequences per alignment?
  85. my $nseq=0; # total number of sequences in mega-alignment
  86. my $all=1; # 0: match states according to first sequence
  87. my @match=(); # index array for match states of first seed sequence
  88. my $options="";
  89. my (@H, @E, @C, @G, @B, @S, @T); # counts with DSSP states for consensus calculation
  90. my @seqs=(); # sequence records to print into outfile
  91. my $aliname; # name of alignment specified by -name option
  92. my $full=0; # don't add leading and trailing residues from seed_ali
  93. # Set options
  94. for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
  95. if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
  96. if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
  97. if ($options=~/ -d\s+(([^-]\S*\s+)+)/) {@indirs=split(/\s+/,$1); $options=~s/ -d\s+(([^-]\S*\s+)+)//g;}
  98. if ($options=~s/ -mark //) {$mark=1;}
  99. if ($options=~s/ -first //) {$all=0;}
  100. if ($options=~s/ -all //) {$all=1;}
  101. if ($options=~s/ -diff\s+(\S+) //) {$diff=$1;}
  102. if ($options=~s/ -name\s+(.*?) -/-/) {$aliname=$1;}
  103. elsif ($options=~s/ -name\s+(.*?)\s*$//) {$aliname=$1;}
  104. if ($options=~s/ -full //) {$full=1;}
  105. if ($options=~s/ -v\s+(\d+) //) {$v=$1;}
  106. elsif ($options=~s/ -v //) {$v=1;}
  107. # Read infile and outfile
  108. if (!$infile && $options=~s/^\s*([^-]\S+)\s*//) {$infile=$1;}
  109. if (!$outfile && $options=~s/^\s*([^-]\S+)\s*//) {$outfile=$1;}
  110. # Warn if unknown options found or no infile/outfile
  111. if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
  112. if ($infile eq "") {die("$help\nError in $program: input file missing: $!\n");}
  113. if ($outfile eq "") {die("$help\nError in $program: output file missing: $!\n");}
  114. # Determine output format (A3M or FASTA) and basename of outfile
  115. if ($outfile=~/(\S*)\.(\S*?)$/) {
  116. $baseout=$1;
  117. } else {
  118. $baseout=$outfile;
  119. }
  120. if ($diff ne "") {
  121. if (!-d "/tmp/$ENV{USER}") {mkdir("/tmp/$ENV{USER}",0777);}
  122. my $handle;
  123. ($handle,$tmpfile)=tempfile("XXXXXXXXXX",SUFFIX => ".a3m",DIR=>"/tmp/$ENV{USER}");
  124. close $handle or die "Error: unable to close $tmpfile\n";
  125. }
  126. $aliname=~s/\s+/ /g;
  127. $aliname=~s/\s+/ /g;
  128. $aliname=~s/\s+$//g;
  129. # Read input file into @names, @seqs
  130. $/=">"; # set input field seperator
  131. open (INFILE,"<$infile") || die ("Error: could not open $infile for reading: $!\n");
  132. $seed_in=<INFILE>; # skip first line
  133. while ($seed_in=~s/(.)>$/$1/) {$seed_in.=<INFILE>;} # skip first line
  134. while ($seed_in=<INFILE>) {
  135. if ($seed_in eq ">") {next;}
  136. while ($seed_in=~s/(.)>$/$1/) {$seed_in.=<INFILE>;} # if nameline contains a '>'
  137. $seed_in=~s/(\S+)[^\n]*//;
  138. $names[$nseed]=$1;
  139. $seed_in=~tr/a-zA-Z.-//cd; # remove all invalid symbols from sequence, including ">" at the end
  140. $seed_in=~tr/a-z./A-Z-/; # transform to upper case
  141. # $seed_in=~tr/a-z.-/A-Z~~/; # transform to upper case
  142. $seed_in[$nseed]=$seed_in;
  143. $nseed++;
  144. }
  145. close INFILE;
  146. # Add leading and trailing residues from first seed sequence?
  147. if ($full) {
  148. $nseed=0;
  149. $seed_in=$seed_in[$nseed];
  150. $seed_in=~tr/.-//d; # remove all gaps
  151. # Open alignment file
  152. foreach $indir (@indirs) {
  153. $alifile=$indir."/".$names[$nseed].".a3m";
  154. my $alifilefas=$indir."/".$names[$nseed].".fas";
  155. if (-e $alifile) {last;}
  156. if (-e $alifilefas) {
  157. &HHPaths::System("perl $hhscripts/reformat.pl -M first $alifilefas $alifile");
  158. last;
  159. }
  160. $alifilefas="";
  161. }
  162. if ($alifile eq "") {die("Error: could not find $alifile in input directory/-ies.\n");}
  163. if ($diff ne "") {
  164. &HHPaths::System("hhfilter -v $v -i $alifile -o $tmpfile -diff $diff -cov 5");
  165. $alifile=$tmpfile;
  166. }
  167. # Read seed sequence in alifile
  168. open (ALIFILE,"<$alifile") || die ("Error: could not open $alifile for reading: $!\n");
  169. $seed_ali=<ALIFILE>; # skip first line
  170. while ($seed_ali=~s/(.)>$/$1/) {$seed_ali.=<ALIFILE>;} # skip first line
  171. while ($seed_ali=<ALIFILE>) {
  172. if ($seed_ali eq ">") {next;}
  173. while ($seed_ali=~s/(.)>$/$1/) {$seed_ali.=<ALIFILE>;} # if nameline contains a '>'
  174. $seed_ali=~s/^([^\n]+)//;
  175. my $nameline=$1;
  176. if ($nameline=~/^(ss_|aa_|sa_)/) {next;} # skip secondary structure sequences
  177. $seed_ali=~s/[^a-zA-Z.-]//g; # remove all invalid symbols from sequence, including ">" at the end
  178. if(($seed_ali=~tr/.-/.-/)>0) {die("Error: seed sequence in $alifile must not contain gaps!\n");}
  179. last;
  180. }
  181. close (ALIFILE);
  182. # Align $seed_in to $seed_ali
  183. my $xseq=$seed_in;
  184. my $yseq=uc($seed_ali);
  185. my @i;
  186. my @j;
  187. my $Sstr;
  188. my $score;
  189. # The aligned characters are returned in $i[$col] and $j[$col]
  190. $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr);
  191. # DEBUG
  192. if ($v>=3) {
  193. printf("SEED_IN: %s\n",$xseq);
  194. printf("MATCH: %s\n",$Sstr);
  195. printf("SEED_ALI: %s\n",$yseq);
  196. printf("score=%6.2f imax=%i imin=%i jmax=%i jmin=%i\n",$score,$imax,$imin,$jmax,$jmin);
  197. printf("\n");
  198. if ($v>=4) {
  199. for ($col=0; $col<@j && $col<200; $col++) {
  200. printf("%3i %3i %s %3i %s %3i %3i\n",$col,$i[$col],substr($seed_in,$i[$col],1),$j[$col],substr($seed_ali,$j[$col],1),$j[$col],$jj[$i[$col]]);
  201. }
  202. }
  203. }
  204. if ($jmin>1) {
  205. $seed_in[0] = substr($seed_ali,0,$jmin-1).$seed_in[0];
  206. my $leftgaps = ('-' x ($jmin-1));
  207. for ($nseed=1; $nseed<scalar(@seed_in); $nseed++) {
  208. $seed_in[$nseed] = $leftgaps.$seed_in[$nseed];
  209. }
  210. }
  211. my $L=length($seed_ali);
  212. if ($L-$jmax>0) {
  213. $seed_in[0] .= substr($seed_ali,$jmax,$L-$jmax);
  214. my $rightgaps = ('-') x ($L-$jmax);
  215. for ($nseed=1; $nseed<scalar(@seed_in); $nseed++) {
  216. $seed_in[$nseed] .= $rightgaps;
  217. }
  218. }
  219. }
  220. #########################################################################################################################
  221. # For each seed sequence from infile: open alignment file and reformat its sequences by inserting gaps read from infile
  222. for ($nseed=0; $nseed<scalar(@seed_in); $nseed++) {
  223. $seed_in=$seed_in[$nseed];
  224. my @seed_gaps=split(/[A-Z]/,$seed_in,-1); # $seed_gaps[$i] contains the gaps after the $i'th residue in $seed_in
  225. $seed_in=~tr/.-//d; # remove all gaps
  226. # Open alignment file
  227. foreach $indir (@indirs) {
  228. $alifile=$indir."/".$names[$nseed].".a3m";
  229. my $alifilefas=$indir."/".$names[$nseed].".fas";
  230. if (-e $alifile) {last;}
  231. if (-e $alifilefas) {
  232. &HHPaths::System("perl $hhscripts/reformat.pl -M first $alifilefas $alifile");
  233. last;
  234. }
  235. $alifilefas="";
  236. }
  237. if ($alifile eq "") {die("Error: could not find $alifile in input directory/-ies.\n");}
  238. if ($diff ne "") {
  239. &HHPaths::System("hhfilter -v $v -i $alifile -o $tmpfile -diff $diff -cov 5");
  240. $alifile=$tmpfile;
  241. }
  242. open (ALIFILE,"<$alifile") || die ("Error: could not open $alifile for reading: $!\n");
  243. if ($v>=3) { #DEBUG
  244. printf("seed_in=$seed_in\n");
  245. printf("seed_gaps=");
  246. for ($i=0; $i<@seed_gaps; $i++) {printf("%i%s ",$i,$seed_gaps[$i]);}
  247. print("\n");
  248. print("Name=".$names[$nseed]."\n");
  249. }
  250. # Read seed sequence in alifile
  251. $seed_ali=<ALIFILE>; # skip first line
  252. while ($seed_ali=~s/(.)>$/$1/) {$seed_ali.=<ALIFILE>;} # skip first line
  253. while ($seed_ali=<ALIFILE>) {
  254. if ($seed_ali eq ">") {next;}
  255. while ($seed_ali=~s/(.)>$/$1/) {$seed_ali.=<ALIFILE>;} # if nameline contains a '>'
  256. $seed_ali=~s/^([^\n]+)//;
  257. my $nameline=$1;
  258. if ($nameline=~/^(ss_|aa_|sa_)/) {next;} # skip secondary structure sequences
  259. $seed_ali=~s/[^a-zA-Z.-]//g; # remove all invalid symbols from sequence, including ">" at the end
  260. if(($seed_ali=~tr/.-/.-/)>0) {die("Error: seed sequence in $alifile must not contain gaps!\n");}
  261. last;
  262. }
  263. close (ALIFILE);
  264. # Align $seed_in to $seed_ali
  265. my $xseq=$seed_in;
  266. my $yseq=uc($seed_ali);
  267. my @i;
  268. my @j;
  269. my $Sstr;
  270. my $score;
  271. # The aligned characters are returned in $i[$col] and $j[$col]
  272. $score = &AlignNW(\$xseq,\$yseq,\@i,\@j,\$imin,\$imax,\$jmin,\$jmax,\$Sstr);
  273. # DEBUG
  274. if ($v>=3) {
  275. printf("SEED_IN: %s\n",$xseq);
  276. printf("MATCH: %s\n",$Sstr);
  277. printf("SEED_ALI: %s\n",$yseq);
  278. printf("score=%6.2f imax=%i imin=%i jmax=%i jmin=%i\n",$score,$imax,$imin,$jmax,$jmin);
  279. printf("\n");
  280. if ($v>=4) {
  281. for ($col=0; $col<@j && $col<200; $col++) {
  282. printf("%3i %3i %s %3i %s %3i %3i\n",$col,$i[$col],substr($seed_in,$i[$col],1),$j[$col],substr($seed_ali,$j[$col],1),$j[$col],$jj[$i[$col]]);
  283. }
  284. }
  285. }
  286. # Does alignment look reasonable?
  287. my $sc=$score/&max(1,&max($imax-$imin,$jmax-$jmin));
  288. if ($sc<0.5) {
  289. printf("\nWARNING: alignment for %s has score per column of only %6.2f Skipping alignment\n",$names[$nseed],$sc,);
  290. if ($v>=2) {
  291. printf("SEED_IN: %s\n",$xseq);
  292. printf("MATCH: %s\n",$Sstr);
  293. printf("SEED_ALI: %s\n",$yseq);
  294. printf("score=%6.2f imax=%i imin=%i jmax=%i jmin=%i\n",$score,$imax,$imin,$jmax,$jmin);
  295. }
  296. next;
  297. }
  298. # Calculate $jj[$i]: residue number from $seed_ali aligned with residue $i of $seed_in (first=1)
  299. @jj=(); $jj[0]=0;
  300. for ($col=0; $col<@i; $col++) {
  301. $i=$i[$col];
  302. $j=$j[$col];
  303. if ($i>0) {
  304. if ($j>0) {
  305. $jj[$i]=$j;
  306. } else {
  307. $jj[$i]=0;
  308. }
  309. }
  310. }
  311. # For each sequence in alifile: reformat sequence by inserting gaps read from infile after each match state
  312. my $first=1; # seed sequences not yet marked?
  313. my $res; # sequence from alignment file
  314. open (ALIFILE,"<$alifile") || die ("Error: could not open $alifile for reading: $!\n");
  315. $res=<ALIFILE>; # skip first line
  316. while (1) {if($res=~s/(.)>/$1/) {$res.=<ALIFILE>;} else {last;}} # skip first line
  317. while ($res=<ALIFILE>) {
  318. if ($res eq ">") {next;}
  319. while (1) {if($res=~s/(.)>/$1/) {$res.=<ALIFILE>;} else {last;}} # if nameline contains a '>'
  320. $res=~s/^([^\n]+)//;
  321. my $nameline=$1;
  322. # Skip secondary structure sequences (ss_dssp is needed for consensus determination)
  323. if ($nameline=~/^(ss_|sa_|aa_)/ && $nameline!~/^ss_dssp/) {next;}
  324. $res=~s/[^a-zA-Z.-]//g; # remove all invalid symbols from sequence, including ">" at the end
  325. # Split next $res sequence from alignment file into 'Match plus inserts' strings
  326. my @res=split(/(?=[A-Z-][a-z.]*)/,"X".$res); # $res[$j] contains $j'th match state plus inserts
  327. # printf("alifile: %-30.30s seq = \n%s\n",$alifile,$res);
  328. if (@res<$jmax) {
  329. printf("Error: sequence %-.20s in %s has fewer match states (%i) than there are residues \n",$nameline,scalar(@res),$alifile);
  330. printf("in the seed sequence of this alignment in $infile (%i).\n",$jmax);
  331. print("Remember alignment files must have match states exactly in those columns where the seed has a residue.\n");
  332. die();
  333. }
  334. # THIS IS WHERE THE MERGING REALLY HAPPENS:
  335. # For each residue in $seed_in: append corresponding Match+inserts string(s) from $res,
  336. # plus the gaps between residues in $seed_in
  337. $res=$seed_gaps[0];
  338. $j=$jmin;
  339. for ($i=1; $i<=$#jj; $i++) {
  340. if ($jj[$i]>0) {
  341. # Add Match+insert strings of $res that are NOT aligned to residues in $seed_in -> inserts
  342. for (; $j<$jj[$i]; $j++) {
  343. $res[$j]=~tr/A-Z-/a-z/d; # transform to inserts
  344. $res.=$res[$j];
  345. }
  346. $res.=$res[$j++].$seed_gaps[$i]; # add gaps from input alignment found after $i'th residue in $seed_in
  347. } else {
  348. # Residue in $seed_in aligned to gap in $seed_ali => add '-' to represent missing match state
  349. $res.="-".$seed_gaps[$i];
  350. }
  351. }
  352. if ($nameline=~/^ss_dssp/) {
  353. $res=~tr/a-z//d; # remove inserts from dssp sequence
  354. @res=unpack("C*",$res);
  355. for ($i=0; $i<@res; $i++) {
  356. if (! defined $H[$i]) {$H[$i]=$E[$i]=$C[$i]=$G[$i]=$B[$i]=$S[$i]=$T[$i]=0;}
  357. if ($res[$i]==72) {$H[$i]++;} # H
  358. elsif ($res[$i]==69) {$E[$i]++;} # E
  359. elsif ($res[$i]==67) {$C[$i]++;} # C
  360. elsif ($res[$i]==71) {$G[$i]++; $H[$i]+=0.35; $C[$i]+=0.4;} # G
  361. elsif ($res[$i]==66) {$B[$i]++; $E[$i]+=0.35; $C[$i]+=0.4;} # B
  362. elsif ($res[$i]==83) {$S[$i]++; $C[$i]+=0.4;} # S
  363. elsif ($res[$i]==84) {$T[$i]++; $C[$i]+=0.4;} # T
  364. elsif ($res[$i]==73) {$C[$i]++;} # I
  365. }
  366. next; # Skip dssp sequence
  367. } elsif ($first) {
  368. # If seed sequences are to be marked, place a @ before the name of the seed sequence
  369. if ($mark) {$nameline="\@"."$nameline";}
  370. $first=0;
  371. if ($all==0 && $nseed==0) {
  372. # Determine match state index array
  373. @res=unpack("C*",$res);
  374. for ($i=$j=0; $i<@res; $i++) {
  375. my $c=$res[$i];
  376. if ($c>=65 && $c<=90) {
  377. $match[$j++]=1;
  378. } elsif ($c==45) {
  379. $match[$j++]=0;
  380. }
  381. }
  382. }
  383. }
  384. # Turn all match states to inserts that don't have a residue in the first seed sequence
  385. &AssignMatchStates(\$res);
  386. push(@seqs,">$nameline\n$res\n");
  387. $nseq++;
  388. # DEBUG
  389. if ($v>=3) {
  390. printf(">%-80.80s\n%s\n",$nameline,$res);
  391. if ($v>=4) {
  392. printf("res= ");
  393. for ($i=$imin; $i<=$imax; $i++) {
  394. printf("i=%-2i jj=%-2i res[jj]=%-4.4s seed_gaps[i]=%s\n",$i,$jj[$i],$res[$jj[$i]],$seed_gaps[$i]);
  395. }
  396. }
  397. print("\n");
  398. }
  399. }
  400. close ALIFILE;
  401. }
  402. # For each seed sequence from infile: open alignment file and reformat its sequences by inserting gaps read from infile
  403. #########################################################################################################################
  404. # Add DSSP consensus sequence
  405. if (@H>10) {
  406. my $res="";
  407. my $c;
  408. my $max;
  409. for ($i=0; $i<@H; $i++) {
  410. $max=0; $c="-";
  411. if ($C[$i]>$max) {$max=$C[$i]; $c="C";}
  412. if ($H[$i]>$max) {$max=$H[$i]; $c="H";}
  413. if ($E[$i]>$max) {$max=$E[$i]; $c="E";}
  414. if ($G[$i]>$max) {$max=$G[$i]; $c="G";}
  415. if ($B[$i]>$max) {$max=$B[$i]; $c="B";}
  416. if ($S[$i]>$max) {$max=$S[$i]; $c="S";}
  417. if ($T[$i]>$max) {$max=$B[$i]; $c="T";}
  418. $res.=$c;
  419. }
  420. &AssignMatchStates(\$res);
  421. $res=~tr/a-z.//d;
  422. unshift(@seqs,">ss_dssp consensus\n$res\n");
  423. }
  424. # Print outfile
  425. open (OUTFILE,">$outfile") || die ("Error: could not open $outfile for writing: $!\n");
  426. if ($aliname) {print(OUTFILE "#$aliname\n");}
  427. foreach my $seq (@seqs) {
  428. print(OUTFILE "$seq");
  429. }
  430. close(OUTFILE);
  431. if ($diff ne "") {unlink($tmpfile);}
  432. if ($v>=2) {
  433. print("Created alignment $outfile with $nseq sequences\n");
  434. }
  435. #&HHPaths::System("perl $hhscripts/reformat.pl $baseout.a3m $baseout.fas -v 3");
  436. # Reformat $baseout.a3m such that match columns are defined by first seed sequence?
  437. #&HHPaths::System("perl $hhscripts/reformat.pl -M first $baseout.fas $baseout"."_first.a3m");
  438. exit;
  439. # Turn all match states to inserts that don't have a residue in the first seed sequence
  440. sub AssignMatchStates()
  441. {
  442. my $pres=$_[0];
  443. if ($all==0) {
  444. my @res=unpack("C*",${$pres});
  445. for ($i=$j=0; $i<@res; $i++) {
  446. my $c=$res[$i];
  447. if ($c>=65 && $c<=90) {
  448. if (!$match[$j++]) {$res[$i]=$c+32;} # make character lower-case
  449. } elsif ($c==45) {
  450. if (!$match[$j++]) {$res[$i]=46;} # transform '-' to '.'
  451. }
  452. }
  453. ${$pres} = pack("C*",@res);
  454. ${$pres} =~tr/.//d;
  455. }
  456. }
  457. # Maximum
  458. sub max {
  459. my $max = shift @_;
  460. foreach (@_) {
  461. if ($_>$max) {$max=$_}
  462. }
  463. return $max;
  464. }
  465. # Minimum
  466. sub min {
  467. my $min = shift @_;
  468. foreach (@_) {
  469. if ($_<$min) {$min=$_}
  470. }
  471. return $min;
  472. }