123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825 |
- #! /usr/bin/perl
- # reformat.pl
- # Reformat a multiple alignment file
- #
- # HHsuite version 3.0.0 (15-03-2015)
- #
- # Reference:
- # Remmert M., Biegert A., Hauser A., and Soding J.
- # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
- # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
- # (C) Johannes Soeding, 2012
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- # We are very grateful for bug reports! Please contact us at [email protected]
- use strict;
- use warnings;
- my $numres=100; # number of residues per line
- my $desclen=1000; # maximum number of characters in nameline
- my $ARGC=scalar(@ARGV);
- if ($ARGC<2) {
- die("
- reformat.pl from HHsuite3
- Read a multiple alignment in one format and write it in another format
- Usage: reformat.pl [informat] [outformat] infile outfile [options]
- or reformat.pl [informat] [outformat] 'fileglob' .ext [options]
- Available input formats:
- fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
- a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
- gaps aligned to inserts: '.'
- a3m: like a2m, but gaps aligned to inserts MAY be omitted
- sto: Stockholm format; sequences in several blocks with sequence name at
- beginning of line (hmmer output)
- psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r)
- clu: Clustal format; sequences in several blocks with sequence name at beginning
- of line
- Available output formats:
- fas: aligned fasta; all gaps '-'
- a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
- aligned to inserts: '.'
- a3m: like a2m, but gaps aligned to inserts are omitted
- sto: Stockholm format; sequences in just one block, one line per sequence
- psi: format as read by PSI-BLAST using the -B option
- clu: Clustal format
- If no input or output format is given the file extension is interpreted as format
- specification ('aln' as 'clu')
- Options:
- -v int verbose mode (0:off, 1:on)
- -num add number prefix to sequence names: 'name', '1:name' '2:name' etc
- -noss remove secondary structure sequences (beginning with >ss_)
- -sa do not remove solvent accessibility sequences (beginning with >sa_)
- -M first make all columns with residue in first sequence match columns
- (default for output format a2m or a3m)
- -M int make all columns with less than X% gaps match columns
- (for output format a2m or a3m)
- -r remove all lower case residues (insert states)
- (AFTER -M option has been processed)
- -r int remove all lower case columns with more than X% gaps
- -g '' suppress all gaps
- -g '-' write all gaps as '-'
- -uc write all residues in upper case (AFTER all other options have been processed)
- -lc write all residues in lower case (AFTER all other options have been processed)
- -l number of residues per line (for Clustal, FASTA, A2M, A3M formats)
- (default=$numres)
- -d maximum number of characers in nameline (default=$desclen)
- Examples: reformat.pl 1hjra.a3m 1hjra.a2m
- (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
- reformat.pl test.a3m test.fas -num -r 90
- reformat.pl fas sto '*.fasta' .stockholm
- \n");
- # clu: clustal format (hmmer output)
- # sel: Selex format
- # phy: Phylip format
- }
- my $informat="";
- my $outformat="";
- my $infile="";
- my $outfile="";
- my $num=0; # don't use sequence number as prefix: '>n|name'
- my $noss=0; # don't remove secondary structure
- my $nosa=1; # remove solvent accessibility sequences
- my $line;
- my $options="";
- my @names; # names of sequences read in
- my @seqs; # residues of sequences read in
- my $n; # number of sequences read in
- my $k; # counts sequences for output
- my $remove_inserts=0;
- my $remove_gapped=0;
- my $matchmode=""; # do not change capitalization
- my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns
- my $v=2;
- my $update=0;
- my $nss=-1; # index of secondary structure sequence
- my $lname; # length of sequence name in clustal, stockholm, psi format
- my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files
- my @informats= ("fas","a2m","a3m","sto","psi","clu");
- my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
- my $found;
- my $element;
- my $gap="default";
- my $case="default";
- # Process optionsz
- for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
- if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
- if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
- if ($options=~s/ -num / /) {$num=1; $desclen=505;}
- if ($options=~s/ -noss / /) {$noss=1;}
- if ($options=~s/ -sa / /) {$nosa=0;}
- if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
- if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
- if ($options=~s/ -r / /) {$remove_inserts=1;}
- if ($options=~s/ -lc / /) {$case="lc";}
- if ($options=~s/ -uc / /) {$case="uc";}
- if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
- if ($options=~s/ -v / /) {$v=2;}
- if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
- if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;}
- if ($options=~s/ -u / /) {$update=1;}
- if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
- if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
- if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}
- # Assign informat, outformat, infile, and outfile
- if ($outfile eq "") {
- if ($options=~s/(\S+)\s*$//) {
- $outfile=$1;
- } else {
- die("Error: no output file given: '$options'\n");
- }
- }
- if ($infile eq "") {
- if ($options=~s/(\S+)\s*$//) {
- $infile=$1;
- } else {
- die("Error: no input file given: '$options'\n");
- }
- }
- if ($options=~s/(\S+)\s*$//) {
- $outformat=$1;
- } else {
- if ($outfile=~/\S*\.(\S+?)$/) {
- $outformat=lc($1);
- if ($outformat eq "aln") {$outformat="clu";}
- elsif ($outformat eq "fa") {$outformat="fas";}
- elsif ($outformat eq "fasta") {$outformat="fas";}
- elsif ($outformat eq "afa") {$outformat="fas";}
- elsif ($outformat eq "afas") {$outformat="fas";}
- elsif ($outformat eq "afasta") {$outformat="fas";}
- } else {
- print ("Using FASTA output format: '$options'\n"); $outformat="fas";
- }
- }
- if ($options=~s/(\S+)\s*$//) {
- $informat=$1;
- } else {
- if ($infile=~/\S*\.(\S+?)$/) {
- $informat=lc($1);
- if ($informat eq "aln") {$informat="clu";}
- elsif ($informat eq "fa") {$informat="fas";}
- elsif ($informat eq "fasta") {$informat="fas";}
- } else {
- print ("Using FASTA input format: '$options'\n"); $informat="fas";
- }
- }
- # Warn if unknown options found
- if ($options!~/^\s*$/) {
- $options=~s/^\s*(.*?)\s*$/$1/g;
- print("\nWARNING: unknown options '$options'\n");
- }
- # Check if input and output formats are valid
- $found=0;
- foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
- if(!$found) {die("\nError: $informat is not a valid input format option\n");}
- $found=0;
- foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
- if(!$found) {die("\nError: $outformat is not a valid output format option\n");}
- #if($outformat eq "psi") {
- # $remove_inserts=1;
- #}
- if($outformat eq "ufas") {$gap="";}
- if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
- {
- $outfile=~/.*\.(\S*)$/;
- my $outext=$1;
- my @infiles=glob($infile);
- printf("%i files to reformat\n",scalar(@infiles));
- foreach $infile (@infiles)
- {
- if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
- $outfile="$1.$outext";
- if ($update && -e $outfile) {next;}
- if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
- &reformat($infile,$outfile);
- }
- exit 0;
- }
- else
- {
- if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
- &reformat($infile,$outfile);
- exit 0;
- }
- ################################################################################################
- # Reformat a single file
- ################################################################################################
- sub reformat()
- {
- $infile=$_[0];
- $nss=-1;
- $titleline="";
- ################################################################################################
- # Input part
- ################################################################################################
- my $skip=0;
- open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");
- # Read a2m, a3m, fas format
- if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
- {
- $/=">"; # set input field seperator
- $n=0;
- my $seq=<INFILE>;
- if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file
- $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file
-
- # If first line has no sequence name, use >root_name
- if ($seq ne ">") {
- $infile="/$infile."; # determine root name 1
- $infile=~/^.*\/(.*?)\..*/; # determine root name 2
- $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
- $seq=~tr/\n //d; # remove newlines and blanks
- $seqs[$n]=$seq;
- $n++;
- }
-
- while ($seq=<INFILE>) {
- $seq=~s/\n\#.*//g; # remove commentary lines
- while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n'
- if ($seq=~/^aa_/) {next;}
- if ($seq=~/^sa_/ && $nosa) {next;}
- if ($seq=~/^ss_/) {
- if ($noss) {next;} # do not read in >ss_ and >sa_ sequences
- # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
- # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
- $nss=$n;
- }
- $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n'
- if (defined $1 && $1 ne "") {
- $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
- } else {
- $names[$n]=$n;
- }
- $seqs[$n]=$seq;
- $n++;
- }
-
- $/="\n"; # reset input field seperator
- }
- # Read Stockholm format
- elsif ($informat eq "sto")
- {
- my %seqhash;
- my $name;
- my $first_block=1;
-
- $n=0;
- while ($line = <INFILE>)
- {
- $line=~tr/\r//d;
- $line=~s/\s+/ /g;
- if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line
- if ($line=~/^\#/) {next;} # skip commentary and blank line
- if ($line=~/^\/\//) {last;} # reached end of file
- if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts
- if ($line!~/^\s*(\S+)\s+(\S+)/) {
- die ("\nERROR found in stockholm format: $!");
- }
- if (!(exists $seqhash{$1}))
- {
- if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences
- if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
- if ($line=~/^ss_/) {
- if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
- # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
- # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
- $nss=$n;
- }
- $line=~/^\s*(\S+)\s+(\S+)/;
- $names[$n]=$1;
- $seqs[$n]=$2;
- $seqhash{$1}=$n++;
- $first_block=1;
- }
- else
- {
- if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
- $seqs[$seqhash{$1}].=$2;
- }
- # printf("%3i %s\n",$n-1,$seqs[$n-1]);
- }
- }
- elsif ($informat eq "clu")
- {
- my $residues_per_line=50; # default number of characters for namefield residues
- # (only needed if no gap between name and residues in first sequence -> SMART)
- my $block=1; # number of block currently read
- my $name;
- my $residues;
- $n=0; # sequence number of first block
- $k=0; # sequence index to zero for first block
- while ($line = <INFILE>)
- {
- # print($namefieldlen.$line);
- $line=~tr/\r//d;
- if ($line=~/CLUSTAL/i) {next;}
- if ($line=~/^\#/) {next;} # skip commentary and blank line
- if ($line=~/^\/\//) {last;} # reached end of file
- if ($line=~/^\s*$/){ # new sequence block starts
- if ($k) {
- if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
- $block++;
- $n=$k;
- $k=0; # set sequence index to zero for next block to read
- }
- next;
- }
- if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
- if ($line=~/^[*.: ]*$/) {next;}
- if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
- chomp($line);
- if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
- die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
- }
- $name=$1;
- $residues=$2;
- print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
- } else {
- if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
- if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
- if ($line=~/^ss_/) {
- # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
- # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
- $nss=$n;
- }
- $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
- $name=$1;
- $residues=$2;
- $residues=~tr/ //d;
- $residues_per_line=length($residues);
- }
- if ($block==1) {
- $names[$k]=$name;
- $seqs[$k]=$residues;
- } else {
- $seqs[$k].=$residues;
- if ($names[$k] ne $name) {
- print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
- }
- }
- # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
- $k++;
- }
- if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
- if (!$n) {$n=$k;}
- }
- # Read psi format
- elsif ($informat eq "psi")
- {
- my $block=1; # number of block currently read
- my $name;
- my $residues;
- $n=0; # sequence number of first block
- $k=0; # sequence index to zero for first block
-
- while ($line = <INFILE>)
- {
- # print($namefieldlen.$line);
- $line=~tr/\r//d;
- if ($line=~/^\s*$/){ # new sequence block starts
- if ($k) {
- if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
- $block++;
- $n=$k;
- $k=0; # set sequence index to zero for next block to read
- }
- next;
- }
- if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
- if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
- if ($line=~/^ss_/) {
- # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
- # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
- $nss=$n;
- }
- $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
- $name=$1;
- $residues=$2;
- $residues=~tr/ //d;
-
- if ($block==1) {
- $names[$k]=$name;
- $seqs[$k]=$residues;
- } else {
- $seqs[$k].=$residues;
- if ($names[$k] ne $name) {
- print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
- }
- }
- # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
- $k++;
- }
- if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
- if (!$n) {$n=$k;}
- }
-
- close INFILE;
-
-
- # Empty input file?
- if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}
-
-
- ################################################################################################
- # Transforming to upper-case
- ################################################################################################
- if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
- }
- ################################################################################################
- # Removing non-alphanumeric symbols
- ################################################################################################
- for ($k=0; $k<$n; $k++) {
- $seqs[$k]=~tr/A-Za-z0-9.~-//cd;
- $seqs[$k]=~tr/~/-/;
- }
-
-
- ################################################################################################
- # Filling up with gaps '.' or deleting gaps
- ################################################################################################
-
- # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option
- if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
- {
- print("inserting gaps...\n");
- my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
- my $j; # counts match states
- my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
- my $insert;
-
- # Determine length of longest insert
- for ($k=0; $k<$n; $k++)
- {
- # split into list of single match states and variable-length inserts
- # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
- # The '#' symbol is prepended to get rid of a perl bug in split
- @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
- $j=0;
- # printf("Sequence $k: $seqs[$k]\n");
- # printf("Sequence $k: @inserts\n");
-
- # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
- foreach $insert (@inserts)
- {
- if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
- $j++;
- # printf("$insert|");
- }
- # printf("\n");
- }
- my $ngap;
-
- # Fill up inserts with gaps '.' up to length $len_ins[$j]
- for ($k=0; $k<$n; $k++)
- {
- # split into list of single match states and variable-length inserts
- @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
- $j=0;
-
- # append the missing number of gaps after each match state
- foreach $insert (@inserts)
- {
- for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
- $j++;
- }
- $seqs[$k] = join("",@inserts);
- $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
- }
- }
-
-
- ################################################################################################
- # Match state assignment
- ################################################################################################
- # Default match state rule for a3m is -M first
- if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}
- # Use gap rule for match state assignment?
- if ($matchmode eq "gaprule") {
- my @gaps=();
- my $residues;
- my @residues;
-
- # Determine number of gaps per column
- for ($k=0; $k<$n; $k++) {
- @residues=unpack("C*",$seqs[$k]);
- for (my $l=0; $l<@residues; $l++) {
- if ($residues[$l]==46 || $residues[$l]==45) {
- if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
- }
- }
- }
-
- # Set columns with more gaps than $match_gaprule to lower case,
- for ($k=0; $k<$n; $k++) {
- @residues=unpack("C*",$seqs[$k]);
- $residues="";
- for (my $l=0; $l<@residues; $l++) {
- if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
- if ($residues[$l]==46) {
- $residues .= "-";
- } else {
- $residues .= uc(chr($residues[$l]));
- }
- } else {
- if ($residues[$l]==45) {
- $residues .= ".";
- } else {
- $residues .= lc(chr($residues[$l]));
- }
- }
- $seqs[$k]=$residues;
- }
- }
- }
-
- # Use first sequence for match state assignment?
- if ($matchmode eq "first") {
-
- my @match=();
- my $residues;
- my @residues;
-
- # Determine which columns have a gap in first sequence
- for ($k=0; $k<scalar(@names); $k++) { #find seed sequence
- if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
- }
- @residues=unpack("C*",$seqs[$k]);
- for (my $l=0; $l<@residues; $l++) {
- if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
- }
- # Set columns without residue in first sequence to upper case,
- for ($k=0; $k<$n; $k++) {
- @residues=unpack("C*",$seqs[$k]);
- $residues="";
- for (my $l=0; $l<@residues; $l++) {
- if ($match[$l]) {
- if ($residues[$l]==46) {
- $residues .= "-";
- } else {
- $residues .= uc(chr($residues[$l]));
- }
- } else {
- if ($residues[$l]==45) {
- $residues .= ".";
- } else {
- $residues .= lc(chr($residues[$l]));
- }
- }
- $seqs[$k]=$residues;
- }
- }
- }
-
- ################################################################################################
- # Remove gaps etc.
- ################################################################################################
- # Remove columns with too many gaps?
- if ($remove_gapped) {
- my @gaps=();
- my $residues;
- my @residues;
-
- # Determine number of gaps '.' or '-' per column
- for ($k=0; $k<$n; $k++) {
- @residues=unpack("C*",$seqs[$k]);
- for (my $l=0; $l<@residues; $l++) {
- if ($residues[$l]==45 || $residues[$l]==46) {
- if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
- }
- }
- }
-
- # Remove columns with too many gaps
- for ($k=0; $k<$n; $k++) {
- @residues=unpack("C*",$seqs[$k]);
- $residues="";
- for (my $l=0; $l<@residues; $l++) {
- if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
- $residues .= chr($residues[$l])
- }
- $seqs[$k]=$residues;
- }
- }
- }
-
- # Remove/transliterate gaps?
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
- if ($remove_inserts) {
- for ($k=0; $k<$n; $k++) {
- $seqs[$k]=~tr/a-z.//d;
- # printf("%s\n",$seqs[$k]);
- }
- }
-
- # Remove sequences which contain only gaps
- my $nin=$n;
- for ($k=0; $k<$n; $k++) {
- if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
- if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
- splice(@seqs,$k,1);
- splice(@names,$k,1);
- $k--; $n--;
- }
- }
-
- # Cut down sequence names to $desclen characters maximum
- for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}
- if ($outformat eq "a3m") {
- # Remove all '.' gaps
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
- } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
- # Substitute all '.' by '-'
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
- }
- if ($gap ne "default") {
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
- }
- if ($case eq "uc") {
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
- } elsif ($case eq "lc") {
- for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
- }
-
-
- ####################################################
- # Check that sequences have same length
- ####################################################
- if ($outformat ne "a3m" && $outformat ne "ufas") {
- my $len=length($seqs[0]);
- for($k=1; $k<$n; $k++) {
- if (length($seqs[$k])!=$len) {
- printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n",
- $names[0],$len,$names[$k],length($seqs[$k]));
- if ($v>=3) {
- printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
- }
- exit 1;
- }
- }
- }
- ####################################################
- # Remove html tags
- ####################################################
- for($k=0; $k<$n; $k++) {
- $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
- }
- ################################################################################################
- # Output part
- ################################################################################################
-
- my $ndssp=-1;
- my $nsa=-1;
- my $npred=-1;
- my $nconf=-1;
- my $nquery=-1;
- for ($k=0; $k<$n; $k++) {
- if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
- elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
- elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
- elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
- elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
- }
- #Write sequences into output file
- open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
- if ($outformat eq "sto" || $outformat eq "psi") {
- my $refline;
- if ($outformat eq "sto") {
- print(OUTFILE "# STOCKHOLM 1.0\n\n");
- # Write reference annotation line for match states (-M first)
- if (!$lname) {$lname=32;}
- if ($names[$nquery] =~ /^\S+\s+(.*)/) {
- printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE", $1);
- }
- $refline=$seqs[$nquery];
- $refline=~s/[a-z]/-/g;
- printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
- if ($ndssp>=0) {
- printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
- }
- }
- if ($num) {
- my $num=2;
- for ($k=0; $k<$n; $k++) {
- if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
- $names[$k]=~s/^(\S+)\#\d+/$1/;
- $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
- $num++;
- }
- }
- for ($k=0; $k<$n; $k++) {
- if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
- $names[$k] =~ /\s*(\S+)/;
- if (!$lname) {$lname=32;}
- printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
- }
- if ($outformat eq "sto") {print(OUTFILE "//\n");}
- } elsif ($outformat eq "clu") {
- printf(OUTFILE "CLUSTAL\n\n\n");
- if ($num) {
- my $num=2;
- for ($k=0; $k<$n; $k++) {
- if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
- $names[$k]=~s/^(\S+)\#\d+/$1/;
- $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
- $num++;
- }
- }
- while($seqs[0] ne "") { # While there are still residues left, write new block of sequences
- for ($k=0; $k<scalar(@names); $k++) { # Go through all sequences
- $names[$k] =~ s/\s*(\S+).*/$1/;
- $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq
- if (!$lname) {$lname=18;}
- printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
- }
- print(OUTFILE "\n"); # leave a blank line after each block of 60 residues
- }
- } else {
- if ($num) {
- my $num=2;
- for ($k=0; $k<$n; $k++) {
- if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
- $names[$k]=~s/^(\S+)\#\d+/$1/;
- $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
- # $names[$k]=~s/^(\S+)/$1\#$num/;
- $num++;
- }
- }
- if ($titleline ne "" && $outformat eq "a3m") {
- printf(OUTFILE "%s\n",$titleline);
- }
- for ($k=0; $k<$n; $k++) {
- $seqs[$k]=~s/(\S{$numres})/$1\n/g;
- printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
- }
- }
-
- close OUTFILE;
- if ($v>=2) {
- if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
- else {
- if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
- print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
- }
- }
-
- return;
- }
|