reformat.pl 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #! /usr/bin/perl
  2. # reformat.pl
  3. # Reformat a multiple alignment file
  4. #
  5. # HHsuite version 3.0.0 (15-03-2015)
  6. #
  7. # Reference:
  8. # Remmert M., Biegert A., Hauser A., and Soding J.
  9. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  10. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  11. # (C) Johannes Soeding, 2012
  12. # This program is free software: you can redistribute it and/or modify
  13. # it under the terms of the GNU General Public License as published by
  14. # the Free Software Foundation, either version 3 of the License, or
  15. # (at your option) any later version.
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. # You should have received a copy of the GNU General Public License
  21. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. # We are very grateful for bug reports! Please contact us at [email protected]
  23. use strict;
  24. use warnings;
  25. my $numres=100; # number of residues per line
  26. my $desclen=1000; # maximum number of characters in nameline
  27. my $ARGC=scalar(@ARGV);
  28. if ($ARGC<2) {
  29. die("
  30. reformat.pl from HHsuite3
  31. Read a multiple alignment in one format and write it in another format
  32. Usage: reformat.pl [informat] [outformat] infile outfile [options]
  33. or reformat.pl [informat] [outformat] 'fileglob' .ext [options]
  34. Available input formats:
  35. fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
  36. a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
  37. gaps aligned to inserts: '.'
  38. a3m: like a2m, but gaps aligned to inserts MAY be omitted
  39. sto: Stockholm format; sequences in several blocks with sequence name at
  40. beginning of line (hmmer output)
  41. psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r)
  42. clu: Clustal format; sequences in several blocks with sequence name at beginning
  43. of line
  44. Available output formats:
  45. fas: aligned fasta; all gaps '-'
  46. a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
  47. aligned to inserts: '.'
  48. a3m: like a2m, but gaps aligned to inserts are omitted
  49. sto: Stockholm format; sequences in just one block, one line per sequence
  50. psi: format as read by PSI-BLAST using the -B option
  51. clu: Clustal format
  52. If no input or output format is given the file extension is interpreted as format
  53. specification ('aln' as 'clu')
  54. Options:
  55. -v int verbose mode (0:off, 1:on)
  56. -num add number prefix to sequence names: 'name', '1:name' '2:name' etc
  57. -noss remove secondary structure sequences (beginning with >ss_)
  58. -sa do not remove solvent accessibility sequences (beginning with >sa_)
  59. -M first make all columns with residue in first sequence match columns
  60. (default for output format a2m or a3m)
  61. -M int make all columns with less than X% gaps match columns
  62. (for output format a2m or a3m)
  63. -r remove all lower case residues (insert states)
  64. (AFTER -M option has been processed)
  65. -r int remove all lower case columns with more than X% gaps
  66. -g '' suppress all gaps
  67. -g '-' write all gaps as '-'
  68. -uc write all residues in upper case (AFTER all other options have been processed)
  69. -lc write all residues in lower case (AFTER all other options have been processed)
  70. -l number of residues per line (for Clustal, FASTA, A2M, A3M formats)
  71. (default=$numres)
  72. -d maximum number of characers in nameline (default=$desclen)
  73. Examples: reformat.pl 1hjra.a3m 1hjra.a2m
  74. (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
  75. reformat.pl test.a3m test.fas -num -r 90
  76. reformat.pl fas sto '*.fasta' .stockholm
  77. \n");
  78. # clu: clustal format (hmmer output)
  79. # sel: Selex format
  80. # phy: Phylip format
  81. }
  82. my $informat="";
  83. my $outformat="";
  84. my $infile="";
  85. my $outfile="";
  86. my $num=0; # don't use sequence number as prefix: '>n|name'
  87. my $noss=0; # don't remove secondary structure
  88. my $nosa=1; # remove solvent accessibility sequences
  89. my $line;
  90. my $options="";
  91. my @names; # names of sequences read in
  92. my @seqs; # residues of sequences read in
  93. my $n; # number of sequences read in
  94. my $k; # counts sequences for output
  95. my $remove_inserts=0;
  96. my $remove_gapped=0;
  97. my $matchmode=""; # do not change capitalization
  98. my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns
  99. my $v=2;
  100. my $update=0;
  101. my $nss=-1; # index of secondary structure sequence
  102. my $lname; # length of sequence name in clustal, stockholm, psi format
  103. my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files
  104. my @informats= ("fas","a2m","a3m","sto","psi","clu");
  105. my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
  106. my $found;
  107. my $element;
  108. my $gap="default";
  109. my $case="default";
  110. # Process optionsz
  111. for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
  112. if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
  113. if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
  114. if ($options=~s/ -num / /) {$num=1; $desclen=505;}
  115. if ($options=~s/ -noss / /) {$noss=1;}
  116. if ($options=~s/ -sa / /) {$nosa=0;}
  117. if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
  118. if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
  119. if ($options=~s/ -r / /) {$remove_inserts=1;}
  120. if ($options=~s/ -lc / /) {$case="lc";}
  121. if ($options=~s/ -uc / /) {$case="uc";}
  122. if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
  123. if ($options=~s/ -v / /) {$v=2;}
  124. if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
  125. if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;}
  126. if ($options=~s/ -u / /) {$update=1;}
  127. if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
  128. if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
  129. if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}
  130. # Assign informat, outformat, infile, and outfile
  131. if ($outfile eq "") {
  132. if ($options=~s/(\S+)\s*$//) {
  133. $outfile=$1;
  134. } else {
  135. die("Error: no output file given: '$options'\n");
  136. }
  137. }
  138. if ($infile eq "") {
  139. if ($options=~s/(\S+)\s*$//) {
  140. $infile=$1;
  141. } else {
  142. die("Error: no input file given: '$options'\n");
  143. }
  144. }
  145. if ($options=~s/(\S+)\s*$//) {
  146. $outformat=$1;
  147. } else {
  148. if ($outfile=~/\S*\.(\S+?)$/) {
  149. $outformat=lc($1);
  150. if ($outformat eq "aln") {$outformat="clu";}
  151. elsif ($outformat eq "fa") {$outformat="fas";}
  152. elsif ($outformat eq "fasta") {$outformat="fas";}
  153. elsif ($outformat eq "afa") {$outformat="fas";}
  154. elsif ($outformat eq "afas") {$outformat="fas";}
  155. elsif ($outformat eq "afasta") {$outformat="fas";}
  156. } else {
  157. print ("Using FASTA output format: '$options'\n"); $outformat="fas";
  158. }
  159. }
  160. if ($options=~s/(\S+)\s*$//) {
  161. $informat=$1;
  162. } else {
  163. if ($infile=~/\S*\.(\S+?)$/) {
  164. $informat=lc($1);
  165. if ($informat eq "aln") {$informat="clu";}
  166. elsif ($informat eq "fa") {$informat="fas";}
  167. elsif ($informat eq "fasta") {$informat="fas";}
  168. } else {
  169. print ("Using FASTA input format: '$options'\n"); $informat="fas";
  170. }
  171. }
  172. # Warn if unknown options found
  173. if ($options!~/^\s*$/) {
  174. $options=~s/^\s*(.*?)\s*$/$1/g;
  175. print("\nWARNING: unknown options '$options'\n");
  176. }
  177. # Check if input and output formats are valid
  178. $found=0;
  179. foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
  180. if(!$found) {die("\nError: $informat is not a valid input format option\n");}
  181. $found=0;
  182. foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
  183. if(!$found) {die("\nError: $outformat is not a valid output format option\n");}
  184. #if($outformat eq "psi") {
  185. # $remove_inserts=1;
  186. #}
  187. if($outformat eq "ufas") {$gap="";}
  188. if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
  189. {
  190. $outfile=~/.*\.(\S*)$/;
  191. my $outext=$1;
  192. my @infiles=glob($infile);
  193. printf("%i files to reformat\n",scalar(@infiles));
  194. foreach $infile (@infiles)
  195. {
  196. if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
  197. $outfile="$1.$outext";
  198. if ($update && -e $outfile) {next;}
  199. if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
  200. &reformat($infile,$outfile);
  201. }
  202. exit 0;
  203. }
  204. else
  205. {
  206. if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
  207. &reformat($infile,$outfile);
  208. exit 0;
  209. }
  210. ################################################################################################
  211. # Reformat a single file
  212. ################################################################################################
  213. sub reformat()
  214. {
  215. $infile=$_[0];
  216. $nss=-1;
  217. $titleline="";
  218. ################################################################################################
  219. # Input part
  220. ################################################################################################
  221. my $skip=0;
  222. open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");
  223. # Read a2m, a3m, fas format
  224. if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
  225. {
  226. $/=">"; # set input field seperator
  227. $n=0;
  228. my $seq=<INFILE>;
  229. if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file
  230. $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file
  231. # If first line has no sequence name, use >root_name
  232. if ($seq ne ">") {
  233. $infile="/$infile."; # determine root name 1
  234. $infile=~/^.*\/(.*?)\..*/; # determine root name 2
  235. $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
  236. $seq=~tr/\n //d; # remove newlines and blanks
  237. $seqs[$n]=$seq;
  238. $n++;
  239. }
  240. while ($seq=<INFILE>) {
  241. $seq=~s/\n\#.*//g; # remove commentary lines
  242. while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n'
  243. if ($seq=~/^aa_/) {next;}
  244. if ($seq=~/^sa_/ && $nosa) {next;}
  245. if ($seq=~/^ss_/) {
  246. if ($noss) {next;} # do not read in >ss_ and >sa_ sequences
  247. # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
  248. # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
  249. $nss=$n;
  250. }
  251. $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n'
  252. if (defined $1 && $1 ne "") {
  253. $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
  254. } else {
  255. $names[$n]=$n;
  256. }
  257. $seqs[$n]=$seq;
  258. $n++;
  259. }
  260. $/="\n"; # reset input field seperator
  261. }
  262. # Read Stockholm format
  263. elsif ($informat eq "sto")
  264. {
  265. my %seqhash;
  266. my $name;
  267. my $first_block=1;
  268. $n=0;
  269. while ($line = <INFILE>)
  270. {
  271. $line=~tr/\r//d;
  272. $line=~s/\s+/ /g;
  273. if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line
  274. if ($line=~/^\#/) {next;} # skip commentary and blank line
  275. if ($line=~/^\/\//) {last;} # reached end of file
  276. if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts
  277. if ($line!~/^\s*(\S+)\s+(\S+)/) {
  278. die ("\nERROR found in stockholm format: $!");
  279. }
  280. if (!(exists $seqhash{$1}))
  281. {
  282. if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences
  283. if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
  284. if ($line=~/^ss_/) {
  285. if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
  286. # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
  287. # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
  288. $nss=$n;
  289. }
  290. $line=~/^\s*(\S+)\s+(\S+)/;
  291. $names[$n]=$1;
  292. $seqs[$n]=$2;
  293. $seqhash{$1}=$n++;
  294. $first_block=1;
  295. }
  296. else
  297. {
  298. if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
  299. $seqs[$seqhash{$1}].=$2;
  300. }
  301. # printf("%3i %s\n",$n-1,$seqs[$n-1]);
  302. }
  303. }
  304. elsif ($informat eq "clu")
  305. {
  306. my $residues_per_line=50; # default number of characters for namefield residues
  307. # (only needed if no gap between name and residues in first sequence -> SMART)
  308. my $block=1; # number of block currently read
  309. my $name;
  310. my $residues;
  311. $n=0; # sequence number of first block
  312. $k=0; # sequence index to zero for first block
  313. while ($line = <INFILE>)
  314. {
  315. # print($namefieldlen.$line);
  316. $line=~tr/\r//d;
  317. if ($line=~/CLUSTAL/i) {next;}
  318. if ($line=~/^\#/) {next;} # skip commentary and blank line
  319. if ($line=~/^\/\//) {last;} # reached end of file
  320. if ($line=~/^\s*$/){ # new sequence block starts
  321. if ($k) {
  322. if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
  323. $block++;
  324. $n=$k;
  325. $k=0; # set sequence index to zero for next block to read
  326. }
  327. next;
  328. }
  329. if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
  330. if ($line=~/^[*.: ]*$/) {next;}
  331. if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
  332. chomp($line);
  333. if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
  334. die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
  335. }
  336. $name=$1;
  337. $residues=$2;
  338. print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
  339. } else {
  340. if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
  341. if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
  342. if ($line=~/^ss_/) {
  343. # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
  344. # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
  345. $nss=$n;
  346. }
  347. $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
  348. $name=$1;
  349. $residues=$2;
  350. $residues=~tr/ //d;
  351. $residues_per_line=length($residues);
  352. }
  353. if ($block==1) {
  354. $names[$k]=$name;
  355. $seqs[$k]=$residues;
  356. } else {
  357. $seqs[$k].=$residues;
  358. if ($names[$k] ne $name) {
  359. print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
  360. }
  361. }
  362. # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
  363. $k++;
  364. }
  365. if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
  366. if (!$n) {$n=$k;}
  367. }
  368. # Read psi format
  369. elsif ($informat eq "psi")
  370. {
  371. my $block=1; # number of block currently read
  372. my $name;
  373. my $residues;
  374. $n=0; # sequence number of first block
  375. $k=0; # sequence index to zero for first block
  376. while ($line = <INFILE>)
  377. {
  378. # print($namefieldlen.$line);
  379. $line=~tr/\r//d;
  380. if ($line=~/^\s*$/){ # new sequence block starts
  381. if ($k) {
  382. if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
  383. $block++;
  384. $n=$k;
  385. $k=0; # set sequence index to zero for next block to read
  386. }
  387. next;
  388. }
  389. if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
  390. if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
  391. if ($line=~/^ss_/) {
  392. # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
  393. # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
  394. $nss=$n;
  395. }
  396. $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
  397. $name=$1;
  398. $residues=$2;
  399. $residues=~tr/ //d;
  400. if ($block==1) {
  401. $names[$k]=$name;
  402. $seqs[$k]=$residues;
  403. } else {
  404. $seqs[$k].=$residues;
  405. if ($names[$k] ne $name) {
  406. print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
  407. }
  408. }
  409. # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
  410. $k++;
  411. }
  412. if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
  413. if (!$n) {$n=$k;}
  414. }
  415. close INFILE;
  416. # Empty input file?
  417. if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}
  418. ################################################################################################
  419. # Transforming to upper-case
  420. ################################################################################################
  421. if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M
  422. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
  423. }
  424. ################################################################################################
  425. # Removing non-alphanumeric symbols
  426. ################################################################################################
  427. for ($k=0; $k<$n; $k++) {
  428. $seqs[$k]=~tr/A-Za-z0-9.~-//cd;
  429. $seqs[$k]=~tr/~/-/;
  430. }
  431. ################################################################################################
  432. # Filling up with gaps '.' or deleting gaps
  433. ################################################################################################
  434. # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option
  435. if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
  436. {
  437. print("inserting gaps...\n");
  438. my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
  439. my $j; # counts match states
  440. my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
  441. my $insert;
  442. # Determine length of longest insert
  443. for ($k=0; $k<$n; $k++)
  444. {
  445. # split into list of single match states and variable-length inserts
  446. # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
  447. # The '#' symbol is prepended to get rid of a perl bug in split
  448. @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
  449. $j=0;
  450. # printf("Sequence $k: $seqs[$k]\n");
  451. # printf("Sequence $k: @inserts\n");
  452. # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
  453. foreach $insert (@inserts)
  454. {
  455. if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
  456. $j++;
  457. # printf("$insert|");
  458. }
  459. # printf("\n");
  460. }
  461. my $ngap;
  462. # Fill up inserts with gaps '.' up to length $len_ins[$j]
  463. for ($k=0; $k<$n; $k++)
  464. {
  465. # split into list of single match states and variable-length inserts
  466. @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
  467. $j=0;
  468. # append the missing number of gaps after each match state
  469. foreach $insert (@inserts)
  470. {
  471. for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
  472. $j++;
  473. }
  474. $seqs[$k] = join("",@inserts);
  475. $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
  476. }
  477. }
  478. ################################################################################################
  479. # Match state assignment
  480. ################################################################################################
  481. # Default match state rule for a3m is -M first
  482. if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}
  483. # Use gap rule for match state assignment?
  484. if ($matchmode eq "gaprule") {
  485. my @gaps=();
  486. my $residues;
  487. my @residues;
  488. # Determine number of gaps per column
  489. for ($k=0; $k<$n; $k++) {
  490. @residues=unpack("C*",$seqs[$k]);
  491. for (my $l=0; $l<@residues; $l++) {
  492. if ($residues[$l]==46 || $residues[$l]==45) {
  493. if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
  494. }
  495. }
  496. }
  497. # Set columns with more gaps than $match_gaprule to lower case,
  498. for ($k=0; $k<$n; $k++) {
  499. @residues=unpack("C*",$seqs[$k]);
  500. $residues="";
  501. for (my $l=0; $l<@residues; $l++) {
  502. if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
  503. if ($residues[$l]==46) {
  504. $residues .= "-";
  505. } else {
  506. $residues .= uc(chr($residues[$l]));
  507. }
  508. } else {
  509. if ($residues[$l]==45) {
  510. $residues .= ".";
  511. } else {
  512. $residues .= lc(chr($residues[$l]));
  513. }
  514. }
  515. $seqs[$k]=$residues;
  516. }
  517. }
  518. }
  519. # Use first sequence for match state assignment?
  520. if ($matchmode eq "first") {
  521. my @match=();
  522. my $residues;
  523. my @residues;
  524. # Determine which columns have a gap in first sequence
  525. for ($k=0; $k<scalar(@names); $k++) { #find seed sequence
  526. if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
  527. }
  528. @residues=unpack("C*",$seqs[$k]);
  529. for (my $l=0; $l<@residues; $l++) {
  530. if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
  531. }
  532. # Set columns without residue in first sequence to upper case,
  533. for ($k=0; $k<$n; $k++) {
  534. @residues=unpack("C*",$seqs[$k]);
  535. $residues="";
  536. for (my $l=0; $l<@residues; $l++) {
  537. if ($match[$l]) {
  538. if ($residues[$l]==46) {
  539. $residues .= "-";
  540. } else {
  541. $residues .= uc(chr($residues[$l]));
  542. }
  543. } else {
  544. if ($residues[$l]==45) {
  545. $residues .= ".";
  546. } else {
  547. $residues .= lc(chr($residues[$l]));
  548. }
  549. }
  550. $seqs[$k]=$residues;
  551. }
  552. }
  553. }
  554. ################################################################################################
  555. # Remove gaps etc.
  556. ################################################################################################
  557. # Remove columns with too many gaps?
  558. if ($remove_gapped) {
  559. my @gaps=();
  560. my $residues;
  561. my @residues;
  562. # Determine number of gaps '.' or '-' per column
  563. for ($k=0; $k<$n; $k++) {
  564. @residues=unpack("C*",$seqs[$k]);
  565. for (my $l=0; $l<@residues; $l++) {
  566. if ($residues[$l]==45 || $residues[$l]==46) {
  567. if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
  568. }
  569. }
  570. }
  571. # Remove columns with too many gaps
  572. for ($k=0; $k<$n; $k++) {
  573. @residues=unpack("C*",$seqs[$k]);
  574. $residues="";
  575. for (my $l=0; $l<@residues; $l++) {
  576. if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
  577. $residues .= chr($residues[$l])
  578. }
  579. $seqs[$k]=$residues;
  580. }
  581. }
  582. }
  583. # Remove/transliterate gaps?
  584. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
  585. if ($remove_inserts) {
  586. for ($k=0; $k<$n; $k++) {
  587. $seqs[$k]=~tr/a-z.//d;
  588. # printf("%s\n",$seqs[$k]);
  589. }
  590. }
  591. # Remove sequences which contain only gaps
  592. my $nin=$n;
  593. for ($k=0; $k<$n; $k++) {
  594. if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
  595. if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
  596. splice(@seqs,$k,1);
  597. splice(@names,$k,1);
  598. $k--; $n--;
  599. }
  600. }
  601. # Cut down sequence names to $desclen characters maximum
  602. for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}
  603. if ($outformat eq "a3m") {
  604. # Remove all '.' gaps
  605. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
  606. } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
  607. # Substitute all '.' by '-'
  608. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
  609. }
  610. if ($gap ne "default") {
  611. for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
  612. }
  613. if ($case eq "uc") {
  614. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
  615. } elsif ($case eq "lc") {
  616. for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
  617. }
  618. ####################################################
  619. # Check that sequences have same length
  620. ####################################################
  621. if ($outformat ne "a3m" && $outformat ne "ufas") {
  622. my $len=length($seqs[0]);
  623. for($k=1; $k<$n; $k++) {
  624. if (length($seqs[$k])!=$len) {
  625. printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n",
  626. $names[0],$len,$names[$k],length($seqs[$k]));
  627. if ($v>=3) {
  628. printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
  629. }
  630. exit 1;
  631. }
  632. }
  633. }
  634. ####################################################
  635. # Remove html tags
  636. ####################################################
  637. for($k=0; $k<$n; $k++) {
  638. $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
  639. }
  640. ################################################################################################
  641. # Output part
  642. ################################################################################################
  643. my $ndssp=-1;
  644. my $nsa=-1;
  645. my $npred=-1;
  646. my $nconf=-1;
  647. my $nquery=-1;
  648. for ($k=0; $k<$n; $k++) {
  649. if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
  650. elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
  651. elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
  652. elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
  653. elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
  654. }
  655. #Write sequences into output file
  656. open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
  657. if ($outformat eq "sto" || $outformat eq "psi") {
  658. my $refline;
  659. if ($outformat eq "sto") {
  660. print(OUTFILE "# STOCKHOLM 1.0\n\n");
  661. # Write reference annotation line for match states (-M first)
  662. if (!$lname) {$lname=32;}
  663. if ($names[$nquery] =~ /^\S+\s+(.*)/) {
  664. printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE", $1);
  665. }
  666. $refline=$seqs[$nquery];
  667. $refline=~s/[a-z]/-/g;
  668. printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
  669. if ($ndssp>=0) {
  670. printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
  671. }
  672. }
  673. if ($num) {
  674. my $num=2;
  675. for ($k=0; $k<$n; $k++) {
  676. if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
  677. $names[$k]=~s/^(\S+)\#\d+/$1/;
  678. $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
  679. $num++;
  680. }
  681. }
  682. for ($k=0; $k<$n; $k++) {
  683. if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
  684. $names[$k] =~ /\s*(\S+)/;
  685. if (!$lname) {$lname=32;}
  686. printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
  687. }
  688. if ($outformat eq "sto") {print(OUTFILE "//\n");}
  689. } elsif ($outformat eq "clu") {
  690. printf(OUTFILE "CLUSTAL\n\n\n");
  691. if ($num) {
  692. my $num=2;
  693. for ($k=0; $k<$n; $k++) {
  694. if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
  695. $names[$k]=~s/^(\S+)\#\d+/$1/;
  696. $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
  697. $num++;
  698. }
  699. }
  700. while($seqs[0] ne "") { # While there are still residues left, write new block of sequences
  701. for ($k=0; $k<scalar(@names); $k++) { # Go through all sequences
  702. $names[$k] =~ s/\s*(\S+).*/$1/;
  703. $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq
  704. if (!$lname) {$lname=18;}
  705. printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
  706. }
  707. print(OUTFILE "\n"); # leave a blank line after each block of 60 residues
  708. }
  709. } else {
  710. if ($num) {
  711. my $num=2;
  712. for ($k=0; $k<$n; $k++) {
  713. if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
  714. $names[$k]=~s/^(\S+)\#\d+/$1/;
  715. $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
  716. # $names[$k]=~s/^(\S+)/$1\#$num/;
  717. $num++;
  718. }
  719. }
  720. if ($titleline ne "" && $outformat eq "a3m") {
  721. printf(OUTFILE "%s\n",$titleline);
  722. }
  723. for ($k=0; $k<$n; $k++) {
  724. $seqs[$k]=~s/(\S{$numres})/$1\n/g;
  725. printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
  726. }
  727. }
  728. close OUTFILE;
  729. if ($v>=2) {
  730. if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
  731. else {
  732. if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
  733. print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
  734. }
  735. }
  736. return;
  737. }