splitfasta.pl 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #! /usr/bin/perl
  2. # splitfasta.pl
  3. # Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
  4. #
  5. # (C) Johannes Soeding, 2012
  6. #
  7. # HHsuite version 3.0.0 (15-03-2015)
  8. #
  9. # Reference:
  10. # Remmert M., Biegert A., Hauser A., and Soding J.
  11. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  12. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  13. # This program is free software: you can redistribute it and/or modify
  14. # it under the terms of the GNU General Public License as published by
  15. # the Free Software Foundation, either version 3 of the License, or
  16. # (at your option) any later version.
  17. # This program is distributed in the hope that it will be useful,
  18. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. # GNU General Public License for more details.
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. # We are very grateful for bug reports! Please contact us at [email protected]
  24. use lib $ENV{"HHLIB"}."/scripts";
  25. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  26. use strict;
  27. use warnings;
  28. my $ext="seq";
  29. my $usage="
  30. splitfasta.pl from HHsuite $VERSION
  31. Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
  32. Write files into current directory and name each file by the first word after \">\" in the name line.
  33. Usage: splitfasta.pl infile [option]
  34. Option:
  35. -fam : use family-based name (for SCOP/ASTRAL sequences
  36. -name : use sequence name as file name (default)
  37. -ext <ext> : extension for sequence files (default=$ext)
  38. \n";
  39. if (@ARGV<1) {die $usage;;}
  40. my $line;
  41. my $infile=$ARGV[0];
  42. my $outfile;
  43. my $sequence="";
  44. my $options="";
  45. my $fam=0; # option -fam?
  46. my $famid="";
  47. my %numfams=();
  48. my $n=0; # number of name lines read in so far
  49. if (@ARGV>1) {
  50. $options.=join(" ",@ARGV[1..$#ARGV]);
  51. }
  52. # Set number of cpus to use
  53. if ($options=~s/-fam//g) {$fam=1;}
  54. if ($options=~s/-name//g) {$fam=0;}
  55. if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
  56. open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
  57. if ($fam) {
  58. while ($line=<INFILE>) {
  59. if ($line=~/^>(\S+)\s+(\S+)/) {
  60. $famid=$2;
  61. if ($n) {
  62. open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
  63. print(OUTFILE $sequence);
  64. close(OUTFILE);
  65. }
  66. if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
  67. $outfile="$fam.".$numfams{$fam}.".seq";
  68. $sequence=$line;
  69. $n++;
  70. } else {
  71. $sequence.=$line;
  72. }
  73. }
  74. if ($n) {
  75. open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
  76. print(OUTFILE $sequence);
  77. close(OUTFILE);
  78. }
  79. } else {
  80. my %exists=();
  81. while ($line=<INFILE>) {
  82. if ($line=~/^>(\S+)/) {
  83. if ($n) {
  84. open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
  85. print(OUTFILE $sequence);
  86. close(OUTFILE);
  87. }
  88. if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
  89. $exists{$1}=1;
  90. my $tmp = $1;
  91. $tmp =~ s/\|/_/g;
  92. $tmp =~ s/\./_/g;
  93. $outfile="$tmp.$ext";
  94. $sequence=$line;
  95. $n++;
  96. } else {
  97. $sequence.=$line;
  98. }
  99. }
  100. if ($n) {
  101. open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
  102. print(OUTFILE $sequence);
  103. close(OUTFILE);
  104. }
  105. }
  106. close(INFILE);
  107. printf("Created %i sequence files\n",$n);