#! /usr/bin/perl
# splitfasta.pl
# Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
#
# (C) Johannes Soeding, 2012
#
# HHsuite version 3.0.0 (15-03-2015)
#
# Reference:
# Remmert M., Biegert A., Hauser A., and Soding J.
# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
use lib $ENV{"HHLIB"}."/scripts";
use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
use warnings;
my $ext="seq";
my $usage="
splitfasta.pl from HHsuite $VERSION
Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
Write files into current directory and name each file by the first word after \">\" in the name line.
Usage: splitfasta.pl infile [option]
Option:
-fam : use family-based name (for SCOP/ASTRAL sequences
-name : use sequence name as file name (default)
-ext : extension for sequence files (default=$ext)
\n";
if (@ARGV<1) {die $usage;;}
my $line;
my $infile=$ARGV[0];
my $outfile;
my $sequence="";
my $options="";
my $fam=0; # option -fam?
my $famid="";
my %numfams=();
my $n=0; # number of name lines read in so far
if (@ARGV>1) {
$options.=join(" ",@ARGV[1..$#ARGV]);
}
# Set number of cpus to use
if ($options=~s/-fam//g) {$fam=1;}
if ($options=~s/-name//g) {$fam=0;}
if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
if ($fam) {
while ($line=) {
if ($line=~/^>(\S+)\s+(\S+)/) {
$famid=$2;
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
$outfile="$fam.".$numfams{$fam}.".seq";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
} else {
my %exists=();
while ($line=) {
if ($line=~/^>(\S+)/) {
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
$exists{$1}=1;
my $tmp = $1;
$tmp =~ s/\|/_/g;
$tmp =~ s/\./_/g;
$outfile="$tmp.$ext";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
}
close(INFILE);
printf("Created %i sequence files\n",$n);