#! /usr/bin/perl # pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile # # HHsuite version 3.0.0 (15-03-2015) # # Reference: # Remmert M., Biegert A., Hauser A., and Soding J. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). # (C) Johannes Soeding, 2012 # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de use lib $ENV{"HHLIB"}."/scripts"; use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. use strict; $|= 1; # Activate autoflushing on STDOUT # Default options my $idmax=90; # maximum sequence identity threshold my $Evalmin=0.01; # minimum BLAST E-value (should be <0.01 to ensure that sequences being filtered out are homologous to some representative sequence my $covmin=90; # minimum coverage threshold (should be large enough to ensure that no structural domain is lost from the filtered set just because it occurs in a sequence with, say, 2 other domains similar to some representative sequence) my $v=2; my $blastpgp="$ncbidir/blastpgp"; my $help=" pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile Compares each sequence with all others using BLAST (blastpgp). If two sequences A and B are sufficiently similar, the sequence with lower resolution will be removed. The exact criterion for removal of sequence B is: IF more than \$covmin\% residues of sequence B are aligned to sequence A AND the sequence identity of the alignment is larger than \$idmin AND the E-value is better than \$Evalmin AND sequence B has lower resolution than A. The input file must have been prepared with pdb2fasta.pl or scop2fasta.pl. Sequences with fewer than 15 residues are suppressed. Usage: pdbfilter.pl infile filtered_file [-u old_filtered_file] [-id int] [-cov int] Options -id maximum sequence identity between representative sequences (default=$idmax) -e minimum E-value between representative sequences (default=$Evalmin) -cov minimum coverage of a sequence with a similar one to get thrown out (default=$covmin) -u update the old filtered file; this saves a lot of execution time -v verbose mode Example: pdbfilter.pl pdb.fas pdb70.fas -u pdb70.fas -id 70 -cov 90\n "; if (@ARGV<2) {print($help); exit;} my $infile; my $outfile; my $oldfile; my $root=""; # $outfile without extension my $line; my $pdbid=""; # e.g. 1ahs_C my $qpdbid; # pdbid of query sequence my $seq=""; # sequence record (name+residues) in FASTA my @seqs; # sequence records as they were read my $len=0; # length of sequence to be read in my %len; # $len{$pdbid} is length of sequence my $resol; # experimental resolution in Angstroem my %resol; # experimental resolution in Angstroem my %excluded; # representative sequences are all those not in this hash my %accepted; # representative sequences accpeted so far my %similar; # $similar{$pdbid} contains list of all pdbids (SCOPIDs) that are represented by (i.e. similar to) $qpdbid my %het; # $het{$pdbid} is "*" if sequence $pdbid has a HET: record (i.e. contains HET group with >=10 atoms), "" otherwise my $id; # sequence identity to query my $cov; # coverage my $Evalue; # BLAST E-value my $nold=0; # number of sequences in oldfile my $ntot=0; # total number of sequences in oldfile and infile my $k=0; # counts sequences read in so far my $sort_by_family=1; # set to 0 if at least one non-SCOP sequence found # Read command line options my $options=""; for (my $i=0; $i<=$#ARGV; $i++) {$options.=" $ARGV[$i]";} if ($options=~s/ -id\s+(\S+)//) {$idmax=$1;} if ($options=~s/ -cov\s+(\S+)//) {$covmin=$1;} if ($options=~s/ -e\s+(\S+)//) {$Evalmin=$1;} if ($options=~s/ -u\s+(\S+)//) {$oldfile=$1;} if ($options=~s/ -v\s*(\d+)//) {$v=$1;} if ($options=~s/ -v//) {$v=2;} if (!$infile && $options=~s/^\s*([^- ]\S+)\s*//) {$infile=$1;} if (!$outfile && $options=~s/^\s*([^- ]\S+)\s*//) {$outfile=$1;} # Warn if unknown options found or no infile/outfile if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");} if (!$infile) {print($help); print("Error: no input file given\n"); exit;} if (!$outfile) {print($help); print("Error: no output file given\n"); exit;} $covmin*=0.01; if ($outfile=~/^(\S*?)\.(.*)$/) {$root=$1;} else {$root=$outfile;} # Read sequences from oldfile if ($oldfile) {$nold=&ReadSequences($oldfile,0);} # Read NEW sequences from infile (the ones that are not yet in oldfile) $ntot=&ReadSequences($infile,$nold); # Sort by resolution @seqs=sort SortByResolution @seqs; #foreach $seq (@seqs) {print("$seq");} # Record resolution and length of all sequences in hashes for ($k=0; $k<@seqs; $k++) { $seqs[$k]=~s/^(\S+?)!(\S+?)!>(\S+)(.*)/>$3$4/o; $len{$3}=$1; $resol{$3}=$2; my $pdbid=$3; if ($seqs[$k]=~s/ PDB:\s*(( \d\S{3,5}\*?)+)//) { $similar{$pdbid}=$1; } elsif ($seqs[$k]=~s/ SCOPID:\s*(( [a-z]\d\S{5}\*?)+)//) { $similar{$pdbid}=$1; } else { $similar{$pdbid}=""; } if ($seqs[$k]=~/ HET:/) { $het{$pdbid}="*"; } else { $het{$pdbid}=""; } } # Format BLAST database, initialize if ($v>=2) {printf("Doing PSI-BLAST runs\n");} &PrintSequences("$root.db",$nold); &System("$ncbidir/formatdb -i $root.db"); my $nacc=0; # number of accepted sequences = number of BLAST searches already performed my $done=0; # number of sequences already processed my $nexcl=0; # number of already excluded chains my $step=($ntot>1000)*0.05+($ntot<=1000)*0.1; my $reformat=$step; # when X% of the total number of sequences have been excluded, reformat database #################################################################################################### # For each sequence in infile, do BLAST search and exclude hits that are too similar to query foreach $seq (@seqs) { $seq=~/^>(\S+)/o; $done++; if($excluded{$1}) {next;} $qpdbid=$1; $resol=$resol{$1}; $len=$len{$1}; if ($v>=2 && !($nacc%100)) { printf("\nSearches:%-4i Done:%3i%% DB-size:%3i%% ",$nacc,100*($nexcl+$nacc)/$ntot+0.5,100*($ntot-$nexcl)/$ntot+0.5); } # When a substantial number of sequences have been excluded, reformat database WITHOUT excluded sequences if (!$oldfile && ($nexcl+$nacc)/$ntot>$reformat) { if ($v>=2) {printf("\b\b\b%2i%%",100*$reformat+0.5);} elsif ($v>=3) { printf("\nReformatting search database containing %i out of %i sequences\n",$ntot-$nexcl,$ntot);} # Write updated database with all seqs with index >=$done that have not yet been excluded # (Don't have to compare A->B and B->A, hence exclude seqs with index <$done) &PrintSequences("$root.db",$done); &System("$ncbidir/formatdb -i $root.db"); $reformat+=$step; } open (TMPFILE,">$root.tmp") || die ("ERROR: cannot open $root.tmp for writing: $!\n"); print(TMPFILE $seq); close(TMPFILE); # Find hits that are too similar if ($v>=3) {print("\n$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot");} open(BLAST,"$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|"); while ($line=){ if ($line=~/^>(\S+)/o && $1 ne $qpdbid && !$excluded{$1} && !$accepted{$1}) { $pdbid=$1; while ($line=){if ($line=~/^ Score = /o) {last;}} $line=~/ Expect =\s*(\S+)/ or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n"); $Evalue=$1; $Evalue=~s/^e/1e/; $Evalue=~s/,$//; $line=; $line=~/^ Identities =\s*\d+\/(\d+)\s+\((\d+)\%\)/o or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n"); $len=$1; $id=$2; # Coverage = (length of whole alignment (including gaps) - gaps in query or HSP) / total length of matched sequence if ($line=~/Gaps =\s*(\d+)\/\d+/) {$cov=($len-$1)/$len{$pdbid};} else {$cov=$len/$len{$pdbid};} ## Main filtering criterion: remove sequence from representative set if... if ($id>=$idmax && $Evalue<=$Evalmin && $cov>=$covmin && $resol<=$resol{$pdbid}) { $excluded{$pdbid}=1; # if ($similar{$qpdbid} ne "") {$similar{$qpdbid}.=",";} # separate pdbids with identical sequences with ',' if (!$similar{$qpdbid}) {$similar{$qpdbid}="";} $similar{$qpdbid}.=" ".$pdbid.$het{$pdbid}.$similar{$pdbid}; $nexcl++; if ($v>=4) { print($line); printf("Rep: $qpdbid excl: $pdbid (len=%3i cov=%3.0f id=$1)\n",$len{$pdbid},100*$cov); } } } } close(BLAST); $nacc++; $accepted{$qpdbid}=1; if ($v>=2) {print(".");} } if ($v>=2) {print("\n");} #################################################################################################### if ($sort_by_family) {@seqs=sort SortByFamily @seqs;} # Print out all representative sequences to outfile &PrintSequences($outfile,0); if ($v>=2) { printf("Written %i out of %i sequences to $outfile\n",$ntot-$nexcl,$ntot); } unlink("$root.tmp"); unlink(glob("$root.db*")); exit; # Read sequences in infile beginning at index $k sub ReadSequences() { my $infile=$_[0]; my $k=$_[1]; my $k0=$k; my $nres=0; # skip sequences with fewer than 15 real residues if ($v>=3) {printf("Reading $infile ... \n");} open (INFILE,"<$infile") || die ("ERROR: cannot open $infile for reading: $!\n"); while ($line=) { if ($line=~/^>/) { if ($pdbid && !$len{$pdbid} && $nres>=15) { if ($len<26) {chomp $seq; $seq.=('x'x(26-$len))."\n";} # add 'x' to make $seq at least 26 resiudes long $seqs[$k++]="$len!$resol!$seq"; } # Read pdbid (or SCOP id) and resolution from name line if ($line=~/^>(\S{4,6}) .*; (\d+\.?\d*|NMR)A? \{/o) { # PDB sequence $pdbid=$1; if ($2 eq "NMR") {$resol=10;} else {$resol=$2;} $seq=$line; $len=$nres=0; $sort_by_family=0; } elsif ($line=~/^>([a-z]\S{6}) .* (\d+\.?\d*)\s*$/) { # SCOP sequence $pdbid=$1; if ($2 eq "" or $2 eq "NMR") {$resol=10;} else {$resol=$2;} $resol=$2; $line=~s/^(>.*) (\d+\.?\d*)\s*$/$1\n/; # remove resolution at the end of the name line $seq=$line; $len=0; } else {print("WARNING: nameline format not recognized: $line");} $nres=0; } else { $seq.=$line; $len+=length($line)-1; $nres+=($line=~tr/a-wyA-WY/a-wyA-WY/); } } if ($pdbid && !$len{$pdbid}) { if ($len<26) {$seq.=('X'x(26-$len))."\n";} # add 'X' to make $seq at least 26 resiudes long $seqs[$k++]="$len!$resol!$seq"; } close(INFILE); if ($v>=2) {printf("Read %i sequences from $infile ... \n",$k-$k0);} return $k; } # Print all sequences that have not been excluded so far to file, beginning at index $k sub PrintSequences() { open (OUTFILE,">$_[0]") || die("Error: could not write to $_[0]: $!\n"); for (my $k=$_[1]; $k<@seqs; $k++) { $seqs[$k]=~/>(\S+)/o; if (!$excluded{$1}) { my $seq=$seqs[$k]; if ($similar{$1} ne "") { my $pdbs=$similar{$1}; # Limit number of represented pdbs to 30 if ($pdbs =~/^(\s+\S+){31,}/) {$pdbs =~/^((?:\s+\S+){30})/; $pdbs=$1." ...";} if ($pdbs=~/^\s*\d\S{3,5}/) { $seq=~s/^(.*)/$1 PDB:$pdbs/; } elsif ($pdbs=~/^\s*[a-z]\d\S{5}/) { $seq=~s/^(.*)/$1 SCOPIDS:$pdbs/; } } print(OUTFILE $seq); } } close(OUTFILE); } sub SortByResolution() { my $aa; my $bb; if ($a!~/^\d+!(\S+)!>/o) { $a=~/^(\S+)/o; printf("Error: sequence %s does not contain resolution in right format\n",$1); return 0; } $aa=$1; if ($b!~/^\d+!(\S+)!>/o) { $b=~/^(\S+)/o; printf("Error: sequence %s does not contain resolution in right format\n",$1); return 0; } $bb=$1; return ($aa<=>$bb); } sub SortByFamily() { my $aa; my $bb; if ($a!~/^>\S+ (\S+)/) { $a=~/^(\S+)/o; printf("Error: sequence %s does not contain resolution in right format\n",$1); return 0; } $aa=$1; if ($b!~/^>\S+ (\S+)/) { $b=~/^(\S+)/o; printf("Error: sequence %s does not contain resolution in right format\n",$1); return 0; } $bb=$1; return ($aa cmp $bb); } sub System() { $v--; &HHPaths::System($_[0]); $v++; }