pdbfilter.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #! /usr/bin/perl
  2. # pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile
  3. #
  4. # HHsuite version 3.0.0 (15-03-2015)
  5. #
  6. # Reference:
  7. # Remmert M., Biegert A., Hauser A., and Soding J.
  8. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  9. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  10. # (C) Johannes Soeding, 2012
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. # This program is distributed in the hope that it will be useful,
  16. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. # GNU General Public License for more details.
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. # We are very grateful for bug reports! Please contact us at [email protected]
  22. use lib $ENV{"HHLIB"}."/scripts";
  23. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  24. use strict;
  25. $|= 1; # Activate autoflushing on STDOUT
  26. # Default options
  27. my $idmax=90; # maximum sequence identity threshold
  28. my $Evalmin=0.01; # minimum BLAST E-value (should be <0.01 to ensure that sequences being filtered out are homologous to some representative sequence
  29. my $covmin=90; # minimum coverage threshold (should be large enough to ensure that no structural domain is lost from the filtered set just because it occurs in a sequence with, say, 2 other domains similar to some representative sequence)
  30. my $v=2;
  31. my $blastpgp="$ncbidir/blastpgp";
  32. my $help="
  33. pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of
  34. sequences to outfile
  35. Compares each sequence with all others using BLAST (blastpgp). If two sequences A and B are
  36. sufficiently similar, the sequence with lower resolution will be removed.
  37. The exact criterion for removal of sequence B is:
  38. IF more than \$covmin\% residues of sequence B are aligned to sequence A
  39. AND the sequence identity of the alignment is larger than \$idmin
  40. AND the E-value is better than \$Evalmin
  41. AND sequence B has lower resolution than A.
  42. The input file must have been prepared with pdb2fasta.pl or scop2fasta.pl.
  43. Sequences with fewer than 15 residues are suppressed.
  44. Usage: pdbfilter.pl infile filtered_file [-u old_filtered_file] [-id int] [-cov int]
  45. Options
  46. -id <int> maximum sequence identity between representative sequences (default=$idmax)
  47. -e <float> minimum E-value between representative sequences (default=$Evalmin)
  48. -cov <int> minimum coverage of a sequence with a similar one to get thrown out (default=$covmin)
  49. -u <file> update the old filtered file; this saves a lot of execution time
  50. -v <int> verbose mode
  51. Example: pdbfilter.pl pdb.fas pdb70.fas -u pdb70.fas -id 70 -cov 90\n
  52. ";
  53. if (@ARGV<2) {print($help); exit;}
  54. my $infile;
  55. my $outfile;
  56. my $oldfile;
  57. my $root=""; # $outfile without extension
  58. my $line;
  59. my $pdbid=""; # e.g. 1ahs_C
  60. my $qpdbid; # pdbid of query sequence
  61. my $seq=""; # sequence record (name+residues) in FASTA
  62. my @seqs; # sequence records as they were read
  63. my $len=0; # length of sequence to be read in
  64. my %len; # $len{$pdbid} is length of sequence
  65. my $resol; # experimental resolution in Angstroem
  66. my %resol; # experimental resolution in Angstroem
  67. my %excluded; # representative sequences are all those not in this hash
  68. my %accepted; # representative sequences accpeted so far
  69. my %similar; # $similar{$pdbid} contains list of all pdbids (SCOPIDs) that are represented by (i.e. similar to) $qpdbid
  70. my %het; # $het{$pdbid} is "*" if sequence $pdbid has a HET: record (i.e. contains HET group with >=10 atoms), "" otherwise
  71. my $id; # sequence identity to query
  72. my $cov; # coverage
  73. my $Evalue; # BLAST E-value
  74. my $nold=0; # number of sequences in oldfile
  75. my $ntot=0; # total number of sequences in oldfile and infile
  76. my $k=0; # counts sequences read in so far
  77. my $sort_by_family=1; # set to 0 if at least one non-SCOP sequence found
  78. # Read command line options
  79. my $options="";
  80. for (my $i=0; $i<=$#ARGV; $i++) {$options.=" $ARGV[$i]";}
  81. if ($options=~s/ -id\s+(\S+)//) {$idmax=$1;}
  82. if ($options=~s/ -cov\s+(\S+)//) {$covmin=$1;}
  83. if ($options=~s/ -e\s+(\S+)//) {$Evalmin=$1;}
  84. if ($options=~s/ -u\s+(\S+)//) {$oldfile=$1;}
  85. if ($options=~s/ -v\s*(\d+)//) {$v=$1;}
  86. if ($options=~s/ -v//) {$v=2;}
  87. if (!$infile && $options=~s/^\s*([^- ]\S+)\s*//) {$infile=$1;}
  88. if (!$outfile && $options=~s/^\s*([^- ]\S+)\s*//) {$outfile=$1;}
  89. # Warn if unknown options found or no infile/outfile
  90. if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
  91. if (!$infile) {print($help); print("Error: no input file given\n"); exit;}
  92. if (!$outfile) {print($help); print("Error: no output file given\n"); exit;}
  93. $covmin*=0.01;
  94. if ($outfile=~/^(\S*?)\.(.*)$/) {$root=$1;} else {$root=$outfile;}
  95. # Read sequences from oldfile
  96. if ($oldfile) {$nold=&ReadSequences($oldfile,0);}
  97. # Read NEW sequences from infile (the ones that are not yet in oldfile)
  98. $ntot=&ReadSequences($infile,$nold);
  99. # Sort by resolution
  100. @seqs=sort SortByResolution @seqs;
  101. #foreach $seq (@seqs) {print("$seq");}
  102. # Record resolution and length of all sequences in hashes
  103. for ($k=0; $k<@seqs; $k++) {
  104. $seqs[$k]=~s/^(\S+?)!(\S+?)!>(\S+)(.*)/>$3$4/o;
  105. $len{$3}=$1;
  106. $resol{$3}=$2;
  107. my $pdbid=$3;
  108. if ($seqs[$k]=~s/ PDB:\s*(( \d\S{3,5}\*?)+)//) {
  109. $similar{$pdbid}=$1;
  110. } elsif ($seqs[$k]=~s/ SCOPID:\s*(( [a-z]\d\S{5}\*?)+)//) {
  111. $similar{$pdbid}=$1;
  112. } else {
  113. $similar{$pdbid}="";
  114. }
  115. if ($seqs[$k]=~/ HET:/) {
  116. $het{$pdbid}="*";
  117. } else {
  118. $het{$pdbid}="";
  119. }
  120. }
  121. # Format BLAST database, initialize
  122. if ($v>=2) {printf("Doing PSI-BLAST runs\n");}
  123. &PrintSequences("$root.db",$nold);
  124. &System("$ncbidir/formatdb -i $root.db");
  125. my $nacc=0; # number of accepted sequences = number of BLAST searches already performed
  126. my $done=0; # number of sequences already processed
  127. my $nexcl=0; # number of already excluded chains
  128. my $step=($ntot>1000)*0.05+($ntot<=1000)*0.1;
  129. my $reformat=$step; # when X% of the total number of sequences have been excluded, reformat database
  130. ####################################################################################################
  131. # For each sequence in infile, do BLAST search and exclude hits that are too similar to query
  132. foreach $seq (@seqs) {
  133. $seq=~/^>(\S+)/o;
  134. $done++;
  135. if($excluded{$1}) {next;}
  136. $qpdbid=$1;
  137. $resol=$resol{$1};
  138. $len=$len{$1};
  139. if ($v>=2 && !($nacc%100)) {
  140. printf("\nSearches:%-4i Done:%3i%% DB-size:%3i%% ",$nacc,100*($nexcl+$nacc)/$ntot+0.5,100*($ntot-$nexcl)/$ntot+0.5);
  141. }
  142. # When a substantial number of sequences have been excluded, reformat database WITHOUT excluded sequences
  143. if (!$oldfile && ($nexcl+$nacc)/$ntot>$reformat) {
  144. if ($v>=2) {printf("\b\b\b%2i%%",100*$reformat+0.5);}
  145. elsif ($v>=3) {
  146. printf("\nReformatting search database containing %i out of %i sequences\n",$ntot-$nexcl,$ntot);}
  147. # Write updated database with all seqs with index >=$done that have not yet been excluded
  148. # (Don't have to compare A->B and B->A, hence exclude seqs with index <$done)
  149. &PrintSequences("$root.db",$done);
  150. &System("$ncbidir/formatdb -i $root.db");
  151. $reformat+=$step;
  152. }
  153. open (TMPFILE,">$root.tmp") || die ("ERROR: cannot open $root.tmp for writing: $!\n");
  154. print(TMPFILE $seq);
  155. close(TMPFILE);
  156. # Find hits that are too similar
  157. if ($v>=3) {print("\n$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot");}
  158. open(BLAST,"$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|");
  159. while ($line=<BLAST>){
  160. if ($line=~/^>(\S+)/o && $1 ne $qpdbid && !$excluded{$1} && !$accepted{$1}) {
  161. $pdbid=$1;
  162. while ($line=<BLAST>){if ($line=~/^ Score = /o) {last;}}
  163. $line=~/ Expect =\s*(\S+)/ or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n");
  164. $Evalue=$1;
  165. $Evalue=~s/^e/1e/;
  166. $Evalue=~s/,$//;
  167. $line=<BLAST>;
  168. $line=~/^ Identities =\s*\d+\/(\d+)\s+\((\d+)\%\)/o or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n");
  169. $len=$1;
  170. $id=$2;
  171. # Coverage = (length of whole alignment (including gaps) - gaps in query or HSP) / total length of matched sequence
  172. if ($line=~/Gaps =\s*(\d+)\/\d+/) {$cov=($len-$1)/$len{$pdbid};} else {$cov=$len/$len{$pdbid};}
  173. ## Main filtering criterion: remove sequence from representative set if...
  174. if ($id>=$idmax && $Evalue<=$Evalmin && $cov>=$covmin && $resol<=$resol{$pdbid}) {
  175. $excluded{$pdbid}=1;
  176. # if ($similar{$qpdbid} ne "") {$similar{$qpdbid}.=",";} # separate pdbids with identical sequences with ','
  177. if (!$similar{$qpdbid}) {$similar{$qpdbid}="";}
  178. $similar{$qpdbid}.=" ".$pdbid.$het{$pdbid}.$similar{$pdbid};
  179. $nexcl++;
  180. if ($v>=4) {
  181. print($line);
  182. printf("Rep: $qpdbid excl: $pdbid (len=%3i cov=%3.0f id=$1)\n",$len{$pdbid},100*$cov);
  183. }
  184. }
  185. }
  186. }
  187. close(BLAST);
  188. $nacc++;
  189. $accepted{$qpdbid}=1;
  190. if ($v>=2) {print(".");}
  191. }
  192. if ($v>=2) {print("\n");}
  193. ####################################################################################################
  194. if ($sort_by_family) {@seqs=sort SortByFamily @seqs;}
  195. # Print out all representative sequences to outfile
  196. &PrintSequences($outfile,0);
  197. if ($v>=2) {
  198. printf("Written %i out of %i sequences to $outfile\n",$ntot-$nexcl,$ntot);
  199. }
  200. unlink("$root.tmp");
  201. unlink(glob("$root.db*"));
  202. exit;
  203. # Read sequences in infile beginning at index $k
  204. sub ReadSequences()
  205. {
  206. my $infile=$_[0];
  207. my $k=$_[1];
  208. my $k0=$k;
  209. my $nres=0; # skip sequences with fewer than 15 real residues
  210. if ($v>=3) {printf("Reading $infile ... \n");}
  211. open (INFILE,"<$infile") || die ("ERROR: cannot open $infile for reading: $!\n");
  212. while ($line=<INFILE>) {
  213. if ($line=~/^>/) {
  214. if ($pdbid && !$len{$pdbid} && $nres>=15) {
  215. if ($len<26) {chomp $seq; $seq.=('x'x(26-$len))."\n";} # add 'x' to make $seq at least 26 resiudes long
  216. $seqs[$k++]="$len!$resol!$seq";
  217. }
  218. # Read pdbid (or SCOP id) and resolution from name line
  219. if ($line=~/^>(\S{4,6}) .*; (\d+\.?\d*|NMR)A? \{/o) {
  220. # PDB sequence
  221. $pdbid=$1;
  222. if ($2 eq "NMR") {$resol=10;} else {$resol=$2;}
  223. $seq=$line;
  224. $len=$nres=0;
  225. $sort_by_family=0;
  226. } elsif ($line=~/^>([a-z]\S{6}) .* (\d+\.?\d*)\s*$/) {
  227. # SCOP sequence
  228. $pdbid=$1;
  229. if ($2 eq "" or $2 eq "NMR") {$resol=10;} else {$resol=$2;}
  230. $resol=$2;
  231. $line=~s/^(>.*) (\d+\.?\d*)\s*$/$1\n/; # remove resolution at the end of the name line
  232. $seq=$line;
  233. $len=0;
  234. } else {print("WARNING: nameline format not recognized: $line");}
  235. $nres=0;
  236. } else {
  237. $seq.=$line;
  238. $len+=length($line)-1;
  239. $nres+=($line=~tr/a-wyA-WY/a-wyA-WY/);
  240. }
  241. }
  242. if ($pdbid && !$len{$pdbid}) {
  243. if ($len<26) {$seq.=('X'x(26-$len))."\n";} # add 'X' to make $seq at least 26 resiudes long
  244. $seqs[$k++]="$len!$resol!$seq";
  245. }
  246. close(INFILE);
  247. if ($v>=2) {printf("Read %i sequences from $infile ... \n",$k-$k0);}
  248. return $k;
  249. }
  250. # Print all sequences that have not been excluded so far to file, beginning at index $k
  251. sub PrintSequences()
  252. {
  253. open (OUTFILE,">$_[0]") || die("Error: could not write to $_[0]: $!\n");
  254. for (my $k=$_[1]; $k<@seqs; $k++) {
  255. $seqs[$k]=~/>(\S+)/o;
  256. if (!$excluded{$1}) {
  257. my $seq=$seqs[$k];
  258. if ($similar{$1} ne "") {
  259. my $pdbs=$similar{$1};
  260. # Limit number of represented pdbs to 30
  261. if ($pdbs =~/^(\s+\S+){31,}/) {$pdbs =~/^((?:\s+\S+){30})/; $pdbs=$1." ...";}
  262. if ($pdbs=~/^\s*\d\S{3,5}/) {
  263. $seq=~s/^(.*)/$1 PDB:$pdbs/;
  264. } elsif ($pdbs=~/^\s*[a-z]\d\S{5}/) {
  265. $seq=~s/^(.*)/$1 SCOPIDS:$pdbs/;
  266. }
  267. }
  268. print(OUTFILE $seq);
  269. }
  270. }
  271. close(OUTFILE);
  272. }
  273. sub SortByResolution() {
  274. my $aa;
  275. my $bb;
  276. if ($a!~/^\d+!(\S+)!>/o) {
  277. $a=~/^(\S+)/o;
  278. printf("Error: sequence %s does not contain resolution in right format\n",$1);
  279. return 0;
  280. }
  281. $aa=$1;
  282. if ($b!~/^\d+!(\S+)!>/o) {
  283. $b=~/^(\S+)/o;
  284. printf("Error: sequence %s does not contain resolution in right format\n",$1);
  285. return 0;
  286. }
  287. $bb=$1;
  288. return ($aa<=>$bb);
  289. }
  290. sub SortByFamily() {
  291. my $aa;
  292. my $bb;
  293. if ($a!~/^>\S+ (\S+)/) {
  294. $a=~/^(\S+)/o;
  295. printf("Error: sequence %s does not contain resolution in right format\n",$1);
  296. return 0;
  297. }
  298. $aa=$1;
  299. if ($b!~/^>\S+ (\S+)/) {
  300. $b=~/^(\S+)/o;
  301. printf("Error: sequence %s does not contain resolution in right format\n",$1);
  302. return 0;
  303. }
  304. $bb=$1;
  305. return ($aa cmp $bb);
  306. }
  307. sub System() {
  308. $v--;
  309. &HHPaths::System($_[0]);
  310. $v++;
  311. }