pdb2fasta.pl 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944
  1. #! /usr/bin/perl
  2. #
  3. # pdb2fasta.pl - generate FASTA nonredundant sequence file from SEQRES records of globbed pdb files.
  4. # HHsuite version 3.0.0 (15-03-2015)
  5. #
  6. # Reference:
  7. # Remmert M., Biegert A., Hauser A., and Soding J.
  8. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  9. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  10. # (C) Johannes Soeding, 2012
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. # This program is distributed in the hope that it will be useful,
  16. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. # GNU General Public License for more details.
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. # We are very grateful for bug reports! Please contact us at [email protected]
  22. use lib $ENV{"HHLIB"}."/scripts";
  23. use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
  24. use strict;
  25. $|= 1; # Activate autoflushing on STDOUT
  26. # Default parameters
  27. my $v=2;
  28. my $help="
  29. pdb2fasta.pl - Generate FASTA nonredundant sequence file from SEQRES records of
  30. globbed pdb files.
  31. For updating purposes, you can write only those sequences to pdb_new.fas that are not
  32. already contained in an old file by giving as third argument the old pdb.fas file.
  33. Usage: pdb2fasta.pl 'pdb-fileglob' pdb_newseqs.fas [options]
  34. Options:
  35. -u oldfile update: write only those sequences to pdb_new.fas that are not contained in oldfile
  36. -scop file read dir.cla.scop.txt_1.65 and list SCOP fold(s) in sequence name
  37. -dali dir read FoldIndex.html and domain_definitions.txt in DALI directory and list DALI fold(s)
  38. in sequence name
  39. -v int verbose mode
  40. -t MTH-YR use only structures released until the given month and year, e.g. APR-12 or SEP-99
  41. -all include all sequences instead of nonredundant set
  42. Examples:
  43. pdb2fasta.pl '*.ent' /data/pdbfas/pdb_20Apr2012.fas
  44. pdb2fasta.pl '*.pdb' pdb_new.fas -u pdb.fas -dali /data/dali -scop /data/scop/dir.cla.scop.txt_1.75
  45. \n";
  46. my $TOTLEN=160; # maximum length of name, description, and keywords
  47. my $DESCLEN=80; # maximum length of description
  48. if (@ARGV<2) {die($help);}
  49. my @pdbfiles;
  50. my $newseqfile;
  51. my $oldfile="";
  52. my $dalidir="";
  53. my $scopfile="";
  54. my $date="";
  55. my %months=("JAN"=>1,"FEB"=>2,"MAR"=>3,"APR"=>4,"MAY"=>5,"JUN"=>6,"JUL"=>7,"AUG"=>8,"SEP"=>9,"OCT"=>10,"NOV"=>11,"DEC"=>12);
  56. my %oldpdbids=(); # hash contains all pdbids in $oldfile
  57. our $pdbfile;
  58. my $pdbid; # four-letter PDB identifier, e.g. 1hz4
  59. my $resolution; # experimental resolution in Angstrom
  60. my $rvalue; # R-value
  61. my $free_rvalue; # free R-value
  62. my $molid=0; # molecule id (for multichain structures)
  63. my $length; # number of residues in a chain
  64. my @seqres=(); # three-letter code of chain currently read in
  65. my $seqres; #
  66. my %descript; # $descript{"A"} contains the description for chain A
  67. my $descript;
  68. my %organism; # $organism{"A"} contains the organism for chain A
  69. my $organism;
  70. my $organism_common;
  71. my @chain; # $chain[$molid]
  72. my $chain; # either A for chain A or "" if no chain id
  73. my @chains; #
  74. my @keywds; # keywords for the structure
  75. my $keywds; # keywords for the structure
  76. my $token;
  77. my $synonym; # read from COMPND SYNONYM records of pdb files
  78. my @synonyms; # read from COMPND SYNONYM records of pdb files
  79. my $line; # line read in from file
  80. my @sequences=(); # contains all sequences of chains to be printed to outfile
  81. my @resolution=(); # $resolution[$nc] contains resolution of $nc'th sequence, where $nc=$nchains{$seqres}
  82. my %nchains; # $nchains{$seqres} is index in @sequences of sequence with these residues
  83. my $nchains=0; # number of chains written to $newseqfile
  84. my $k=0; # counts pdb files processed
  85. my @equiv_pdbs; # list of pdbids (including _chain) with identical residues (maximum one pdbid_chain per pdb file)
  86. my %dalifamids=(); # $foldids{$pdbid} contains a list of (one or more) DALI or SCOP foldids
  87. my %scopfamids=(); # $foldids{$pdbid} contains a list of (one or more) DALI or SCOP foldids
  88. my $het; # $het contains list of hetero ligands with at least 10 atoms in current pdb file (e.g. "DAC,PTR")
  89. my %words; # for debugging upper case -> lower case
  90. my $nr=1; # 1: create nonredundant set 0:do not eliminate redundant sequences
  91. my %three2one=(
  92. "ALA"=>"A","VAL"=>"V","PHE"=>"F","PRO"=>"P","MET"=>"M","ILE"=>"I","LEU"=>"L","ASP"=>"D","GLU"=>"E","LYS"=>"K",
  93. "ARG"=>"R","SER"=>"S","THR"=>"T","TYR"=>"Y","HIS"=>"H","CYS"=>"C","ASN"=>"N","GLN"=>"Q","TRP"=>"W","GLY"=>"G",
  94. "2AS"=>"D","3AH"=>"H","5HP"=>"E","ACL"=>"R","AIB"=>"A","ALM"=>"A","ALO"=>"T","ALY"=>"K","ARM"=>"R","ASA"=>"D",
  95. "ASB"=>"D","ASK"=>"D","ASL"=>"D","ASQ"=>"D","AYA"=>"A","BCS"=>"C","BHD"=>"D","BMT"=>"T","BNN"=>"A","BUC"=>"C",
  96. "BUG"=>"L","C5C"=>"C","C6C"=>"C","CCS"=>"C","CEA"=>"C","CHG"=>"A","CLE"=>"L","CME"=>"C","CSD"=>"A","CSO"=>"C",
  97. "CSP"=>"C","CSS"=>"C","CSW"=>"C","CXM"=>"M","CY1"=>"C","CY3"=>"C","CYG"=>"C","CYM"=>"C","CYQ"=>"C","DAH"=>"F",
  98. "DAL"=>"A","DAR"=>"R","DAS"=>"D","DCY"=>"C","DGL"=>"E","DGN"=>"Q","DHA"=>"A","DHI"=>"H","DIL"=>"I","DIV"=>"V",
  99. "DLE"=>"L","DLY"=>"K","DNP"=>"A","DPN"=>"F","DPR"=>"P","DSN"=>"S","DSP"=>"D","DTH"=>"T","DTR"=>"W","DTY"=>"Y",
  100. "DVA"=>"V","EFC"=>"C","FLA"=>"A","FME"=>"M","GGL"=>"E","GLZ"=>"G","GMA"=>"E","GSC"=>"G","HAC"=>"A","HAR"=>"R",
  101. "HIC"=>"H","HIP"=>"H","HMR"=>"R","HPQ"=>"F","HTR"=>"W","HYP"=>"P","IIL"=>"I","IYR"=>"Y","KCX"=>"K","LLP"=>"K",
  102. "LLY"=>"K","LTR"=>"W","LYM"=>"K","LYZ"=>"K","MAA"=>"A","MEN"=>"N","MHS"=>"H","MIS"=>"S","MLE"=>"L","MPQ"=>"G",
  103. "MSA"=>"G","MSE"=>"M","MVA"=>"V","NEM"=>"H","NEP"=>"H","NLE"=>"L","NLN"=>"L","NLP"=>"L","NMC"=>"G","OAS"=>"S",
  104. "OCS"=>"C","OMT"=>"M","PAQ"=>"Y","PCA"=>"E","PEC"=>"C","PHI"=>"F","PHL"=>"F","PR3"=>"C","PRR"=>"A","PTR"=>"Y",
  105. "SAC"=>"S","SAR"=>"G","SCH"=>"C","SCS"=>"C","SCY"=>"C","SEL"=>"S","SEP"=>"S","SET"=>"S","SHC"=>"C","SHR"=>"K",
  106. "SOC"=>"C","STY"=>"Y","SVA"=>"S","TIH"=>"A","TPL"=>"W","TPO"=>"T","TPQ"=>"A","TRG"=>"K","TRO"=>"W","TYB"=>"Y",
  107. "TYQ"=>"Y","TYS"=>"Y","TYY"=>"Y","AGM"=>"R","GL3"=>"G","SMC"=>"C","ASX"=>"B","CGU"=>"E","CSX"=>"C","GLX"=>"Z",
  108. "LED"=>"L"
  109. );
  110. # Read command line options
  111. my $options="";
  112. for (my $i=0; $i<=$#ARGV; $i++) {$options.=" $ARGV[$i]";}
  113. if ($options=~s/ -u\s+(\S+)//) {$oldfile=$1;}
  114. if ($options=~s/ -dali\s+(\S+)//) {$dalidir=$1;}
  115. if ($options=~s/ -scop\s+(\S+)//) {$scopfile=$1;}
  116. if ($options=~s/ -v\s*(\d+)//) {$v=$1;}
  117. if ($options=~s/ -v//) {$v=2;}
  118. if ($options=~s/ -all//) {$nr=0;}
  119. if ($options=~s/ -t (\w\w\w)-(\d\d)//) {$date=($months{$1}-1)/12+$2+100*($2<50);}
  120. if ($options=~s/^\s*([^- ]\S+)\s*//) {
  121. if ($v>=2) {print("Globbing...")};
  122. @pdbfiles=glob($1);
  123. if ($v>=2) {print(" found ".scalar(@pdbfiles)." files\n")};
  124. }
  125. if ($options=~s/^\s*([^- ]\S+)\s*//) {$newseqfile=$1;}
  126. # Warn if unknown options found or no infile/newseqfile
  127. if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
  128. if (!@pdbfiles) {print($help); print("Error: no input files given\n"); exit;}
  129. if (!$newseqfile) {print($help); print("Error: no output file given\n"); exit;}
  130. # Updating option?
  131. if ($oldfile) {
  132. # Reading pdb codes from $oldfile
  133. if ($v>=3) {printf("Reading pdb codes from $oldfile ... \n");}
  134. open (OLDFILE,"<$oldfile") || die ("ERROR: cannot open $oldfile for writing: $!\n");
  135. while ($line=<OLDFILE>) {
  136. if ($line=~/^>(\S\S\S\S)/o) {$oldpdbids{$1}=1;}
  137. }
  138. close(OLDFILE);
  139. }
  140. # Add fold identifiers?
  141. if ($dalidir) {&ReadDaliFiles();}
  142. if ($scopfile) {&ReadScopFile();}
  143. ############################################################################################
  144. # Read one pdb file after the other
  145. foreach $pdbfile (@pdbfiles) {
  146. $k++;
  147. if ($pdbfile=~/^.*\/(.*?)$/) {$pdbid=$1;} else {$pdbid=$pdbfile;} # remove path
  148. if ($pdbid=~/^(pdb)?(.*)\..*$/) {$pdbid=lc($2);} else {die("Error: globbed file $pdbfile has no extension\n");}
  149. if (exists $oldpdbids{$pdbid}) {next;}
  150. if ($v>=1) {
  151. print(".");
  152. if (!($k%100)) {print("$k\n");}
  153. } elsif ($v>=2) {printf("Reading %4i %s\n",$k,$pdbfile);}
  154. open (PDBFILE, "<$pdbfile") || die ("Error: couldn't open $pdbfile: $!\n");
  155. if ($v>=4) {print("Reading $pdbfile...\n");}
  156. $line=<PDBFILE>;
  157. # Initialize before reading new pdb file
  158. $resolution=0;
  159. $rvalue=0;
  160. $free_rvalue=0;
  161. $molid=0;
  162. $token="MOLECULE";
  163. $descript="";
  164. $organism="";
  165. $organism_common="";
  166. $keywds="";
  167. $chain="";
  168. %organism=();
  169. %descript=();
  170. @keywds=();
  171. @chain=();
  172. @seqres=();
  173. $synonym="";
  174. @synonyms=();
  175. $het=""; # will contain list of hetero groups (if found)
  176. # COMPND ASPARTATE AMINOTRANSFERASE (E.C.2.6.1.1) WILD TYPE COMPLEXED 1ASA 3
  177. # COMPND 2 WITH PYRIDOXAL-5'-PHOSPHATE AND MALEATE 1ASA 4
  178. # or
  179. # COMPND MOL_ID: 1;
  180. # COMPND 2 MOLECULE: ACTIN-LIKE PROTEIN 3;
  181. # COMPND 3 CHAIN: A;
  182. # COMPND 4 SYNONYM: ARP3; ACTIN-RELATED PROTEIN 3; ACTIN-2;
  183. # COMPND 5 OTHER_DETAILS: PART OF THE ARP2/3 COMPLEX;
  184. # COMPND 6 MOL_ID: 2;
  185. # COMPND 7 MOLECULE: ACTIN-LIKE PROTEIN 2;
  186. # COMPND 8 CHAIN: B;
  187. # COMPND 9 SYNONYM: ARP2; ACTIN-RELATED PROTEIN 2;
  188. # COMPND 10 OTHER_DETAILS: PART OF THE ARP2/3 COMPLEX;
  189. # or
  190. # COMPND MOL_ID: 1;
  191. # COMPND 2 MOLECULE: PHOSPHATE SYSTEM POSITIVE REGULATORY PROTEIN
  192. # COMPND 3 PHO4;
  193. # COMPND 4 CHAIN: A, B;
  194. # COMPND 5 FRAGMENT: DNA BINDING DOMAIN;
  195. # COMPND 6 SYNONYM: BHLH;
  196. # COMPND 7 ENGINEERED: YES;
  197. # COMPND 8 BIOLOGICAL_UNIT: DIMER;
  198. while ($line && $line!~/^COMPND /o && $line!~/^REMARK /o) {$line=<PDBFILE>;}
  199. if (!$line || $line!~/^COMPND /)
  200. {
  201. if ($v>=2) {print("\n\nWarning: no COMPND line found in $pdbfile; skipping pdb file\n");}
  202. next;
  203. }
  204. while ($line && $line=~/^COMPND /o && $line!~/^REMARK /o) {
  205. $line=~s/^(.{70}).*/$1/;
  206. if ($line=~/^COMPND\s+\d*\s+MOL_ID:\s*(\d+)/) {
  207. if ($molid>0) {
  208. &SetDescript();
  209. } else {$molid=$1;}
  210. $descript="";
  211. $synonym="";
  212. }
  213. elsif ($line=~/^COMPND\s+\d*\s+MOLECULE:\s*(.*\S)/) {$descript=$1; $token="MOLECULE";}
  214. elsif ($line=~/^COMPND\s+\d*\s+CHAIN:\s*(.*\S)/) {$chain=$1; $token="CHAIN";}
  215. elsif ($line=~/^COMPND\s+\d*\s+SYNONYM:\s*(.*\S)/) {$synonym=$1;}
  216. elsif ($line=~/^COMPND\s+\d*\s+FRAGMENT:/) {$token="";}
  217. elsif ($line=~/^COMPND\s+\d*\s+EC:/) {$token="";}
  218. elsif ($line=~/^COMPND\s+\d*\s+ENGINEERED:/) {$token="";}
  219. elsif ($line=~/^COMPND\s+\d*\s+MUTATION:/) {$token="";}
  220. elsif ($line=~/^COMPND\s+\d*\s+BIOLOGICAL UNIT:/) {$token="";}
  221. elsif ($line=~/^COMPND\s+\d*\s+OTHER_DETAILS:/) {$token="";}
  222. else {
  223. $line=~/^COMPND\s+\d*\s+(.*\S)/;
  224. if ($token eq "MOLECULE") {$descript.=" ".$1;}
  225. elsif ($token eq "SYNONYM") {$synonym.=" ".$1;}
  226. elsif ($token eq "CHAIN") {$chain.=" ".$1;}
  227. }
  228. $line=<PDBFILE>;
  229. }
  230. &SetDescript();
  231. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  232. # SOURCE (ESCHERICHIA COLI) 1ASA 5
  233. # or
  234. # SOURCE MOL_ID: 1;
  235. # SOURCE 2 ORGANISM_SCIENTIFIC: BOS TAURUS;
  236. # SOURCE 3 ORGANISM_COMMON: BOVINE;
  237. # SOURCE 4 ORGAN: THYMUS;
  238. # SOURCE 5 MOL_ID: 2;
  239. # SOURCE 6 ORGANISM_SCIENTIFIC: BOS TAURUS;
  240. # SOURCE 7 ORGANISM_COMMON: BOVINE;
  241. # SOURCE 8 ORGAN: THYMUS;
  242. $molid=0;
  243. $token="ORGANISM";
  244. # $organism="Synthetic?";
  245. while ($line && $line!~/^SOURCE /o && $line!~/^REMARK /o) {$line=<PDBFILE>;}
  246. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  247. if ($v>=2 && $line!~/^SOURCE /) {print("\n\nWarning: no SOURCE line found in $pdbfile\n");}
  248. while ($line=~/^SOURCE /o && $line!~/^REMARK /o) {
  249. $line=~s/^(.{70}).*/$1/;
  250. if ($line=~/^SOURCE\s+\d*\s+MOL_ID:\s*(\d+)/) {
  251. if ($molid>0) {
  252. &SetOrganism();
  253. } else {$molid=$1;}
  254. }
  255. elsif ($line=~/^SOURCE\s+\d*\s+ORGANISM_SCIENTIFIC:\s*(.*\S)/) {$organism=$1; $token="ORGANISM";}
  256. elsif ($line=~/^SOURCE\s+\d*\s+SYNTHETIC/) {if ($organism eq "") {$organism="Synthetic"; $token="";}}
  257. elsif ($line=~/^SOURCE\s+\d*\s+FRAGMENT:/) {$token="";}
  258. elsif ($line=~/^SOURCE\s+\d*\s+ORGANISM_COMMON:\s+(.*\S)/) {$organism_common=$1; $token="";}
  259. elsif ($line=~/^SOURCE\s+\d*\s+STRAIN:/) {$token="";}
  260. elsif ($line=~/^SOURCE\s+\d*\s+VARIANT:/) {$token="";}
  261. elsif ($line=~/^SOURCE\s+\d*\s+CELL_LINE:/) {$token="";}
  262. elsif ($line=~/^SOURCE\s+\d*\s+ATCC:/) {$token="";}
  263. elsif ($line=~/^SOURCE\s+\d*\s+ORGAN:/) {$token="";}
  264. elsif ($line=~/^SOURCE\s+\d*\s+TISSUE:/) {$token="";}
  265. elsif ($line=~/^SOURCE\s+\d*\s+CELL:/) {$token="";}
  266. elsif ($line=~/^SOURCE\s+\d*\s+ORGANELLE:/) {$token="";}
  267. elsif ($line=~/^SOURCE\s+\d*\s+SECRETION:/) {$token="";}
  268. elsif ($line=~/^SOURCE\s+\d*\s+CELLULAR_LOCATION:/) {$token="";}
  269. elsif ($line=~/^SOURCE\s+\d*\s+PLASMID:/) {$token="";}
  270. elsif ($line=~/^SOURCE\s+\d*\s+GENE:/) {$token="";}
  271. elsif ($line=~/^SOURCE\s+\d*\s+EXPRESSION_/) {$token="";}
  272. elsif ($line=~/^SOURCE\s+\d*\s+OTHER_DETAILS:/) {$token="";}
  273. else {
  274. $line=~/^SOURCE\s+\d*\s+(.*\S)/;
  275. if ($token eq "ORGANISM") {$organism.=$1;}
  276. }
  277. $line=<PDBFILE>;
  278. }
  279. &SetOrganism();
  280. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  281. # KEYWDS KETOLISOMERASE, XYLOSE METABOLISM, GLUCOSE-FRUCTOSE
  282. # KEYWDS 2 INTERCONVERSION, HYDRIDE TRANSFER, ALPHA-BETA BARREL,
  283. # KEYWDS 3 METALLOENZYME, THERMOPHILE
  284. while ($line && $line!~/^KEYWDS /o && $line!~/^REMARK /o) {$line=<PDBFILE>;}
  285. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  286. while ($line=~/^KEYWDS /o) {
  287. $line=~s/^(.{70}).*/$1/;
  288. if ($line=~/^KEYWDS\s+\d*\s+(.*\S)/) {$keywds.=" ".$1;}
  289. $line=<PDBFILE>;
  290. }
  291. $keywds=~s/CRYSTAL STRUCTURE,?\s*//i;
  292. $keywds=~s/X-RAY STRUCTURE,?\s*//i;
  293. $keywds=~s/THREE-DIMENSIONAL STR.CTURE,?\s*//i;
  294. $keywds=~s/NMR,?\s*//i;
  295. if ($keywds) {
  296. $keywds=~s/\s+/ /g;
  297. $keywds=~s/^\s+/ /;
  298. $keywds=~s/\s*;?$/;/;
  299. @keywds=split(/[,;]\s+/,$keywds);
  300. } else {@keywds=();}
  301. # Include keywords up to 50 chars taken
  302. my @these_keywds=@keywds;
  303. if (@these_keywds) {
  304. # Remove keywords that are substring of description or organism
  305. for (my $k=0; $k<@these_keywds; $k++) {
  306. $keywds=$these_keywds[$k];
  307. my $ddescript=$descript;
  308. my $kkeywds=$keywds;
  309. my $oorganism=$organism;
  310. $ddescript=~tr/a-zA-Z//cd;
  311. $kkeywds=~tr/a-zA-Z//cd;
  312. $oorganism=~tr/a-zA-Z//cd;
  313. if ($ddescript=~/$kkeywds/i) {splice(@these_keywds,$k,1); $k--;}
  314. elsif ($kkeywds=~/$ddescript/i) {splice(@these_keywds,$k,1); $k--;}
  315. elsif ($oorganism=~/$kkeywds/i) {splice(@these_keywds,$k,1); $k--;}
  316. }
  317. # Add keywords until length limitation is exceeded ($TOTLEN chars)
  318. if (@these_keywds) {
  319. $keywds=$these_keywds[0];
  320. for (my $k=1; $k<@these_keywds && length($descript.$keywds.", ".$these_keywds[$k].$organism)<$TOTLEN; $k++) {
  321. $keywds.=", ".$these_keywds[$k];
  322. }
  323. $keywds=~s/^(\S)/ $1/; # add space at first position
  324. } else {$keywds="";}
  325. } else {$keywds="";}
  326. foreach my $chain (keys(%descript)) {$descript{$chain}.=$keywds;}
  327. # Check date?
  328. if ($date) {
  329. while ($line && $line!~/^REVDAT 1/o && $line!~/^REMARK /o) {$line=<PDBFILE>;}
  330. if ($line=~/^REVDAT 1/) {
  331. if ($line=~/^.{16}(\w\w\w)-(\d\d)/) {
  332. my $thisdate=($months{$1}-1)/12+$2+100*($2<50);
  333. # print("This date: $thisdate date=$date\n$line");
  334. if ($thisdate>$date) {next;}
  335. } elsif ($v>=2) {
  336. print("WARNING: no valid date in header: \n$line");
  337. }
  338. }
  339. }
  340. # REMARK 2 RESOLUTION. 2.00 ANGSTROMS.
  341. while ($line && $line!~/^REMARK 2 /o && $line!~/^SEQRES /o) {$line=<PDBFILE>;}
  342. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  343. if ($v>=2 && $line!~/^REMARK 2 /) {print("\n\nWarning: no REMARK 2 line found in $pdbfile\n");}
  344. while ($line=~/^REMARK 2 /o && $line!~/^SEQRES /o) {
  345. if ($line=~/^REMARK 2\s+RESOLUTION\.\s+(\d+\.?\d*)/) {$resolution=$1; last;}
  346. $line=<PDBFILE>;
  347. }
  348. # REMARK 3 R VALUE (WORKING SET) : 0.216
  349. # REMARK 3 FREE R VALUE : 0.251
  350. while ($line && $line!~/^REMARK 3 /o && $line!~/^SEQRES /o) {$line=<PDBFILE>;}
  351. if (!$line) {if ($v>=2) {print("\nFormat error in $pdbfile. Skipping file ...\n");} next;}
  352. if ($v>=2 && $line!~/^REMARK 3 /) {print("\n\nWarning: no REMARK 3 line found in $pdbfile\n");}
  353. while ($line=~/^REMARK 3 /o && $line!~/^SEQRES /o) {
  354. if ($line=~/^REMARK 3\s+R VALUE\s+\(WORKING SET\)\s*:\s*(\d+\.?\d*)/o) {$rvalue=$1;}
  355. if ($line=~/^REMARK 3\s+FREE R VALUE\s*:\s*(\d+\.?\d*)/o) {$free_rvalue=$1;}
  356. $line=<PDBFILE>;
  357. }
  358. # Record current position in PDBFILE
  359. my $file_pos = tell(PDBFILE);
  360. # Search for hetero groups BEFORE adding seqeunces => read pdb file twice :(
  361. while ($line && $line!~/^HET /o && $line!~/^ATOM /o) {$line=<PDBFILE>;}
  362. if (defined $line && $line=~/^HET /o) {
  363. while ($line) {
  364. # ----+----1----+----2----+----3
  365. # HET DAC A 172 18
  366. if ($line=~/^HET\s+(\S+) ..........\s*(\d+)/) {
  367. if ($2>=10 ) {
  368. my $this_het=$1;
  369. if ($het!~/$this_het/) { # don't list any hetgoup twice
  370. if ($het eq "") {$het=" HET: $1";} else {$het.=" $1";}
  371. }
  372. }
  373. } else {
  374. last;
  375. }
  376. $line=<PDBFILE>;
  377. }
  378. }
  379. if ($het ne "") {$het.=";";}
  380. # Rewind the current position in PDBFILE back before SEQRES records
  381. seek (PDBFILE,$file_pos,0);
  382. # SEQRES 1 396 MET PHE GLU ASN ILE THR ALA ALA PRO ALA ASP PRO ILE 1ASA 60
  383. # SEQRES 1 A 366 SER ARG MET PRO SER PRO PRO MET PRO VAL PRO PRO ALA
  384. @seqres=();
  385. my $newchain="@"; # make sure that sequence is not printed before first chain has been read
  386. my $newlength; # compare previous to current chain to find out when one chain is finished
  387. while ($line && $line!~/^SEQRES /o) {$line=<PDBFILE>;}
  388. if (!defined $line) {
  389. if ($v>=2) {print("\n\nWarning: no SEQRES line found in $pdbfile. Skipping file ...\n");}
  390. close (PDBFILE);
  391. next;
  392. }
  393. while ($line=~/^SEQRES /o) {
  394. if (length($line) >= 20) {
  395. $chain = $newchain;
  396. $newchain = substr($line, 10, 2);
  397. $newchain =~ /\s*(\S*)/; #if newchain=" " -> ""
  398. $newchain = $1;
  399. $length = $newlength;
  400. $newlength = substr($line, 13, 4);
  401. $seqres = substr($line, 19, 51);
  402. $seqres =~ s/\s*$//;
  403. # Compare previous to current chain to find out when one chain is finished
  404. if ($chain ne $newchain && $chain ne "@") {
  405. if (scalar(@seqres)!=$length) {
  406. if ($v>=2) {printf("\nWarning: in $pdbfile, line $., sequence length=$length, counted residues = %i\n",scalar(@seqres));}
  407. }
  408. &AddSequence();
  409. @seqres=();
  410. }
  411. push(@seqres,split(/\s+/,$seqres));
  412. } else {
  413. print("\nError: found invalid SEQRES record in $pdbfile, line $. : line=$line");
  414. }
  415. $line=<PDBFILE>;
  416. }
  417. $chain=$newchain;
  418. $length=$newlength;
  419. &AddSequence();
  420. close (PDBFILE);
  421. } # end foreach $pdbfile
  422. ############################################################################################
  423. # Print all sequences
  424. open (NEWSEQFILE,">$newseqfile") || die ("ERROR: cannot open $newseqfile for writing: $!\n");
  425. for (my $nc=0; $nc<@sequences; $nc++) {
  426. if ($equiv_pdbs[$nc] ne "") {
  427. $sequences[$nc]=~s/^(.*)/$1 PDB:$equiv_pdbs[$nc]/; # Add list of equivalent pdb codes
  428. }
  429. printf(NEWSEQFILE "%s",$sequences[$nc]);
  430. }
  431. close(NEWSEQFILE);
  432. foreach my $word (keys(%words)) {print("$word ");}
  433. print("\n");
  434. print("Written $nchains chains to $newseqfile\n");
  435. exit;
  436. ##################################################################################
  437. # Set description when a new MOL_ID line is found, or at the end of COMPND records
  438. ##################################################################################
  439. sub SetDescript() {
  440. my $i;
  441. # print("chain=$chain\n");
  442. if ($chain eq "NULL;") {$chain="";}
  443. $chain[$molid]=$chain;
  444. $molid=$1;
  445. if ($descript=~/^DNA[ ;]/) {$het=" HET: DNA";}
  446. # Cut description down to max. $DESCLEN letters
  447. $descript=~s/\s*;\s*$//;
  448. if (length($descript)>$DESCLEN) {
  449. $descript=~s/(.{$DESCLEN}\S*).*/$1.../; # remove everything after first comma
  450. }
  451. # Add synonyms with a maximum of 16 letters to description
  452. if ($synonym ne "") {
  453. @synonyms=split(/;\s+/,$synonym);
  454. for ($i=0; $i<scalar(@synonyms); $i++) {
  455. if (length($synonyms[$i])>16) {
  456. splice(@synonyms,$i,1);
  457. } else {
  458. $synonyms[$i]=~s/;\s*//;
  459. $token="SYNONYM";
  460. }
  461. }
  462. }
  463. # Choose shortname (<=16 characters) from synonyms
  464. unshift(@synonyms,$descript);
  465. for ($i=0; $i<scalar(@synonyms); $i++) {
  466. if (length($synonyms[$i])<=16) {last;}
  467. }
  468. if ($i>=scalar(@synonyms)) {$i=0;}
  469. $descript=$synonyms[$i];
  470. $descript=~s/^\s*//;
  471. $descript=~s/\s*$//;
  472. splice(@synonyms,$i,1);
  473. for $synonym (@synonyms) {$descript.=", ".$synonym;}
  474. if ($descript ne "") {
  475. $descript=~s/\s+/ /g;
  476. $descript=~s/^\s+//g;
  477. $descript=~s/\s*;*\s*$//; # remove ';'
  478. }
  479. $descript.=";"; # append a semicolon ';'
  480. if ($chain ne "") {
  481. $chain=~s/\s*;\s*$//;
  482. @chains=split(/[,; ]\s*/,$chain);
  483. foreach $chain (@chains) {
  484. $descript{$chain}=$descript;
  485. # printf("chain='$chain' description='$descript'\n");
  486. }
  487. } else {
  488. $descript{$chain}=$descript;
  489. # printf("chain='$chain' description='$descript'\n");
  490. }
  491. }
  492. ##################################################################################
  493. # Set organism when a new MOL_ID line is found, or at the end of SOURCE records
  494. ##################################################################################
  495. sub SetOrganism() {
  496. if ($organism eq "" && $organism_common ne "") {$organism=$organism_common;}
  497. if (!exists $chain[$molid]) {$molid=1-$molid;}
  498. $chain=$chain[$molid];
  499. $molid=$1;
  500. $organism=~tr/$//d;
  501. $organism=~s/^\s*([^;:]*).*/$1/;
  502. if ($organism=~/^\S*\s*\(([\w ]*)/) {$organism=$1;} # bovine (Bos taurus)
  503. elsif ($organism=~/^([\w -]+)/) {$organism=$1;} # BACTERIOPHAGE T4 (MUTANT GENE DERIVED ...)
  504. elsif ($organism=~/^(\S+\s+\S+)/) {$organism=$1;} # maximum two words
  505. elsif ($organism=~/^(\S+)/) {$organism=$1;}
  506. $organism=~s/\s*$//g;
  507. if ($chain ne "") {
  508. $chain=~s/\s*;\s*$//;
  509. @chains=split(/[,; ]\s*/,$chain);
  510. foreach $chain (@chains) {
  511. $organism{$chain}=$organism;
  512. # printf("chain='%s' organism{chain}='%s'\n",$chain,$organism{$chain});
  513. }
  514. } else {
  515. $organism{$chain}=$organism;
  516. # printf("chain='%s' organism{chain}='%s'\n",$chain,$organism{$chain});
  517. }
  518. }
  519. ##################################################################################
  520. # Print out sequence of last chain read in
  521. ##################################################################################
  522. sub AddSequence() {
  523. my $seqres="";
  524. my $pdbidchain;
  525. my $nc; # $nc= either next chain number OR, if identical seq exists with better resolution, index of this seq
  526. foreach my $aa (@seqres) {$seqres.=&Three2OneLetter($aa);}
  527. if ($v>=3) {
  528. printf("CHAIN ='%s'\n",$chain);
  529. printf("DESCRP='%s'\n",$descript{$chain});
  530. printf("KEYWDS='$keywds'\n");
  531. printf("ORGANI='%s'\n",$organism{$chain});
  532. printf("SEQRES='%s'\n",$seqres);
  533. }
  534. if (length($seqres)<=20) {return 1;} # skip short protein/DNA chains (for DNA's ADGT &Three2OneLetter() returns "")
  535. if ($chain ne "") {$pdbidchain=$pdbid."_".$chain;} else {$pdbidchain=$pdbid;}
  536. # Check for nonredundancy
  537. if ($nr==1 && defined $nchains{$seqres} ) {
  538. $nc=$nchains{$seqres};
  539. # $sequences[$nchains{$seqres}]=~/^(\S+)/;
  540. # print("Sequence $pdbid"."_$chain redundant with $1. Res now: $resolution Res before: $resolution[$nc]\n");
  541. if ($resolution==0 || $resolution[$nc]<=$resolution) {
  542. # PDB identifier not yet contained in list of equivalent pdb ids?
  543. if ($equiv_pdbs[$nc]!~/$pdbid/ && $sequences[$nc]!~/^>$pdbid/) {
  544. $equiv_pdbs[$nc].=" $pdbidchain"; # Add new pdbid_chain to list $equiv_pdbs[$nc]
  545. if ($het ne "") {$equiv_pdbs[$nc].="*";}
  546. }
  547. return 1;
  548. } else {
  549. # Sequence redundant
  550. # => Throw out earlier sequence and keep this one
  551. # => Keep list $equiv_pdbs[$nc] from earlier sequence and append its pdbid
  552. $sequences[$nc]=~/>(\S+)/;
  553. $equiv_pdbs[$nc].=" $1";
  554. if ($het ne "") {$equiv_pdbs[$nc].="*";}
  555. }
  556. } else {
  557. $nc=$nchains{$seqres}=$nchains;
  558. $nchains++;
  559. $equiv_pdbs[$nc]="";
  560. }
  561. $resolution[$nc]=$resolution;
  562. # If descript{chain} does not exist, it was not specified seperately for each chain
  563. if (exists $descript{$chain}) {$descript=$descript{$chain}}
  564. $descript=~s/;*$//; # remove ; at the end
  565. # If organism{chain} does not exist, it was not specified seperately for each chain
  566. if (exists $organism{$chain}) {
  567. $organism=lc($organism{$chain});
  568. } else {
  569. if($organism=~/\((.*)\)/) {$organism=$1;}
  570. $organism=lc($organism);
  571. }
  572. if ($v>=3) {
  573. printf("Accept:\n",$chain);
  574. printf("CHAIN ='%s'\n",$chain);
  575. printf("DESCRP='%s'\n",$descript);
  576. printf("KEYWDS='$keywds'\n");
  577. printf("ORGANI='%s'\n",$organism);
  578. printf("SEQRES='%s'\n",$seqres);
  579. }
  580. # Correct upper/lower case
  581. $descript=" $descript ";
  582. $descript=~s/([a-zA-Z]{5,})/\L$1/g; # make everything longer than 5 word characters lower case
  583. $descript=~s/([\s]OF[\s])/\L$1/g;
  584. $descript=~s/([\s]OR[\s])/\L$1/g;
  585. $descript=~s/([\s]ON[\s])/\L$1/g;
  586. $descript=~s/([\s]NO[\s])/\L$1/g;
  587. $descript=~s/([\s]IN[\s])/\L$1/g;
  588. $descript=~s/([\s]IS[\s])/\L$1/g;
  589. $descript=~s/([\s]BY[\s])/\L$1/g;
  590. $descript=~s/([\s]AT[\s])/\L$1/g;
  591. $descript=~s/([\s]TO[\s])/\L$1/g;
  592. $descript=~s/([ -]ALL[ -])/\L$1/g;
  593. $descript=~s/([ -]AND[ -])/\L$1/g;
  594. $descript=~s/([ -]ARM[ -])/\L$1/g;
  595. $descript=~s/([ -]BOX[ -])/\L$1/g;
  596. $descript=~s/([ -]BOX[ -])/\L$1/g;
  597. $descript=~s/([ -]EGG[ -])/\L$1/g;
  598. $descript=~s/([ -]EYE[ -])/\L$1/g;
  599. $descript=~s/([ -]FOR[ -])/\L$1/g;
  600. $descript=~s/([ -]HAS[ -])/\L$1/g;
  601. $descript=~s/([ -]HEN[ -])/\L$1/g;
  602. $descript=~s/([ -]HOT[ -])/\L$1/g;
  603. $descript=~s/([ -]LOW[ -])/\L$1/g;
  604. $descript=~s/([ -]MOL[ -])/\L$1/g;
  605. $descript=~s/([ -]NON[ -])/\L$1/g;
  606. $descript=~s/([ -]ONE[ -])/\L$1/g;
  607. $descript=~s/([\s]OUT[\s])/\L$1/g;
  608. $descript=~s/([ -]SEX[ -])/\L$1/g;
  609. $descript=~s/([ -]SIX[ -])/\L$1/g;
  610. $descript=~s/([ -]TEN[ -])/\L$1/g;
  611. $descript=~s/([\s]THE[\s])/\L$1/g;
  612. $descript=~s/([ -]TWO[ -])/\L$1/g;
  613. $descript=~s/([ -]WAY[ -])/\L$1/g;
  614. $descript=~s/([\W]ACID[\W])/\L$1/g;
  615. $descript=~s/([\W]ACYL[\W])/\L$1/g;
  616. $descript=~s/([\W]ALDO[\W])/\L$1/g;
  617. $descript=~s/([\W]ANTI[\W])/\L$1/g;
  618. $descript=~s/([\W]AUTO[\W])/\L$1/g;
  619. $descript=~s/([\W]AXIN[\W])/\L$1/g;
  620. $descript=~s/([\W]BASE[\W])/\L$1/g;
  621. $descript=~s/([\W]BEAN[\W])/\L$1/g;
  622. $descript=~s/([\W]BETA[\W])/\L$1/g;
  623. $descript=~s/([\W]BILE[\W])/\L$1/g;
  624. $descript=~s/([\W]BLUE[\W])/\L$1/g;
  625. $descript=~s/([\W]BONE[\W])/\L$1/g;
  626. $descript=~s/([\W]BOND[\W])/\L$1/g;
  627. $descript=~s/([\W]CELL[\W])/\L$1/g;
  628. $descript=~s/([\W]COAT[\W])/\L$1/g;
  629. $descript=~s/([\W]COIL[\W])/\L$1/g;
  630. $descript=~s/([\W]COLD[\W])/\L$1/g;
  631. $descript=~s/([\W]COLI[\W])/\L$1/g;
  632. $descript=~s/([\W]CORE[\W])/\L$1/g;
  633. $descript=~s/([\W]CRYO[\W])/\L$1/g;
  634. $descript=~s/([\W]DRUG[\W])/\L$1/g;
  635. $descript=~s/([\W]DUAL[\W])/\L$1/g;
  636. $descript=~s/([\W]DUCK[\W])/\L$1/g;
  637. $descript=~s/([\W]ENDO[\W])/\L$1/g;
  638. $descript=~s/([\W]FAST[\W])/\L$1/g;
  639. $descript=~s/([\W]FIVE[\W])/\L$1/g;
  640. $descript=~s/([\W]FOLD[\W])/\L$1/g;
  641. $descript=~s/([\W]FOOT[\W])/\L$1/g;
  642. $descript=~s/([\W]FORM[\W])/\L$1/g;
  643. $descript=~s/([\W]FOUR[\W])/\L$1/g;
  644. $descript=~s/([\W]FROM[\W])/\L$1/g;
  645. $descript=~s/([\W]FLAP[\W])/\L$1/g;
  646. $descript=~s/([\W]FREE[\W])/\L$1/g;
  647. $descript=~s/([\W]GENE[\W])/\L$1/g;
  648. $descript=~s/([\W]GOOD[\W])/\L$1/g;
  649. $descript=~s/([\W]HALF[\W])/\L$1/g;
  650. $descript=~s/([\W]HAND[\W])/\L$1/g;
  651. $descript=~s/([\W]HAVE[\W])/\L$1/g;
  652. $descript=~s/([\W]HEAD[\W])/\L$1/g;
  653. $descript=~s/([\W]HEAT[\W])/\L$1/g;
  654. $descript=~s/([\W]HEME[\W])/\L$1/g;
  655. $descript=~s/([\W]HEXA[\W])/\L$1/g;
  656. $descript=~s/([\W]HIGH[\W])/\L$1/g;
  657. $descript=~s/([\W]HOLO[\W])/\L$1/g;
  658. $descript=~s/([\W]IRON[\W])/\L$1/g;
  659. $descript=~s/([\W]KETO[\W])/\L$1/g;
  660. $descript=~s/([\W]KNOT[\W])/\L$1/g;
  661. $descript=~s/([\W]LATE[\W])/\L$1/g;
  662. $descript=~s/([\W]LENS[\W])/\L$1/g;
  663. $descript=~s/([\W]LIKE[\W])/\L$1/g;
  664. $descript=~s/([\W]LONG[\W])/\L$1/g;
  665. $descript=~s/([\W]LOOP[\W])/\L$1/g;
  666. $descript=~s/([\W]MAIN[\W])/\L$1/g;
  667. $descript=~s/([\W]MEAN[\W])/\L$1/g;
  668. $descript=~s/([\W]MINI[\W])/\L$1/g;
  669. $descript=~s/([\W]MITE[\W])/\L$1/g;
  670. $descript=~s/([\W]MODE[\W])/\L$1/g;
  671. $descript=~s/([\W]MONO[\W])/\L$1/g;
  672. $descript=~s/([\W]MUCH[\W])/\L$1/g;
  673. $descript=~s/([\W]NINE[\W])/\L$1/g;
  674. $descript=~s/([\W]NULL[\W])/\L$1/g;
  675. $descript=~s/([\W]ONLY[\W])/\L$1/g;
  676. $descript=~s/([\W]OPEN[\W])/\L$1/g;
  677. $descript=~s/([\W]PARA[\W])/\L$1/g;
  678. $descript=~s/([\W]PLUS[\W])/\L$1/g;
  679. $descript=~s/([\W]POST[\W])/\L$1/g;
  680. $descript=~s/([\W]POLY[\W])/\L$1/g;
  681. $descript=~s/([\W]PORE[\W])/\L$1/g;
  682. $descript=~s/([\W]PUMP[\W])/\L$1/g;
  683. $descript=~s/([\W]RICH[\W])/\L$1/g;
  684. $descript=~s/([\W]RING[\W])/\L$1/g;
  685. $descript=~s/([\W]ROLE[\W])/\L$1/g;
  686. $descript=~s/([\W]ROLL[\W])/\L$1/g;
  687. $descript=~s/([\W]SALT[\W])/\L$1/g;
  688. $descript=~s/([\W]SEMI[\W])/\L$1/g;
  689. $descript=~s/([\W]SITE[\W])/\L$1/g;
  690. $descript=~s/([\W]STEM[\W])/\L$1/g;
  691. $descript=~s/([\W]TAIL[\W])/\L$1/g;
  692. $descript=~s/([\W]TATA[\W])/\L$1/g;
  693. $descript=~s/([\W]TRAP[\W])/\L$1/g;
  694. $descript=~s/([\W]TUBE[\W])/\L$1/g;
  695. $descript=~s/([\W]TURN[\W])/\L$1/g;
  696. $descript=~s/([\W]TWIN[\W])/\L$1/g;
  697. $descript=~s/([\W]TYPE[\W])/\L$1/g;
  698. $descript=~s/([\W]WELL[\W])/\L$1/g;
  699. $descript=~s/([\W]WILD[\W])/\L$1/g;
  700. $descript=~s/([\W]WITH[\W])/\L$1/g;
  701. $descript=~s/([\W]WROM[\W])/\L$1/g;
  702. $descript=~s/([\W]ZETA[\W])/\L$1/g;
  703. $descript=~s/([\W]ZINC[\W])/\L$1/g;
  704. $descript=~s/DE NOVO/de novo/ig;
  705. $descript=~s/(\W)KDA(\W)/$1kDa$2/g;
  706. $descript=~s/(\S+[CBDFGJKLMNPQRTVWXZ]{4,}\S+)/\U$1/ig;
  707. $descript=~s/(\W)(\S[CBDFGHJKLMNPQRSTVWXZ]{4,}\W)/$1\U$2/ig; # no vowels for at least 4 letters -> abbreviation -> upper case
  708. $descript=~s/(\W)([CBDFGHJKLMNPQRSTVWXZ]{4,}\S\W)/$1\U$2/ig; # no vowels for at least 4 letters -> abbreviation -> upper case
  709. $descript=~s/(\w+ii+\w+)/\U$1/ig;
  710. $descript=~s/([\W]rossman[\W])/\u$1/g;
  711. $descript=~s/([\W]nadph[\W])/\U$1/g;
  712. $descript=~s/([\W]gapdh[\W])/\U$1/g;
  713. $descript=~s/([\W]f[\W])/\u$1/g;
  714. $descript=~s/(\W)(\w{0,3})RNP(\W)/$1\L$2\URNP$3/ig;
  715. $descript=~s/(\W)(\w{0,3})RNA(\W)/$1\L$2\URNA$3/ig;
  716. $descript=~s/(\W)(\w{0,3})DNA(\W)/$1\L$2\UDNA$3/ig;
  717. $descript=~s/RNASE(\W)/RNAse$1/ig;
  718. $descript=~s/DNASE(\W)/DNAse$1/ig;
  719. $descript=~s/barnase/barnase/ig;
  720. $descript=~s/atpase(\W)/ATPase$1/ig;
  721. $descript=~s/gtpase(\W)/GTPase$1/ig;
  722. # Write amino acid three letter symbols with one capital letter
  723. foreach my $aa ("Ala","Cys","Asp","Glu","Phe","Gly","His","Ile","Lys","Leu","Met","Asn","Pro","Gln","Arg","Ser","Thr","Val","Trp","Tyr","MSe") {
  724. $descript=~s/$aa([ -;:.+])/$aa$1/ig;
  725. }
  726. # Write ions as chemical elements
  727. foreach my $ion ("Zn","Mg","Na","Ka","Ca","Fe","Cu","Se","Al","Mn") {
  728. $descript=~s/([ -:;])$ion([ -;:.+])/$1$ion$2/ig;
  729. $descript=~s/([ -:;])$ion([ -;:.+])/$1$ion$2/ig;
  730. }
  731. $descript=~s/^\s*//;
  732. $descript=~s/\s*$//;
  733. $descript="\u$descript"; # first letter of description upper case
  734. $organism="\u$organism"; # first letter of organism upper case
  735. $organism=~s/ ([\w\d]{0,2})$/ \U$1/; # Influenza A virus
  736. $organism=~s/ (\w+\d+\w*)$/ \U$1/; # Influenza A virus
  737. if ($v>=2 && $organism eq "") {print("\n\nWarning: no organism found for chain $chain in $pdbfile\n");}
  738. my $foldid="";
  739. if ($dalifamids{$pdbidchain}) {$foldid=" $dalifamids{$pdbidchain}".$foldid;}
  740. if ($scopfamids{$pdbidchain}) {$foldid=" $scopfamids{$pdbidchain}".$foldid;}
  741. my $res;
  742. if ($resolution>0) {$res=$resolution."A";} else {$res="NMR";}
  743. # Set sequence record
  744. $seqres=~s/(\S{1,100})/$1\n/g; # insert newlines after each 70 characters
  745. $sequences[$nc]=sprintf(">%-6.6s %s;%s %s {%s}%s\n",$pdbidchain,$descript,$het,$res,$organism,$foldid);
  746. $sequences[$nc].=sprintf("$seqres");
  747. if($v>=3) {print($sequences[$nc]);}
  748. return 0;
  749. }
  750. sub ReadScopFile()
  751. {
  752. # Read dir.cla.scop.txt_1.65
  753. if ($v>=2) {print("Reading $scopfile ... ");}
  754. open (SCOPFILE,"<$scopfile") || die ("ERROR: cannot open $scopfile: $!\n");
  755. #d1dlwa_ 1dlw A: a.1.1.1 14982 cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=14982
  756. my $n=0;
  757. my $scopfamid;
  758. my $chain;
  759. my $pdbidchain;
  760. while($line=<SCOPFILE>) {
  761. if ($line=~/^\S+\s+(\S\S\S\S)\s+(\S+)\s+(\w\.\d+\.\d+\.\d+)/) {
  762. $pdbid=$1;
  763. $chain=$2;
  764. $scopfamid=$3;
  765. $pdbidchain=$1;
  766. # printf("chain=$chain\n");
  767. if ($chain!~/([A-Za-z\d]):/) {
  768. if ($scopfamids{$pdbidchain}) {
  769. $scopfamids{$pdbidchain}.=" ".$scopfamid;
  770. # printf("scopfamids($pdbidchain)=%s\n",$scopfamids{$pdbidchain});
  771. } else {
  772. $scopfamids{$pdbidchain}="SCOP: ".$scopfamid;
  773. # printf("scopfamids($pdbidchain)=%s\n",$scopfamids{$pdbidchain});
  774. }
  775. $n++;
  776. } else {
  777. my %chains=();
  778. while ($chain=~/([A-Za-z\d]):/) {
  779. $chain=~s/([A-Za-z\d]):[^A-Za-z:]*//;
  780. if ($chains{$1}) {next;}
  781. if ($1 ne "") {$pdbidchain="$pdbid"."_$1";}
  782. $chains{$1}=1;
  783. if ($scopfamids{$pdbidchain}) {
  784. $scopfamids{$pdbidchain}.=" ".$scopfamid;
  785. # printf("scopfamids($pdbidchain)=%s\n",$scopfamids{$pdbidchain});
  786. } else {
  787. $scopfamids{$pdbidchain}="SCOP: ".$scopfamid;
  788. # printf("scopfamids($pdbidchain)=%s\n",$scopfamids{$pdbidchain});
  789. }
  790. $n++;
  791. }
  792. }
  793. }
  794. }
  795. close(SCOPFILE);
  796. print(" found $n domains in SCOP file $scopfile\n");
  797. }
  798. sub ReadDaliFiles()
  799. {
  800. # Read FoldIndex.html
  801. if ($v>=2) {print("Reading $dalidir/FoldIndex.html ... ");}
  802. open (FOLDINEXFILE,"<$dalidir/FoldIndex.html") || die ("ERROR: cannot open $dalidir/FoldIndex.html: $!\n");
  803. # 1.1.1.1.1.1 1cunA_2 ...
  804. # 1.1.1.2.1.1 ___1lvfA_1 ...
  805. my $n=0;
  806. my $dalifamid;
  807. my %fold_for_repr;
  808. while($line=<FOLDINEXFILE>) {
  809. if ($line=~/(\d+\.\d+\.\d+\.\d+\.\d+\.\d+)\s+_*(\S+)(_\d+)/) {
  810. $fold_for_repr{$2.$3}=$1;
  811. if ($3 eq "_0") {
  812. $dalifamid=$1;
  813. $pdbid=$2;
  814. $pdbid=~s/^(\S\S\S\S)(\S)/$1_$2/;
  815. $dalifamids{$pdbid}="DALI: ".$dalifamid;
  816. }
  817. $n++;
  818. }
  819. }
  820. close(FOLDINEXFILE);
  821. if ($v>=1) {print(" found $n representative domains in DALI's FoldIndex.html\n");}
  822. # Read domain_definitions.txt
  823. my $domainfile="$dalidir/domain_definitions.txt";
  824. my $repr;
  825. if ($v>=2) {print("Reading $domainfile ... ");}
  826. open (DOMAINFILE,"<$domainfile") || die ("ERROR: cannot open $domainfile: $!\n");
  827. # 1cunA/1-106 1cunA_1 1 ALPHA SPECTRIN
  828. $n=0;
  829. while($line=<DOMAINFILE>) {
  830. if ($line=~/^(\S+)\/\S+\s+(\S+)/) {
  831. if (!$fold_for_repr{$2}) {
  832. if ($v>=2) {print("WARNING: no fold for DALI representative $2 in $domainfile\n");}
  833. next;
  834. }
  835. $pdbid=$1;
  836. $repr=$2;
  837. $pdbid=~s/^(\S\S\S\S)(\S)/$1_$2/;
  838. if ($dalifamids{$pdbid}) {
  839. $dalifamids{$pdbid}.=" ".$fold_for_repr{$repr};
  840. } else {
  841. $dalifamids{$pdbid}=" DALI: ".$fold_for_repr{$repr};
  842. }
  843. $n++;
  844. }
  845. }
  846. close(DOMAINFILE);
  847. print(" found $n domains in DALI file $domainfile\n");
  848. }
  849. ##################################################################################
  850. # Convert three-letter amino acid code into one-letter code
  851. ##################################################################################
  852. sub Three2OneLetter {
  853. my $res = $three2one{uc($_[0])};
  854. if (defined $res) {
  855. return $res;
  856. } elsif ($_[0] =~ /^\s+$/) {
  857. return "";
  858. } else {
  859. return "X";
  860. }
  861. }