PdbFile.pm 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. #!/usr/bin/perl -w
  2. package PdbFile;
  3. use strict;
  4. use utilities;
  5. sub new {
  6. my ($caller, $filename) = @_;
  7. my $caller_is_obj = ref($caller);
  8. my $class = $caller_is_obj || $caller;
  9. no strict "refs";
  10. my $self = bless {}, $class;
  11. $self->{residues} = {}; ## for a residue number => atom lines concatenated
  12. $self->{residOrder} = {}; ## i-th residue in "seq" is residOrder-th residue in "residues"
  13. $self->{atm2resid} = {}; ## atom i is in residue j
  14. $self->{atoms} = {};
  15. $self->{comment} = "";
  16. $self->{seq} = "";
  17. if (defined($filename)) {
  18. $self->read($filename);
  19. }
  20. return $self;
  21. }
  22. sub read {
  23. my $self = shift;
  24. my $filename = shift;
  25. $self->clear();
  26. my $seq = "";
  27. my $nresPrev = -1e4;
  28. my $internResidCount = 0;
  29. open(PDB, $filename) or die "Cant open $filename: $!\n";
  30. while (my $line = <PDB>) {
  31. my $residue;
  32. my $nres;
  33. # ATOM 1 N PRO 1 -29.477 -9.021 66.175 1.00 75.79
  34. if ($line =~ /ATOM.{2}\s*(\d+)\s.([\S|\s]{3}).(\S{3})\s.\s*(\d+)[\s|\D]\s*(-?\d+.\d+)\s*(-?\d+.\d+)\s*(-?\d+.\d+)\s*/) {
  35. my $atm = $1;
  36. $residue = $3;
  37. $nres = $4;
  38. $self->{atm2resid}->{$atm} = $nres;
  39. if (not exists($self->{residues}->{$nres})) { $self->{residues}->{$nres} = ""; }
  40. $self->{residues}->{$nres} .= $line;
  41. $self->{atoms}->{$atm} = $line;
  42. if ($nresPrev != $nres) {
  43. $seq .= &Three2OneLetter($residue);
  44. $nresPrev = $nres;
  45. $self->{residOrder}->{$internResidCount} = $nres;
  46. $internResidCount++;
  47. }
  48. }
  49. elsif ($line =~ /^COMMENT/) {
  50. $self->{comment} .= $line;
  51. }
  52. }
  53. $self->{seq} = $seq;
  54. close(PDB);
  55. }
  56. sub clear {
  57. my $self = shift;
  58. %{$self} = ();
  59. }
  60. sub residue_to_string {
  61. my $self = shift;
  62. my $nres = shift;
  63. return $self->{residues}->{$nres} if exists($self->{residues}->{$nres});
  64. return "";
  65. }
  66. sub print_residue {
  67. my $self = shift;
  68. my $nres = shift;
  69. print $self->residue_to_string($nres) . "\n";
  70. }
  71. sub print_seq {
  72. my $self = shift;
  73. print $self->{seq} . "\n";
  74. }
  75. sub get_residue_for_atom {
  76. my $self = shift;
  77. my $atm = shift;
  78. return $self->{atm2resid}->{$atm};
  79. }
  80. sub get_atom_type {
  81. my $self = shift;
  82. my $atm = shift;
  83. my $atmLine = $self->{atoms}->{$atm};
  84. $atmLine =~ /^ATOM.{2}\s*\d+\s.([\S|\s]{3})/;
  85. my $type = &trim($1);
  86. return $type;
  87. }
  88. sub get_coordinates_of {
  89. my $self = shift;
  90. my $resid = shift;
  91. my $atmType = shift;
  92. my $v = 0;
  93. my @coords;
  94. # print "resid=$resid\n";
  95. if (not exists($self->{residues}->{$resid})) {
  96. return ((-999999));
  97. }
  98. $resid = $self->{residues}->{$resid};
  99. my @atoms = split(/\n/, $resid);
  100. for (my $i=0; $i<@atoms; $i++) {
  101. if ($atoms[$i] =~ /ATOM.{2}\s*\d+\s.([\S|\s]{3}).\S{3}\s.\s*\d+[\s|\D]\s*(-?\d+.\d+)\s*(-?\d+.\d+)\s*(-?\d+.\d+)/ && &trim($1) eq $atmType) {
  102. push(@coords, $2, $3, $4);
  103. last;
  104. }
  105. }
  106. if (scalar(@coords) != 3) {
  107. if ($v >= 1) {
  108. print "atom coordinates dont exist for residue=$resid, atomType=$atmType\n";
  109. }
  110. return ((-999999));
  111. }
  112. return @coords;
  113. }
  114. sub get_num_of_residues {
  115. my $self = shift;
  116. my $numOfResidues = scalar(keys(%{$self->{residues}}));
  117. return $numOfResidues;
  118. }
  119. sub get_startIdx_of_seq {
  120. my $self = shift;
  121. my $seq = shift;
  122. my $startIdx = &KMP($self->{seq}, $seq);
  123. return $startIdx;
  124. }
  125. ## having a sequence from a pir file (aseq), build a new pdb file
  126. ## which contains only those residues in "aseq" which have "pattern" 1
  127. ## aseq must be a substring of
  128. ##
  129. ## residue numbers in pdb-file and intern
  130. ## pdb 3 4 5 8 9 ...
  131. ## intern 1 2 3 4 5 ...
  132. ##
  133. ## intern numbers the residues in self->{seq}
  134. ## when searching aseq in self->seq with KMP one gets an intern residue number as result
  135. ##
  136. ## residOrder{intern} = pdb, e.g. residOrder{2} = 4
  137. sub rebuild_pdb_file {
  138. my $self = shift;
  139. my $aseq = shift;
  140. my $apattern = shift;
  141. my $outfile = shift;
  142. my @seq = split(//, $aseq);
  143. my @pattern = split(//, $apattern);
  144. if (scalar(@seq) != scalar(@pattern)) {
  145. print "ERROR in rebuild_pdb_file: seq and pattern have different length!\n";
  146. return;
  147. }
  148. ## search for start index of seq in pdbSeq => intern idx
  149. my $startIdx = &KMP($self->{seq}, $aseq);
  150. if ($startIdx == -1) {
  151. print "ERROR in rebuild_pdb_file: KMP could not find $aseq in $self->{seq}!\n";
  152. return;
  153. }
  154. my $newPdb = "";
  155. $newPdb .= $self->{comment};
  156. $newPdb .= "COMMENT\n";
  157. $newPdb .= "COMMENT artificial pdb file for multitemplates\n";
  158. for (my $i=0; $i<@seq; $i++) {
  159. ## build new pdb file only with residues in pattern (i.e. pattern=1)
  160. next if ($pattern[$i] == 0);
  161. ## get pdb residue number from intern residue number
  162. my $internResid = $startIdx + $i;
  163. my $res = $self->{residOrder}->{$internResid};
  164. $newPdb .= $self->{residues}->{$res};
  165. }
  166. $newPdb .= "END";
  167. ## renumber atom indices (by removing residues, their order is not any longer valid)
  168. my @pdbLines = split(/\n/, $newPdb);
  169. my $atomCounter = 1;
  170. for (my $i=0; $i<@pdbLines; $i++) {
  171. if ($pdbLines[$i] =~ /^(ATOM.{2})([\s|\d]{5})(.*)/) {
  172. my $atmIdx = sprintf("%5d", $atomCounter);
  173. $pdbLines[$i] = "$1$atmIdx$3";
  174. $atomCounter++;
  175. }
  176. }
  177. $newPdb = join("\n", @pdbLines);
  178. open(OH, "> $outfile") or die ("Cant write $outfile: $!\n");
  179. print (OH $newPdb);
  180. close(OH);
  181. }
  182. ## excise residues "start" till "end" from current pdb-file (in self)
  183. ## and write new pdb into outfile
  184. sub excise_pdb_file {
  185. my $self = shift;
  186. my $start = shift;
  187. my $end = shift;
  188. my $outfile = shift;
  189. my $newPdb = "";
  190. $newPdb .= "COMMENT excised residues $start-$end\n";
  191. for (my $i=$start; $i<=$end; $i++) {
  192. if (exists($self->{residues}->{$i})) {
  193. $newPdb .= $self->{residues}->{$i};
  194. }
  195. }
  196. $newPdb .= "END";
  197. open(OH, "> $outfile") or die ("Cant write $outfile: $!\n");
  198. print (OH $newPdb);
  199. close(OH);
  200. }
  201. ## calculate distance between CA-atom of residue1 and residue2
  202. sub distance_between {
  203. my $self = shift;
  204. my $resid1 = shift;
  205. my $resid2 = shift;
  206. if (not exists($self->{residues}->{$resid1}) || not exists($self->{residues}->{$resid2})) {
  207. print "ERROR: PdbFile::distance_between cant find residue ($resid1, $resid2)!\n";
  208. return -1;
  209. }
  210. my @coord1 = $self->get_CA_coordinates($resid1);
  211. my @coord2 = $self->get_CA_coordinates($resid2);
  212. if ($coord1[0] == -999999 || $coord2[0] == -999999) {
  213. return -999999;
  214. }
  215. if ($#coord1 != $#coord2) {
  216. print "ERROR: distance_between: coord1 and coord2 differ in length (resid1=$resid1,resid2=$resid2)!\n";
  217. }
  218. my $dist = &euklid_dist(\@coord1, \@coord2);
  219. return $dist;
  220. }
  221. sub get_CA_coordinates {
  222. my $self = shift;
  223. my $resid = shift;
  224. my $v = shift || 0;
  225. if (not exists($self->{residues}->{$resid})) {
  226. if ($v >= 1) {
  227. print "CA coordinates dont exist for residue\n$resid\n";
  228. }
  229. return ((-999999));
  230. }
  231. $resid = $self->{residues}->{$resid};
  232. my @atoms = split(/\n/, $resid);
  233. my @coords;
  234. for (my $i=0; $i<@atoms; $i++) {
  235. if ($atoms[$i] =~ /ATOM.{2}\s*\d+\s.CA\s.\S{3}\s.\s*\d+[\s|\D]\s*(-?\d+.\d+)\s*(-?\d+.\d+)\s*(-?\d+.\d+)/) {
  236. push(@coords, $1, $2, $3);
  237. last;
  238. }
  239. }
  240. if (scalar(@coords) != 3) {
  241. if ($v >= 1) {
  242. print "CA coordinates dont exist for residue\n$resid\n";
  243. }
  244. return ((-999999));
  245. }
  246. return @coords;
  247. }
  248. sub radius_of_gyration {
  249. my $self = shift;
  250. my @coords;
  251. my @center = (0,0,0);
  252. my $numValidResidues = 0;
  253. my $gyrationRadius2 = 0;
  254. ## calculate center:
  255. foreach my $residue (keys(%{$self->{residues}})) {
  256. @coords = $self->get_CA_coordinates($residue);
  257. next if ($coords[0] == -999999);
  258. for (my $j=0; $j<3; $j++) {
  259. $center[$j] += $coords[$j];
  260. }
  261. $numValidResidues++;
  262. }
  263. for (my $j=0; $j<3; $j++) {
  264. $center[$j] /= $numValidResidues;
  265. }
  266. ## calculate distances to center:
  267. foreach my $residue (keys(%{$self->{residues}})) {
  268. @coords = $self->get_CA_coordinates($residue);
  269. next if ($coords[0] == -999999);
  270. my $dist = &euklid_dist(\@coords, \@center);
  271. $gyrationRadius2 += $dist*$dist;
  272. }
  273. $gyrationRadius2 /= $numValidResidues;
  274. return sqrt($gyrationRadius2);
  275. }
  276. ## calculate distances between residues
  277. ## if a pdb file is not complete, ie. there are missing residues
  278. ## then two "subsequent" (solved) residues are not subsequent in sequence
  279. ## eg. solved residues: 4 5 8 9
  280. ## => with inbetween==0: 4-5 5-8 8-9
  281. ## if only sequence-subsequent residues shall be considered use seqsub=1
  282. sub pairwise_distances {
  283. my $self = shift;
  284. ## how many residues between residue x and y
  285. my $inbetween = shift || 0;
  286. my $seqsub = shift || 0;
  287. my %distances;
  288. my @residues = sort {$a <=> $b} keys(%{$self->{residues}});
  289. for (my $i=0; $i<scalar(@residues)-$inbetween-1; $i++) {
  290. my $xResidue = $residues[$i];
  291. my $yResidue = $residues[$i+$inbetween+1];
  292. next if ($seqsub==1 && $yResidue - $xResidue != $inbetween+1);
  293. my $dist = $self->distance_between($xResidue, $yResidue);
  294. next if ($dist == -999999);
  295. $distances{"$xResidue-$yResidue"} = $dist;
  296. }
  297. return %distances;
  298. }
  299. 1;