aminoAcid.pm 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. package aminoAcid;
  2. require Exporter;
  3. use strict;
  4. our @ISA = qw(Exporter);
  5. our @EXPORT = qw(@SIM);
  6. my @GONNET = (
  7. ## A R N D C Q E G H I L K M F P S T W Y V
  8. 10227, 3430, 2875, 3869, 1625, 2393, 4590, 6500, 2352, 3225, 5819, 4172, 1435, 1579, 3728, 4610, 6264, 418, 1824, 5709, # A
  9. 3430, 7780, 2209, 2589, 584, 2369, 3368, 3080, 2173, 1493, 3093, 5701, 763, 859, 1893, 2287, 3487, 444, 1338, 2356, # R
  10. 2875, 2209, 3868, 3601, 501, 1541, 2956, 3325, 1951, 1065, 2012, 2879, 532, 688, 1480, 2304, 3204, 219, 1148, 1759, # N
  11. 3869, 2589, 3601, 8618, 488, 2172, 6021, 4176, 2184, 1139, 2151, 3616, 595, 670, 2086, 2828, 3843, 204, 1119, 2015, # D
  12. 1625, 584, 501, 488, 5034, 355, 566, 900, 516, 741, 1336, 591, 337, 549, 419, 901, 1197, 187, 664, 1373, # C
  13. 2393, 2369, 1541, 2172, 355, 1987, 2891, 1959, 1587, 1066, 2260, 2751, 570, 628, 1415, 1595, 2323, 219, 871, 1682, # Q
  14. 4590, 3368, 2956, 6021, 566, 2891, 8201, 3758, 2418, 1624, 3140, 4704, 830, 852, 2418, 2923, 4159, 278, 1268, 2809, # E
  15. 6500, 3080, 3325, 4176, 900, 1959, 3758,26066, 2016, 1354, 2741, 3496, 741, 797, 2369, 3863, 4169, 375, 1186, 2569, # G
  16. 2352, 2173, 1951, 2184, 516, 1587, 2418, 2016, 5409, 1123, 2380, 2524, 600, 1259, 1298, 1642, 2446, 383, 876, 1691, # H
  17. 3225, 1493, 1065, 1139, 741, 1066, 1624, 1354, 1123, 6417, 9630, 1858, 1975, 2225, 1260, 1558, 3131, 417, 1697, 7504, # I
  18. 5819, 3093, 2012, 2151, 1336, 2260, 3140, 2741, 2380, 9630,25113, 3677, 4187, 5540, 2670, 2876, 5272, 1063, 3945,11005, # L
  19. 4172, 5701, 2879, 3616, 591, 2751, 4704, 3496, 2524, 1858, 3677, 7430, 949, 975, 2355, 2847, 4340, 333, 1451, 2932, # K
  20. 1435, 763, 532, 595, 337, 570, 830, 741, 600, 1975, 4187, 949, 1300, 1111, 573, 743, 1361, 218, 828, 2310, # M
  21. 1579, 859, 688, 670, 549, 628, 852, 797, 1259, 2225, 5540, 975, 1111, 6126, 661, 856, 1498, 1000, 4464, 2602, # F
  22. 3728, 1893, 1480, 2086, 419, 1415, 2418, 2369, 1298, 1260, 2670, 2355, 573, 661,11834, 2320, 3300, 179, 876, 2179, # P
  23. 4610, 2287, 2304, 2828, 901, 1595, 2923, 3863, 1642, 1558, 2876, 2847, 743, 856, 2320, 3611, 4686, 272, 1188, 2695, # S
  24. 6264, 3487, 3204, 3843, 1197, 2323, 4159, 4169, 2446, 3131, 5272, 4340, 1361, 1498, 3300, 4686, 8995, 397, 1812, 5172, # T
  25. 418, 444, 219, 204, 187, 219, 278, 375, 383, 417, 1063, 333, 218, 1000, 179, 272, 397, 4101, 1266, 499, # W
  26. 1824, 1338, 1148, 1119, 664, 871, 1268, 1186, 876, 1697, 3945, 1451, 828, 4464, 876, 1188, 1812, 1266, 9380, 2227, # Y
  27. 5709, 2356, 1759, 2015, 1373, 1682, 2809, 2569, 1691, 7504,11005, 2932, 2310, 2602, 2179, 2695, 5172, 499, 2227,11569);# V
  28. my @BLOSUM62 = (
  29. 0.0222,
  30. 0.0022,0.0181,
  31. 0.0019,0.0019,0.0148,
  32. 0.0021,0.0015,0.0037,0.0225,
  33. 0.0016,0.0004,0.0004,0.0004,0.0127,
  34. 0.0018,0.0024,0.0015,0.0016,0.0003,0.0076,
  35. 0.0029,0.0025,0.0021,0.0049,0.0003,0.0034,0.0168,
  36. 0.0057,0.0016,0.0027,0.0024,0.0007,0.0013,0.0018,0.0396,
  37. 0.0010,0.0013,0.0014,0.0009,0.0002,0.0010,0.0013,0.0009,0.0096,
  38. 0.0031,0.0012,0.0009,0.0011,0.0012,0.0008,0.0012,0.0013,0.0006,0.0191,
  39. 0.0043,0.0023,0.0013,0.0014,0.0016,0.0016,0.0019,0.0020,0.0009,0.0115,0.0388,
  40. 0.0032,0.0062,0.0024,0.0024,0.0005,0.0030,0.0040,0.0024,0.0012,0.0015,0.0024,0.0166,
  41. 0.0013,0.0007,0.0005,0.0004,0.0004,0.0007,0.0006,0.0007,0.0003,0.0025,0.0052,0.0008,0.0045,
  42. 0.0016,0.0009,0.0007,0.0007,0.0005,0.0005,0.0008,0.0011,0.0008,0.0030,0.0056,0.0009,0.0012,0.0186,
  43. 0.0021,0.0009,0.0008,0.0011,0.0003,0.0008,0.0014,0.0013,0.0005,0.0009,0.0013,0.0015,0.0004,0.0005,0.0195,
  44. 0.0065,0.0022,0.0030,0.0027,0.0011,0.0018,0.0029,0.0037,0.0011,0.0017,0.0024,0.0030,0.0008,0.0012,0.0016,0.0137,
  45. 0.0037,0.0017,0.0022,0.0019,0.0009,0.0013,0.0021,0.0022,0.0007,0.0027,0.0033,0.0023,0.0010,0.0012,0.0013,0.0048,0.0133,
  46. 0.0004,0.0003,0.0002,0.0001,0.0002,0.0002,0.0002,0.0004,0.0002,0.0004,0.0007,0.0003,0.0002,0.0009,0.0001,0.0003,0.0003,0.0074,
  47. 0.0013,0.0009,0.0007,0.0006,0.0004,0.0006,0.0008,0.0008,0.0016,0.0015,0.0023,0.0010,0.0006,0.0043,0.0004,0.0010,0.0009,0.0010,0.0113,
  48. 0.0049,0.0015,0.0011,0.0012,0.0014,0.0011,0.0016,0.0017,0.0006,0.0120,0.0094,0.0019,0.0023,0.0025,0.0012,0.0023,0.0035,0.0004,0.0015,0.0206);
  49. use vars qw(@SIM);
  50. @SIM = ();
  51. sub new {
  52. my $class = shift;
  53. my $self = {};
  54. bless($self, $class);
  55. $self->setupMatrices();
  56. return $self;
  57. }
  58. sub setupMatrices {
  59. my $self = shift;
  60. $self->setupSubstitutionMatrix();
  61. }
  62. sub aa2i {
  63. my $self = shift;
  64. my $aa = shift;
  65. $aa = uc($aa);
  66. if ($aa eq 'A') { return 0; }
  67. elsif ($aa eq 'R') { return 1; }
  68. elsif ($aa eq 'N') { return 2; }
  69. elsif ($aa eq 'D') { return 3; }
  70. elsif ($aa eq 'C') { return 4; }
  71. elsif ($aa eq 'Q') { return 5; }
  72. elsif ($aa eq 'E') { return 6; }
  73. elsif ($aa eq 'G') { return 7; }
  74. elsif ($aa eq 'H') { return 8; }
  75. elsif ($aa eq 'I') { return 9; }
  76. elsif ($aa eq 'L') { return 10;}
  77. elsif ($aa eq 'K') { return 11;}
  78. elsif ($aa eq 'M') { return 12;}
  79. elsif ($aa eq 'F') { return 13;}
  80. elsif ($aa eq 'P') { return 14;}
  81. elsif ($aa eq 'S') { return 15;}
  82. elsif ($aa eq 'T') { return 16;}
  83. elsif ($aa eq 'W') { return 17;}
  84. elsif ($aa eq 'Y') { return 18;}
  85. elsif ($aa eq 'V') { return 19;}
  86. elsif ($aa eq 'X') { return 20;}
  87. elsif ($aa eq 'J') { return 20;}
  88. elsif ($aa eq 'O') { return 20;}
  89. elsif ($aa eq 'U') { return 4; }
  90. elsif ($aa eq 'B') { return 3; }
  91. elsif ($aa eq 'Z') { return 6; }
  92. elsif ($aa eq '-') { return 20;}
  93. elsif ($aa eq '_') { return 20;}
  94. elsif ($aa eq '.') { return 20;}
  95. else {return 20;}
  96. }
  97. sub printSimilarityMatrix {
  98. my $self = shift;
  99. for (my $i=0; $i<20; $i++) {
  100. for (my $j=0; $j<20; $j++) {
  101. my $me = sprintf("%.3f", $SIM[$i][$j]);
  102. print "$me ";
  103. }
  104. print "\n";
  105. }
  106. }
  107. sub setupSubstitutionMatrix {
  108. my $self = shift;
  109. my $m = shift;
  110. my $matrix = defined($m) ? $m : 0;
  111. my @pb;
  112. my @P;
  113. my @R;
  114. my $b;
  115. if ($matrix == 0) {
  116. for (my $a=0; $a<20; ++$a) {
  117. for ($pb[$a]=0.0, $b=0; $b<20; ++$b) {
  118. $P[$a][$b] = 0.000001 * $GONNET[$a*20 + $b];
  119. }
  120. }
  121. for (my $a=0; $a<20; ++$a) { $P[$a][20] = $P[20][$a] = 1.0 };
  122. } else {
  123. &setupBlosumMatrix($matrix, \@P, \@pb);
  124. }
  125. my $sumab=0.0;
  126. for (my $a=0; $a<20; $a++) {
  127. for (my $b=0; $b<20; $b++) {
  128. $sumab += $P[$a][$b];
  129. }
  130. }
  131. for (my $a=0; $a<20; $a++) {
  132. for (my $b=0; $b<20; $b++) {
  133. $P[$a][$b] /= $sumab;
  134. }
  135. }
  136. for (my $a=0; $a<20; $a++) {
  137. for ($pb[$a]=0.0, my $b=0; $b<20; $b++) {
  138. $pb[$a] += $P[$a][$b];
  139. }
  140. }
  141. for (my $a=0; $a<20; ++$a) {
  142. for (my $b=0; $b<20; ++$b) {
  143. $R[$a][$b] = $P[$a][$b]/$pb[$b];
  144. }
  145. }
  146. ## Precompute matrix R for amino acid pseudocounts:
  147. for (my $a=0; $a<20; ++$a) {
  148. for (my $b=0; $b<20; ++$b) {
  149. $SIM[$a][$b] = log($R[$a][$b]/$pb[$a])/log(2);
  150. }
  151. }
  152. }
  153. sub setupBlosumMatrix {
  154. my $self = shift;
  155. my $matrix = shift;
  156. my $Pref = shift;
  157. my $pbref = shift;
  158. my @P = @{$Pref};
  159. my @pb = @{$pbref};
  160. my $n = 0;
  161. for (my $a=0; $a<20; $a++) {
  162. for ($pb[$a]=0.0, my $b=0; $b<=$a; $b++, $n++) {
  163. $P[$a][$b] = $BLOSUM62[$n];
  164. }
  165. }
  166. for (my $a=0; $a<19; $a++) {
  167. for (my $b=$a+1; $b<20; $b++) {
  168. $P[$a][$b] = $P[$b][$a];
  169. }
  170. }
  171. for (my $a=0; $a<20; $a++) {
  172. $P[$a][20] = 1.0;
  173. $P[20][$a] = 1.0;
  174. }
  175. }
  176. 1;