distanceTree.pm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. #!/user/bin/perl -w
  2. use strict;
  3. package distanceTree;
  4. use predTMscore;
  5. use singleRankingNet;
  6. use TemplateList;
  7. use simpleTree;
  8. use aminoAcid;
  9. use utilities;
  10. use config;
  11. my $config = HHpredConfig->instance();
  12. our @ISA = qw(Exporter);
  13. our @EXPORT = qw(upgma readDistances printDistances calculateSequenceWeights calculatePairwiseDistances iterativelyWeightTemplates getProbSeqAndProfSimilarityBetween neighborJoin);
  14. ## pairwise distances between query and sequences in templateList
  15. ##
  16. ## do this by:
  17. ## - calculating pairwise similarities
  18. ## - transforming these similarities into distances
  19. sub calculatePairwiseDistances {
  20. my $queryName = shift;
  21. my $templateList = shift;
  22. my $hhmDir = shift;
  23. my $workingDir = shift;
  24. my $v = 1;
  25. my @simSeq;
  26. my @simProf;
  27. my @simPredTMscore;
  28. my @prob;
  29. my @dist;
  30. my @seqNames;
  31. ## get sequence names, ordering is important, see upgma!
  32. push (@seqNames, $queryName);
  33. for (my $i=0; $i<$templateList->size(); $i++) {
  34. push (@seqNames, $templateList->get($i)->get_Hit());
  35. }
  36. print "\nSimilarities:\n";
  37. print "-------------\n";
  38. for (my $i=0; $i<@seqNames; $i++) {
  39. for (my $j=0; $j<@seqNames; $j++) {
  40. my $qhhmFile = "$hhmDir/$seqNames[$i].hhm";
  41. my $thhmFile = "$hhmDir/$seqNames[$j].hhm";
  42. $simPredTMscore[$i][$j] = &getPredTMscoreSimilarityBetween($i, $j, $qhhmFile, $thhmFile, $workingDir);
  43. }
  44. }
  45. ## calculate pairwise distances between sequences:
  46. for (my $i=0; $i<@seqNames; $i++) {
  47. for (my $j=0; $j<@seqNames; $j++) {
  48. ## NN TMscore based distance
  49. $dist[$i][$j] = -log( &min(0.99, &max($simPredTMscore[$i][$j], 0.01)) ) / log(2);
  50. if ($i != $j and $dist[$i][$j] <= 0.001) {
  51. $dist[$i][$j] = 0.01;
  52. }
  53. ## minimum distance between i,j and j,i (for templates only, not wrt query)
  54. if ($i > $j && $i != 0 && $j != 0) {
  55. if ($dist[$i][$j] < $dist[$j][$i]) {
  56. $dist[$j][$i] = $dist[$i][$j];
  57. } else {
  58. $dist[$i][$j] = $dist[$j][$i];
  59. }
  60. }
  61. }
  62. }
  63. if ($v >= 1) {
  64. for (my $i=0; $i<@seqNames; $i++) {
  65. for (my $j=0; $j<@seqNames; $j++) {
  66. print sprintf("%.3f ", $dist[$i][$j]);
  67. }
  68. print "\n";
  69. }
  70. print "--------------------------------------------------------\n";
  71. }
  72. return (@dist);
  73. }
  74. ## calculate sequence based similarity between to sequences
  75. ## (which are extracted from a pir alignment)
  76. sub calculateSeqSimilarityBetween {
  77. my $seq1 = shift;
  78. my $seq2 = shift;
  79. my $aaO = shift; ## amino-acid object
  80. if (length($seq1) != length($seq2)) {
  81. print "Error: calculateSimilarityBetween: sequences differ in length!\n";
  82. }
  83. my $matchCols = 1;
  84. my $sim = 0;
  85. for (my $i=0; $i<length($seq1); $i++) {
  86. my $AA1 = substr($seq1, $i, 1);
  87. my $AA2 = substr($seq2, $i, 1);
  88. my $aaI1 = $aaO->aa2i($AA1);
  89. my $aaI2 = $aaO->aa2i($AA2);
  90. ## if match-match
  91. if ($aaI1 >= 0 and $aaI1 < 20 and $aaI2 >= 0 and $aaI2 < 20) {
  92. $sim += $SIM[$aaI1][$aaI2];
  93. $matchCols++;
  94. }
  95. }
  96. return $sim/$matchCols;
  97. }
  98. ## calculate profile based similarity between sequences
  99. ## (by hhalign)
  100. sub getProbSeqAndProfSimilarityBetween {
  101. my $qhhmFile = shift;
  102. my $thhmFile = shift;
  103. my $workingDir = shift;
  104. my $hhalign = $config->get_hhalign();
  105. ## use full length sequences for profile similarity?? or only ranges which are aligned to query??
  106. ## unfortunately, $output = `cmd -o -` does not work on queue
  107. ## TODO: read hhalign parameters from config
  108. my $cmd = "$hhalign -i $qhhmFile -t $thhmFile -mact 0.05 -o $workingDir/hhalignPairwiseOutput.hhr";
  109. system("$cmd");
  110. print "calling hhalign(" . &get_basename($qhhmFile) . "," . &get_basename($thhmFile) . ")\n";
  111. my $tlist = TemplateList->new();
  112. $tlist->hhr_to_TemplateList("$workingDir/hhalignPairwiseOutput.hhr");
  113. system("rm $workingDir/hhalignPairwiseOutput.hhr");
  114. ## sequence based similarity
  115. my $seqSim = $tlist->get(0)->get_Sim();
  116. ## profile based similarity
  117. my $score = $tlist->get(0)->get_Score();
  118. my $aliCols = $tlist->get(0)->get_Cols();
  119. my $profileSim = $score / $aliCols;
  120. my $prob = $tlist->get(0)->get_Prob();
  121. return ($prob, $seqSim, $profileSim);
  122. }
  123. sub getPredTMscoreSimilarityBetween {
  124. my $i = shift;
  125. my $j = shift;
  126. my $qhhmFile = shift;
  127. my $thhmFile = shift;
  128. my $workingDir = shift;
  129. my $hhalign = $config->get_hhalign();;
  130. my $params = " -mact 0.05 -ssm 4 ";
  131. if ($i == 0 || $j == 0) { $params = " -mact 0.05 "; }
  132. ## use full length sequences for profile similarity?? or only ranges which are aligned to query??
  133. ## unfortunately, $output = `cmd -o -` does not work on queue
  134. ## TODO: read hhalign parameters from config
  135. my $cmd = "$hhalign -i $qhhmFile -t $thhmFile $params -o $workingDir/hhalignPairwiseOutput.hhr";
  136. system("$cmd");
  137. print "calling hhalign(" . &get_basename($qhhmFile) . "," . &get_basename($thhmFile) . ")\n";
  138. my $tlist = TemplateList->new();
  139. $tlist->hhr_to_TemplateList("$workingDir/hhalignPairwiseOutput.hhr");
  140. system("rm $workingDir/hhalignPairwiseOutput.hhr");
  141. my $TMscoreNet = singleRankingNet->new();
  142. my $QLen = $tlist->get_queryLength();
  143. my $QNeff = $tlist->get_neff();
  144. print "getPredTMscoreSimiliarityBetween: TMscoreNet->predict_n11)\n";
  145. my $predTMscore = $TMscoreNet->predict_TMscore($tlist->get(0), $QNeff);
  146. # my $Tstart = $tlist->get(0)->get_Tstart();
  147. # my $Tend = $tlist->get(0)->get_Tend();
  148. # my $TLen = $Tend - $Tstart + 1;
  149. #my $TLen = &get_HMM_len($thhmFile);
  150. my $s = $predTMscore; # * $QLen / &min($QLen, $TLen);
  151. return $s;
  152. }
  153. ## go through pariwise distance matrix (upper right triangle of matrix)
  154. ## and find pair with minimal distance
  155. sub searchOpt {
  156. my $hasParentPtr = shift;
  157. my $distPtr = shift;
  158. my $minI = -1;
  159. my $minJ = -1;
  160. my $minDist = 999999999;
  161. for (my $i=0; $i<@{$distPtr}; $i++) {
  162. ## nodes which already have a parent are skipped
  163. next if ($hasParentPtr->{$i});
  164. for (my $j=$i+1; $j<@{$distPtr->[$i]}; $j++) {
  165. next if ($hasParentPtr->{$j});
  166. if ($distPtr->[$i][$j] < $minDist) {
  167. $minDist = $distPtr->[$i][$j];
  168. $minI = $i;
  169. $minJ = $j;
  170. }
  171. }
  172. }
  173. return ($minI, $minJ, $minDist);
  174. }
  175. ## add a new node (parent of node i and j) to similarity matrix
  176. sub addNodeAndConnect {
  177. my $hasParentPtr = shift;
  178. my $distPtr = shift;
  179. my $clusterSizePtr = shift;
  180. my $nodeI = shift;
  181. my $nodeJ = shift;
  182. ## set new parent k of nodeI and nodeJ
  183. my $parent = scalar(@{$distPtr->[0]});
  184. my $k = $parent;
  185. $hasParentPtr->{$nodeI} = $parent;
  186. $hasParentPtr->{$nodeJ} = $parent;
  187. $clusterSizePtr->{$parent} = $clusterSizePtr->{$nodeI} + $clusterSizePtr->{$nodeJ};
  188. ## calculate distances from all available nodes
  189. ## (i.e. the ones without parent) to new node k
  190. for (my $el=0; $el<@{$distPtr}; $el++) {
  191. if ($hasParentPtr->{$el} or $el==$nodeI or $el==$nodeJ) {
  192. $distPtr->[$el][$k] = 0;
  193. $distPtr->[$k][$el] = 0;
  194. next;
  195. }
  196. # print "distPtr->[$nodeI][$el]=" . $distPtr->[$nodeI][$el] . "\n";
  197. # print "clusterSizePtr->{$nodeI}=" . $clusterSizePtr->{$nodeI} . "\n";
  198. # print "distPtr->[$nodeJ][$el]=" . $distPtr->[$nodeJ][$el] . "\n";
  199. # print "clusterSizePtr->{$nodeJ}=" . $clusterSizePtr->{$nodeJ} . "\n";
  200. $distPtr->[$el][$k] = ($distPtr->[$el][$nodeI]*$clusterSizePtr->{$nodeI} + $distPtr->[$el][$nodeJ]*$clusterSizePtr->{$nodeJ}) / ($clusterSizePtr->{$nodeI} + $clusterSizePtr->{$nodeJ});
  201. $distPtr->[$k][$el] = $distPtr->[$el][$k];
  202. }
  203. return $k;
  204. }
  205. sub connectNodes {
  206. my $hasParentPtr = shift;
  207. my $clusterSizePtr = shift;
  208. my $nodeI = shift;
  209. my $nodeJ = shift;
  210. my $parent = shift;
  211. $hasParentPtr->{$nodeI} = $parent;
  212. $hasParentPtr->{$nodeJ} = $parent;
  213. # print "clusterSize->{$nodeI} = " . $clusterSizePtr->{$nodeI} . "\n";
  214. # print "clusterSize->{$nodeJ} = " . $clusterSizePtr->{$nodeJ} . "\n";
  215. $clusterSizePtr->{$parent} = $clusterSizePtr->{$nodeI} + $clusterSizePtr->{$nodeJ};
  216. }
  217. sub printDistances {
  218. my $distPtr = shift;
  219. my @dist = @$distPtr;
  220. print "Distances\n";
  221. print "---------\n";
  222. for (my $i=0; $i<@dist; $i++) {
  223. my @row = @{$dist[$i]};
  224. for (my $j=0; $j<@row; $j++) {
  225. print sprintf("%.3f ", $row[$j]);
  226. }
  227. print "\n";
  228. }
  229. print "\n";
  230. }
  231. sub readDistances {
  232. my $file = shift;
  233. my @sim;
  234. open(SH, "< $file") or die "Cant open $file: $!\n";
  235. while(my $line = <SH>) {
  236. my @toks = split(/\s+/, $line);
  237. push(@sim, \@toks);
  238. }
  239. close(SH);
  240. return @sim;
  241. }
  242. ## build upgma tree from distance matrix
  243. ## the distance matrix is extended in every step (for every new node)
  244. ## leaves are numbered as rows (columns) in distances, ie.
  245. ## ordering is important, see calculatePairwiseDistances
  246. sub upgma {
  247. my @distances = @_;
  248. my $tree = simpleTree->new();
  249. my $v = 2;
  250. my $numLeafs = scalar(@distances);
  251. my %clusterSize;
  252. for (my $i=0; $i<@distances; $i++) {
  253. $clusterSize{$i} = 1;
  254. $tree->addLeaf($i);
  255. }
  256. my %hasParent;
  257. for (my $step=0; $step<$numLeafs-1; $step++) {
  258. &printDistances(\@distances);
  259. ## search for two nodes with minimal distance
  260. my ($nodeI, $nodeJ, $maxDist) = &searchOpt(\%hasParent, \@distances);
  261. if ($v >= 2) {
  262. print "searchOpt=(nodeI=$nodeI,nodeJ=$nodeJ,maxDist=$maxDist)\n";
  263. }
  264. ## add a new node connecting nodeI and nodeJ
  265. my $parent = &addNodeAndConnect(\%hasParent, \@distances, \%clusterSize, $nodeI, $nodeJ);
  266. if ($v >= 2) { print "parent=$parent\n"; }
  267. $tree->addNode($parent);
  268. $tree->connectAdaptWeightToHeight($nodeI, $nodeJ, $maxDist/2, $maxDist/2);
  269. }
  270. return $tree;
  271. }
  272. sub calculateMatrixQ {
  273. my $distPtr = shift;
  274. my $LPtr = shift;
  275. ## calculate r_i
  276. my @idx = sort {$a <=> $b} keys(%{$LPtr});
  277. for (my $i=0; $i<@idx; $i++) {
  278. my $sum = 0;
  279. for (my $k=0; $k<@idx; $k++) {
  280. $sum += $distPtr->[$idx[$i]][$idx[$k]];
  281. }
  282. $LPtr->{$idx[$i]} = $sum / (scalar(@idx) - 2);
  283. }
  284. ## calculate D_ij and save pair i,j which gives minimum
  285. my $minI;
  286. my $minJ;
  287. my $minD = 999999999;
  288. for (my $i=0; $i<@idx; $i++) {
  289. for (my $j=0; $j<@idx; $j++) {
  290. next if ($idx[$i] == $idx[$j]);
  291. my $D = $distPtr->[$idx[$i]][$idx[$j]] - ($LPtr->{$idx[$i]} + $LPtr->{$idx[$j]});
  292. if ($D < $minD) {
  293. $minD = $D;
  294. $minI = $idx[$i];
  295. $minJ = $idx[$j];
  296. }
  297. }
  298. }
  299. return ($minI, $minJ);
  300. }
  301. sub newNodeWithDistances {
  302. my $distPtr = shift;
  303. my $LPtr = shift;
  304. my $minI = shift;
  305. my $minJ = shift;
  306. my $k = scalar(@{$distPtr});
  307. my @idx = sort {$a <=> $b} (keys %{$LPtr});
  308. for (my $m=0; $m<@idx; $m++) {
  309. $distPtr->[$k][$idx[$m]] = 0.5*($distPtr->[$minI][$idx[$m]] + $distPtr->[$minJ][$idx[$m]] - $distPtr->[$minI][$minJ]);
  310. $distPtr->[$idx[$m]][$k] = $distPtr->[$k][$idx[$m]];
  311. }
  312. $distPtr->[$k][$k] = 0;
  313. return $k;
  314. }
  315. sub neighborJoin {
  316. my @distances = @_;
  317. &printMatrix(\@distances, "distances in neighborjoin");
  318. my $v = 2;
  319. my $tree = simpleTree->new();
  320. my %L;
  321. ## T is set of leaf nodes, L=T
  322. for (my $i=0; $i<@distances; $i++) {
  323. $tree->addLeaf($i);
  324. $L{$i} = 1;
  325. print "add leaf $i\n" if ($v >= 2);
  326. }
  327. while (scalar(keys(%L)) > 2) {
  328. ## calculate matrix Q
  329. ## and find nodes i,j to join
  330. my ($minI, $minJ) = &calculateMatrixQ(\@distances, \%L);
  331. printMatrix(\@distances, "MatrixQ") if ($v >= 2);
  332. print "minI=$minI, minJ=$minJ\n" if ($v >= 2);
  333. ## define new node k and calculate its distances
  334. my $k = &newNodeWithDistances(\@distances, \%L, $minI, $minJ);
  335. print "new node k=$k\n" if ($v >= 2);
  336. ## calculate edge lengths ik, jk
  337. my $distIK = 0.5*($distances[$minI][$minJ] + $L{$minI} - $L{$minJ});
  338. my $distJK = $distances[$minI][$minJ] - $distIK;
  339. print "distIK=$distIK, distJK=$distJK\n" if ($v >= 2);
  340. ## add node k as parent of i,j to tree
  341. $tree->connect($minI, $minJ, $distIK, $distJK, $k);
  342. print "connect($minI,$minJ,$distIK,$distJK,$k)\n" if ($v >= 2);
  343. ## remove i and j from L, add k
  344. delete $L{$minI};
  345. delete $L{$minJ};
  346. $L{$k} = 1;
  347. }
  348. if (scalar(keys(%L)) == 2) {
  349. ## only two leaves remain: add last "edge" (in fact two edges, each having half the weight since tree is rooted)
  350. my @final = keys(%L);
  351. my $i = $final[0];
  352. my $j = $final[1];
  353. print "final i=$i\n" if ($v>=2);
  354. print "final j=$j\n" if ($v>=2);
  355. my $finalWeight = $distances[$i][$j];
  356. print "finalWeight=$finalWeight\n" if ($v>=2);
  357. $tree->connect($i, $j, $finalWeight/2, $finalWeight/2);
  358. }
  359. return $tree;
  360. }
  361. sub iterativelyWeightTemplates {
  362. my $tree = shift;
  363. my $treeModify = $tree; ## copy by value
  364. my $root = $treeModify->getRoot();
  365. my @leaves = $treeModify->getAllLeavesBelow($root);
  366. my %weights = map {$_ => 1} @leaves; ## initial weights
  367. ## preprocessing of tree to ensure that getNextAllBelowVisitedNode works correctly
  368. $treeModify->visitAllLeaves();
  369. my $v = 2;
  370. my $round = 0;
  371. my $curNode = -1;
  372. ## iteratively start with "lowest" nodes and go up till root is reached
  373. while ($curNode != $root) {
  374. if ($v>= 2) {
  375. print "round=$round\n";
  376. print "---------\n";
  377. $round++;
  378. }
  379. ## find node with all its children have already been visited
  380. $curNode = $treeModify->getNextAllBelowVisitedNode();
  381. my $parent = $treeModify->{parent}->{$curNode};
  382. if ($v >= 2) {
  383. print "curNode=$curNode\n";
  384. print "parent=$parent\n";
  385. }
  386. ## get its leaves
  387. my @curLeaves = $treeModify->getAllLeavesBelow($curNode);
  388. if ($v >= 2) {
  389. print "curLeaves=@curLeaves\n";
  390. }
  391. ## update weights
  392. my $t0 = $treeModify->{weight}->{$curNode};
  393. if ($t0 < 0.00001) {
  394. $t0 = 0.0001;
  395. }
  396. if ($v >= 2) {
  397. print "t0=$t0\n";
  398. }
  399. my $normalizer = 1.0/$t0;
  400. foreach my $leaf (@curLeaves) {
  401. $normalizer += $weights{$leaf} / $tree->{weight}->{$leaf};
  402. }
  403. foreach my $leaf (@curLeaves) {
  404. my $distn = $treeModify->{weight}->{$leaf};
  405. if ($distn < 0.00001) { $distn = 0.0001; }
  406. $weights{$leaf} *= (1.0/$t0 + 1.0/$distn) / $normalizer;
  407. if ($v >= 2) {
  408. print "weight{$leaf}=$weights{$leaf}\n";
  409. }
  410. }
  411. ## update distances
  412. foreach my $leaf (@curLeaves) {
  413. $treeModify->{weight}->{$leaf} += $t0;
  414. }
  415. }
  416. foreach my $leaf (sort {$a <=> $b} keys(%weights)) {
  417. print "final weight for $leaf: $weights{$leaf}\n";
  418. }
  419. return %weights;
  420. }
  421. 1;