selectTemplatesHeuristic.pm 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. #!/user/bin/perl -w
  2. package selectTemplatesHeuristic;
  3. use strict;
  4. use config;
  5. use Template;
  6. use TemplateList;
  7. use utilities;
  8. our @ISA = qw(Exporter);
  9. our @EXPORT = qw(preselectTemplates presetAccPosteriors ChooseTemplatesScoringHeuristic SingleTemplateSelection UpdateMaxProb);
  10. #########################################################################
  11. ## input: inbase, i.e. path/name of query
  12. ## outbase, i.e. path/name of outputs for query
  13. ## bestByProp, i.e. number of best entries taken, e.g. by sim
  14. ## output: array of "best" templates wrt similarity, sumProb/len, score
  15. #########################################################################
  16. sub preselectTemplates {
  17. my $numBestByProp = shift;
  18. my $templateList = shift;
  19. my $verbose = 1;
  20. if ($verbose >= 2) {
  21. $templateList->print();
  22. }
  23. ## heuristic works as follows:
  24. ## get an hhr file and select the best X templates wrt.
  25. ## - similarity
  26. ## - sumprobs
  27. ## - probability
  28. ## and for the rest (about 15-20?) templates use the scoring procedure
  29. my $selectedTemplates = TemplateList->new();
  30. ## select most interesting 30-50 templates using a heuristic
  31. ## best numBestByProp wrt similarity
  32. $templateList->sort_by_sim();
  33. for (my $i=0; $i<$numBestByProp; $i++) {
  34. my $templ = $templateList->get($i);
  35. $selectedTemplates->check_and_add($templ);
  36. }
  37. ## best numBestByProp wrt sumOfProbs
  38. $templateList->sort_by_sumProbL();
  39. for (my $i=0; $i<$numBestByProp; $i++) {
  40. $selectedTemplates->check_and_add($templateList->get($i));
  41. }
  42. ## best numBestByProp wrt probability
  43. $templateList->sort_by_prob();
  44. for (my $i=0; $i<$numBestByProp; $i++) {
  45. $selectedTemplates->check_and_add($templateList->get($i));
  46. }
  47. return ($selectedTemplates);
  48. }
  49. ########################################################
  50. ## input: array of templates (from CreateArrayInfo)
  51. ## inbase: path/queryname to hhm file
  52. ## outbase: path/queryname, where path is the
  53. ## location to write new files; all these new
  54. ## files start with queryname
  55. ## templateList: array of preselected templates
  56. ## as created by CreateInfoArray
  57. ##
  58. ## output: hash of references to arrays containing
  59. ## posteriori probabilities for each template
  60. ########################################################
  61. sub presetAccPosteriors {
  62. print "calling presetAccPosteriors\n";
  63. my $queryLength = shift;
  64. my $outbase = shift;
  65. my $tlist = shift;
  66. my $config = shift;
  67. my $verbose = 3;
  68. my %posteriors;
  69. ## create tab files for all templates
  70. for (my $tl=0; $tl<$tlist->size(); $tl++)
  71. {
  72. my $template = $tlist->get($tl);
  73. my $querystart = $template->get_Qstart();
  74. my $queryend = $template->get_Qend();
  75. my $tempstart = $template->get_Tstart();
  76. my $tempend = $template->get_Tend();
  77. ## filename for new tab file
  78. my $tabFileForHit = "$outbase." . $template->get_Hit() . ".HIT" . $template->get_No() . ".tab";
  79. ## create tab file(s) if necessary (for posterior probabilities)
  80. if (not (-e $tabFileForHit))
  81. {
  82. if ($verbose >= 1) {
  83. print "Creating $tabFileForHit...\n";
  84. }
  85. my $success = &BuildSingleTabFile("$outbase." . $template->get_Filt() . ".tab", $template->get_No(), $outbase);
  86. if ( ! $success) {
  87. print "Warning: presetAccPosteriors could not build tab file $tabFileForHit!\n";
  88. next;
  89. }
  90. }
  91. open (ALIH, "< $tabFileForHit") or die ("Cant open $tabFileForHit! $!\n");
  92. ## save posterior probabilities for template at positions given in aligned range
  93. my @templatePosteriors;
  94. for(my $i=0; $i<$queryLength; $i++)
  95. {
  96. push(@templatePosteriors, 0);
  97. }
  98. ## go through all aligned positions
  99. while(<ALIH>)
  100. {
  101. next if (/^\s*i\s+j\s+score\s+SS\s+probab/);
  102. ## position in query, position in template, score, SS, posterior-probability, dssp
  103. if (/(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/)
  104. {
  105. my $posAli = $1 - 1;
  106. my $posteriorProbab = $5;
  107. my $dssp = $6;
  108. if ($dssp eq '-') {
  109. $templatePosteriors[$posAli] = 0;
  110. }
  111. else {
  112. $templatePosteriors[$posAli] = $posteriorProbab;
  113. }
  114. next;
  115. }
  116. ## position in query, position in template, score, SS, posterior-probability
  117. ## if case is only entered, if no dssp value is available
  118. if (/(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/)
  119. {
  120. if ($verbose >= 3) {
  121. print "warning: no dssp value for " . $template->get_Hit() . ": i=$1, j=$2\n";
  122. }
  123. my $posAli = $1 - 1;
  124. my $posteriorProbab = $5;
  125. $templatePosteriors[$posAli] = $posteriorProbab;
  126. }
  127. }
  128. close(ALIH);
  129. my $templateName = $template->get_Hit();
  130. my $hitnr = $template->get_No();
  131. ## each accepted template gets an unique key in its posterior-probability table
  132. my $accKey = $templateName . "##" . $hitnr;
  133. ## works because of reference counting
  134. $posteriors{$accKey} = \@templatePosteriors;
  135. if ($verbose >= 1) {
  136. print "(created $tabFileForHit and) saved posteriors\n";
  137. }
  138. }
  139. return %posteriors;
  140. }
  141. sub UpdateMaxProb {
  142. my $maxProbPtr = shift;
  143. my $posteriorsPtr = shift;
  144. my $probHom = shift;
  145. my $similarity = 0;
  146. print "probHom=$probHom\n";
  147. my $increased = 0;
  148. for (my $i=0; $i<@{$maxProbPtr}; $i++) {
  149. my $probAtPos = $probHom * $posteriorsPtr->[$i] * ($similarity + 1);
  150. if ($probAtPos > $maxProbPtr->[$i]) {
  151. $maxProbPtr->[$i] = $probAtPos;
  152. $increased = 1;
  153. }
  154. my $r = sprintf("%.4f", $maxProbPtr->[$i]);
  155. if ($increased) {
  156. print "$r(+) ";
  157. } else {
  158. print "$r ";
  159. }
  160. $increased = 0;
  161. }
  162. print "\n";
  163. }
  164. ## calculate score for each possible template as follows:
  165. ## S(t) = sum_{i \in A_t} { exp( alpha*(Prob_t*P_t(i) - P_{max}(i)) ) - 1}
  166. ##
  167. ## take template with greatest score as long as it is > 0
  168. ##
  169. ## TODO: split this function into 2 parts?
  170. sub ChooseTemplatesScoringHeuristic
  171. {
  172. my $queryName = shift;
  173. my $queryPath = shift;
  174. my $queryLength = shift;
  175. my $outbase = shift;
  176. ## maximum number of templates to rank with this heuristic
  177. my $maxNumTemplates = shift;
  178. my $preselect = shift; ## if preselect = 1 => preselect wrt score, sumprobs, and prob,
  179. ## if preselect = 2 => preselect first element in predictions
  180. ## otherwise do not preselect anything
  181. my $templateList = shift;
  182. my $config = shift;
  183. my $verbose = 3;
  184. ## array for finally chosen templates
  185. my $templatesChosen = TemplateList->new();
  186. ## containing posterior probabilities for all templates
  187. my %accPosteriors;
  188. ## containing max_{t T_acc} {Prob_t * P_t(i) for all positions i
  189. my @maxProb;
  190. my %templateAlreadyAccepted;
  191. my $inbase = "$queryPath/$queryName";
  192. ## heuristic parameter
  193. my $alpha = 1;
  194. my $yshift = 0.95;
  195. my $overlapThreshold = 0.5;
  196. print "selectTempaltesHeuristic:\n";
  197. print "alpha=$alpha\n";
  198. print "yshift=$yshift\n\n";
  199. ## values for current best template
  200. my $maxScore = -9999999;
  201. my $bestTemplateIdx;
  202. my @posteriorsMax;
  203. my $probHomMax;
  204. my $similarityMax;
  205. my $needMoreTemplates = 1;
  206. my $bestRankedPtr = 0;
  207. my $bestRanked = "nnn";
  208. my %accepted;
  209. ## calculate posterior probabilities for all templates in list
  210. %accPosteriors = &presetAccPosteriors($queryLength, $outbase, $templateList, $config);
  211. ## set maxProb to zero array
  212. for (my $i=0; $i<$queryLength; $i++) {
  213. push(@maxProb, 0);
  214. }
  215. ## preselect templates wrt sumprobs, score and prob. Fill up according to scoring heuristic
  216. ## TODO: check for maxNumTemplates exceeding
  217. if ($preselect == 1) {
  218. ## take best 3 wrt sumprobs, score and prob
  219. print "calling preselectTemplates...\n";
  220. my $preselectedTemplates = &preselectTemplates(3, $templateList);
  221. print "preselection wrt sumprobs, score, prob:\n";
  222. $preselectedTemplates->print();
  223. ## for each preselected template, calculate posterior probabilities
  224. print "calling presetAccPosteriors...\n";
  225. my %accPosteriorsPreset = &presetAccPosteriors($queryLength, $outbase, $preselectedTemplates, $config);
  226. ## to be filled with all not preselected templates
  227. my $tmpTemplateList = TemplateList->new();
  228. ## remove preselected templates from list and update maxProb array
  229. for (my $i=0; $i<$templateList->size(); $i++)
  230. {
  231. my $template = $templateList->get($i);
  232. my $hitnr = $template->get_No();
  233. my $templateName = $template->get_Hit();
  234. my $key = $templateName . "##" . $hitnr;
  235. ## if template has been preselected: save it as 'chosen' and update maxProb
  236. if ( exists($accPosteriorsPreset{$key}) ) {
  237. $templatesChosen->add_template($template);
  238. my @posteriors = @{$accPosteriorsPreset{$key}};
  239. my $probHom = $template->get_Prob()/100;
  240. my $similarity = $template->get_Sim();
  241. &UpdateMaxProb(\@maxProb, \@posteriors, $probHom, $similarity);
  242. }
  243. else {
  244. $tmpTemplateList->add_template($template);
  245. }
  246. }
  247. ## reset templateList
  248. $templateList = $tmpTemplateList;
  249. }
  250. ## preselect first template in list, update maxProb array, then go on with scoring heuristic
  251. if ($preselect == 2) {
  252. ## first template in list is accepted for sure:
  253. $templateAlreadyAccepted{0} = 1;
  254. push(@{$accepted{$templateList->get(0)->get_Hit()}}, 0);
  255. $templatesChosen->add_template($templateList->get(0));
  256. my $keyFirst = $templateList->get(0)->get_Hit() . "##" . $templateList->get(0)->get_No();
  257. my @posteriorsFirst = @{$accPosteriors{$keyFirst}};
  258. my $probFirst = $templateList->get(0)->get_Prob() / 100;
  259. my $similarityFirst = $templateList->get(0)->get_Sim();
  260. print "accepting template name=" . $templateList->get(0)->get_Hit() . "hitnr=" . $templateList->get(0)->get_No() . "; score=first in list\n";
  261. ## update 'maxProb' array with values from first template
  262. &UpdateMaxProb(\@maxProb, \@posteriorsFirst, $probFirst, $similarityFirst);
  263. }
  264. ## test remaining templates, whether to choose one more or not
  265. while($needMoreTemplates) {
  266. ## if new best ranked template found => test overlap
  267. ##
  268. if ($bestRankedPtr != 0) {
  269. ## template with this id already accepted - test overlap
  270. if (exists($accepted{$bestRanked}))
  271. {
  272. my $overlapTooBig = 0;
  273. ## test overlap
  274. foreach my $lineNr (@{$accepted{$bestRanked}})
  275. {
  276. my $minLast = min($templateList->get($lineNr)->get_Tend(), $bestRankedPtr->get_Tend());
  277. my $minFirst = min($templateList->get($lineNr)->get_Tstart(), $bestRankedPtr->get_Tstart());
  278. my $maxLast = max($templateList->get($lineNr)->get_Tend(), $bestRankedPtr->get_Tend());
  279. my $maxFirst = max($templateList->get($lineNr)->get_Tstart(), $bestRankedPtr->get_Tstart());
  280. if (($minLast - $maxFirst)/($maxLast - $minFirst) > $overlapThreshold)
  281. {
  282. ## do not accept
  283. $overlapTooBig = 1;
  284. if ($verbose >= 1) {
  285. print "=> $bestRanked overlaps " . $templateList->get($lineNr)->get_Hit() . "\n";
  286. }
  287. last;
  288. }
  289. }
  290. if ($overlapTooBig == 0)
  291. {
  292. print "=> accepted{$bestRanked} += $bestTemplateIdx\n";
  293. push(@{$accepted{$bestRanked}}, $bestTemplateIdx);
  294. $templatesChosen->add_template($bestRankedPtr);
  295. &UpdateMaxProb(\@maxProb, \@posteriorsMax, $probHomMax, $similarityMax);
  296. print "accepting template name=" . $templateList->get($bestTemplateIdx)->get_Hit() ."; hitnr=" . $templateList->get($bestTemplateIdx)->get_No() . "; score=$maxScore\n";
  297. }
  298. }
  299. else
  300. {
  301. ## new template
  302. if ($bestRanked ne "nnn")
  303. {
  304. $accepted{$bestRanked} = ();
  305. print "=> accepted{$bestRanked} = $bestTemplateIdx\n";
  306. push(@{$accepted{$bestRanked}}, $bestTemplateIdx);
  307. $templatesChosen->add_template($bestRankedPtr);
  308. &UpdateMaxProb(\@maxProb, \@posteriorsMax, $probHomMax, $similarityMax);
  309. print "accepting template name=" . $templateList->get($bestTemplateIdx)->get_Hit() ."; hitnr=" . $templateList->get($bestTemplateIdx)->get_No() . "; score=$maxScore\n";
  310. }
  311. }
  312. }
  313. $bestRankedPtr = 0;
  314. $bestTemplateIdx = -1;
  315. $maxScore = -999999;
  316. ## start scoring all templates
  317. for (my $tl=0; $tl<$templateList->size(); $tl++) {
  318. next if (defined $templateAlreadyAccepted{$tl});
  319. my $tscore = 0;
  320. my $template = $templateList->get($tl);
  321. ## probability of template being homologous
  322. my $probHom = $template->get_Prob()/100;
  323. my $similarity = $template->get_Sim();
  324. my $templateName = $template->get_Hit();
  325. my $hitnr = $template->get_No();
  326. ## key should exist!?! not always, when hhsearch -tab file does not have entry for each hit - dont know why this happens...: same as in presetAccPosteriors
  327. my $key = "$templateName" . "##" . "$hitnr";
  328. next if (not exists($accPosteriors{$key}));
  329. my @posteriors = @{$accPosteriors{$key}};
  330. for (my $p=0; $p<@posteriors; $p++) {
  331. next if ($posteriors[$p] == 0);
  332. #my $probTemp = $posteriors[$p] * $probHom * ($similarity + 1);
  333. my $probTemp = $posteriors[$p] * $probHom;
  334. my $exponent = $alpha * ($probTemp - $maxProb[$p]);
  335. $tscore += exp($exponent) - $yshift;
  336. }
  337. if ($tscore > $maxScore) {
  338. $maxScore = sprintf("%.3f", $tscore);
  339. $bestTemplateIdx = $tl;
  340. $probHomMax = $probHom;
  341. $similarityMax = $similarity;
  342. @posteriorsMax = @posteriors;
  343. }
  344. } ## all templates in list are scored now.
  345. ## test how to proceed:
  346. ##
  347. if ($bestTemplateIdx == -1) {
  348. print "NO new best template could be found! stopping.\n";
  349. $needMoreTemplates = 0;
  350. next;
  351. }
  352. ## found a "valid" best template for "final" template selection
  353. if ($preselect == 2 && $maxScore > 0) {
  354. $bestRanked = $templateList->get($bestTemplateIdx)->get_Hit();
  355. $bestRankedPtr = $templateList->get($bestTemplateIdx);
  356. # push(@templatesChosen, $templateList[$maxTemplateIdx]);
  357. $templateAlreadyAccepted{$bestTemplateIdx} = 1;
  358. }
  359. ## no more "valid" template available
  360. elsif ($preselect == 2 and $maxScore <= 0) {
  361. print "No more template has a score > 0! Stop searching for templates!\n";
  362. $needMoreTemplates = 0;
  363. next;
  364. }
  365. ## fill preselection
  366. elsif ($preselect == 1) {
  367. if ($templatesChosen->size() >= $maxNumTemplates) {
  368. $needMoreTemplates = 0;
  369. print "maximum number of templates ($maxNumTemplates) reached. stop selecting templates\n";
  370. next;
  371. }
  372. $bestRanked = $templateList->get($bestTemplateIdx)->get_Hit();
  373. $bestRankedPtr = $templateList->get($bestTemplateIdx);
  374. $templateAlreadyAccepted{$bestTemplateIdx} = 1;
  375. }
  376. ## exceeding maximal number of templates
  377. if ($preselect == 2 && $templatesChosen->size() >= $maxNumTemplates) {
  378. $needMoreTemplates = 0;
  379. print "more templates with Score > 0 than needed!\n";
  380. }
  381. ## no more templates in template list
  382. if( scalar(keys(%templateAlreadyAccepted)) >= $templateList->size() ) {
  383. print "no more templates available ... stop with " . $templateList->size() . " templates.\n";
  384. $needMoreTemplates = 0;
  385. }
  386. }
  387. return $templatesChosen;
  388. }
  389. sub SingleTemplateSelection {
  390. my $templateList = shift;
  391. my $queryLength = $templateList->get_queryLength();
  392. my $templatesChosen = TemplateList->new();
  393. my $maxOverlap = 20;
  394. my $minNewCoverage = 40;
  395. my @coverage;
  396. for (my $i=0; $i<$queryLength; $i++) {
  397. $coverage[$i] = 0;
  398. }
  399. for (my $i=0; $i<$templateList->size(); $i++) {
  400. my $template = $templateList->get($i);
  401. my $qStart = $template->get_Qstart() - 1;
  402. my $qEnd = $template->get_Qend() - 1;
  403. my $unaligned = 0;
  404. my $aligned = 0;
  405. for (my $i=$qStart; $i<=$qEnd; $i++) {
  406. if ($coverage[$i] == 0) {
  407. $unaligned++;
  408. } else {
  409. $aligned++;
  410. }
  411. }
  412. if ($aligned < $maxOverlap && $unaligned > $minNewCoverage) {
  413. for (my $j=$qStart; $j<$qEnd; $j++) { $coverage[$j] = 1; }
  414. $templatesChosen->add_template($template);
  415. }
  416. }
  417. return $templatesChosen;
  418. }
  419. sub checkCoverage {
  420. my $start = shift;
  421. my $end = shift;
  422. my $coveragePtr = shift;
  423. my $overlap = 0;
  424. my $uncovered = 0;
  425. for (my $i=$start; $i<$end; $i++) {
  426. $overlap++ if ($coveragePtr->[$i] == 1);
  427. $uncovered++ if ($coveragePtr->[$i] == 0);
  428. }
  429. return ($uncovered, $overlap);
  430. }
  431. sub updateCoverage {
  432. my $start = shift;
  433. my $end = shift;
  434. my $coveragePtr = shift;
  435. my $uncovered = 0;
  436. my $overlap = 0;
  437. for (my $i=$start; $i<$end; $i++) {
  438. if ($coveragePtr->[$i] == 0) {
  439. $uncovered++;
  440. } elsif ($coveragePtr->[$i] == 1) {
  441. $overlap++;
  442. }
  443. $coveragePtr->[$i] = 1;
  444. }
  445. return ($uncovered, $overlap);
  446. }
  447. 1;