filterAlignments.pm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #!/usr/bin/perl -w
  2. package filterAlignments;
  3. require Exporter;
  4. use strict;
  5. use config;
  6. use utilities;
  7. our @ISA = qw(Exporter);
  8. our @EXPORT = qw(Filtering FilteringRanking RemoveSSFromA3m);
  9. ## TODO: what about scopdir??, see below
  10. sub RemoveSSFromA3m {
  11. my $a3mFile = shift;
  12. open (A3M, "< $a3mFile") or die "Cant open $a3mFile: $!\n";
  13. my @a3m = <A3M>;
  14. close(A3M);
  15. ## assume sequence ID starts with a number
  16. ## remove all lines before ID
  17. while($a3m[0] !~ /^>\d/) {
  18. shift(@a3m);
  19. }
  20. open (A3M, "> $a3mFile") or die "Cant open $a3mFile: $!\n";
  21. print (A3M @a3m);
  22. close(A3M);
  23. }
  24. #######################################################
  25. ## input: inbase, i.e. basename of hhr, a3m and hhm
  26. ## outbase, i.e. where to write output for query
  27. ## (filtered query a3m, hhm); also base
  28. ## for other output files (filtered a3m
  29. ## templates)
  30. ## output: list of templates with parameters
  31. ## obtained by the following procedure
  32. ##
  33. ## description:
  34. ## first create a template info array
  35. ## take the first HITS templates and use them as
  36. ## "database" for the filter steps
  37. ## do filter-steps: for each filtering strength:
  38. ## create new filtered a3m files (templates and query)
  39. ## => create hhm files
  40. ## => do hhsearch => hhr file => CreateInfoArray
  41. ## append the templates from CreateInfoArray to
  42. ## list of templates, update best templates for
  43. ## each position in query
  44. ## do filtering until either last filter step is
  45. ## reached or filtering gives no more templates
  46. ##
  47. #######################################################
  48. sub Filtering {
  49. my $queryName = shift;
  50. my $outbase = shift; ## basename of outputs wrt query during filtering (e.g. filt.a3m), remove these outputs later
  51. my $templateList = shift;
  52. my $server = shift;
  53. my $config = shift;
  54. my $qscStep = shift || 0.1;
  55. my $outdir; ## directory where to write output during filtering, e.g. *.filt.a3m files
  56. ## setup
  57. my $verbose = 1;
  58. my $v = 1;
  59. if ($outbase =~ /^(.*?)\/([^\/]*)$/) {
  60. $outdir = $1;
  61. } else {
  62. $outdir = ".";
  63. }
  64. my $cpus = $config->get_cpus();
  65. my $options = "-mact " . $config->get_hhsearch_mact();
  66. my $qscmax = 0; #-qsc (max similarity)
  67. ## TODO: remove limitation, all preselected hits are filtered
  68. my $HITS = 50; #number of hits to be filtered
  69. my %bestscores; #hash with raw scores for each hit (in order to update SCOREs correctly)
  70. my $MAXOVLAP = 20; #maximum allowable overlap to treat same templatehits independently
  71. my $hhmake = $config->get_hhmake();
  72. my $hhfilter = $config->get_hhfilter();
  73. my $pdbdir = $config->get_pdbdir();
  74. ## fill bestscores hash
  75. for (my $i=0; $i<$templateList->size(); $i++){
  76. my $template = $templateList->get($i);
  77. #new template:
  78. if (! $bestscores{$template->get_Hit()}){
  79. push (@{$bestscores{$template->get_Hit()}}, [$template->get_Qstart(),
  80. $template->get_Qend(),
  81. $template->get_Score(),
  82. $template->get_SumProbL()]);
  83. }
  84. #template exists already: check if independent one
  85. else {
  86. if ($bestscores{$template->get_Hit()}[0][0] < $template->get_Qstart()) {
  87. #existing template "in front of" new one
  88. if (($bestscores{$template->get_Hit()}[0][1]-$template->get_Qstart())<=$MAXOVLAP) {
  89. #overlap <= 20 residues
  90. #push to key -> additional array
  91. push (@{$bestscores{$template->get_Hit()}}, [$template->get_Qstart(),
  92. $template->get_Qend(),
  93. $template->get_Score(),
  94. $template->get_SumProbL()]);
  95. }
  96. else{
  97. if ($bestscores{$template->get_Hit()}[0][2]< $template->get_Score()){
  98. #if existing score is smaller than the score of
  99. #another part of the query-template alignment: update it
  100. $bestscores{$template->get_Hit()}[0][2] = $template->get_Score();
  101. }
  102. }
  103. }
  104. else {
  105. #existing template "behind"
  106. if (($template->get_Qend() - $bestscores{$template->get_Hit()}[0][0]) <= $MAXOVLAP) {
  107. #overlap <= 20 residues
  108. #push to key -> additional array
  109. push (@{$bestscores{$template->get_Hit()}}, [$template->get_Qstart(),
  110. $template->get_Qend(),
  111. $template->get_Score(),
  112. $template->get_SumProbL()]);
  113. }
  114. else{
  115. if ($bestscores{$template->get_Hit()}[0][2]< $template->get_Score()){
  116. #if existing score is smaller than the score of another
  117. #part of the query-template alignment: update it
  118. $bestscores{$template->get_Hit()}[0][2] = $template->get_Score();
  119. }
  120. }
  121. }
  122. }
  123. }
  124. ####################
  125. #search for -qsc max
  126. ####################
  127. for (my $i=0; $i<$templateList->size(); $i++) {
  128. my $tt = $templateList->get($i);
  129. if ($tt->get_Sim() > $qscmax) { $qscmax = $tt->get_Sim(); }
  130. }
  131. my @hits = (); #array with hits of prob>80%
  132. my $db = ""; #contains hhms of templates (for filtering)
  133. my $a3msToFilter = ""; #contains a3ms of templates (for filtering)
  134. # my $pid = getppid();
  135. ## copy a3m files into temporary directory
  136. my $randID = &randBetween(1000, 9999);
  137. my $tmpDir = "/tmp/Filtering$queryName$randID";
  138. while (-e $tmpDir) {
  139. $randID = &randBetween(1000, 9999);
  140. $tmpDir .= "$randID";
  141. }
  142. &System("mkdir $tmpDir");
  143. for (my $i=0; $i<$templateList->size(); $i++) {
  144. my $tt = $templateList->get($i);
  145. #if probability of hit is greater than 80%
  146. if ($tt->get_Prob() >= 80){
  147. my $hhm = "$tmpDir/" . $tt->get_Hit() . ".filt.hhm";
  148. #check for double entries!
  149. if ($db !~ /$hhm/){
  150. $db .= "$hhm ";
  151. $a3msToFilter .= "$tmpDir/" . $tt->get_Hit() . ".filt.a3m ";
  152. &System("cp " . $config->get_pdbdir() . "/" . $tt->get_Hit() . ".a3m $tmpDir/" . $tt->get_Hit() . ".filt.a3m");
  153. push (@hits, $tt->get_Hit());
  154. }
  155. }
  156. }
  157. #############
  158. #filtersteps:
  159. #############
  160. &System("cp $outbase.a3m $tmpDir/$queryName.filt.a3m");
  161. my $hhrnr = -1;
  162. for (my $qsc=0; $qsc<$qscmax+0.1; $qsc+=$qscStep)
  163. {
  164. #if there are hits with more than 80% probability:
  165. if (@hits>0){
  166. $hhrnr++;
  167. #filter all hits with prob>= 80% from previous search with "-qsc $qsc"
  168. my $filteredA3mFiles = "";
  169. if ($config->get_parallelFiltering()) {
  170. my $multithread = $config->get_multithread();
  171. my $cpus = $config->get_cpus();
  172. my $cmd = "$multithread '$a3msToFilter' '$hhfilter -i \$file -id 100 -diff 0 -qsc $qsc -o \$file.out &> /dev/null' -cpu $cpus";
  173. &System("$cmd");
  174. sleep(1);
  175. $cmd = "$multithread '$a3msToFilter' '$hhmake -i \$file.out -id 100 -diff 0 -qsc $qsc -o $tmpDir/\$base.hhm &> /dev/null' -cpu $cpus";
  176. &System("$cmd");
  177. sleep(1);
  178. } else {
  179. foreach my $fileToFilter (split(/\s+/, $a3msToFilter)) {
  180. &System("$hhfilter -i $fileToFilter -id 100 -diff 0 -qsc $qsc -o $fileToFilter.out -v $v &> /dev/null");
  181. sleep 1 while (! -e "$fileToFilter.out");
  182. $filteredA3mFiles .= "$fileToFilter.out ";
  183. }
  184. foreach my $fileToMake (split(/\s+/, $filteredA3mFiles)) {
  185. my $base = $fileToMake;
  186. $base =~ s/\.a3m\.out$//;
  187. &System("$hhmake -i $fileToMake -diff 100 -o $base.hhm -v $v &> /dev/null");
  188. sleep 1 while (! -e "$base.hhm");
  189. }
  190. }
  191. #filter query:
  192. &System("$hhfilter -i $tmpDir/$queryName.filt.a3m -id 100 -diff 0 -qsc $qsc -o $tmpDir/$queryName.filt.a3m -v 1");
  193. sleep 1 while ( !(-e "$tmpDir/$queryName.filt.a3m") );
  194. &System("$hhmake -i $tmpDir/$queryName.filt.a3m -diff 100 -o $tmpDir/$queryName.filt.hhm -v 1");
  195. sleep 1 while ( !(-e "$tmpDir/$queryName.filt.hhm") );
  196. #hhsearch:
  197. &System($config->get_hhsearch() . " -cpu " . $config->get_cpus() . " -i $tmpDir/$queryName.filt.hhm -d \"$db\" -o $outbase.$hhrnr.hhr $options -Z $HITS -B $HITS -atab $outbase.$hhrnr.tab");
  198. sleep 1 while ( !(-e "$outbase.$hhrnr.hhr") );
  199. my $tListFiltered = TemplateList->new();
  200. $tListFiltered->hhr_to_TemplateList("$outbase.$hhrnr.hhr");
  201. ###########################################
  202. #update rawscores & SumProbs/Length & @hits
  203. ###########################################
  204. @hits = ();
  205. $db = "";
  206. $a3msToFilter = "";
  207. for (my $i=0; $i<$tListFiltered->size(); $i++) {
  208. my $filtTemplate = $tListFiltered->get($i);
  209. my $difference = 1000;
  210. my $chooseScore;
  211. my $templ = -1;
  212. my $choostempl;
  213. #to distinguish between "same" or "different" covered part of query
  214. foreach my $e (@{$bestscores{$filtTemplate->get_Hit()}}) {
  215. $templ++;
  216. my $startdifference = abs(@{$e}[0] - $filtTemplate->get_Qstart());
  217. #difference between start of observed and start of actual queryalignmentpositon
  218. #score belongs to template with smallest difference between startpositions
  219. if ($startdifference < $difference){
  220. $chooseScore = @{$e}[2];
  221. $choostempl = $templ;
  222. $difference = $startdifference;
  223. }
  224. }
  225. if ($chooseScore< $filtTemplate->get_Score()){
  226. #if existing score is smaller than the filtered score: update it
  227. $bestscores{$filtTemplate->get_Hit()}[$choostempl][2] = $filtTemplate->get_Score(); #update scores in %bestscores
  228. }
  229. if ($filtTemplate->get_Prob() >= 80) {
  230. my $hhm = "$tmpDir/" . $filtTemplate->get_Hit() . ".filt.hhm";
  231. #check for double entries!
  232. if ($db !~ /$hhm/) {
  233. $db .= "$hhm ";
  234. $a3msToFilter .= "$tmpDir/" . $filtTemplate->get_Hit() . ".filt.a3m ";
  235. push (@hits, $filtTemplate->get_Hit());
  236. }
  237. }
  238. }
  239. ## add filtered hits to final templates list
  240. for (my $j=0; $j<$tListFiltered->size(); $j++) {
  241. $templateList->add_template($tListFiltered->get($j));
  242. }
  243. }
  244. }
  245. #update rawscores in array with all possible templates
  246. for (my $i=0; $i<$templateList->size(); $i++) {
  247. my $template = $templateList->get($i);
  248. my $difference = 1000;
  249. my $templ = -1;
  250. my $choostempl;
  251. foreach my $e (@{$bestscores{$template->get_Hit()}}){
  252. $templ++;
  253. #difference between start of observed and start of actual queryalignmentpositon
  254. my $startdifference = abs(@{$e}[0] - $template->get_Qstart());
  255. if ($startdifference < $difference) {
  256. $choostempl = $templ;
  257. $difference = $startdifference;
  258. } #score belongs to template with smallest difference between startpositions
  259. }
  260. #update scores in @prediction; omg this is ugly
  261. $template->set_Score( $bestscores{$template->get_Hit()}[$choostempl][2] );
  262. }
  263. &System("rm -R $tmpDir");
  264. return $templateList;
  265. }
  266. 1;