TemplateList.pm 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. #!/user/bin/perl -w
  2. use strict;
  3. package TemplateList;
  4. use config;
  5. use utilities;
  6. use Template;
  7. my $config = HHpredConfig->instance();
  8. sub new {
  9. my ($caller, %arg) = @_;
  10. my $caller_is_obj = ref($caller);
  11. my $class = $caller_is_obj || $caller;
  12. no strict "refs";
  13. my $self = bless {}, $class;
  14. $self->{templates} = [];
  15. $self->{queryLength} = -1;
  16. $self->{query} = "";
  17. $self->{neff} = -1;
  18. if ($caller_is_obj) {
  19. my $size = $caller->size();
  20. for (my $i=0; $i<$size; $i++) {
  21. $self->{templates}->[$i] = $caller->{templates}->[$i];
  22. }
  23. $self->{queryLength} = $caller->{queryLength};
  24. $self->{query} = $caller->{caller};
  25. $self->{neff} = $caller->{neff};
  26. }
  27. return $self;
  28. }
  29. sub add_template {
  30. my ($self, $template) = @_;
  31. my $curSize = $self->size();
  32. $self->{templates}->[$curSize] = $template;
  33. }
  34. ## before adding template, check whether it is already in list
  35. sub check_and_add {
  36. my ($self, $template) = @_;
  37. for (my $i=0; $i<$self->size(); $i++) {
  38. if ($self->{templates}->[$i]->equals($template)) {
  39. return;
  40. }
  41. }
  42. $self->add_template($template);
  43. }
  44. sub clear {
  45. my $self = shift;
  46. %{$self} = ();
  47. $self->{templates} = [];
  48. $self->{query} = "";
  49. $self->{queryLength} = -1;
  50. $self->{neff} = -1;
  51. }
  52. ## delete hit with No "No"
  53. sub delete_No {
  54. my $self = shift;
  55. my $No = shift;
  56. ## get idx for hit with No "No"
  57. my $deleteIdx = -1;
  58. for (my $i=0; $i<$self->size(); $i++) {
  59. if ($self->{templates}->[$i]->get_No() == $No) {
  60. $deleteIdx = $i;
  61. last;
  62. }
  63. }
  64. print "deleting No=$No, idx=$deleteIdx\n";
  65. if ($deleteIdx != -1) {
  66. splice(@{$self->{templates}}, $deleteIdx, 1);
  67. }
  68. }
  69. sub size {
  70. my $self = shift;
  71. return scalar(@{$self->{templates}});
  72. }
  73. sub get {
  74. my ($self, $i) = @_;
  75. $self->{templates}->[$i];
  76. }
  77. sub get_last {
  78. my $self = shift;
  79. $self->{templates}->[$self->size()-1];
  80. }
  81. sub to_string {
  82. my $self = shift;
  83. my $spacer = shift;
  84. my $out = "";
  85. for (my $i=0; $i<$self->size(); $i++) {
  86. $out .= $self->{templates}->[$i]->to_string($spacer) . "\n";
  87. }
  88. return $out;
  89. }
  90. sub print {
  91. my $self = shift;
  92. my $out = $self->to_string();
  93. print $out;
  94. }
  95. sub to_TemplateList_helper {
  96. my $self = shift;
  97. my $hhrFile = shift;
  98. my @lines = @_;
  99. my $matchC;
  100. my $No;
  101. my $filtnr = "start"; ## filter step (start means no filtering)
  102. my $spaceLen = 12;
  103. if ($hhrFile =~ /\.(\d+)\.hhr/) {
  104. $filtnr = $1;
  105. }
  106. for (my $i=0; $i<@lines; $i++) {
  107. my $line = $lines[$i];
  108. if ($line =~ /^Match_columns\s*(\S+)/) {
  109. $matchC = $1;
  110. $self->_set_queryLength($matchC);
  111. }
  112. if ($line =~ /^Query\s+(\S+)/) {
  113. my $query = $1;
  114. $self->_set_query($query);
  115. }
  116. if ($line =~ /^Neff\s+(\S+)/) {
  117. my $neff = $1;
  118. $self->_set_neff($neff);
  119. }
  120. ## No Hit Prob E-val P-val Score SS Cols Query(start end) Template(start end) HMM
  121. elsif ($line=~/^\s*(\d+)\s+(\S+).+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\S+)\)$/) {
  122. my $No = $1;
  123. my $Hit = $2;
  124. my $Prob = $3;
  125. my $Eval = $4;
  126. my $Pval = $5;
  127. my $Score = $6;
  128. my $SS = $7;
  129. my $Cols = $8;
  130. my $Qstart = $9;
  131. my $Qend = $10;
  132. my $Tstart = $11;
  133. my $Tend = $12;
  134. my $HMM = $13;
  135. my $SSL = $SS/$matchC;
  136. $SSL = sprintf("%.4f", $SSL);
  137. my $template = Template->new(Filt => $filtnr,
  138. No => $No,
  139. Hit => $Hit,
  140. Prob => $Prob,
  141. Eval => $Eval,
  142. Pval => $Pval,
  143. Score => $Score,
  144. SS => $SS,
  145. Cols => $Cols,
  146. Qstart => $Qstart,
  147. Qend => $Qend,
  148. Tstart => $Tstart,
  149. Tend => $Tend,
  150. HMM => $HMM);
  151. $self->add_template($template);
  152. }
  153. elsif($line =~ /^No\s+(\d+)/) {
  154. $No = $1;
  155. $line = $lines[++$i];
  156. if ($line !~ /^>(\S+)\s/) {
  157. die("Error:: wrong format in \"$line\"\n");
  158. }
  159. my $hit = $1;
  160. $line = $lines[++$i];
  161. if ($line !~ /Similarity=(\S+)\s+Sum_probs=(\S+)\s*/) {
  162. die("Error: wrong format in \"$line\"\n");
  163. }
  164. my $Similarity = $1;
  165. my $SumProbL = $2/$matchC;
  166. $SumProbL = sprintf("%.4f" , $SumProbL);
  167. if ($line =~ /Identities=(\S+)%\s/) {
  168. $self->get($No-1)->set_Ident($1);
  169. }
  170. $self->get($No-1)->set_Sim($Similarity);
  171. $self->get($No-1)->set_SumProbL($SumProbL);
  172. }
  173. elsif ($line =~ /^T\s+ss_dssp(\s+)(\S+)/) {
  174. $spaceLen = length($1)-1;
  175. my $ss_dssp = $self->get($No-1)->get_ss_dssp();
  176. $self->get($No-1)->set_ss_dssp("$ss_dssp" . $2);
  177. }
  178. ## Confidence line may contain spaces => read number of spaces from ss_dssp line
  179. elsif ($line =~ /^Confidence\s{$spaceLen}(.*)\n/) {
  180. my $conf = $self->get($No-1)->get_conf();
  181. $self->get($No-1)->set_conf("$conf" . $1);
  182. }
  183. }
  184. }
  185. sub str_to_TemplateList {
  186. my $self = shift;
  187. my $str = shift;
  188. my @lines;
  189. @lines = split(/\n/, $str);
  190. $self->to_TemplateList_helper("dummy", @lines);
  191. }
  192. sub hhr_to_TemplateList {
  193. my ($self, $hhrFile) = @_;
  194. my @lines;
  195. open(HHR,"< $hhrFile") or die("Cant open $hhrFile: $!\n");
  196. @lines = <HHR>;
  197. close(HHR);
  198. $self->to_TemplateList_helper($hhrFile, @lines);
  199. }
  200. sub write_to_file {
  201. my ($self, $outfile) = @_;
  202. open (OH, "> $outfile") or die("Cant write to $outfile: $!\n");
  203. my $out = $self->to_string('===');
  204. print(OH $out);
  205. close(OH);
  206. }
  207. sub read_from_file {
  208. my ($self, $infile) = @_;
  209. my $append = 0;
  210. ## append template(s) to already existing ones
  211. $append = 1 if (scalar(@_) > 2 && $_[2] == 1);
  212. $self->clear() if (! $append);
  213. open(IH, "< $infile") or die("Cant open $infile: $!\n");
  214. while(<IH>) {
  215. chomp;
  216. if (/(\S+===)+/) {
  217. my @entry = split(/===/, $_);
  218. my $template = Template->new(Filt => $entry[0],
  219. No => $entry[1],
  220. Hit => $entry[2],
  221. Prob => $entry[3],
  222. Eval => $entry[4],
  223. Pval => $entry[5],
  224. Score => $entry[6],
  225. SS => $entry[7],
  226. Cols => $entry[8],
  227. Qstart => $entry[9],
  228. Qend => $entry[10],
  229. Tstart => $entry[11],
  230. Tend => $entry[12],
  231. HMM => $entry[13],
  232. Ident => $entry[14],
  233. Sim => $entry[15],
  234. SumProbL => $entry[16],
  235. predTM => $entry[17]);
  236. $self->add_template($template);
  237. }
  238. }
  239. close(IH);
  240. }
  241. sub set_queryLength {
  242. my ($self, $len) = @_;
  243. $self->{queryLength} = $len;
  244. }
  245. sub get_queryLength {
  246. my $self = shift;
  247. $self->{queryLength};
  248. }
  249. sub set_query {
  250. my ($self, $query) = @_;
  251. $self->{query} = $query;
  252. }
  253. sub get_query {
  254. my $self = shift;
  255. $self->{query};
  256. }
  257. sub get_neff {
  258. my $self = shift;
  259. $self->{neff};
  260. }
  261. sub set_neff {
  262. my ($self, $neff) = @_;
  263. $self->{neff} = $neff;
  264. }
  265. ## for backward compatibility ##
  266. sub _set_queryLength {
  267. my ($self, $len) = @_;
  268. $self->{queryLength} = $len;
  269. }
  270. sub _get_queryLength {
  271. my $self = shift;
  272. $self->{queryLength};
  273. }
  274. sub _set_query {
  275. my ($self, $query) = @_;
  276. $self->{query} = $query;
  277. }
  278. sub _get_query {
  279. my $self = shift;
  280. $self->{query};
  281. }
  282. sub _get_neff {
  283. my $self = shift;
  284. $self->{neff};
  285. }
  286. sub _set_neff {
  287. my ($self, $neff) = @_;
  288. $self->{neff} = $neff;
  289. }
  290. ######
  291. sub sort_by_sim {
  292. my $self = shift;
  293. @{$self->{templates}} = sort {$b->get_Sim() <=> $a->get_Sim()} @{$self->{templates}};
  294. }
  295. sub sort_by_prob {
  296. my $self = shift;
  297. @{$self->{templates}} = sort {$b->get_Prob() <=> $a->get_Prob()} @{$self->{templates}};
  298. }
  299. sub sort_by_sumProbL {
  300. my $self = shift;
  301. @{$self->{templates}} = sort {$b->get_SumProbL() <=> $a->get_SumProbL()} @{$self->{templates}};
  302. }
  303. sub sort_by_predTM {
  304. my $self = shift;
  305. @{$self->{templates}} = sort {$b->get_predTM() <=> $a->get_predTM()} @{$self->{templates}};
  306. }
  307. sub templateList_to_hhr {
  308. my $self = shift;
  309. my $outbase = shift;
  310. my $hhsearch = $config->get_hhsearch();
  311. my @hhrContent = ();
  312. open(HHR, "> $outbase.hhr") or die ("Error in templateList_to_hhr: Cant write $outbase.hhr: $!\n");
  313. for (my $i=0; $i<$self->size(); $i++) {
  314. my $template = $self->get($i);
  315. ## open apropriate hhr file (wrt filter step)
  316. my $infile = "$outbase." . $template->get_Filt() . ".hhr";
  317. open (IN, "< $infile") or die ("Error: cannot open $infile!\n");
  318. my $checkedHeader = 0;
  319. my $begin;
  320. my $e = 0;
  321. my $end;
  322. my $line;
  323. my $hitnr = $i+1;
  324. while ($line = <IN>) {
  325. ## copy first header lines:
  326. if (($checkedHeader==0) && ($i==0) && ($line !~ /^\s*\d+\s+\S+.+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$/)) {
  327. if ($line=~ /^Command/) {
  328. $line=~ s/(^Command\s*)(.*)$/$1$hhsearch artificial hhr file/;
  329. }
  330. ## replace P-value against TMscore
  331. if ($line=~ /\s+No\s+Hit\s+Prob\s+E-value\s+P-value\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s*/) {
  332. $line =~ s/(\s*No\s+Hit\s+Prob\s+E-value\s+)(P-value)(\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s+)/$1TMScore$3/;
  333. }
  334. print (HHR "$line");
  335. }
  336. else {
  337. $checkedHeader = 1;
  338. }
  339. ## get hit Info:
  340. my $No = $template->get_No();
  341. if ($line =~ /^\s*$No(\s+\S+.+\s+\S+\s+\S+)\s+\S+(\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$)/) {
  342. ## replace P-value by TMScore in hit info
  343. $line = sprintf("%3s$1 %1.4f$2\n", $hitnr, $template->get_predTM());
  344. print (HHR "$line");
  345. last;
  346. }
  347. }
  348. ## skip all lines up to alignment block
  349. ## Find beginning of alignment and replace hit index by new one
  350. while ($line = <IN>){
  351. my $No = $template->get_No();
  352. if ($line =~ /^No\s+$No/) {
  353. last;
  354. }
  355. }
  356. $line =~ s/^No\s+\d+/No $hitnr/;
  357. push(@hhrContent, $line);
  358. ## Push alignment block onto array
  359. while ($line = <IN>) {
  360. if(($line =~ /^No\s/)) {
  361. last;
  362. }
  363. if ($line =~ /Done!/) {}
  364. else {
  365. push(@hhrContent, $line);
  366. }
  367. }
  368. close (IN);
  369. ## create associated tab file
  370. &BuildSingleTabFile("$outbase." . $template->get_Filt() . ".tab", $template->get_No(), $outbase);
  371. }
  372. print(HHR "\n");
  373. print(HHR @hhrContent);
  374. print(HHR "Done!\n");
  375. close (HHR);
  376. }
  377. ## starting from current hhr file, extract some features and save them into resultfile
  378. ## this is needed for benchmark set compilation
  379. sub createBenchmarkInfoFile {
  380. my ($self, $resultFile, $pdbdir) = @_;
  381. my $TMalign = $config->get_TMalign();
  382. my $query = $self->_get_query();
  383. my $queryPDB = "$pdbdir/$query.pdb";
  384. my $res = "";
  385. $res .= "queryName"."\t"."TMID"."\t"."coverage"."\t"."queryLen"."\t"."templateName"."\t"."TMscore\n";
  386. ## extract information from max first 50 templates
  387. for (my $i=0; $i<50 && $i<$self->size(); $i++) {
  388. my $template = $self->get($i);
  389. my $TMscore = 0;
  390. my $TMid = 0;
  391. my $templatePDB = "$pdbdir/" . $template->get_Hit() . ".pdb";
  392. my $tmalignResult = `$TMalign $templatePDB $queryPDB`;
  393. if ($tmalignResult =~ /TM-score\s*=\s*(\S+),\s+ID\s*=\s*(\S+)/) {
  394. $TMscore = $1;
  395. $TMid= int(($2*100)+0.5);
  396. }
  397. my $queryLen = $self->_get_queryLength();
  398. my $coverage = int(($template->get_Cols()*100/$queryLen)+0.5);
  399. my $templateName = $template->get_Hit();
  400. $res .= "$query\t$TMid\t$coverage\t$queryLen\t$templateName\t$TMscore\n";
  401. }
  402. open(OH, "> $resultFile") or die "Cant write $resultFile: $!\n";
  403. print (OH $res);
  404. close(OH);
  405. }
  406. 1;