TemplateListStruct.pm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. #!/user/bin/perl -w
  2. use strict;
  3. package TemplateListStruct;
  4. use config;
  5. use utilities;
  6. use TemplateStruct;
  7. my $config = HHpredConfig->instance();
  8. sub new {
  9. my ($caller, $filename) = @_;
  10. my $caller_is_obj = ref($caller);
  11. my $class = $caller_is_obj || $caller;
  12. no strict "refs";
  13. my $self = bless {}, $class;
  14. $self->{templates} = [];
  15. $self->{queryLength} = -1;
  16. $self->{query} = "";
  17. $self->{neff} = -1;
  18. if ($caller_is_obj) {
  19. my $size = $caller->size();
  20. for (my $i=0; $i<$size; $i++) {
  21. $self->{templates}->[$i] = $caller->{templates}->[$i];
  22. }
  23. $self->{queryLength} = $caller->{queryLength};
  24. $self->{query} = $caller->{caller};
  25. $self->{neff} = $caller->{neff};
  26. }
  27. if (defined($filename)) {
  28. $self->hhr_to_TemplateList($filename);
  29. }
  30. return $self;
  31. }
  32. sub add_template {
  33. my ($self, $template) = @_;
  34. my $curSize = $self->size();
  35. $self->{templates}->[$curSize] = $template;
  36. }
  37. ## before adding template, check whether it is already in list
  38. sub check_and_add {
  39. my ($self, $template) = @_;
  40. for (my $i=0; $i<$self->size(); $i++) {
  41. if ($self->{templates}->[$i]->equals($template)) {
  42. return;
  43. }
  44. }
  45. $self->add_template($template);
  46. }
  47. sub clear {
  48. my $self = shift;
  49. %{$self} = ();
  50. $self->{templates} = [];
  51. $self->{query} = "";
  52. $self->{queryLength} = -1;
  53. $self->{neff} = -1;
  54. }
  55. ## delete hit with No "No"
  56. sub delete_No {
  57. my $self = shift;
  58. my $No = shift;
  59. ## get idx for hit with No "No"
  60. my $deleteIdx = -1;
  61. for (my $i=0; $i<$self->size(); $i++) {
  62. if ($self->{templates}->[$i]->get_No() == $No) {
  63. $deleteIdx = $i;
  64. last;
  65. }
  66. }
  67. print "deleting No=$No, idx=$deleteIdx\n";
  68. if ($deleteIdx != -1) {
  69. splice(@{$self->{templates}}, $deleteIdx, 1);
  70. }
  71. }
  72. sub size {
  73. my $self = shift;
  74. return scalar(@{$self->{templates}});
  75. }
  76. sub get {
  77. my ($self, $i) = @_;
  78. $self->{templates}->[$i];
  79. }
  80. sub get_last {
  81. my $self = shift;
  82. $self->{templates}->[$self->size()-1];
  83. }
  84. sub to_string {
  85. my $self = shift;
  86. my $spacer = shift;
  87. my $out = "";
  88. for (my $i=0; $i<$self->size(); $i++) {
  89. $out .= $self->{templates}->[$i]->to_string($spacer) . "\n";
  90. }
  91. return $out;
  92. }
  93. sub print {
  94. my $self = shift;
  95. my $out = $self->to_string();
  96. print $out;
  97. }
  98. sub to_TemplateList_helper {
  99. my $self = shift;
  100. my $hhrFile = shift;
  101. my @lines = @_;
  102. my $matchC;
  103. my $No;
  104. my $filtnr = "start"; ## filter step (start means no filtering)
  105. my $spaceLen = 12;
  106. if ($hhrFile =~ /\.(\d+)\.hhr/) {
  107. $filtnr = $1;
  108. }
  109. for (my $i=0; $i<@lines; $i++) {
  110. my $line = $lines[$i];
  111. if ($line =~ /^Match_columns\s*(\S+)/) {
  112. $matchC = $1;
  113. $self->_set_queryLength($matchC);
  114. }
  115. if ($line =~ /^Query\s+(\S+)/) {
  116. my $query = $1;
  117. $self->_set_query($query);
  118. }
  119. if ($line =~ /^Neff\s+(\S+)/) {
  120. my $neff = $1;
  121. $self->_set_neff($neff);
  122. }
  123. ## No Hit Prob E-val P-val Score SS Cols Query(start end) Template(start end) HMM
  124. elsif ($line=~/^\s*(\d+)\s+(\S+).+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\S+)\)$/) {
  125. my $No = $1;
  126. my $Hit = $2;
  127. my $Prob = $3;
  128. my $Eval = $4;
  129. my $Pval = $5;
  130. my $Score = $6;
  131. my $SS = $7;
  132. my $Cols = $8;
  133. my $Qstart = $9;
  134. my $Qend = $10;
  135. my $Tstart = $11;
  136. my $Tend = $12;
  137. my $HMM = $13;
  138. my $SSL = $SS/$matchC;
  139. $SSL = sprintf("%.4f", $SSL);
  140. my $template = TemplateStruct->new(Filt => $filtnr,
  141. No => $No,
  142. Hit => $Hit,
  143. Prob => $Prob,
  144. Eval => $Eval,
  145. Pval => $Pval,
  146. Score => $Score,
  147. SS => $SS,
  148. Cols => $Cols,
  149. Qstart => $Qstart,
  150. Qend => $Qend,
  151. Tstart => $Tstart,
  152. Tend => $Tend,
  153. HMM => $HMM);
  154. $self->add_template($template);
  155. }
  156. elsif($line =~ /^No\s+(\d+)/) {
  157. $No = $1;
  158. $line = $lines[++$i];
  159. if ($line !~ /^>(\S+)\s/) {
  160. die("Error:: wrong format in \"$line\"\n");
  161. }
  162. my $hit = $1;
  163. $line = $lines[++$i];
  164. if ($line =~ /COMPACTNESS=(\S+)/i) {
  165. $self->get($No-1)->set_Compactness($1);
  166. } else {
  167. #print "missing compactness in hhr-file!\n";
  168. }
  169. if ($line =~ /CSS=(\S+)/i) {
  170. $self->get($No-1)->set_Css($1);
  171. } else {
  172. #print "missing css in hhr-file!\n";
  173. }
  174. if ($line =~ /CONTACT=(\S+)/i) {
  175. $self->get($No-1)->set_Contact($1);
  176. } else {
  177. #print "missing contact in hhr-file!\n";
  178. }
  179. if ($line =~ /CONTACT_REALIGN=(\S+)/i) {
  180. $self->get($No-1)->set_ContactRealign($1);
  181. } else {
  182. #print "missing contact realign in hhr-file!\n";
  183. }
  184. if ($line !~ /Similarity=(\S+)\s+Sum_probs=(\S+)\s*/) {
  185. die("Error: wrong format in \"$line\"\n");
  186. }
  187. my $Similarity = $1;
  188. my $SumProbL = $2/$matchC;
  189. $SumProbL = sprintf("%.4f" , $SumProbL);
  190. if ($line =~ /Identities=(\S+)%\s/) {
  191. $self->get($No-1)->set_Ident($1);
  192. }
  193. $self->get($No-1)->set_Sim($Similarity);
  194. $self->get($No-1)->set_SumProbL($SumProbL);
  195. }
  196. elsif ($line =~ /^T\s+ss_dssp(\s+)(\S+)/) {
  197. $spaceLen = length($1)-1;
  198. my $ss_dssp = $self->get($No-1)->get_ss_dssp();
  199. $self->get($No-1)->set_ss_dssp("$ss_dssp" . $2);
  200. }
  201. ## Confidence line may contain spaces => read number of spaces from ss_dssp line
  202. elsif ($line =~ /^Confidence\s{$spaceLen}(.*)\n/) {
  203. my $conf = $self->get($No-1)->get_conf();
  204. $self->get($No-1)->set_conf("$conf" . $1);
  205. }
  206. }
  207. }
  208. sub str_to_TemplateList {
  209. my $self = shift;
  210. my $str = shift;
  211. my @lines;
  212. @lines = split(/\n/, $str);
  213. $self->to_TemplateList_helper("dummy", @lines);
  214. }
  215. sub hhr_to_TemplateList {
  216. my ($self, $hhrFile) = @_;
  217. my @lines;
  218. open(HHR,"< $hhrFile") or die("Cant open $hhrFile: $!\n");
  219. @lines = <HHR>;
  220. close(HHR);
  221. $self->to_TemplateList_helper($hhrFile, @lines);
  222. }
  223. sub write_to_file {
  224. my ($self, $outfile) = @_;
  225. open (OH, "> $outfile") or die("Cant write to $outfile: $!\n");
  226. my $out = $self->to_string('===');
  227. print(OH $out);
  228. close(OH);
  229. }
  230. sub read_from_file {
  231. my ($self, $infile) = @_;
  232. my $append = 0;
  233. ## append template(s) to already existing ones
  234. $append = 1 if (scalar(@_) > 2 && $_[2] == 1);
  235. $self->clear() if (! $append);
  236. open(IH, "< $infile") or die("Cant open $infile: $!\n");
  237. while(<IH>) {
  238. chomp;
  239. if (/(\S+===)+/) {
  240. my @entry = split(/===/, $_);
  241. my $template = TemplateStruct->new(Filt => $entry[0],
  242. No => $entry[1],
  243. Hit => $entry[2],
  244. Prob => $entry[3],
  245. Eval => $entry[4],
  246. Pval => $entry[5],
  247. Score => $entry[6],
  248. SS => $entry[7],
  249. Cols => $entry[8],
  250. Qstart => $entry[9],
  251. Qend => $entry[10],
  252. Tstart => $entry[11],
  253. Tend => $entry[12],
  254. HMM => $entry[13],
  255. Ident => $entry[14],
  256. Sim => $entry[15],
  257. SumProbL => $entry[16],
  258. predTM => $entry[17],
  259. Compactness => $entry[18],
  260. Css => $entry[19],
  261. Contact => $entry[20],
  262. ContactRealign => $entry[21]);
  263. $self->add_template($template);
  264. }
  265. }
  266. close(IH);
  267. }
  268. sub set_queryLength {
  269. my ($self, $len) = @_;
  270. $self->{queryLength} = $len;
  271. }
  272. sub get_queryLength {
  273. my $self = shift;
  274. $self->{queryLength};
  275. }
  276. sub set_query {
  277. my ($self, $query) = @_;
  278. $self->{query} = $query;
  279. }
  280. sub get_query {
  281. my $self = shift;
  282. $self->{query};
  283. }
  284. sub get_neff {
  285. my $self = shift;
  286. $self->{neff};
  287. }
  288. sub set_neff {
  289. my ($self, $neff) = @_;
  290. $self->{neff} = $neff;
  291. }
  292. ## for backward compatibility ##
  293. sub _set_queryLength {
  294. my ($self, $len) = @_;
  295. $self->{queryLength} = $len;
  296. }
  297. sub _get_queryLength {
  298. my $self = shift;
  299. $self->{queryLength};
  300. }
  301. sub _set_query {
  302. my ($self, $query) = @_;
  303. $self->{query} = $query;
  304. }
  305. sub _get_query {
  306. my $self = shift;
  307. $self->{query};
  308. }
  309. sub _get_neff {
  310. my $self = shift;
  311. $self->{neff};
  312. }
  313. sub _set_neff {
  314. my ($self, $neff) = @_;
  315. $self->{neff} = $neff;
  316. }
  317. ######
  318. sub sort_by_sim {
  319. my $self = shift;
  320. @{$self->{templates}} = sort {$b->get_Sim() <=> $a->get_Sim()} @{$self->{templates}};
  321. }
  322. sub sort_by_prob {
  323. my $self = shift;
  324. @{$self->{templates}} = sort {$b->get_Prob() <=> $a->get_Prob()} @{$self->{templates}};
  325. }
  326. sub sort_by_sumProbL {
  327. my $self = shift;
  328. @{$self->{templates}} = sort {$b->get_SumProbL() <=> $a->get_SumProbL()} @{$self->{templates}};
  329. }
  330. sub sort_by_predTM {
  331. my $self = shift;
  332. @{$self->{templates}} = sort {$b->get_predTM() <=> $a->get_predTM()} @{$self->{templates}};
  333. }
  334. sub templateList_to_hhr {
  335. my $self = shift;
  336. my $outbase = shift;
  337. my @hhrContent = ();
  338. my $hh = $config->get_hh();
  339. open(HHR, "> $outbase.hhr") or die ("Error in templateList_to_hhr: Cant write $outbase.hhr: $!\n");
  340. for (my $i=0; $i<$self->size(); $i++) {
  341. my $template = $self->get($i);
  342. ## open apropriate hhr file (wrt filter step)
  343. my $infile = "$outbase." . $template->get_Filt() . ".hhr";
  344. open (IN, "< $infile") or die ("Error: cannot open $infile!\n");
  345. my $checkedHeader = 0;
  346. my $begin;
  347. my $e = 0;
  348. my $end;
  349. my $line;
  350. my $hitnr = $i+1;
  351. while ($line = <IN>) {
  352. ## copy first header lines:
  353. if (($checkedHeader==0) && ($i==0) && ($line !~ /^\s*\d+\s+\S+.+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$/)) {
  354. if ($line=~ /^Command/) {
  355. $line=~ s/(^Command\s*)(.*)$/$1$hh\/hhsearch artificial hhr file/;
  356. }
  357. ## replace P-value against TMscore
  358. if ($line=~ /\s+No\s+Hit\s+Prob\s+E-value\s+P-value\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s*/) {
  359. $line =~ s/(\s*No\s+Hit\s+Prob\s+E-value\s+)(P-value)(\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s+)/$1TMScore$3/;
  360. }
  361. print (HHR "$line");
  362. }
  363. else {
  364. $checkedHeader = 1;
  365. }
  366. ## get hit Info:
  367. my $No = $template->get_No();
  368. if ($line =~ /^\s*$No(\s+\S+.+\s+\S+\s+\S+)\s+\S+(\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$)/) {
  369. ## replace P-value by TMScore in hit info
  370. $line = sprintf("%3s$1 %1.4f$2\n", $hitnr, $template->get_predTM());
  371. print (HHR "$line");
  372. last;
  373. }
  374. }
  375. ## skip all lines up to alignment block
  376. ## Find beginning of alignment and replace hit index by new one
  377. while ($line = <IN>){
  378. my $No = $template->get_No();
  379. if ($line =~ /^No\s+$No/) {
  380. last;
  381. }
  382. }
  383. $line =~ s/^No\s+\d+/No $hitnr/;
  384. push(@hhrContent, $line);
  385. ## Push alignment block onto array
  386. while ($line = <IN>) {
  387. if(($line =~ /^No\s/)) {
  388. last;
  389. }
  390. if ($line =~ /Done!/) {}
  391. else {
  392. push(@hhrContent, $line);
  393. }
  394. }
  395. close (IN);
  396. ## create associated tab file
  397. &BuildSingleTabFile("$outbase." . $template->get_Filt() . ".tab", $template->get_No(), $outbase);
  398. }
  399. print(HHR "\n");
  400. print(HHR @hhrContent);
  401. print(HHR "Done!\n");
  402. close (HHR);
  403. }
  404. ## starting from current hhr file, extract some features and save them into resultfile
  405. ## this is needed for benchmark set compilation
  406. sub createBenchmarkInfoFile {
  407. my ($self, $resultFile, $pdbdir) = @_;
  408. my $TMalign = $config->get_TMalign();
  409. my $query = $self->_get_query();
  410. my $queryPDB = "$pdbdir/$query.pdb";
  411. my $res = "";
  412. $res .= "queryName"."\t"."TMID"."\t"."coverage"."\t"."queryLen"."\t"."templateName"."\t"."TMscore\n";
  413. ## extract information from max first 50 templates
  414. for (my $i=0; $i<50 && $i<$self->size(); $i++) {
  415. my $template = $self->get($i);
  416. my $TMscore = 0;
  417. my $TMid = 0;
  418. my $templatePDB = "$pdbdir/" . $template->get_Hit() . ".pdb";
  419. my $tmalignResult = `$TMalign $templatePDB $queryPDB`;
  420. if ($tmalignResult =~ /TM-score\s*=\s*(\S+),\s+ID\s*=\s*(\S+)/) {
  421. $TMscore = $1;
  422. $TMid= int(($2*100)+0.5);
  423. }
  424. my $queryLen = $self->_get_queryLength();
  425. my $coverage = int(($template->get_Cols()*100/$queryLen)+0.5);
  426. my $templateName = $template->get_Hit();
  427. $res .= "$query\t$TMid\t$coverage\t$queryLen\t$templateName\t$TMscore\n";
  428. }
  429. open(OH, "> $resultFile") or die "Cant write $resultFile: $!\n";
  430. print (OH $res);
  431. close(OH);
  432. }
  433. 1;