123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- #!/user/bin/perl -w
- use strict;
- package TemplateListStruct;
- use config;
- use utilities;
- use TemplateStruct;
- my $config = HHpredConfig->instance();
- sub new {
- my ($caller, $filename) = @_;
- my $caller_is_obj = ref($caller);
- my $class = $caller_is_obj || $caller;
- no strict "refs";
- my $self = bless {}, $class;
- $self->{templates} = [];
- $self->{queryLength} = -1;
- $self->{query} = "";
- $self->{neff} = -1;
- if ($caller_is_obj) {
- my $size = $caller->size();
- for (my $i=0; $i<$size; $i++) {
- $self->{templates}->[$i] = $caller->{templates}->[$i];
- }
- $self->{queryLength} = $caller->{queryLength};
- $self->{query} = $caller->{caller};
- $self->{neff} = $caller->{neff};
- }
- if (defined($filename)) {
- $self->hhr_to_TemplateList($filename);
- }
- return $self;
- }
- sub add_template {
- my ($self, $template) = @_;
- my $curSize = $self->size();
- $self->{templates}->[$curSize] = $template;
- }
- ## before adding template, check whether it is already in list
- sub check_and_add {
- my ($self, $template) = @_;
- for (my $i=0; $i<$self->size(); $i++) {
- if ($self->{templates}->[$i]->equals($template)) {
- return;
- }
- }
- $self->add_template($template);
- }
-
- sub clear {
- my $self = shift;
- %{$self} = ();
- $self->{templates} = [];
- $self->{query} = "";
- $self->{queryLength} = -1;
- $self->{neff} = -1;
- }
- ## delete hit with No "No"
- sub delete_No {
- my $self = shift;
- my $No = shift;
- ## get idx for hit with No "No"
- my $deleteIdx = -1;
- for (my $i=0; $i<$self->size(); $i++) {
- if ($self->{templates}->[$i]->get_No() == $No) {
- $deleteIdx = $i;
- last;
- }
- }
- print "deleting No=$No, idx=$deleteIdx\n";
- if ($deleteIdx != -1) {
- splice(@{$self->{templates}}, $deleteIdx, 1);
- }
- }
- sub size {
- my $self = shift;
- return scalar(@{$self->{templates}});
- }
- sub get {
- my ($self, $i) = @_;
- $self->{templates}->[$i];
- }
- sub get_last {
- my $self = shift;
- $self->{templates}->[$self->size()-1];
- }
- sub to_string {
- my $self = shift;
- my $spacer = shift;
- my $out = "";
- for (my $i=0; $i<$self->size(); $i++) {
- $out .= $self->{templates}->[$i]->to_string($spacer) . "\n";
- }
- return $out;
- }
- sub print {
- my $self = shift;
- my $out = $self->to_string();
- print $out;
- }
- sub to_TemplateList_helper {
- my $self = shift;
- my $hhrFile = shift;
- my @lines = @_;
- my $matchC;
- my $No;
- my $filtnr = "start"; ## filter step (start means no filtering)
- my $spaceLen = 12;
- if ($hhrFile =~ /\.(\d+)\.hhr/) {
- $filtnr = $1;
- }
- for (my $i=0; $i<@lines; $i++) {
- my $line = $lines[$i];
- if ($line =~ /^Match_columns\s*(\S+)/) {
- $matchC = $1;
- $self->_set_queryLength($matchC);
- }
- if ($line =~ /^Query\s+(\S+)/) {
- my $query = $1;
- $self->_set_query($query);
- }
- if ($line =~ /^Neff\s+(\S+)/) {
- my $neff = $1;
- $self->_set_neff($neff);
- }
- ## No Hit Prob E-val P-val Score SS Cols Query(start end) Template(start end) HMM
- elsif ($line=~/^\s*(\d+)\s+(\S+).+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\S+)\)$/) {
- my $No = $1;
- my $Hit = $2;
- my $Prob = $3;
- my $Eval = $4;
- my $Pval = $5;
- my $Score = $6;
- my $SS = $7;
- my $Cols = $8;
- my $Qstart = $9;
- my $Qend = $10;
- my $Tstart = $11;
- my $Tend = $12;
- my $HMM = $13;
- my $SSL = $SS/$matchC;
- $SSL = sprintf("%.4f", $SSL);
- my $template = TemplateStruct->new(Filt => $filtnr,
- No => $No,
- Hit => $Hit,
- Prob => $Prob,
- Eval => $Eval,
- Pval => $Pval,
- Score => $Score,
- SS => $SS,
- Cols => $Cols,
- Qstart => $Qstart,
- Qend => $Qend,
- Tstart => $Tstart,
- Tend => $Tend,
- HMM => $HMM);
- $self->add_template($template);
- }
- elsif($line =~ /^No\s+(\d+)/) {
- $No = $1;
- $line = $lines[++$i];
- if ($line !~ /^>(\S+)\s/) {
- die("Error:: wrong format in \"$line\"\n");
- }
- my $hit = $1;
- $line = $lines[++$i];
- if ($line =~ /COMPACTNESS=(\S+)/i) {
- $self->get($No-1)->set_Compactness($1);
- } else {
- #print "missing compactness in hhr-file!\n";
- }
- if ($line =~ /CSS=(\S+)/i) {
- $self->get($No-1)->set_Css($1);
- } else {
- #print "missing css in hhr-file!\n";
- }
- if ($line =~ /CONTACT=(\S+)/i) {
- $self->get($No-1)->set_Contact($1);
- } else {
- #print "missing contact in hhr-file!\n";
- }
- if ($line =~ /CONTACT_REALIGN=(\S+)/i) {
- $self->get($No-1)->set_ContactRealign($1);
- } else {
- #print "missing contact realign in hhr-file!\n";
- }
- if ($line !~ /Similarity=(\S+)\s+Sum_probs=(\S+)\s*/) {
- die("Error: wrong format in \"$line\"\n");
- }
- my $Similarity = $1;
- my $SumProbL = $2/$matchC;
- $SumProbL = sprintf("%.4f" , $SumProbL);
- if ($line =~ /Identities=(\S+)%\s/) {
- $self->get($No-1)->set_Ident($1);
- }
- $self->get($No-1)->set_Sim($Similarity);
- $self->get($No-1)->set_SumProbL($SumProbL);
- }
- elsif ($line =~ /^T\s+ss_dssp(\s+)(\S+)/) {
- $spaceLen = length($1)-1;
- my $ss_dssp = $self->get($No-1)->get_ss_dssp();
- $self->get($No-1)->set_ss_dssp("$ss_dssp" . $2);
- }
- ## Confidence line may contain spaces => read number of spaces from ss_dssp line
- elsif ($line =~ /^Confidence\s{$spaceLen}(.*)\n/) {
- my $conf = $self->get($No-1)->get_conf();
- $self->get($No-1)->set_conf("$conf" . $1);
- }
- }
- }
- sub str_to_TemplateList {
- my $self = shift;
- my $str = shift;
- my @lines;
- @lines = split(/\n/, $str);
-
- $self->to_TemplateList_helper("dummy", @lines);
- }
- sub hhr_to_TemplateList {
- my ($self, $hhrFile) = @_;
- my @lines;
- open(HHR,"< $hhrFile") or die("Cant open $hhrFile: $!\n");
- @lines = <HHR>;
- close(HHR);
- $self->to_TemplateList_helper($hhrFile, @lines);
- }
- sub write_to_file {
- my ($self, $outfile) = @_;
-
- open (OH, "> $outfile") or die("Cant write to $outfile: $!\n");
- my $out = $self->to_string('===');
- print(OH $out);
- close(OH);
- }
- sub read_from_file {
- my ($self, $infile) = @_;
- my $append = 0;
- ## append template(s) to already existing ones
- $append = 1 if (scalar(@_) > 2 && $_[2] == 1);
- $self->clear() if (! $append);
- open(IH, "< $infile") or die("Cant open $infile: $!\n");
- while(<IH>) {
- chomp;
- if (/(\S+===)+/) {
- my @entry = split(/===/, $_);
- my $template = TemplateStruct->new(Filt => $entry[0],
- No => $entry[1],
- Hit => $entry[2],
- Prob => $entry[3],
- Eval => $entry[4],
- Pval => $entry[5],
- Score => $entry[6],
- SS => $entry[7],
- Cols => $entry[8],
- Qstart => $entry[9],
- Qend => $entry[10],
- Tstart => $entry[11],
- Tend => $entry[12],
- HMM => $entry[13],
- Ident => $entry[14],
- Sim => $entry[15],
- SumProbL => $entry[16],
- predTM => $entry[17],
- Compactness => $entry[18],
- Css => $entry[19],
- Contact => $entry[20],
- ContactRealign => $entry[21]);
-
- $self->add_template($template);
- }
- }
- close(IH);
- }
- sub set_queryLength {
- my ($self, $len) = @_;
- $self->{queryLength} = $len;
- }
- sub get_queryLength {
- my $self = shift;
- $self->{queryLength};
- }
- sub set_query {
- my ($self, $query) = @_;
- $self->{query} = $query;
- }
- sub get_query {
- my $self = shift;
- $self->{query};
- }
- sub get_neff {
- my $self = shift;
- $self->{neff};
- }
- sub set_neff {
- my ($self, $neff) = @_;
- $self->{neff} = $neff;
- }
- ## for backward compatibility ##
- sub _set_queryLength {
- my ($self, $len) = @_;
- $self->{queryLength} = $len;
- }
- sub _get_queryLength {
- my $self = shift;
- $self->{queryLength};
- }
- sub _set_query {
- my ($self, $query) = @_;
- $self->{query} = $query;
- }
- sub _get_query {
- my $self = shift;
- $self->{query};
- }
- sub _get_neff {
- my $self = shift;
- $self->{neff};
- }
- sub _set_neff {
- my ($self, $neff) = @_;
- $self->{neff} = $neff;
- }
- ######
- sub sort_by_sim {
- my $self = shift;
- @{$self->{templates}} = sort {$b->get_Sim() <=> $a->get_Sim()} @{$self->{templates}};
- }
- sub sort_by_prob {
- my $self = shift;
- @{$self->{templates}} = sort {$b->get_Prob() <=> $a->get_Prob()} @{$self->{templates}};
- }
- sub sort_by_sumProbL {
- my $self = shift;
- @{$self->{templates}} = sort {$b->get_SumProbL() <=> $a->get_SumProbL()} @{$self->{templates}};
- }
- sub sort_by_predTM {
- my $self = shift;
- @{$self->{templates}} = sort {$b->get_predTM() <=> $a->get_predTM()} @{$self->{templates}};
- }
- sub templateList_to_hhr {
- my $self = shift;
- my $outbase = shift;
- my @hhrContent = ();
- my $hh = $config->get_hh();
- open(HHR, "> $outbase.hhr") or die ("Error in templateList_to_hhr: Cant write $outbase.hhr: $!\n");
- for (my $i=0; $i<$self->size(); $i++) {
- my $template = $self->get($i);
- ## open apropriate hhr file (wrt filter step)
- my $infile = "$outbase." . $template->get_Filt() . ".hhr";
- open (IN, "< $infile") or die ("Error: cannot open $infile!\n");
- my $checkedHeader = 0;
- my $begin;
- my $e = 0;
- my $end;
- my $line;
- my $hitnr = $i+1;
- while ($line = <IN>) {
- ## copy first header lines:
- if (($checkedHeader==0) && ($i==0) && ($line !~ /^\s*\d+\s+\S+.+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$/)) {
- if ($line=~ /^Command/) {
- $line=~ s/(^Command\s*)(.*)$/$1$hh\/hhsearch artificial hhr file/;
- }
- ## replace P-value against TMscore
- if ($line=~ /\s+No\s+Hit\s+Prob\s+E-value\s+P-value\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s*/) {
- $line =~ s/(\s*No\s+Hit\s+Prob\s+E-value\s+)(P-value)(\s+Score\s+SS\s+Cols\s+Query\s+HMM\s+Template\s+HMM\s+)/$1TMScore$3/;
- }
- print (HHR "$line");
- }
- else {
- $checkedHeader = 1;
- }
-
- ## get hit Info:
- my $No = $template->get_No();
- if ($line =~ /^\s*$No(\s+\S+.+\s+\S+\s+\S+)\s+\S+(\s+\S+\s+\S+\s+\S+\s+\d+-\d+\s+\S+\s*\(\S+\)$)/) {
- ## replace P-value by TMScore in hit info
- $line = sprintf("%3s$1 %1.4f$2\n", $hitnr, $template->get_predTM());
- print (HHR "$line");
- last;
- }
- }
- ## skip all lines up to alignment block
- ## Find beginning of alignment and replace hit index by new one
- while ($line = <IN>){
- my $No = $template->get_No();
- if ($line =~ /^No\s+$No/) {
- last;
- }
- }
-
- $line =~ s/^No\s+\d+/No $hitnr/;
- push(@hhrContent, $line);
- ## Push alignment block onto array
- while ($line = <IN>) {
- if(($line =~ /^No\s/)) {
- last;
- }
- if ($line =~ /Done!/) {}
- else {
- push(@hhrContent, $line);
- }
- }
- close (IN);
- ## create associated tab file
- &BuildSingleTabFile("$outbase." . $template->get_Filt() . ".tab", $template->get_No(), $outbase);
- }
- print(HHR "\n");
- print(HHR @hhrContent);
- print(HHR "Done!\n");
- close (HHR);
- }
- ## starting from current hhr file, extract some features and save them into resultfile
- ## this is needed for benchmark set compilation
- sub createBenchmarkInfoFile {
- my ($self, $resultFile, $pdbdir) = @_;
- my $TMalign = $config->get_TMalign();
-
- my $query = $self->_get_query();
- my $queryPDB = "$pdbdir/$query.pdb";
- my $res = "";
- $res .= "queryName"."\t"."TMID"."\t"."coverage"."\t"."queryLen"."\t"."templateName"."\t"."TMscore\n";
- ## extract information from max first 50 templates
- for (my $i=0; $i<50 && $i<$self->size(); $i++) {
- my $template = $self->get($i);
- my $TMscore = 0;
- my $TMid = 0;
- my $templatePDB = "$pdbdir/" . $template->get_Hit() . ".pdb";
- my $tmalignResult = `$TMalign $templatePDB $queryPDB`;
- if ($tmalignResult =~ /TM-score\s*=\s*(\S+),\s+ID\s*=\s*(\S+)/) {
- $TMscore = $1;
- $TMid= int(($2*100)+0.5);
- }
- my $queryLen = $self->_get_queryLength();
- my $coverage = int(($template->get_Cols()*100/$queryLen)+0.5);
- my $templateName = $template->get_Hit();
- $res .= "$query\t$TMid\t$coverage\t$queryLen\t$templateName\t$TMscore\n";
- }
- open(OH, "> $resultFile") or die "Cant write $resultFile: $!\n";
- print (OH $res);
- close(OH);
- }
- 1;
|