123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775 |
- package PirFile;
- use FastaFile;
- use utilities;
- use PdbFile;
- use strict;
- sub new {
- my ($caller, $filename) = @_;
- my $caller_is_obj = ref($caller);
- my $class = $caller_is_obj || $caller;
- no strict "refs";
- my $self = bless [], $class;
-
- if (defined($filename)) {
- $self->read_from_file("$filename");
- }
- return $self;
- }
- sub read_from_file {
- my ($self, $filename) = @_;
- my @lines;
- open(FH, "< $filename") or die ("Cant open $filename: $!\n");
- @lines = <FH>;
- close(FH);
- my $readSeq = 0;
- my $seq = "";
- my $id;
- my $idxEntry = 0;
- for (my $i=0; $i<@lines; $i++) {
- my $curLine = $lines[$i];
- if ($curLine =~ /^\s*>\S+;(\S+)/) {
- if ($readSeq == 1) {
- $self->[$idxEntry]->{seq} = $seq;
- $idxEntry++;
- }
- $id = $1;
- ## structure or sequence description line
- $i++;
- chomp($lines[$i]);
- $curLine = $lines[$i];
- $self->[$idxEntry] = {id=>$id, descr=>$curLine};
- $readSeq = 1;
- $seq = "";
- next;
- }
- if ($readSeq == 1) {
- chomp($curLine);
- $seq .= $curLine;
- }
- }
- $self->[$idxEntry]->{seq} = $seq;
- }
- sub size {
- my $self = shift;
- scalar(@{$self});
- }
- sub get_seq {
- my ($self, $idx) = @_;
- $self->[$idx]->{seq};
- }
- sub get_id {
- my ($self, $idx) = @_;
- $self->[$idx]->{id};
- }
- sub get_descr {
- my ($self, $idx) = @_;
- $self->[$idx]->{descr};
- }
- sub add_entry {
- my ($self, $id, $descr, $seq) = @_;
-
- my $idx = $self->size();
- $self->[$idx]->{id} = $id;
- $self->[$idx]->{descr} = $descr;
- $self->[$idx]->{seq} = $seq;
- }
- sub to_string {
- my $self = shift;
- my $res = "";
- for (my $i=0; $i<$self->size(); $i++) {
- ## skip gap lines (my be introduced in reduce_to_cores)
- next if ($self->[$i]->{seq} =~ /^-+\*/);
- # my $hit = &get_basename($self->[$i]->{id});
- # my $chain = &get_PDB_chain($hit);
- # my $len = &get_seq_len($self->[$i]->{seq});
- $res .= ">P1;" . $self->[$i]->{id} . "\n";
- $res .= $self->[$i]->{descr} . "\n" if ($self->[$i]->{descr} ne "");
- # $res .= "sequence:$hit: 1: : $len: : : : : \n" if ($self->[$i]->{descr} =~ /\S:\@-\@/ && $i==0);
- # $res .= "structureX:$hit: : $chain: : $chain: dummy : dummy : : \n" if ($self->[$i]->{descr} eq "" && $i>0);
- $res .= $self->[$i]->{seq} . "\n";
- }
- return $res;
- }
- sub print {
- my $self = shift;
- print $self->to_string();
- }
- sub write_to_file {
- my ($self, $outfile) = @_;
- open (OH, "> $outfile") or die "Cant write to $outfile: $!\n";
- print (OH $self->to_string());
- close(OH);
- }
- sub to_fasta {
- my ($self) = @_;
-
- my $fasta = FastaFile->new();
- for (my $i=0; $i<$self->size(); $i++) {
- my $seq = $self->[$i]->{seq};
- ## remove final '*'
- $seq =~ s/\*$//;
- $fasta->add_entry($self->[$i]->{id}, $self->[$i]->{descr}, $seq);
- }
-
- return $fasta;
- }
- ## transform current pir file into a new one (written to outdir/queryId.pir)
- ## in this new one, only alignment positions covered by the first template (the cores)
- ## are used in the remaining templates, i.e. residues uncovered by the first template
- ## but by the 2nd, 3rd,... will be deleted, i.e. replaced by gaps.
- ## In order to provide Modeller with apropriate pdb files for these "new" templates,
- ## new pdb-files are generated and written to outdir
- sub reduce_to_cores {
- my $self = shift;
- my $outdir = shift;
- my $pdbdir = shift;
- my $firstTemplEntry = shift; # which entry in pir file is first (NN-ranked) template
- # this is needed because cores are wrt first template
- my $reducedPirFile = $self;
- my @pattern;
- if (@{$self} < 2) {
- print "ERROR in reduce_to_cores: no template in alignment: $!\n";
- return;
- }
- ## get first template
- print "first entry id=" . $self->[$firstTemplEntry]->{seq} . "\n";
- my $firstTemplate = $self->[$firstTemplEntry]->{seq};
- $firstTemplate =~ s/\*$//;
- my @firstTemp = split(//, $firstTemplate);
- ## create pattern of first template, i.e. 1 if residue, 0 if gap
- for (my $i=0; $i<@firstTemp; $i++) {
- my $pat = 1;
- $pat = 0 if ($firstTemp[$i] eq "-");
- push(@pattern, $pat);
- }
- my %ids;
- ## all other templates are not allowed to cover new residues, i.e.
- ## have pat=1 where pattern==0 (but vice versa is allowed)
- for (my $i=1; $i<@{$self}; $i++) {
- my $tempSeq = $self->[$i]->{seq};
- my $idOrig = $self->[$i]->{id};
- my $idAct = $idOrig;
- if ($i==$firstTemplEntry) {
- ## handle case if firstTemplate is coming up several times, see below
- next;
- }
-
- ## a template can be used several times (ie. occur several times in pir file)
- ## thus the name of the new pdb file has to change by adding an index
- ## because otherwise the file will be overwritten for each same template
- if (exists($ids{$idOrig})) {
- my $occurrence = $ids{$idOrig};
- $idAct = $idOrig . "" . $occurrence;
- $reducedPirFile->[$i]->{id} = $idAct;
- if ($reducedPirFile->[$i]->{descr} =~ /(.+:)$idOrig(:.+)/) {
- $reducedPirFile->[$i]->{descr} = "$1$idAct$2";
- } else {
- print "No match for: " . $reducedPirFile->[$i]->{descr} . "\n";
- }
- $ids{$idOrig}++;
- } else {
- $ids{$idOrig} = 0;
- }
- $tempSeq =~ s/\*$//;
- my @template = split(//, $tempSeq);
- my $tempPattern = "";
- ## test each residue in current template
- for (my $j=0; $j<@template; $j++) {
- if ($template[$j] ne "-" && $pattern[$j]==0) {
- $template[$j] = "-";
- $tempPattern .= "0";
- } elsif ($template[$j] ne "-") {
- $tempPattern .= "1";
- }
- }
- $reducedPirFile->[$i]->{seq} = join("", @template) . "*";
- ## read in original pdb file
- my $templatePdb = PdbFile->new();
- my $pdbFileName = "$pdbdir/$idOrig.pdb";
- $templatePdb->read($pdbFileName);
- ## create new adapted pdb file
- my $newTempPdbFile = "$outdir/$idAct.pdb";
- $tempSeq =~ s/-//g;
- $templatePdb->rebuild_pdb_file($tempSeq, $tempPattern, $newTempPdbFile);
- }
- my $queryId = $reducedPirFile->[0]->{id};
- ## find gaps-only columns (may have been created during core generation)
- my @gapPos = &match_all_positions("-+", $reducedPirFile->[0]->{seq});
- ## check if gap is in first template => gap column, otherwise => insert
- my @gapColPos;
- for (my $i=0; $i<@gapPos; $i++) {
- my $start = $gapPos[$i]->[0];
- my $end = $gapPos[$i]->[1];
- my $rangeInTemp1 = substr($reducedPirFile->[$firstTemplEntry]->{seq}, $start, $end-$start);
- if ($rangeInTemp1 =~ /^-+$/) {
- push(@gapColPos, $gapPos[$i]);
- }
- }
- ## remove gap columns
- for (my $i=0; $i<$reducedPirFile->size(); $i++) {
- $reducedPirFile->[$i]->{seq} = &remove_ranges($reducedPirFile->[$i]->{seq}, @gapColPos);
- }
- $reducedPirFile->write_to_file("$outdir/$queryId.pir");
- }
- ## this is a slight modification of the upper reduce_to_cores function:
- ## the former one reduces all templates (but the first) to the cores defined by the first template
- ## this one reduces only the first template to cores by removing residues given in hash %remmoveResidues
- sub remove_blocks_from_first {
- my $self = shift;
- my $outdir = shift;
- my $pdbdir = shift;
- my $firstTemplEntry = shift; # which entry in pir file is first (NN-ranked) template
- # this is needed because cores are wrt first template
- my $removeResiduesPtr = shift;
- my %removeResidues = %{$removeResiduesPtr};
- my $reducedPirFile = $self;
- my @pattern;
- if (@{$self} < 2) {
- print "ERROR in reduce_to_cores: no template in alignment: $!\n";
- return;
- }
- ## get first template
- print "first entry id=" . $self->[$firstTemplEntry]->{seq} . "\n";
- my $firstTemplate = $self->[$firstTemplEntry]->{seq};
- $firstTemplate =~ s/\*$//;
- my @firstTemp = split(//, $firstTemplate);
- ## create pattern of first template, i.e. 1 if residue, 0 if gap
- my $residueNr = 1;
- my $pattern = ""; ## pattern for building new pdb file (1 if residue, 0 if gap)
- my $newFirstTemplSeq = ""; ## modified first template sequence in pir file (the additional gaps
- for (my $i=0; $i<@firstTemp; $i++) {
- if ($firstTemp[$i] eq "-" ) { $newFirstTemplSeq .= "-"; }
- else {
- if (exists($removeResidues{$residueNr})) {
- $pattern .= "0";
- $newFirstTemplSeq .= "-";
- }
- else {
- $pattern .= "1";
- $newFirstTemplSeq .= $firstTemp[$i];
- }
- $residueNr++;
- }
- }
- my $oldFirstSeq = $reducedPirFile->[$firstTemplEntry]->{seq};
- $reducedPirFile->[$firstTemplEntry]->{seq} = "$newFirstTemplSeq*";
- ## create new pdb file where residues are removed
- ## MODELLER needs this new pdb file
- ## read in original pdb file
- my $firstTemplName = $self->[$firstTemplEntry]->{id};
- my $templatePdb = PdbFile->new();
- my $pdbFileName = "$pdbdir/$firstTemplName.pdb";
- $templatePdb->read($pdbFileName);
-
- my $newTempPdbFile = "$outdir/$firstTemplName.pdb";
- my $tempSeq = $oldFirstSeq;
- $tempSeq =~ s/-//g;
- $tempSeq =~ s/\*$//;
- print "pattern=$pattern\n";
- print " $tempSeq\n";
- $templatePdb->rebuild_pdb_file($tempSeq, $pattern, $newTempPdbFile);
- ## copy remaining pdb files to outdir (needed for MODELLER)
- for (my $i=1; $i<$self->size(); $i++) {
- next if ($i==$firstTemplEntry);
- my $templName = $self->[$i]->{id};
- my $pdbFileName = "$pdbdir/$templName.pdb";
- system("cp $pdbFileName $outdir");
- }
- my $queryId = $reducedPirFile->[0]->{id};
- ## find gaps-only columns (may have been created during core generation)
- my @gapPos = &match_all_positions("-+", $reducedPirFile->[0]->{seq});
- ## check if gap is in all sequences
- my @gapColPos;
- for (my $i=0; $i<@gapPos; $i++) {
- my $start = $gapPos[$i]->[0];
- my $end = $gapPos[$i]->[1];
- my $success = 1;
- for (my $j=0; $j<$reducedPirFile->size(); $j++) {
- my $rangeInTemp1 = substr($reducedPirFile->[$j]->{seq}, $start, $end-$start);
- if ($rangeInTemp1 !~ /^-+$/) {
- $success = 0;
- last;
- }
- }
- if ($success) {
- push(@gapColPos, $gapPos[$i]);
- }
- }
- ## remove gap columns
- for (my $i=0; $i<$reducedPirFile->size(); $i++) {
- $reducedPirFile->[$i]->{seq} = &remove_ranges($reducedPirFile->[$i]->{seq}, @gapColPos);
- }
- $reducedPirFile->write_to_file("$outdir/$queryId.pir");
- ## now, all necessary pdb files and the final pir file should be placed in outdir so that MODELLER can be
- ## started
- }
- ## find displaced gaps in the query, i.e.
- ## find situations where in the query there are two adjacent residues
- ## aligned to two template positions j1 and j2 (j2 > j1+1) and template
- ## distance is > 7A
- sub num_of_displaced_gaps {
- my $self = shift;
- my $pdbdir = shift;
- my $v = 1;
- my @gapPos;
- my $numDisplacedGaps = 0;
- my $qseq = $self->[0]->{seq};
- ## find gaps in query
- @gapPos = &match_all_positions("-+", $qseq);
- if ($v >= 1) {
- print "gaps in query:\n";
- for (my $i=0; $i<@gapPos; $i++) {
- print "$gapPos[$i]->[0] - $gapPos[$i]->[1]\n";
- }
- }
- ## skip front gaps (should not happen)
- if (scalar(@gapPos) > 0 && $gapPos[0]->[0] == 0) {
- shift @gapPos;
- }
- ## check for each template whether j2 > j1+1 and whether distance is > 7A
- for (my $i=1; $i<$self->size(); $i++) {
- my $tseq = $self->[$i]->{seq};
- my $tpdb = PdbFile->new();
- my $tpdbFileName = "$pdbdir/" . $self->[$i]->{id} . ".pdb";
- if ($v >= 1) {
- print "checking template $tpdbFileName\n";
- }
- $tpdb->read($tpdbFileName);
-
- for (my $j=0; $j<@gapPos; $j++) {
- my $start = $gapPos[$j]->[0];
- my $end = $gapPos[$j]->[1];
- ## get subsequence of template where there is a gap in query
- my $excerpt = substr($tseq, $start, $end-$start);
- ## test whether it is a gap => continue
- next if ($excerpt =~ /^-+$/);
- next if (substr($tseq, $start-1, 1) eq "-" || substr($tseq, $end, 1) eq "-");
- ## calculate distance between aligned residues
- # get residues indices in pdb-file of residues right before and after the gap (in query)
- my $tseqWoGaps = $tseq;
- $tseqWoGaps =~ s/-//g;
- $tseqWoGaps =~ s/\*$//;
- my $startIdx = $tpdb->get_startIdx_of_seq($tseqWoGaps);
- print "startIdx=$startIdx\n";
- my $prefix = substr($tseq, 0, $start);
- if ($v >= 1) { print "prefix=$prefix\n"; }
- $prefix =~ s/-//g;
- my $offset = length($prefix);
- my $res1Idx = $startIdx + $offset;
- my $res2Idx = $res1Idx + $end - $start + 1;
- if ($v >= 1) {
- print "resid1=$res1Idx, resid2=$res2Idx\n";
- }
- my $dist = $tpdb->distance_between($res1Idx, $res2Idx);
- if ($v >= 1) { print "dist=$dist\n"; }
- if ($dist >= 7) {
- $numDisplacedGaps++;
- }
- }
- }
- return $numDisplacedGaps;
- }
- ## are characters at pos1 and pos2 aligned in seq1 and seq2, i.e.
- ## are they all residues or gaps
- sub is_aligned {
- my $self = shift;
- my $pos1 = shift;
- my $pos2 = shift;
- my $seq1 = shift;
- my $seq2 = shift;
- my $s1p1 = substr($self->[$seq1]->{seq}, $pos1, 1);
- my $s1p2 = substr($self->[$seq1]->{seq}, $pos2, 1);
- my $s2p1 = substr($self->[$seq2]->{seq}, $pos1, 1);
- my $s2p2 = substr($self->[$seq2]->{seq}, $pos2, 1);
- if ($s1p1 eq '-' || $s1p2 eq '-' || $s2p1 eq '-' || $s2p2 eq '-') {
- return 0;
- }
- return 1;
- }
- sub num_aligned_contacts {
- my $self = shift;
- my $pdbdir = shift;
- my $minSeqDist = shift || 10; # minimum number of "between residues" for a contact
- my $contactThreshold = shift || 8; # max distance between residues to be a contact
- my $aliLen = $self->get_ali_length();
- my $numTemplates = $self->size() -1;
- my $num_contacts = 0;
- my $v = 0;
-
- ## do contact check for pairs Q-T1, Q-T2, ..., T1-T2, T1-T3, ...
- for (my $i=1; $i<$self->size(); $i++) {
- for (my $j=1; $j<$self->size(); $j++) {
- ## read pdb-file
- my $pdbFile = "$pdbdir/" . $self->[$j]->{id} . ".pdb";
- if ($v >= 1) { print "i=$i, j=$j, pdb=" . $self->[$j]->{id} . ".pdb\n"; }
- my $pdb = PdbFile->new();
- $pdb->read($pdbFile);
- my $seqWoGaps = $self->[$j]->{seq};
- $seqWoGaps =~ s/\*$//;
- $seqWoGaps =~ s/-//g;
-
- ## residue-nr of first occurence of seqWoGaps in pdb-file
- my $startIdx = $pdb->get_startIdx_of_seq($seqWoGaps);
- if ($v >= 1) { print "startIdx=$startIdx\n"; }
- ## test residues in pairwise alignment, if they are aligned and have right distance (in sequence)
- ## => calculate distance and see if they are in contact
- for (my $k=0; $k<$aliLen-$minSeqDist; $k++) {
- for (my $l=$k+$minSeqDist+1; $l<$aliLen; $l++) {
- if ($v >= 2) { print "k=$k, l=$l, i-1=" . ($i-1) . ", j=$j\n"; }
- next if (not $self->is_aligned($k, $l, $i-1, $j));
- ## get prefixes of aligned residues
- my $prefix1 = substr($self->[$j]->{seq}, 0, $k+1);
- my $prefix2 = substr($self->[$j]->{seq}, 0, $l);
- ## test whether there are gaps in between, i.e. they are distant enough in sequence
- ## get substring between aligned residues => remove gaps => check length
- my $between = $prefix2;
- $between =~ s/.$//;
- $between =~ s/^$prefix1//;
- $between =~ s/-//g;
- if ($v>=2) { print "between=$between\n"; }
- next if (length($between) < $minSeqDist);
- ## calculate residue numbers in pdb-file => remove gaps from prefixes
- $prefix1 =~ s/-//g;
- $prefix2 =~ s/-//g;
- my $offset1 = length($prefix1);
- my $offset2 = length($prefix2);
- my $res1Idx = $startIdx + $offset1;
- my $res2Idx = $startIdx + $offset2;
- if ($v>=2) { print "res1Idx=$res1Idx, res2Idx=$res2Idx\n"; }
- my $dist = $pdb->distance_between($res1Idx, $res2Idx);
- if ($v>=2) { print "dist=$dist\n"; }
- if ($dist < $contactThreshold) {
- $num_contacts++;
- }
- }
- }
- }
- }
- my $numOfPairs = $numTemplates*($numTemplates-1)/2;
-
- return $num_contacts/$numOfPairs;
- }
- ## number of residues (in query + all templates)
- sub num_residues {
- my $self = shift;
- my $numRes = 0;
- for (my $i=0; $i<$self->size(); $i++) {
- my $seq = $self->[$i]->{seq};
- $seq =~ s/\*$//;
- $seq =~ s/-//g;
- $numRes += length($seq);
- }
- return $numRes;
- }
- sub num_residues_woQuery {
- my $self = shift;
- my $query = $self->[0]->{seq};
- $query =~ s/\*$//;
- $query =~ s/-//g;
- my $resInQuery = length($query);
-
- my $totNumRes = $self->num_residues();
- return $totNumRes - $resInQuery;
- }
- sub get_ali_length {
- my $self = shift;
- return length($self->[0]->{seq})-1;
- }
- sub mean_sd_template_len {
- my $self = shift;
-
- my @lengths;
- for (my $i=1; $i<$self->size(); $i++) {
- }
- }
- sub score_gaps {
- my $self = shift;
- my $gop = shift || 10; # gap open penalty
- my $gep = shift || 1; # gap extension penalty
- my $aliLen = $self->get_ali_length();
-
- my @gaps;
- for (my $i=0; $i<$self->size(); $i++) {
- my @tmp = &match_all_positions("-+", $self->[$i]->{seq});
- push(@gaps, \@tmp);
- }
- my $gapScore = 0;
- for (my $i=0; $i<@gaps; $i++) {
- my @gapsInSeq = @{$gaps[$i]};
- for (my $j=0; $j<@gapsInSeq; $j++) {
- my $start = $gapsInSeq[$j]->[0];
- my $end = $gapsInSeq[$j]->[1];
- next if ($start == 0 || $end == $aliLen); # skip terminal gaps
- $gapScore += $gop + ($end-$start)*$gep;
- }
- }
- return $gapScore;
- }
- sub mean_sd_gap_len {
- my $self = shift;
-
- my @gapLens;
- my $mean = 0;
- my $sd = 0;
- my $aliLen = $self->get_ali_length();
- for (my $i=0; $i<$self->size(); $i++) {
- my @gaps = &match_all_positions("-+", $self->[$i]->{seq});
-
- for (my $j=0; $j<@gaps; $j++) {
- my $start = $gaps[$j]->[0];
- my $end = $gaps[$j]->[1];
- next if ($start == 0 || $end == $aliLen);
- push(@gapLens, $end-$start);
- }
- }
- if (scalar(@gapLens) == 0) { return (0,0); }
- $mean = &mean(@gapLens);
- $sd = &sd(@gapLens);
- return ($mean, $sd);
- }
- ## calculate coverage of query by templates
- ## a residue is covered, if a template residue is aligned to it
- ## gaps in query are ignored (not counted)
- ## "coverage" is the coverage by all templates
- ## "coverageIncrease" is the increase in coverage by the last template in hhmakemodel list (-m option)
- sub coverage {
- my $self = shift;
- my $lastTemplIdx = shift;
- # if coverage of lastly added template is wanted:
- # hhmakemodel option -m 4 3 creates a pir file with templates in order 3 4 (ascending),
- # but last added template is 3 => lastTemplIdx = 1 (i.e. first template in pir file (0 is query))
- my $coverageByLastTemplate = 0;
- my @coverage;
- ## initialization
- for (my $i=0; $i<$self->get_ali_length(); $i++) {
- $coverage[$i] = 0;
- }
- my $query = $self->[0]->{seq};
- $query =~ s/\*$//;
- my @qRes = split(//, $query);
- ## for all but the very last template
- for (my $i=1; $i<$self->size(); $i++) {
- next if ($i == $lastTemplIdx);
- my $tseq = $self->[$i]->{seq};
- $tseq =~ s/\*$//;
- my @tRes = split(//, $tseq);
- for (my $j=0; $j<@tRes; $j++) {
- ## both query and template must not be a gap
- if ($tRes[$j] ne '-' && $qRes[$j] ne '-') {
- $coverage[$j] = 1;
- }
- }
- }
- my $qLen = $query;
- $qLen =~ s/-//g;
- $qLen = length($qLen);
- #print "qLen=$qLen\n";
-
- ## additional coverage of very last template
- my $newlyCovered = 0;
- my $lastTemplate = $self->[$lastTemplIdx]->{seq};
- $lastTemplate =~ s/\*$//;
- my @lastRes = split(//, $lastTemplate);
-
- for (my $i=0; $i<@lastRes; $i++) {
- if ($lastRes[$i] ne '-' && $coverage[$i] != 1 && $qRes[$i] ne '-') {
- $newlyCovered++;
- $coverage[$i] = 1;
- }
- }
- my $coverageIncrease = $newlyCovered/$qLen;
- my $coveredResidues = 0;
-
- ## total coverage
- for (my $i=0; $i<@coverage; $i++) {
- if ($qRes[$i] ne '-' && $coverage[$i] == 1) {
- $coveredResidues++;
- }
- }
- my $coverage = $coveredResidues/$qLen;
- return ($coverage, $coverageIncrease);
- }
- ## calculate number of newly covered residues for each hitNo
- sub allCoverages {
- my $self = shift;
- my @hitNos = @_; # hit numbers in the order given by template selection strategy
- # if coverage of lastly added template is wanted:
- # hhmakemodel option -m 4 3 creates a pir file with templates in order 3 4 (ascending),
- # but last added template is 3 => lastTemplIdx = 1 (i.e. first template in pir file (0 is query))
- my @coverage;
- my %newlyCovered;
- ## initialization
- for (my $i=0; $i<$self->get_ali_length(); $i++) {
- $coverage[$i] = 0;
- }
-
- my $query = $self->[0]->{seq};
- $query =~ s/\*$//;
- my @qRes = split(//, $query);
- ## for all templates
- for (my $i=0; $i<@hitNos; $i++) {
- my $hitNo = $hitNos[$i];
- ## find index in pir file
- my $pirLineIdx = &rankForHitNo($hitNo, @hitNos);
- print "pirLineIdx($hitNo)=$pirLineIdx\n";
- $newlyCovered{$hitNo} = 0;
- ## get sequence
- my $tseq = $self->[$pirLineIdx]->{seq};
- $tseq =~ s/\*$//;
- my @tRes = split(//, $tseq);
- ## check its coverage
- for (my $j=0; $j<@tRes; $j++) {
- ## both query and template must not be a gap
- if ($tRes[$j] ne '-' && $qRes[$j] ne '-') {
- if ($coverage[$j] != 1) {
- $newlyCovered{$hitNo}++;
- }
- $coverage[$j] = 1;
- }
- }
- }
-
- return %newlyCovered;
- }
- sub rankForHitNo {
- my $hitNo = shift;
- my @allHitNos = @_;
-
- my $rank = 1;
- for (my $i=0; $i<@allHitNos; $i++) {
- if ($hitNo > $allHitNos[$i]) {
- $rank++;
- }
- }
- return $rank;
- }
- 1;
|