PirFile.pm 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. package PirFile;
  2. use FastaFile;
  3. use utilities;
  4. use PdbFile;
  5. use strict;
  6. sub new {
  7. my ($caller, $filename) = @_;
  8. my $caller_is_obj = ref($caller);
  9. my $class = $caller_is_obj || $caller;
  10. no strict "refs";
  11. my $self = bless [], $class;
  12. if (defined($filename)) {
  13. $self->read_from_file("$filename");
  14. }
  15. return $self;
  16. }
  17. sub read_from_file {
  18. my ($self, $filename) = @_;
  19. my @lines;
  20. open(FH, "< $filename") or die ("Cant open $filename: $!\n");
  21. @lines = <FH>;
  22. close(FH);
  23. my $readSeq = 0;
  24. my $seq = "";
  25. my $id;
  26. my $idxEntry = 0;
  27. for (my $i=0; $i<@lines; $i++) {
  28. my $curLine = $lines[$i];
  29. if ($curLine =~ /^\s*>\S+;(\S+)/) {
  30. if ($readSeq == 1) {
  31. $self->[$idxEntry]->{seq} = $seq;
  32. $idxEntry++;
  33. }
  34. $id = $1;
  35. ## structure or sequence description line
  36. $i++;
  37. chomp($lines[$i]);
  38. $curLine = $lines[$i];
  39. $self->[$idxEntry] = {id=>$id, descr=>$curLine};
  40. $readSeq = 1;
  41. $seq = "";
  42. next;
  43. }
  44. if ($readSeq == 1) {
  45. chomp($curLine);
  46. $seq .= $curLine;
  47. }
  48. }
  49. $self->[$idxEntry]->{seq} = $seq;
  50. }
  51. sub size {
  52. my $self = shift;
  53. scalar(@{$self});
  54. }
  55. sub get_seq {
  56. my ($self, $idx) = @_;
  57. $self->[$idx]->{seq};
  58. }
  59. sub get_id {
  60. my ($self, $idx) = @_;
  61. $self->[$idx]->{id};
  62. }
  63. sub get_descr {
  64. my ($self, $idx) = @_;
  65. $self->[$idx]->{descr};
  66. }
  67. sub add_entry {
  68. my ($self, $id, $descr, $seq) = @_;
  69. my $idx = $self->size();
  70. $self->[$idx]->{id} = $id;
  71. $self->[$idx]->{descr} = $descr;
  72. $self->[$idx]->{seq} = $seq;
  73. }
  74. sub to_string {
  75. my $self = shift;
  76. my $res = "";
  77. for (my $i=0; $i<$self->size(); $i++) {
  78. ## skip gap lines (my be introduced in reduce_to_cores)
  79. next if ($self->[$i]->{seq} =~ /^-+\*/);
  80. # my $hit = &get_basename($self->[$i]->{id});
  81. # my $chain = &get_PDB_chain($hit);
  82. # my $len = &get_seq_len($self->[$i]->{seq});
  83. $res .= ">P1;" . $self->[$i]->{id} . "\n";
  84. $res .= $self->[$i]->{descr} . "\n" if ($self->[$i]->{descr} ne "");
  85. # $res .= "sequence:$hit: 1: : $len: : : : : \n" if ($self->[$i]->{descr} =~ /\S:\@-\@/ && $i==0);
  86. # $res .= "structureX:$hit: : $chain: : $chain: dummy : dummy : : \n" if ($self->[$i]->{descr} eq "" && $i>0);
  87. $res .= $self->[$i]->{seq} . "\n";
  88. }
  89. return $res;
  90. }
  91. sub print {
  92. my $self = shift;
  93. print $self->to_string();
  94. }
  95. sub write_to_file {
  96. my ($self, $outfile) = @_;
  97. open (OH, "> $outfile") or die "Cant write to $outfile: $!\n";
  98. print (OH $self->to_string());
  99. close(OH);
  100. }
  101. sub to_fasta {
  102. my ($self) = @_;
  103. my $fasta = FastaFile->new();
  104. for (my $i=0; $i<$self->size(); $i++) {
  105. my $seq = $self->[$i]->{seq};
  106. ## remove final '*'
  107. $seq =~ s/\*$//;
  108. $fasta->add_entry($self->[$i]->{id}, $self->[$i]->{descr}, $seq);
  109. }
  110. return $fasta;
  111. }
  112. ## transform current pir file into a new one (written to outdir/queryId.pir)
  113. ## in this new one, only alignment positions covered by the first template (the cores)
  114. ## are used in the remaining templates, i.e. residues uncovered by the first template
  115. ## but by the 2nd, 3rd,... will be deleted, i.e. replaced by gaps.
  116. ## In order to provide Modeller with apropriate pdb files for these "new" templates,
  117. ## new pdb-files are generated and written to outdir
  118. sub reduce_to_cores {
  119. my $self = shift;
  120. my $outdir = shift;
  121. my $pdbdir = shift;
  122. my $firstTemplEntry = shift; # which entry in pir file is first (NN-ranked) template
  123. # this is needed because cores are wrt first template
  124. my $reducedPirFile = $self;
  125. my @pattern;
  126. if (@{$self} < 2) {
  127. print "ERROR in reduce_to_cores: no template in alignment: $!\n";
  128. return;
  129. }
  130. ## get first template
  131. print "first entry id=" . $self->[$firstTemplEntry]->{seq} . "\n";
  132. my $firstTemplate = $self->[$firstTemplEntry]->{seq};
  133. $firstTemplate =~ s/\*$//;
  134. my @firstTemp = split(//, $firstTemplate);
  135. ## create pattern of first template, i.e. 1 if residue, 0 if gap
  136. for (my $i=0; $i<@firstTemp; $i++) {
  137. my $pat = 1;
  138. $pat = 0 if ($firstTemp[$i] eq "-");
  139. push(@pattern, $pat);
  140. }
  141. my %ids;
  142. ## all other templates are not allowed to cover new residues, i.e.
  143. ## have pat=1 where pattern==0 (but vice versa is allowed)
  144. for (my $i=1; $i<@{$self}; $i++) {
  145. my $tempSeq = $self->[$i]->{seq};
  146. my $idOrig = $self->[$i]->{id};
  147. my $idAct = $idOrig;
  148. if ($i==$firstTemplEntry) {
  149. ## handle case if firstTemplate is coming up several times, see below
  150. next;
  151. }
  152. ## a template can be used several times (ie. occur several times in pir file)
  153. ## thus the name of the new pdb file has to change by adding an index
  154. ## because otherwise the file will be overwritten for each same template
  155. if (exists($ids{$idOrig})) {
  156. my $occurrence = $ids{$idOrig};
  157. $idAct = $idOrig . "" . $occurrence;
  158. $reducedPirFile->[$i]->{id} = $idAct;
  159. if ($reducedPirFile->[$i]->{descr} =~ /(.+:)$idOrig(:.+)/) {
  160. $reducedPirFile->[$i]->{descr} = "$1$idAct$2";
  161. } else {
  162. print "No match for: " . $reducedPirFile->[$i]->{descr} . "\n";
  163. }
  164. $ids{$idOrig}++;
  165. } else {
  166. $ids{$idOrig} = 0;
  167. }
  168. $tempSeq =~ s/\*$//;
  169. my @template = split(//, $tempSeq);
  170. my $tempPattern = "";
  171. ## test each residue in current template
  172. for (my $j=0; $j<@template; $j++) {
  173. if ($template[$j] ne "-" && $pattern[$j]==0) {
  174. $template[$j] = "-";
  175. $tempPattern .= "0";
  176. } elsif ($template[$j] ne "-") {
  177. $tempPattern .= "1";
  178. }
  179. }
  180. $reducedPirFile->[$i]->{seq} = join("", @template) . "*";
  181. ## read in original pdb file
  182. my $templatePdb = PdbFile->new();
  183. my $pdbFileName = "$pdbdir/$idOrig.pdb";
  184. $templatePdb->read($pdbFileName);
  185. ## create new adapted pdb file
  186. my $newTempPdbFile = "$outdir/$idAct.pdb";
  187. $tempSeq =~ s/-//g;
  188. $templatePdb->rebuild_pdb_file($tempSeq, $tempPattern, $newTempPdbFile);
  189. }
  190. my $queryId = $reducedPirFile->[0]->{id};
  191. ## find gaps-only columns (may have been created during core generation)
  192. my @gapPos = &match_all_positions("-+", $reducedPirFile->[0]->{seq});
  193. ## check if gap is in first template => gap column, otherwise => insert
  194. my @gapColPos;
  195. for (my $i=0; $i<@gapPos; $i++) {
  196. my $start = $gapPos[$i]->[0];
  197. my $end = $gapPos[$i]->[1];
  198. my $rangeInTemp1 = substr($reducedPirFile->[$firstTemplEntry]->{seq}, $start, $end-$start);
  199. if ($rangeInTemp1 =~ /^-+$/) {
  200. push(@gapColPos, $gapPos[$i]);
  201. }
  202. }
  203. ## remove gap columns
  204. for (my $i=0; $i<$reducedPirFile->size(); $i++) {
  205. $reducedPirFile->[$i]->{seq} = &remove_ranges($reducedPirFile->[$i]->{seq}, @gapColPos);
  206. }
  207. $reducedPirFile->write_to_file("$outdir/$queryId.pir");
  208. }
  209. ## this is a slight modification of the upper reduce_to_cores function:
  210. ## the former one reduces all templates (but the first) to the cores defined by the first template
  211. ## this one reduces only the first template to cores by removing residues given in hash %remmoveResidues
  212. sub remove_blocks_from_first {
  213. my $self = shift;
  214. my $outdir = shift;
  215. my $pdbdir = shift;
  216. my $firstTemplEntry = shift; # which entry in pir file is first (NN-ranked) template
  217. # this is needed because cores are wrt first template
  218. my $removeResiduesPtr = shift;
  219. my %removeResidues = %{$removeResiduesPtr};
  220. my $reducedPirFile = $self;
  221. my @pattern;
  222. if (@{$self} < 2) {
  223. print "ERROR in reduce_to_cores: no template in alignment: $!\n";
  224. return;
  225. }
  226. ## get first template
  227. print "first entry id=" . $self->[$firstTemplEntry]->{seq} . "\n";
  228. my $firstTemplate = $self->[$firstTemplEntry]->{seq};
  229. $firstTemplate =~ s/\*$//;
  230. my @firstTemp = split(//, $firstTemplate);
  231. ## create pattern of first template, i.e. 1 if residue, 0 if gap
  232. my $residueNr = 1;
  233. my $pattern = ""; ## pattern for building new pdb file (1 if residue, 0 if gap)
  234. my $newFirstTemplSeq = ""; ## modified first template sequence in pir file (the additional gaps
  235. for (my $i=0; $i<@firstTemp; $i++) {
  236. if ($firstTemp[$i] eq "-" ) { $newFirstTemplSeq .= "-"; }
  237. else {
  238. if (exists($removeResidues{$residueNr})) {
  239. $pattern .= "0";
  240. $newFirstTemplSeq .= "-";
  241. }
  242. else {
  243. $pattern .= "1";
  244. $newFirstTemplSeq .= $firstTemp[$i];
  245. }
  246. $residueNr++;
  247. }
  248. }
  249. my $oldFirstSeq = $reducedPirFile->[$firstTemplEntry]->{seq};
  250. $reducedPirFile->[$firstTemplEntry]->{seq} = "$newFirstTemplSeq*";
  251. ## create new pdb file where residues are removed
  252. ## MODELLER needs this new pdb file
  253. ## read in original pdb file
  254. my $firstTemplName = $self->[$firstTemplEntry]->{id};
  255. my $templatePdb = PdbFile->new();
  256. my $pdbFileName = "$pdbdir/$firstTemplName.pdb";
  257. $templatePdb->read($pdbFileName);
  258. my $newTempPdbFile = "$outdir/$firstTemplName.pdb";
  259. my $tempSeq = $oldFirstSeq;
  260. $tempSeq =~ s/-//g;
  261. $tempSeq =~ s/\*$//;
  262. print "pattern=$pattern\n";
  263. print " $tempSeq\n";
  264. $templatePdb->rebuild_pdb_file($tempSeq, $pattern, $newTempPdbFile);
  265. ## copy remaining pdb files to outdir (needed for MODELLER)
  266. for (my $i=1; $i<$self->size(); $i++) {
  267. next if ($i==$firstTemplEntry);
  268. my $templName = $self->[$i]->{id};
  269. my $pdbFileName = "$pdbdir/$templName.pdb";
  270. system("cp $pdbFileName $outdir");
  271. }
  272. my $queryId = $reducedPirFile->[0]->{id};
  273. ## find gaps-only columns (may have been created during core generation)
  274. my @gapPos = &match_all_positions("-+", $reducedPirFile->[0]->{seq});
  275. ## check if gap is in all sequences
  276. my @gapColPos;
  277. for (my $i=0; $i<@gapPos; $i++) {
  278. my $start = $gapPos[$i]->[0];
  279. my $end = $gapPos[$i]->[1];
  280. my $success = 1;
  281. for (my $j=0; $j<$reducedPirFile->size(); $j++) {
  282. my $rangeInTemp1 = substr($reducedPirFile->[$j]->{seq}, $start, $end-$start);
  283. if ($rangeInTemp1 !~ /^-+$/) {
  284. $success = 0;
  285. last;
  286. }
  287. }
  288. if ($success) {
  289. push(@gapColPos, $gapPos[$i]);
  290. }
  291. }
  292. ## remove gap columns
  293. for (my $i=0; $i<$reducedPirFile->size(); $i++) {
  294. $reducedPirFile->[$i]->{seq} = &remove_ranges($reducedPirFile->[$i]->{seq}, @gapColPos);
  295. }
  296. $reducedPirFile->write_to_file("$outdir/$queryId.pir");
  297. ## now, all necessary pdb files and the final pir file should be placed in outdir so that MODELLER can be
  298. ## started
  299. }
  300. ## find displaced gaps in the query, i.e.
  301. ## find situations where in the query there are two adjacent residues
  302. ## aligned to two template positions j1 and j2 (j2 > j1+1) and template
  303. ## distance is > 7A
  304. sub num_of_displaced_gaps {
  305. my $self = shift;
  306. my $pdbdir = shift;
  307. my $v = 1;
  308. my @gapPos;
  309. my $numDisplacedGaps = 0;
  310. my $qseq = $self->[0]->{seq};
  311. ## find gaps in query
  312. @gapPos = &match_all_positions("-+", $qseq);
  313. if ($v >= 1) {
  314. print "gaps in query:\n";
  315. for (my $i=0; $i<@gapPos; $i++) {
  316. print "$gapPos[$i]->[0] - $gapPos[$i]->[1]\n";
  317. }
  318. }
  319. ## skip front gaps (should not happen)
  320. if (scalar(@gapPos) > 0 && $gapPos[0]->[0] == 0) {
  321. shift @gapPos;
  322. }
  323. ## check for each template whether j2 > j1+1 and whether distance is > 7A
  324. for (my $i=1; $i<$self->size(); $i++) {
  325. my $tseq = $self->[$i]->{seq};
  326. my $tpdb = PdbFile->new();
  327. my $tpdbFileName = "$pdbdir/" . $self->[$i]->{id} . ".pdb";
  328. if ($v >= 1) {
  329. print "checking template $tpdbFileName\n";
  330. }
  331. $tpdb->read($tpdbFileName);
  332. for (my $j=0; $j<@gapPos; $j++) {
  333. my $start = $gapPos[$j]->[0];
  334. my $end = $gapPos[$j]->[1];
  335. ## get subsequence of template where there is a gap in query
  336. my $excerpt = substr($tseq, $start, $end-$start);
  337. ## test whether it is a gap => continue
  338. next if ($excerpt =~ /^-+$/);
  339. next if (substr($tseq, $start-1, 1) eq "-" || substr($tseq, $end, 1) eq "-");
  340. ## calculate distance between aligned residues
  341. # get residues indices in pdb-file of residues right before and after the gap (in query)
  342. my $tseqWoGaps = $tseq;
  343. $tseqWoGaps =~ s/-//g;
  344. $tseqWoGaps =~ s/\*$//;
  345. my $startIdx = $tpdb->get_startIdx_of_seq($tseqWoGaps);
  346. print "startIdx=$startIdx\n";
  347. my $prefix = substr($tseq, 0, $start);
  348. if ($v >= 1) { print "prefix=$prefix\n"; }
  349. $prefix =~ s/-//g;
  350. my $offset = length($prefix);
  351. my $res1Idx = $startIdx + $offset;
  352. my $res2Idx = $res1Idx + $end - $start + 1;
  353. if ($v >= 1) {
  354. print "resid1=$res1Idx, resid2=$res2Idx\n";
  355. }
  356. my $dist = $tpdb->distance_between($res1Idx, $res2Idx);
  357. if ($v >= 1) { print "dist=$dist\n"; }
  358. if ($dist >= 7) {
  359. $numDisplacedGaps++;
  360. }
  361. }
  362. }
  363. return $numDisplacedGaps;
  364. }
  365. ## are characters at pos1 and pos2 aligned in seq1 and seq2, i.e.
  366. ## are they all residues or gaps
  367. sub is_aligned {
  368. my $self = shift;
  369. my $pos1 = shift;
  370. my $pos2 = shift;
  371. my $seq1 = shift;
  372. my $seq2 = shift;
  373. my $s1p1 = substr($self->[$seq1]->{seq}, $pos1, 1);
  374. my $s1p2 = substr($self->[$seq1]->{seq}, $pos2, 1);
  375. my $s2p1 = substr($self->[$seq2]->{seq}, $pos1, 1);
  376. my $s2p2 = substr($self->[$seq2]->{seq}, $pos2, 1);
  377. if ($s1p1 eq '-' || $s1p2 eq '-' || $s2p1 eq '-' || $s2p2 eq '-') {
  378. return 0;
  379. }
  380. return 1;
  381. }
  382. sub num_aligned_contacts {
  383. my $self = shift;
  384. my $pdbdir = shift;
  385. my $minSeqDist = shift || 10; # minimum number of "between residues" for a contact
  386. my $contactThreshold = shift || 8; # max distance between residues to be a contact
  387. my $aliLen = $self->get_ali_length();
  388. my $numTemplates = $self->size() -1;
  389. my $num_contacts = 0;
  390. my $v = 0;
  391. ## do contact check for pairs Q-T1, Q-T2, ..., T1-T2, T1-T3, ...
  392. for (my $i=1; $i<$self->size(); $i++) {
  393. for (my $j=1; $j<$self->size(); $j++) {
  394. ## read pdb-file
  395. my $pdbFile = "$pdbdir/" . $self->[$j]->{id} . ".pdb";
  396. if ($v >= 1) { print "i=$i, j=$j, pdb=" . $self->[$j]->{id} . ".pdb\n"; }
  397. my $pdb = PdbFile->new();
  398. $pdb->read($pdbFile);
  399. my $seqWoGaps = $self->[$j]->{seq};
  400. $seqWoGaps =~ s/\*$//;
  401. $seqWoGaps =~ s/-//g;
  402. ## residue-nr of first occurence of seqWoGaps in pdb-file
  403. my $startIdx = $pdb->get_startIdx_of_seq($seqWoGaps);
  404. if ($v >= 1) { print "startIdx=$startIdx\n"; }
  405. ## test residues in pairwise alignment, if they are aligned and have right distance (in sequence)
  406. ## => calculate distance and see if they are in contact
  407. for (my $k=0; $k<$aliLen-$minSeqDist; $k++) {
  408. for (my $l=$k+$minSeqDist+1; $l<$aliLen; $l++) {
  409. if ($v >= 2) { print "k=$k, l=$l, i-1=" . ($i-1) . ", j=$j\n"; }
  410. next if (not $self->is_aligned($k, $l, $i-1, $j));
  411. ## get prefixes of aligned residues
  412. my $prefix1 = substr($self->[$j]->{seq}, 0, $k+1);
  413. my $prefix2 = substr($self->[$j]->{seq}, 0, $l);
  414. ## test whether there are gaps in between, i.e. they are distant enough in sequence
  415. ## get substring between aligned residues => remove gaps => check length
  416. my $between = $prefix2;
  417. $between =~ s/.$//;
  418. $between =~ s/^$prefix1//;
  419. $between =~ s/-//g;
  420. if ($v>=2) { print "between=$between\n"; }
  421. next if (length($between) < $minSeqDist);
  422. ## calculate residue numbers in pdb-file => remove gaps from prefixes
  423. $prefix1 =~ s/-//g;
  424. $prefix2 =~ s/-//g;
  425. my $offset1 = length($prefix1);
  426. my $offset2 = length($prefix2);
  427. my $res1Idx = $startIdx + $offset1;
  428. my $res2Idx = $startIdx + $offset2;
  429. if ($v>=2) { print "res1Idx=$res1Idx, res2Idx=$res2Idx\n"; }
  430. my $dist = $pdb->distance_between($res1Idx, $res2Idx);
  431. if ($v>=2) { print "dist=$dist\n"; }
  432. if ($dist < $contactThreshold) {
  433. $num_contacts++;
  434. }
  435. }
  436. }
  437. }
  438. }
  439. my $numOfPairs = $numTemplates*($numTemplates-1)/2;
  440. return $num_contacts/$numOfPairs;
  441. }
  442. ## number of residues (in query + all templates)
  443. sub num_residues {
  444. my $self = shift;
  445. my $numRes = 0;
  446. for (my $i=0; $i<$self->size(); $i++) {
  447. my $seq = $self->[$i]->{seq};
  448. $seq =~ s/\*$//;
  449. $seq =~ s/-//g;
  450. $numRes += length($seq);
  451. }
  452. return $numRes;
  453. }
  454. sub num_residues_woQuery {
  455. my $self = shift;
  456. my $query = $self->[0]->{seq};
  457. $query =~ s/\*$//;
  458. $query =~ s/-//g;
  459. my $resInQuery = length($query);
  460. my $totNumRes = $self->num_residues();
  461. return $totNumRes - $resInQuery;
  462. }
  463. sub get_ali_length {
  464. my $self = shift;
  465. return length($self->[0]->{seq})-1;
  466. }
  467. sub mean_sd_template_len {
  468. my $self = shift;
  469. my @lengths;
  470. for (my $i=1; $i<$self->size(); $i++) {
  471. }
  472. }
  473. sub score_gaps {
  474. my $self = shift;
  475. my $gop = shift || 10; # gap open penalty
  476. my $gep = shift || 1; # gap extension penalty
  477. my $aliLen = $self->get_ali_length();
  478. my @gaps;
  479. for (my $i=0; $i<$self->size(); $i++) {
  480. my @tmp = &match_all_positions("-+", $self->[$i]->{seq});
  481. push(@gaps, \@tmp);
  482. }
  483. my $gapScore = 0;
  484. for (my $i=0; $i<@gaps; $i++) {
  485. my @gapsInSeq = @{$gaps[$i]};
  486. for (my $j=0; $j<@gapsInSeq; $j++) {
  487. my $start = $gapsInSeq[$j]->[0];
  488. my $end = $gapsInSeq[$j]->[1];
  489. next if ($start == 0 || $end == $aliLen); # skip terminal gaps
  490. $gapScore += $gop + ($end-$start)*$gep;
  491. }
  492. }
  493. return $gapScore;
  494. }
  495. sub mean_sd_gap_len {
  496. my $self = shift;
  497. my @gapLens;
  498. my $mean = 0;
  499. my $sd = 0;
  500. my $aliLen = $self->get_ali_length();
  501. for (my $i=0; $i<$self->size(); $i++) {
  502. my @gaps = &match_all_positions("-+", $self->[$i]->{seq});
  503. for (my $j=0; $j<@gaps; $j++) {
  504. my $start = $gaps[$j]->[0];
  505. my $end = $gaps[$j]->[1];
  506. next if ($start == 0 || $end == $aliLen);
  507. push(@gapLens, $end-$start);
  508. }
  509. }
  510. if (scalar(@gapLens) == 0) { return (0,0); }
  511. $mean = &mean(@gapLens);
  512. $sd = &sd(@gapLens);
  513. return ($mean, $sd);
  514. }
  515. ## calculate coverage of query by templates
  516. ## a residue is covered, if a template residue is aligned to it
  517. ## gaps in query are ignored (not counted)
  518. ## "coverage" is the coverage by all templates
  519. ## "coverageIncrease" is the increase in coverage by the last template in hhmakemodel list (-m option)
  520. sub coverage {
  521. my $self = shift;
  522. my $lastTemplIdx = shift;
  523. # if coverage of lastly added template is wanted:
  524. # hhmakemodel option -m 4 3 creates a pir file with templates in order 3 4 (ascending),
  525. # but last added template is 3 => lastTemplIdx = 1 (i.e. first template in pir file (0 is query))
  526. my $coverageByLastTemplate = 0;
  527. my @coverage;
  528. ## initialization
  529. for (my $i=0; $i<$self->get_ali_length(); $i++) {
  530. $coverage[$i] = 0;
  531. }
  532. my $query = $self->[0]->{seq};
  533. $query =~ s/\*$//;
  534. my @qRes = split(//, $query);
  535. ## for all but the very last template
  536. for (my $i=1; $i<$self->size(); $i++) {
  537. next if ($i == $lastTemplIdx);
  538. my $tseq = $self->[$i]->{seq};
  539. $tseq =~ s/\*$//;
  540. my @tRes = split(//, $tseq);
  541. for (my $j=0; $j<@tRes; $j++) {
  542. ## both query and template must not be a gap
  543. if ($tRes[$j] ne '-' && $qRes[$j] ne '-') {
  544. $coverage[$j] = 1;
  545. }
  546. }
  547. }
  548. my $qLen = $query;
  549. $qLen =~ s/-//g;
  550. $qLen = length($qLen);
  551. #print "qLen=$qLen\n";
  552. ## additional coverage of very last template
  553. my $newlyCovered = 0;
  554. my $lastTemplate = $self->[$lastTemplIdx]->{seq};
  555. $lastTemplate =~ s/\*$//;
  556. my @lastRes = split(//, $lastTemplate);
  557. for (my $i=0; $i<@lastRes; $i++) {
  558. if ($lastRes[$i] ne '-' && $coverage[$i] != 1 && $qRes[$i] ne '-') {
  559. $newlyCovered++;
  560. $coverage[$i] = 1;
  561. }
  562. }
  563. my $coverageIncrease = $newlyCovered/$qLen;
  564. my $coveredResidues = 0;
  565. ## total coverage
  566. for (my $i=0; $i<@coverage; $i++) {
  567. if ($qRes[$i] ne '-' && $coverage[$i] == 1) {
  568. $coveredResidues++;
  569. }
  570. }
  571. my $coverage = $coveredResidues/$qLen;
  572. return ($coverage, $coverageIncrease);
  573. }
  574. ## calculate number of newly covered residues for each hitNo
  575. sub allCoverages {
  576. my $self = shift;
  577. my @hitNos = @_; # hit numbers in the order given by template selection strategy
  578. # if coverage of lastly added template is wanted:
  579. # hhmakemodel option -m 4 3 creates a pir file with templates in order 3 4 (ascending),
  580. # but last added template is 3 => lastTemplIdx = 1 (i.e. first template in pir file (0 is query))
  581. my @coverage;
  582. my %newlyCovered;
  583. ## initialization
  584. for (my $i=0; $i<$self->get_ali_length(); $i++) {
  585. $coverage[$i] = 0;
  586. }
  587. my $query = $self->[0]->{seq};
  588. $query =~ s/\*$//;
  589. my @qRes = split(//, $query);
  590. ## for all templates
  591. for (my $i=0; $i<@hitNos; $i++) {
  592. my $hitNo = $hitNos[$i];
  593. ## find index in pir file
  594. my $pirLineIdx = &rankForHitNo($hitNo, @hitNos);
  595. print "pirLineIdx($hitNo)=$pirLineIdx\n";
  596. $newlyCovered{$hitNo} = 0;
  597. ## get sequence
  598. my $tseq = $self->[$pirLineIdx]->{seq};
  599. $tseq =~ s/\*$//;
  600. my @tRes = split(//, $tseq);
  601. ## check its coverage
  602. for (my $j=0; $j<@tRes; $j++) {
  603. ## both query and template must not be a gap
  604. if ($tRes[$j] ne '-' && $qRes[$j] ne '-') {
  605. if ($coverage[$j] != 1) {
  606. $newlyCovered{$hitNo}++;
  607. }
  608. $coverage[$j] = 1;
  609. }
  610. }
  611. }
  612. return %newlyCovered;
  613. }
  614. sub rankForHitNo {
  615. my $hitNo = shift;
  616. my @allHitNos = @_;
  617. my $rank = 1;
  618. for (my $i=0; $i<@allHitNos; $i++) {
  619. if ($hitNo > $allHitNos[$i]) {
  620. $rank++;
  621. }
  622. }
  623. return $rank;
  624. }
  625. 1;