checkTemplates.pm 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. ## read hhsearch results file (containing all filtered hits ranked by a predicted TM-Score)
  2. ## create pir alignment for MODELLER software
  3. ## if necessary create artificial pdb files (add 50 Angstrom)
  4. package checkTemplates;
  5. require Exporter;
  6. use strict;
  7. use config;
  8. use utilities;
  9. our @ISA = qw(Exporter);
  10. our @EXPORT = qw(CheckTemplates);
  11. ## THIS IS (not my own) UGLY CODE AND SHOULD BE REWRITTEN
  12. ## requires valid path to hhmakemodel
  13. ################################################################
  14. ## description: read selected hits in hhr-file
  15. ## check for overlap and repeat domains!
  16. ## input: hhrFile - the aritificial hhr file produced by
  17. ## CreateHHRFromPrediction
  18. ## a3mQuery: the a3m query alignment file-path
  19. ## pirfile: basename of pirfile; pir-file will be created now
  20. ## by calling hhmakemodel, see below
  21. ################################################################
  22. sub CheckTemplates {
  23. my $outbase = shift;
  24. my $workingDir = shift;
  25. my $config = shift;
  26. my $hhrFile = "$outbase.hhr";
  27. my $a3mQuery = "$outbase.a3m";
  28. my $pirFile = "$outbase.pir";
  29. my $pdbdir = $config->get_pdbdir();
  30. my $hhmakemodel = $config->get_hhmakemodel();
  31. my @hits_check;
  32. my $hit;
  33. my @hits;
  34. my $ID;
  35. my $qstart;
  36. my $qend;
  37. my $tstart;
  38. my $tend;
  39. my $verbose = 1;
  40. ## verbose mode for hh tools
  41. my $v = 1;
  42. open(HHR, "< $hhrFile") or die("Error checkTemplates: Cant open $hhrFile: $!\n");
  43. while (my $line = <HHR>) {
  44. if ($line =~ /^\s*(\d+)\s+(\S+).+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\S+)\)$/) {
  45. $hit = $1;
  46. push (@hits, $hit);
  47. $ID = $2;
  48. $qstart = $9;
  49. $qend = $10;
  50. $tstart = $11;
  51. $tend = $12;
  52. if ($ID =~ /^[defgh](\d[a-z0-9]{3})([a-z0-9_\.])[a-z0-9_]/) {
  53. $ID = $1;
  54. my $chain;
  55. if ($2 eq "_") {
  56. $chain = "[A ]";
  57. }
  58. else {
  59. $chain = uc($2);
  60. }
  61. $ID .= "_$chain";
  62. }
  63. push(@hits_check, [$hit, $ID, $qstart, $qend, $tstart, $tend]);
  64. }
  65. }
  66. close (HHR);
  67. if ($verbose >= 2) {
  68. foreach my $c (@hits_check){
  69. foreach my $d (@{$c}){
  70. print "$d\t";
  71. }
  72. print "\n";
  73. }
  74. print "\n";
  75. }
  76. #########################################
  77. ## check if same template is used several
  78. ## times and in overlapping parts (e.g repeatproteins!!!)
  79. ## -> because of problems with MODELLER restraints!
  80. #########################################
  81. my @aligned;
  82. my $donttake = "";
  83. my $added = "";
  84. my $doubles = 0;
  85. for (my $h=0; $h<@hits_check-1; $h++) {
  86. for (my $i=$h+1; $i<@hits_check; $i++) {
  87. if (($donttake !~ /,$hits_check[$h][0],/) && ($donttake !~ /,$hits_check[$i][0],/)) {
  88. my $queryov=0;
  89. my $templov=0;
  90. #check: same templates?
  91. if ($hits_check[$h][1] eq $hits_check[$i][1]){
  92. print "Hit: $hits_check[$h][0] & $hits_check[$i][0] same templates!\t";
  93. #check: overlap?
  94. #print "$hits_check[$h][2]&$hits_check[$i][2]\n$hits_check[$h][3]&$hits_check[$i][3]\n$hits_check[$h][4]&$hits_check[$i][4]\n$hits_check[$h][5]&$hits_check[$i][5]\n\n";
  95. #check: overlap in query!?!
  96. for (my $i=1; $i<=5000; $i++) {
  97. $aligned[$i]=0;
  98. }
  99. my $firstq=$hits_check[$h][2];
  100. my $lastq=$hits_check[$h][3];
  101. for (my $i=$firstq; $i<=$lastq; $i++) {
  102. $aligned[$i]=1
  103. }
  104. $firstq=$hits_check[$i][2];
  105. $lastq=$hits_check[$i][3];
  106. my $queryov=0;
  107. my $mlenq=0;
  108. for (my $i=$firstq; $i<=$lastq; $i++) {
  109. if ($aligned[$i]==1) {
  110. $queryov++;
  111. }
  112. else {
  113. $mlenq++;
  114. }
  115. }
  116. #check: overlap in template!?!
  117. for (my $i=1; $i<=5000; $i++) { $aligned[$i]=0; }
  118. my $firstt = $hits_check[$h][4];
  119. my $lastt = $hits_check[$h][5];
  120. for (my $i=$firstt; $i<=$lastt; $i++) { $aligned[$i]=1; }
  121. $firstt = $hits_check[$i][4];
  122. $lastt = $hits_check[$i][5];
  123. my $templov=0;
  124. my $mlent=0;
  125. for (my $i=$firstt; $i<=$lastt; $i++) {
  126. if ($aligned[$i]==1) {
  127. $templov++;
  128. }
  129. else {
  130. $mlent++;
  131. }
  132. }
  133. print "Overlap: in Query: $queryov AA in Template: $templov AA\t";
  134. if ($queryov<=20 && $templov>20){
  135. print "repeat-domain: $hits_check[$h][0] & $hits_check[$i][0]\t";
  136. if (($added eq "") || ($added !~ /$hits_check[$h][1]/)){
  137. $doubles++;
  138. print " -> create $workingDir/$hits_check[$h][1]_$h.pdb\n";
  139. system ("cp $pdbdir/$hits_check[$h][1].pdb $workingDir/$hits_check[$h][1]_$h.pdb\n");
  140. #add 50 Angstrom to x coordinate in pdb file:
  141. open (PDB, "< $workingDir/$hits_check[$h][1]_$h.pdb") or die ("Error: cannot open $workingDir/$hits_check[$h][1]_$h.pdb\n");
  142. my @lines = <PDB>;
  143. close(PDB);
  144. for (my $i=0; $i<@lines; $i++) {
  145. if ($lines[$i]=~ /^(.{30})(\s*)(-?\d+.\d+)(\s*-?\d+.\d+\s*-?\d+.\d+\s*.*)/){
  146. my $xcoord = $3+50;
  147. $lines[$i] = sprintf("$1%8.3f$4\n", $xcoord);
  148. }
  149. }
  150. open (PDBnew, "> $workingDir/$hits_check[$h][1]_$h.pdb") or die ("Error: cannot open $workingDir/$hits_check[$h][1]_$h.pdb\n");
  151. print(PDBnew @lines);
  152. close (PDBnew);
  153. $added .= " $hits_check[$h][1]_$h";
  154. }
  155. elsif($added !~ /$hits_check[$h][1]_$h/){
  156. $doubles++;
  157. print " -> create $workingDir/$hits_check[$h][1]_$h.pdb\n";
  158. my $copy;
  159. if ($added =~ /.*($hits_check[$h][1]_\d*).*?$/) {
  160. $copy="$workingDir/$1.pdb";
  161. }
  162. system ("cp $copy $workingDir/$hits_check[$h][1]_$h.pdb\n");
  163. #add 50 Angstrom to x coordinate in pdb file:
  164. open (PDB, "< $workingDir/$hits_check[$h][1]_$h.pdb") or die ("Error: cannot open $workingDir/$hits_check[$h][1]_$h.pdb\n");
  165. my @lines = <PDB>;
  166. close(PDB);
  167. for (my $i=0; $i<@lines; $i++) {
  168. if ($lines[$i]=~ /^(.{30})(\s*)(-?\d+.\d+)(\s*-?\d+.\d+\s*-?\d+.\d+\s*.*)/) {
  169. my $xcoord = $3+50;
  170. $lines[$i] = sprintf("$1%8.3f$4\n", $xcoord);
  171. }
  172. }
  173. open (PDBnew, "> $workingDir/$hits_check[$h][1]_$h.pdb") or die ("Error: cannot open $workingDir/$hits_check[$h][1]_$h.pdb\n");
  174. print(PDBnew @lines);
  175. close (PDBnew);
  176. $added .= " $hits_check[$h][1]_$h";
  177. }
  178. else{print "\n";}
  179. }
  180. elsif ($queryov>20){
  181. print "ignore HIT $hits_check[$i][0] for modeling\n";
  182. $donttake .= ",$hits_check[$i][0],";
  183. }
  184. else{print "\n"}
  185. }
  186. }
  187. }
  188. }
  189. my @hits_take;
  190. for (my $h=0; $h<@hits; $h++){
  191. if ($donttake !~ /,$hits[$h],/){
  192. push(@hits_take,$hits[$h]);
  193. }
  194. }
  195. &System("$hhmakemodel $hhrFile -m @hits_take -q $a3mQuery -v $v -d $pdbdir -pir $outbase.pir");
  196. if ($doubles > 0) {
  197. my @pdbs = <$workingDir/*.pdb>;
  198. my $checkedPDBs = "";
  199. my $duplis = 1;
  200. open (PIR, "< $outbase.pir") or die ("Error: cannot open $outbase.pir: $!\n");
  201. my @lines = <PIR>;
  202. close(PIR);
  203. for (my $i=0; $i<@lines; $i++) {
  204. if ($lines[$i] =~ /^structureX:/) {
  205. $lines[$i-1] =~ />P1;(\S+)/;
  206. my $pdbid = $1;
  207. for (my $j=0; $j<@pdbs; $j++) {
  208. if (($pdbs[$j] =~ /$workingDir\/$pdbid\_\d+\.pdb/) && ($checkedPDBs !~ /$pdbid/)) {
  209. $checkedPDBs .= "$pdbid ";
  210. last;
  211. }
  212. elsif (($pdbs[$j] =~ /$workingDir\/$pdbid\_\d+\.pdb/)) {
  213. print"\nReplace $pdbid by DUPLICATEPDB$duplis.$pdbid.pdb!\n";
  214. system "mv $pdbs[$j] $workingDir/DUPLICATEPDB$duplis.$pdbid.pdb";
  215. $lines[$i-1] =~ s/>P1;$pdbid/>P1;DUPLICATEPDB$duplis.$pdbid/;
  216. $lines[$i] =~ s/^structureX:$pdbid:(.*)/structureX:DUPLICATEPDB$duplis.$pdbid:$1/;
  217. $duplis++;
  218. @pdbs = grep !/$pdbs[$j]/, @pdbs;
  219. last;
  220. }
  221. }
  222. }
  223. }
  224. open (OUT, "> $outbase.pir") or die ("Error: cannot write to pir-file: $!\n");
  225. print(OUT @lines);
  226. close(OUT);
  227. }
  228. }