hh_reader.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #!/usr/bin/env python
  2. """
  3. Parser for hhr result files created with hhblits|hhsearch|hhalign -o <hhr_file>
  4. """
  5. import sys
  6. from collections import namedtuple
  7. __author__ = 'Markus Meier ([email protected])'
  8. __version__ = '1.0'
  9. __license__ = "GPL-3"
  10. hhr_alignment = namedtuple('hhr_alignment', ['query_id', 'query_length', 'query_neff',
  11. 'template_id', 'template_length', 'template_info',
  12. 'template_neff', 'query_ali', 'template_ali',
  13. 'start', 'end', 'probability', 'evalue', 'score',
  14. 'aligned_cols', 'identity', 'similarity', 'sum_probs'])
  15. class HHRFormatError(Exception):
  16. def __init__(self, value):
  17. self.value = "ERROR: "+value
  18. def __str__(self):
  19. return repr(self.value)
  20. def get_sequence_name(header):
  21. name = header.replace(">", "").split()[0]
  22. return name
  23. def parse_result(lines):
  24. results = []
  25. query_id = None
  26. query_length = None
  27. query_neff = None
  28. query_seq = []
  29. template_id = None
  30. template_length = None
  31. template_seq = []
  32. template_info = None
  33. query_start = None
  34. query_end = None
  35. template_start = None
  36. template_end = None
  37. probability = None
  38. evalue = None
  39. score = None
  40. identity = None
  41. similarity = None
  42. template_neff = None
  43. sum_probs = None
  44. aligned_cols = None
  45. skipped_ali_tags = ["ss_dssp", "ss_pred", "Consensus"]
  46. is_alignment_section = False
  47. for line in lines:
  48. if(line.startswith("Query")):
  49. query_id = line.split()[1]
  50. elif(line.startswith("Match_columns")):
  51. query_length = int(line.split()[1])
  52. elif(line.startswith("Neff")):
  53. query_neff = float(line.split()[1])
  54. elif(is_alignment_section and (line.startswith("No") or line.startswith("Done!"))):
  55. if query_start is not None:
  56. result = hhr_alignment(query_id, query_length, query_neff,
  57. template_id, template_length, template_info, template_neff,
  58. "".join(query_seq), "".join(template_seq), (query_start, template_start),
  59. (query_end, template_end), probability, evalue, score,
  60. aligned_cols, identity, similarity, sum_probs)
  61. results.append(result)
  62. template_id = None
  63. template_info = None
  64. query_seq = []
  65. template_seq = []
  66. query_start = None
  67. query_end = None
  68. template_start = None
  69. template_end = None
  70. elif(line.startswith("Probab")):
  71. tokens = line.split()
  72. probability = float(tokens[0].split("=")[1])
  73. evalue = float(tokens[1].split("=")[1])
  74. score = float(tokens[2].split("=")[1])
  75. aligned_cols = int(tokens[3].split("=")[1])
  76. identity = float(tokens[4].split("=")[1].replace("%", "")) / 100.0
  77. similarity = float(tokens[5].split("=")[1])
  78. sum_probs = float(tokens[6].split("=")[1])
  79. if(len(tokens) > 7):
  80. template_neff = float(tokens[7].split("=")[1])
  81. continue
  82. elif(line.startswith(">")):
  83. is_alignment_section = True
  84. template_id = line[1:].split()[0]
  85. template_info = line
  86. elif(line.startswith("Q")):
  87. tokens = line.split()
  88. if(tokens[1] in skipped_ali_tags):
  89. continue
  90. try:
  91. token_2 = tokens[2].replace("(", "").replace(")", "")
  92. token_2 = int(token_2)
  93. except:
  94. raise HHRFormatError(("Converting failure of start index ({}) "
  95. "of query alignment").format(tokens[2]))
  96. if query_start is None:
  97. query_start = token_2
  98. query_start = min(query_start, token_2)
  99. try:
  100. token_4 = tokens[4].replace("(", "").replace(")", "")
  101. token_4 = int(token_4)
  102. except:
  103. raise HHRFormatError(("Converting failure of end index ({}) "
  104. "of query alignment").format(tokens[4]))
  105. if query_end is None:
  106. query_end = token_4
  107. query_end = max(query_end, token_4)
  108. query_seq.append(tokens[3])
  109. elif(line.startswith("T")):
  110. tokens = line.split()
  111. if(tokens[1] in skipped_ali_tags):
  112. continue
  113. template_seq.append(tokens[3])
  114. try:
  115. token_2 = tokens[2].replace("(", "").replace(")", "")
  116. token_2 = int(token_2)
  117. except:
  118. raise HHRFormatError(("Converting failure of start index ({}) "
  119. "of template alignment").format(tokens[2]))
  120. if template_start is None:
  121. template_start = token_2
  122. template_start = min(template_start, token_2)
  123. try:
  124. token_4 = tokens[4].replace("(", "").replace(")", "")
  125. token_4 = int(token_4)
  126. except:
  127. raise HHRFormatError(("Converting failure of end index ({}) "
  128. "of template alignment").format(tokens[4]))
  129. if template_end is None:
  130. template_end = token_4
  131. template_end = max(template_end, token_4)
  132. try:
  133. token_5 = tokens[4].replace("(", "").replace(")", "")
  134. token_5 = int(token_5)
  135. except:
  136. raise HHRFormatError(("Converting failure of template length ({}) "
  137. "in template alignment").format(tokens[5]))
  138. template_length = token_5
  139. if(template_id is not None and query_start is not None):
  140. result = hhr_alignment(query_id, query_length, query_neff,
  141. template_id, template_length, template_info, template_neff,
  142. "".join(query_seq), "".join(template_seq), (query_start, template_start),
  143. (query_end, template_end), probability, evalue, score,
  144. aligned_cols, identity, similarity, sum_probs)
  145. results.append(result)
  146. return results
  147. def read_result(input_file):
  148. with open(input_file) as fh:
  149. lines = fh.readlines()
  150. return parse_result(lines)
  151. def main():
  152. counter = 0
  153. for result in read_result(sys.argv[1]):
  154. print("Alignment " + str(counter) + "\t evalue: " + str(result.evalue) +
  155. "\t probability: " + str(result.probability))
  156. print(result.query_id + "\t" + str(result.start[0]) + "\t" +
  157. result.query_ali + "\t" +
  158. str(result.end[0]))
  159. print(result.template_id + "\t" + str(result.start[1]) + "\t" +
  160. result.template_ali + "\t" +
  161. str(result.end[1]))
  162. counter += 1
  163. if __name__ == "__main__":
  164. main()