123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- #!/usr/bin/env python
- """
- Parser for hhr result files created with hhblits|hhsearch|hhalign -o <hhr_file>
- """
- import sys
- from collections import namedtuple
- __author__ = 'Markus Meier ([email protected])'
- __version__ = '1.0'
- __license__ = "GPL-3"
- hhr_alignment = namedtuple('hhr_alignment', ['query_id', 'query_length', 'query_neff',
- 'template_id', 'template_length', 'template_info',
- 'template_neff', 'query_ali', 'template_ali',
- 'start', 'end', 'probability', 'evalue', 'score',
- 'aligned_cols', 'identity', 'similarity', 'sum_probs'])
- class HHRFormatError(Exception):
- def __init__(self, value):
- self.value = "ERROR: "+value
- def __str__(self):
- return repr(self.value)
- def get_sequence_name(header):
- name = header.replace(">", "").split()[0]
- return name
- def parse_result(lines):
- results = []
- query_id = None
- query_length = None
- query_neff = None
- query_seq = []
- template_id = None
- template_length = None
- template_seq = []
- template_info = None
- query_start = None
- query_end = None
- template_start = None
- template_end = None
- probability = None
- evalue = None
- score = None
- identity = None
- similarity = None
- template_neff = None
- sum_probs = None
- aligned_cols = None
- skipped_ali_tags = ["ss_dssp", "ss_pred", "Consensus"]
- is_alignment_section = False
- for line in lines:
- if(line.startswith("Query")):
- query_id = line.split()[1]
- elif(line.startswith("Match_columns")):
- query_length = int(line.split()[1])
- elif(line.startswith("Neff")):
- query_neff = float(line.split()[1])
- elif(is_alignment_section and (line.startswith("No") or line.startswith("Done!"))):
- if query_start is not None:
- result = hhr_alignment(query_id, query_length, query_neff,
- template_id, template_length, template_info, template_neff,
- "".join(query_seq), "".join(template_seq), (query_start, template_start),
- (query_end, template_end), probability, evalue, score,
- aligned_cols, identity, similarity, sum_probs)
- results.append(result)
- template_id = None
- template_info = None
- query_seq = []
- template_seq = []
- query_start = None
- query_end = None
- template_start = None
- template_end = None
- elif(line.startswith("Probab")):
- tokens = line.split()
- probability = float(tokens[0].split("=")[1])
- evalue = float(tokens[1].split("=")[1])
- score = float(tokens[2].split("=")[1])
- aligned_cols = int(tokens[3].split("=")[1])
- identity = float(tokens[4].split("=")[1].replace("%", "")) / 100.0
- similarity = float(tokens[5].split("=")[1])
- sum_probs = float(tokens[6].split("=")[1])
- if(len(tokens) > 7):
- template_neff = float(tokens[7].split("=")[1])
- continue
- elif(line.startswith(">")):
- is_alignment_section = True
- template_id = line[1:].split()[0]
- template_info = line
- elif(line.startswith("Q")):
- tokens = line.split()
- if(tokens[1] in skipped_ali_tags):
- continue
- try:
- token_2 = tokens[2].replace("(", "").replace(")", "")
- token_2 = int(token_2)
- except:
- raise HHRFormatError(("Converting failure of start index ({}) "
- "of query alignment").format(tokens[2]))
- if query_start is None:
- query_start = token_2
- query_start = min(query_start, token_2)
- try:
- token_4 = tokens[4].replace("(", "").replace(")", "")
- token_4 = int(token_4)
- except:
- raise HHRFormatError(("Converting failure of end index ({}) "
- "of query alignment").format(tokens[4]))
- if query_end is None:
- query_end = token_4
- query_end = max(query_end, token_4)
- query_seq.append(tokens[3])
- elif(line.startswith("T")):
- tokens = line.split()
- if(tokens[1] in skipped_ali_tags):
- continue
- template_seq.append(tokens[3])
- try:
- token_2 = tokens[2].replace("(", "").replace(")", "")
- token_2 = int(token_2)
- except:
- raise HHRFormatError(("Converting failure of start index ({}) "
- "of template alignment").format(tokens[2]))
- if template_start is None:
- template_start = token_2
- template_start = min(template_start, token_2)
- try:
- token_4 = tokens[4].replace("(", "").replace(")", "")
- token_4 = int(token_4)
- except:
- raise HHRFormatError(("Converting failure of end index ({}) "
- "of template alignment").format(tokens[4]))
- if template_end is None:
- template_end = token_4
- template_end = max(template_end, token_4)
- try:
- token_5 = tokens[4].replace("(", "").replace(")", "")
- token_5 = int(token_5)
- except:
- raise HHRFormatError(("Converting failure of template length ({}) "
- "in template alignment").format(tokens[5]))
- template_length = token_5
- if(template_id is not None and query_start is not None):
- result = hhr_alignment(query_id, query_length, query_neff,
- template_id, template_length, template_info, template_neff,
- "".join(query_seq), "".join(template_seq), (query_start, template_start),
- (query_end, template_end), probability, evalue, score,
- aligned_cols, identity, similarity, sum_probs)
- results.append(result)
- return results
- def read_result(input_file):
- with open(input_file) as fh:
- lines = fh.readlines()
- return parse_result(lines)
- def main():
- counter = 0
- for result in read_result(sys.argv[1]):
- print("Alignment " + str(counter) + "\t evalue: " + str(result.evalue) +
- "\t probability: " + str(result.probability))
- print(result.query_id + "\t" + str(result.start[0]) + "\t" +
- result.query_ali + "\t" +
- str(result.end[0]))
- print(result.template_id + "\t" + str(result.start[1]) + "\t" +
- result.template_ali + "\t" +
- str(result.end[1]))
- counter += 1
- if __name__ == "__main__":
- main()
|