pdbfilter.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. #!/usr/bin/env python
  2. """
  3. Created in Jun 2016
  4. @author: Harald Voehringer
  5. """
  6. from itertools import groupby
  7. from collections import namedtuple, defaultdict
  8. import textwrap
  9. DEBUG = False
  10. FASTA = namedtuple('FASTA', ['header', 'seq'])
  11. PDBDATA = namedtuple('PDBDATA', ['entry', 'res', 'rfr', 'comp', 'met'])
  12. def as_pairs(fasta):
  13. """ Reads in fasta headers as defined """
  14. for header, group in groupby(fasta, lambda x: x[0] == '>'):
  15. if header:
  16. line = next(group)
  17. identifier = line.split(' ')[0].split('>')[1]
  18. header_line = str(line.strip())
  19. else:
  20. sequence = ''.join(line.strip() for line in group).upper()
  21. data = FASTA(header_line, sequence)
  22. yield identifier, data
  23. def read_fasta(fasta_file):
  24. """ Reads in fasta sequences with help of the as_pairs function."""
  25. fasta_dic = dict()
  26. duplicate = 0
  27. with open(fasta_file) as fasta:
  28. for title, sequence in as_pairs(fasta):
  29. if title not in fasta_dic:
  30. fasta_dic[title] = sequence
  31. else:
  32. duplicate +=1
  33. if duplicate != 0:
  34. print ('! Warning found duplicate {num} fasta.'.format(
  35. num = duplicate))
  36. return fasta_dic
  37. def read_fasta_annotations(fname):
  38. """ Reads the information about resolution, R-free and completness which
  39. be outputed by cif2fasta."""
  40. annotations = dict()
  41. with open(fname) as fh:
  42. for line in fh:
  43. if len(line) > 0 and line[0] == '#':
  44. continue
  45. identifier, res, r_free, comp, method = line.strip().split('\t')
  46. try:
  47. res = float(res)
  48. except ValueError:
  49. res = None
  50. try:
  51. r_free = float(r_free)
  52. except ValueError:
  53. r_free = None
  54. try:
  55. comp = float(comp)
  56. except ValueError:
  57. comp = None
  58. annotations[identifier] = PDBDATA(identifier, res, r_free, comp, method)
  59. return annotations
  60. def read_cluster(cluster_file):
  61. """ Reads in the clusters (this is the output of MMseqs). """
  62. cluster = defaultdict(set)
  63. with open(cluster_file) as fh:
  64. for line in fh:
  65. exemplar, node = line.split()
  66. if node in cluster[exemplar]:
  67. raise Exception('! Please check the clustering procedure: {n} was found twice in cluster {e}.'.format(
  68. n = node,
  69. e = exemplar))
  70. else:
  71. cluster[exemplar].add(node)
  72. return cluster
  73. def read_pdblist(in_file):
  74. pdb_list = set()
  75. with open(in_file) as fh:
  76. for line in fh:
  77. if line.startswith('#'):
  78. continue
  79. strip_line = line.strip()
  80. pdb_code = strip_line.split('_')[0]
  81. # this is a very very basic check
  82. if len(pdb_code) != 4:
  83. print ('! Warning: {line} seems to be an incorrect identifer. Skipping it.'.format(
  84. line = strip_line))
  85. continue
  86. pdb = strip_line.upper()
  87. pdb_list.add(pdb)
  88. return pdb_list
  89. def select_sequences(clusters, annotations):
  90. selected_sequences = set()
  91. for idx, cluster in enumerate(clusters):
  92. nodes = [ annotations[entry] for entry in clusters[cluster] ]
  93. if DEBUG:
  94. print ('Processing Cluster {c} ({i}): {m}'.format(
  95. c = cluster,
  96. i = idx,
  97. m = ', '.join([x.entry for x in nodes])))
  98. # select the best entries of the cluster
  99. best_res = float('inf')
  100. best_entry_res = None
  101. best_rfr = float('inf')
  102. best_entry_rfr = None
  103. best_comp = -float('inf')
  104. best_entry_comp = None
  105. # iterate through each entry in nodes while selecting the representative sequence
  106. for node in nodes:
  107. if (node.res is not None) and (node.res < best_res):
  108. best_res = node.res
  109. best_entry_res = node.entry
  110. if (node.rfr is not None) and (node.rfr < best_rfr):
  111. best_rfr = node.rfr
  112. best_entry_rfr = node.entry
  113. if (node.comp is not None) and (node.comp > best_comp):
  114. best_comp = node.comp
  115. best_entry_comp = node.entry
  116. if best_entry_res is not None:
  117. selected_sequences.add(best_entry_res)
  118. if DEBUG:
  119. print (' - Selected {n} (best resolution = {r}).'.format(
  120. n = best_entry_res,
  121. r = best_res))
  122. if best_entry_rfr is not None:
  123. selected_sequences.add(best_entry_rfr)
  124. if DEBUG:
  125. print (' - Selected {n} (best R-free = {r}).'.format(
  126. n = best_entry_rfr,
  127. r = best_rfr))
  128. if best_entry_comp is not None:
  129. selected_sequences.add(best_entry_comp)
  130. if DEBUG:
  131. print (' - Selected {n} (best completness = {r}).'.format(
  132. n = best_entry_comp,
  133. r = best_comp))
  134. if best_entry_res == None and best_entry_rfr == None and best_entry_comp == None:
  135. print ('! Warning: Did not find any representative entry for cluster {c}.'.format(
  136. c = cluster))
  137. return selected_sequences
  138. def write_sequences(out_file, fasta_db, selected_sequences):
  139. """ Writes selected sequences to a fasta file."""
  140. with open(out_file, 'w') as fh:
  141. for seq in selected_sequences:
  142. fasta_entry = '{h}\n{seq}\n'.format(
  143. h = fasta_db[seq].header,
  144. seq = '\n'.join(textwrap.wrap(fasta_db[seq].seq, 80)))
  145. fh.write(fasta_entry)
  146. def arg():
  147. import argparse
  148. description = """
  149. pdbfilter.py selects from sequence clusters (determined by MMSeqs) the sequences
  150. which have the best resolution, R-free factor and/or completness and writes them to a fasta file.
  151. """.replace('\n', '')
  152. epilog = '2016 Harald Voehringer'
  153. # Initiate a ArgumentParser Class
  154. parser = argparse.ArgumentParser(description = description, epilog = epilog)
  155. # Call add_options to the parser
  156. parser.add_argument('fasta', help = 'input fasta file (created by cif2fasta.py)', metavar = 'FILE')
  157. parser.add_argument('cluster', help = 'sequence clusters (MMseqs)', metavar = 'FILE')
  158. parser.add_argument('annotations', help = 'annotations file (created by cif2fasta using the -p flag, contains information about resolution, R-free and completness of sequences).', metavar = 'FILE')
  159. parser.add_argument('out_file', help = 'output fasta file', metavar = 'FILE')
  160. parser.add_argument('-i', '--include', help = 'include PDB chains', metavar = 'FILE')
  161. parser.add_argument('-r', '--remove', help = 'exclude PDB chains', metavar = 'FILE')
  162. parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose mode')
  163. return parser
  164. def main():
  165. import sys
  166. parser = arg()
  167. args = parser.parse_args(sys.argv[1:])
  168. global DEBUG
  169. if args.verbose:
  170. DEBUG = True
  171. fasta = read_fasta(args.fasta)
  172. clu70 = read_cluster(args.cluster)
  173. annot = read_fasta_annotations(args.annotations)
  174. print ("Found {i} clusters.".format(
  175. i = len(clu70.keys())))
  176. # choose representative sequences from clusters
  177. selection = select_sequences(clu70, annot)
  178. # make sure that pdbs specified in the argument are included
  179. if args.include:
  180. to_include = read_pdblist(args.include)
  181. for pdb in to_include:
  182. if pdb in fasta.keys():
  183. if pdb not in selection:
  184. if DEBUG:
  185. print ('Adding {p}.'.format(
  186. p = pdb))
  187. selection.add(pdb)
  188. else:
  189. print ('! Warning: {p} was not found in input fasta.'.format(
  190. p = pdb))
  191. # removes entries
  192. if args.remove:
  193. to_remove = read_pdblist(args.remove)
  194. for pdb in to_remove:
  195. if pdb in selection:
  196. if DEBUG:
  197. print ('Removing {p}.'.format(
  198. p = pdb))
  199. selection.remove(pdb)
  200. # write them to file
  201. write_sequences(args.out_file, fasta, selection)
  202. if __name__ == "__main__":
  203. main()