hhsuitedb.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. #!/usr/bin/env python
  2. """
  3. hhbsuite.py
  4. Creates HH-suite database files from A3M and HHM files
  5. Usage: Usage: python hhsuite_db.py -o <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>] [more_options]
  6. HHsuite version 3.0.0 (15-03-2015)
  7. Reference:
  8. Remmert M., Biegert A., Hauser A., and Soding J.
  9. HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
  10. Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
  11. (C) Johannes Soeding, 2012
  12. This program is free software: you can redistribute it and/or modify
  13. it under the terms of the GNU General Public License as published by
  14. the Free Software Foundation, either version 3 of the License, or
  15. (at your option) any later version.
  16. This program is distributed in the hope that it will be useful,
  17. but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. GNU General Public License for more details.
  20. You should have received a copy of the GNU General Public License
  21. along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. We are very grateful for bug reports! Please contact us at [email protected]
  23. """
  24. from optparse import OptionParser
  25. from glob import glob
  26. from subprocess import check_call
  27. import ffindex
  28. import a3m
  29. import tempfile
  30. import os
  31. import sys
  32. import shutil
  33. def write_glob_to_file(glob_expr, filename):
  34. fh = open(filename, "w")
  35. for f in glob(glob_expr):
  36. fh.write(f+"\n")
  37. fh.close()
  38. def write_set_to_file(set, filename):
  39. fh = open(filename, "w")
  40. for f in set:
  41. fh.write(f+"\n")
  42. fh.close()
  43. def remove_files_from_index(files_file, database_index_path):
  44. if(os.path.exists(database_index_path) and os.path.getsize(database_index_path) > 0):
  45. check_call(" ".join(["ffindex_modify -us -f ", files_file, database_index_path]), shell=True)
  46. def build_ffindex_database(files_file, data_path, index_path):
  47. check_call(" ".join(["ffindex_build","-s", data_path, index_path, "-f", files_file]), shell=True)
  48. def optimize_database(database_data_path, database_index_path):
  49. if os.path.exists(database_data_path) and os.path.exists(database_index_path) and os.path.getsize(database_data_path) > 0 and os.path.getsize(database_index_path) > 0:
  50. check_call(" ".join(["ffindex_build", "-as", database_data_path+".optimized", database_index_path+".optimized", "-d", database_data_path, "-i", database_index_path]), shell=True)
  51. shutil.move(database_data_path+".optimized", database_data_path)
  52. shutil.move(database_index_path+".optimized", database_index_path)
  53. def get_large_a3ms(a3m_base_path):
  54. entries = ffindex.read_index(a3m_base_path+".ffindex")
  55. data = ffindex.read_data(a3m_base_path+".ffdata")
  56. large_alignments = set()
  57. for entry in entries:
  58. lines = ffindex.read_lines(entry, data)
  59. alignment = a3m.A3M_Container()
  60. try:
  61. alignment.read_a3m_from_lines(lines)
  62. if alignment.number_sequences > 50:
  63. large_alignments.add(entry.name)
  64. except:
  65. sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
  66. if len(entries) > 0 and len(large_alignments) == 0:
  67. large_alignments.add(entries[0].name)
  68. return large_alignments
  69. def calculate_hhm(threads, a3m_base_path, hhm_base_path):
  70. tmp_dir = tempfile.mkdtemp()
  71. large_a3ms = get_large_a3ms(a3m_base_path)
  72. large_a3m_index = os.path.join(tmp_dir, "large.ffindex")
  73. a3m_index = read_ffindex(a3m_base_path+".ffindex")
  74. write_subset_index(a3m_index, large_a3ms, large_a3m_index)
  75. check_call(" ".join(["mpirun", "-np", threads, "ffindex_apply_mpi", a3m_base_path+".ffdata", large_a3m_index, "-d", hhm_base_path+".ffdata", "-i", hhm_base_path+".ffindex", "--", "hhmake", "-v", str(0), "-i", "stdin", "-o" ,"stdout"]), shell=True)
  76. def calculate_cs219(threads, a3m_db_base, cs_db_base):
  77. hhlib_environment = os.environ['HHLIB']
  78. if(hhlib_environment):
  79. #TODO: check
  80. check_call(" ".join(["cstranslate", "-A ", os.path.join(hhlib_environment, "data/cs219.lib"), "-D", os.path.join(hhlib_environment, "data/context_data.lib"), "-x 0.3 -c 4 --ffindex", "-i", a3m_db_base, "-o", cs_db_base, "-I a3m -b"]), env=dict(os.environ, OMP_NUM_THREADS=threads), shell=True)
  81. else:
  82. sys.err("ERROR: HHLIB environment variable not set! See manual!\n")
  83. exit(1)
  84. #TODO throw error
  85. def merge_databases(source_data_path, source_index_path, dest_data_path, dest_index_path):
  86. check_call(["ffindex_build", "-as", "-d", source_data_path, "-i", source_index_path, dest_data_path, dest_index_path])
  87. def sort_database(data_path, index_path):
  88. check_call(["ffindex_build", "-as", data_path, index_path])
  89. def read_ffindex(file):
  90. fh = open(file, "r")
  91. index = []
  92. for line in fh:
  93. index.append(line.rstrip().split())
  94. fh.close()
  95. return index
  96. def write_subset_index(index, subset, output_file):
  97. fh = open(output_file, "w")
  98. for entry in index:
  99. if entry[0] in subset:
  100. fh.write("{name:.64}\t{offset}\t{length}\n".format(name=entry[0], offset=entry[1], length=entry[2]))
  101. fh.close()
  102. def is_sorted(index):
  103. for i in range(len(index)-1):
  104. if(index[i][0] > index[i+1][0]):
  105. return False
  106. return True
  107. def get_duplicates(index):
  108. duplicates = set()
  109. entries = set()
  110. for entry in index:
  111. if(entry[0] in entries):
  112. duplicates.add(entry[0])
  113. entries.add(entry[0])
  114. return duplicates
  115. def get_missing(index, cmp_index):
  116. missing = set()
  117. index_set = set()
  118. for entry in index:
  119. index_set.add(entry[0])
  120. cmp_index_set = set()
  121. for entry in cmp_index:
  122. cmp_index_set.add(entry[0])
  123. for key in cmp_index_set:
  124. if key not in index_set:
  125. missing.add(key)
  126. return missing
  127. def get_overhead(index, cmp_index):
  128. overhead = set()
  129. index_set = set()
  130. for entry in index:
  131. index_set.add(entry[0])
  132. cmp_index_set = set()
  133. for entry in cmp_index:
  134. cmp_index_set.add(entry[0])
  135. for key in index_set:
  136. if key not in cmp_index_set:
  137. overhead.add(key)
  138. return overhead
  139. def handle_duplicates(suffix, calculate, threads, db_basename, force_mode):
  140. index = read_ffindex(db_basename+"_"+suffix+".ffindex")
  141. duplicates = get_duplicates(index)
  142. if(suffix == "a3m" and len(duplicates) > 0):
  143. sys.stderr.write("ERROR: "+db_basename+"_a3m.ffindex contains duplicates!\n")
  144. sys.stderr.write("ERROR: Your database is broken!\n")
  145. exit(1);
  146. if(len(duplicates) == 0):
  147. return
  148. for duplicate in duplicates:
  149. sys.stderr.write("WARNING: "+db_basename+"_"+suffix+".ffindex contains duplicate "+duplicate+"!\n")
  150. if force_mode:
  151. tmp_dir = tempfile.mkdtemp()
  152. try:
  153. sys.stderr.write("WARNING: remove duplicates and recalculate them from their corresponding a3m's!\n")
  154. a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
  155. duplicates_a3m_base = os.path.join(tmp_dir, "duplicates_a3m")
  156. duplicates_a3m_index_file = os.path.join(tmp_dir, "duplicates_a3m.ffindex")
  157. duplicates_a3m_data_file = os.path.join(tmp_dir, "duplicates_a3m.ffdata")
  158. write_subset_index(a3m_index, duplicates, duplicates_a3m_index_file)
  159. os.symlink(db_basename+"_a3m.ffdata", duplicates_a3m_data_file)
  160. duplicates_new_base = os.path.join(tmp_dir, "duplicates_new")
  161. duplicates_new_index_file = os.path.join(tmp_dir, "duplicates_new.ffindex")
  162. duplicates_new_data_file = os.path.join(tmp_dir, "duplicates_new.ffdata")
  163. duplicates_index_file = os.path.join(tmp_dir, "duplicates.dat")
  164. write_set_to_file(duplicates, duplicates_index_file)
  165. remove_files_from_index(duplicates_index_file, db_basename+"_"+suffix+".ffindex")
  166. calculate(threads, duplicates_a3m_base, duplicates_new_base)
  167. sort_database(duplicates_new_data_file, duplicates_new_index_file)
  168. merge_databases(duplicates_new_data_file, duplicates_new_index_file,
  169. db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
  170. sort_database(db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
  171. optimize_database(db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
  172. finally:
  173. shutil.rmtree(tmp_dir)
  174. else:
  175. sys.stderr.write("You may try to use the option --force to fix the database!\n")
  176. def handle_unsorted(suffix, db_basename, force_mode):
  177. index = read_ffindex(db_basename+"_"+suffix+".ffindex")
  178. if(not is_sorted(index)):
  179. sys.stderr.write("Index "+db_basename+"_"+suffix+".ffindex is unsorted!\n")
  180. if force_mode:
  181. sys.stderr.write("Try to sort unsorted index "+db_basename+"_"+suffix+".ffindex!\n")
  182. sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  183. else:
  184. sys.stderr.write("You may try to use the option --force to fix the database!\n")
  185. def handle_missing(suffix, calculate, db_basename, threads, force_mode):
  186. index = read_ffindex(db_basename+"_"+suffix+".ffindex")
  187. a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
  188. missing = get_missing(index, a3m_index)
  189. if(len(missing) == 0):
  190. return
  191. for f in missing:
  192. sys.stderr.write("WARNING: Missing entry "+f+" in "+db_basename+"_"+suffix+".ff{data,index}!\n")
  193. if force_mode:
  194. sys.stderr.write("WARNING: Try to calculate missing entries!\n")
  195. tmp_dir = tempfile.mkdtemp()
  196. try:
  197. missing_base = os.path.join(tmp_dir, "missing_"+suffix)
  198. missing_index_file = os.path.join(tmp_dir, "missing_"+suffix+".ffindex")
  199. missing_data_file = os.path.join(tmp_dir, "missing_"+suffix+".ffdata")
  200. missing_a3m_base = os.path.join(tmp_dir, "missing_a3m")
  201. missing_a3m_index_file = os.path.join(tmp_dir, "missing_a3m.ffindex")
  202. missing_a3m_data_file = os.path.join(tmp_dir, "missing_a3m.ffdata")
  203. write_subset_index(a3m_index, missing, missing_a3m_index_file)
  204. os.symlink(os.path.abspath(db_basename+"_a3m.ffdata"), missing_a3m_data_file)
  205. calculate(threads, missing_a3m_base, missing_base)
  206. merge_databases(missing_data_file, missing_index_file, db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  207. optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  208. finally:
  209. shutil.rmtree(tmp_dir)
  210. else:
  211. sys.stderr.write("You may try to use the option --force to fix the database!\n")
  212. def handle_overhead(suffix, db_basename, force_mode):
  213. index = read_ffindex(db_basename+"_"+suffix+".ffindex")
  214. a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
  215. overhead = get_overhead(index, a3m_index)
  216. #delete overhead cs219 files
  217. if(len(overhead) == 0):
  218. return
  219. for f in overhead:
  220. sys.stderr.write("WARNING: Entry "+f+" from "+db_basename+"_"+suffix+".ff{data,index} has no corresponding entry in the a3m database!\n")
  221. if force_mode:
  222. sys.stderr.write("WARNING: Try to fix overhead entries!\n")
  223. tmp_dir = tempfile.mkdtemp()
  224. try:
  225. index_file = os.path.join(tmp_dir, "to_delete.dat")
  226. write_set_to_file(overhead, index_file)
  227. remove_files_from_index(index_file, db_basename+"_"+suffix+".ffindex")
  228. optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  229. finally:
  230. shutil.rmtree(tmp_dir)
  231. else:
  232. sys.stderr.write("You may try to use the option --force to fix the database!\n")
  233. def check_a3m_format(db_basename, force_mode):
  234. entries = ffindex.read_index(db_basename+"_a3m.ffindex")
  235. data = ffindex.read_data(db_basename+"_a3m.ffdata")
  236. corrupted_alignments = set()
  237. for entry in entries:
  238. lines = ffindex.read_lines(entry, data)
  239. alignment = a3m.A3M_Container()
  240. try:
  241. alignment.read_a3m_from_lines(lines)
  242. except:
  243. corrupted_alignments.add(entry.name)
  244. sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
  245. if len(corrupted_alignments) == 0:
  246. return
  247. if force_mode:
  248. tmp_dir = tempfile.mkdtemp()
  249. try:
  250. sys.stderr.write("WARNING: remove corrupted a3m's!\n")
  251. corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
  252. write_set_to_file(corrupted_alignments, corrupted_index_file)
  253. for suffix in ["a3m", "cs219", "hhm"]:
  254. remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex")
  255. sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  256. optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
  257. finally:
  258. shutil.rmtree(tmp_dir)
  259. else:
  260. sys.stderr.write("You may try to use the option --force to fix the database!\n")
  261. def check_database(output_basename, threads, force_mode):
  262. if not os.path.exists(output_basename+"_a3m.ffindex") or not os.path.exists(output_basename+"_a3m.ffdata"):
  263. sys.stderr.write("Error: No a3m database found!")
  264. exit(1)
  265. if not os.path.exists(output_basename+"_cs219.ffindex") or os.path.exists(output_basename+"_cs219.ffdata"):
  266. open(output_basename+"_cs219.ffindex", 'a').close()
  267. open(output_basename+"_cs219.ffdata", 'a').close()
  268. if not os.path.exists(output_basename+"_hhm.ffindex") or os.path.exists(output_basename+"_hhm.ffdata"):
  269. calculate_hhm(threads, output_basename+"_a3m", output_basename+"_hhm")
  270. check_a3m_format(output_basename, force_mode)
  271. #TODO check hhm's...
  272. #TODO check cs219...
  273. handle_unsorted("a3m", output_basename, force_mode)
  274. handle_unsorted("hhm", output_basename, force_mode)
  275. handle_unsorted("cs219", output_basename, force_mode)
  276. handle_duplicates("a3m", calculate_hhm, threads, output_basename, force_mode)
  277. handle_duplicates("hhm", calculate_hhm, threads, output_basename, force_mode)
  278. handle_duplicates("cs219", calculate_cs219, threads, output_basename, force_mode)
  279. handle_missing("cs219", calculate_cs219, output_basename, threads, force_mode)
  280. handle_overhead("hhm", output_basename, force_mode)
  281. handle_overhead("cs219", output_basename, force_mode)
  282. def add_new_files(globular_expression, suffix, output_basename):
  283. tmp_dir = tempfile.mkdtemp()
  284. files = set(glob(globular_expression))
  285. if(len(files) == 0):
  286. return
  287. try:
  288. files_index = os.path.join(tmp_dir, "files.dat")
  289. write_set_to_file(files, files_index)
  290. new_base = os.path.join(tmp_dir, "new")
  291. new_index_file = new_base + ".ffindex"
  292. new_data_file = new_base + ".ffdata"
  293. build_ffindex_database(files_index, new_data_file, new_index_file)
  294. output_index_file = output_basename+"_"+suffix+".ffindex"
  295. output_data_file = output_basename+"_"+suffix+".ffdata"
  296. remove_files_from_index(files_index, output_index_file)
  297. merge_databases(new_data_file, new_index_file, output_data_file, output_index_file)
  298. optimize_database(output_data_file, output_index_file)
  299. if(suffix == "a3m"):
  300. for other_suffix in ["hhm", "cs219"]:
  301. output_index_file = output_basename+"_"+other_suffix+".ffindex"
  302. output_data_file = output_basename+"_"+other_suffix+".ffdata"
  303. if os.path.exists(output_index_file) and os.path.exists(output_data_file) and os.path.getsize(output_data_file) > 0:
  304. remove_files_from_index(files_index, output_index_file)
  305. optimize_database(output_data_file, output_index_file)
  306. finally:
  307. shutil.rmtree(tmp_dir)
  308. def opt():
  309. parser = OptionParser()
  310. parser.add_option("--ia3m", dest="a3m_files_glob",
  311. help="Glob for a3m files", metavar="<GLOB>")
  312. parser.add_option("--ics219", dest="cs_files_glob",
  313. help="Glob for cs219 files", metavar="<GLOB>")
  314. parser.add_option("--ihhm", dest="hhm_files_glob",
  315. help="Glob for hhm files", metavar="<GLOB>")
  316. parser.add_option("-o", metavar="FILE", dest="output_basename",
  317. help="Output hhsuite database basename")
  318. parser.add_option("--cpu", metavar="INT", dest="nr_cores",
  319. help="Number of threads that shall used")
  320. parser.add_option("--force",
  321. action="store_true", dest="force_mode", default=False,
  322. help="Try to fix problems with the database")
  323. return parser
  324. def check_options(options, parser):
  325. if not options.nr_cores:
  326. sys.stderr.write("Please use --cpu!\n")
  327. parser.print_help()
  328. sys.exit(1)
  329. if not options.output_basename:
  330. sys.stderr.write("Please use -o!\n")
  331. parser.print_help()
  332. sys.exit(1)
  333. def main():
  334. parser = opt()
  335. (options, args) = parser.parse_args()
  336. check_options(options, parser)
  337. #Important to do a3m's first... deleting out of date hhm's and cs219's
  338. if(options.a3m_files_glob):
  339. add_new_files(options.a3m_files_glob, "a3m", options.output_basename)
  340. if(options.hhm_files_glob):
  341. add_new_files(options.hhm_files_glob, "hhm", options.output_basename)
  342. if(options.cs_files_glob):
  343. add_new_files(options.cs_files_glob, "cs219", options.output_basename)
  344. check_database(options.output_basename, options.nr_cores, options.force_mode)
  345. if __name__ == "__main__":
  346. main()