123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484 |
- #!/usr/bin/env python
- """
- hhbsuite.py
- Creates HH-suite database files from A3M and HHM files
- Usage: Usage: python hhsuite_db.py -o <db_name> [-ia3m <a3m_dir>] [-ihhm <hhm_dir>] [-ics <cs_dir>] [more_options]
- HHsuite version 3.0.0 (15-03-2015)
- Reference:
- Remmert M., Biegert A., Hauser A., and Soding J.
- HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
- Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
- (C) Johannes Soeding, 2012
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- We are very grateful for bug reports! Please contact us at [email protected]
- """
- from optparse import OptionParser
- from glob import glob
- from subprocess import check_call
- import ffindex
- import a3m
- import tempfile
- import os
- import sys
- import shutil
- def write_glob_to_file(glob_expr, filename):
- fh = open(filename, "w")
- for f in glob(glob_expr):
- fh.write(f+"\n")
- fh.close()
- def write_set_to_file(set, filename):
- fh = open(filename, "w")
- for f in set:
- fh.write(f+"\n")
- fh.close()
- def remove_files_from_index(files_file, database_index_path):
- if(os.path.exists(database_index_path) and os.path.getsize(database_index_path) > 0):
- check_call(" ".join(["ffindex_modify -us -f ", files_file, database_index_path]), shell=True)
- def build_ffindex_database(files_file, data_path, index_path):
- check_call(" ".join(["ffindex_build","-s", data_path, index_path, "-f", files_file]), shell=True)
- def optimize_database(database_data_path, database_index_path):
- if os.path.exists(database_data_path) and os.path.exists(database_index_path) and os.path.getsize(database_data_path) > 0 and os.path.getsize(database_index_path) > 0:
- check_call(" ".join(["ffindex_build", "-as", database_data_path+".optimized", database_index_path+".optimized", "-d", database_data_path, "-i", database_index_path]), shell=True)
- shutil.move(database_data_path+".optimized", database_data_path)
- shutil.move(database_index_path+".optimized", database_index_path)
- def get_large_a3ms(a3m_base_path):
- entries = ffindex.read_index(a3m_base_path+".ffindex")
- data = ffindex.read_data(a3m_base_path+".ffdata")
-
- large_alignments = set()
- for entry in entries:
- lines = ffindex.read_lines(entry, data)
- alignment = a3m.A3M_Container()
- try:
- alignment.read_a3m_from_lines(lines)
-
- if alignment.number_sequences > 50:
- large_alignments.add(entry.name)
- except:
- sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
- if len(entries) > 0 and len(large_alignments) == 0:
- large_alignments.add(entries[0].name)
- return large_alignments
- def calculate_hhm(threads, a3m_base_path, hhm_base_path):
- tmp_dir = tempfile.mkdtemp()
-
- large_a3ms = get_large_a3ms(a3m_base_path)
- large_a3m_index = os.path.join(tmp_dir, "large.ffindex")
- a3m_index = read_ffindex(a3m_base_path+".ffindex")
- write_subset_index(a3m_index, large_a3ms, large_a3m_index)
-
- check_call(" ".join(["mpirun", "-np", threads, "ffindex_apply_mpi", a3m_base_path+".ffdata", large_a3m_index, "-d", hhm_base_path+".ffdata", "-i", hhm_base_path+".ffindex", "--", "hhmake", "-v", str(0), "-i", "stdin", "-o" ,"stdout"]), shell=True)
- def calculate_cs219(threads, a3m_db_base, cs_db_base):
- hhlib_environment = os.environ['HHLIB']
- if(hhlib_environment):
- #TODO: check
- check_call(" ".join(["cstranslate", "-A ", os.path.join(hhlib_environment, "data/cs219.lib"), "-D", os.path.join(hhlib_environment, "data/context_data.lib"), "-x 0.3 -c 4 --ffindex", "-i", a3m_db_base, "-o", cs_db_base, "-I a3m -b"]), env=dict(os.environ, OMP_NUM_THREADS=threads), shell=True)
- else:
- sys.err("ERROR: HHLIB environment variable not set! See manual!\n")
- exit(1)
- #TODO throw error
- def merge_databases(source_data_path, source_index_path, dest_data_path, dest_index_path):
- check_call(["ffindex_build", "-as", "-d", source_data_path, "-i", source_index_path, dest_data_path, dest_index_path])
- def sort_database(data_path, index_path):
- check_call(["ffindex_build", "-as", data_path, index_path])
- def read_ffindex(file):
- fh = open(file, "r")
-
- index = []
- for line in fh:
- index.append(line.rstrip().split())
-
- fh.close()
-
- return index
- def write_subset_index(index, subset, output_file):
- fh = open(output_file, "w")
- for entry in index:
- if entry[0] in subset:
- fh.write("{name:.64}\t{offset}\t{length}\n".format(name=entry[0], offset=entry[1], length=entry[2]))
- fh.close()
- def is_sorted(index):
- for i in range(len(index)-1):
- if(index[i][0] > index[i+1][0]):
- return False
- return True
- def get_duplicates(index):
- duplicates = set()
- entries = set()
-
- for entry in index:
- if(entry[0] in entries):
- duplicates.add(entry[0])
-
- entries.add(entry[0])
- return duplicates
- def get_missing(index, cmp_index):
- missing = set()
-
- index_set = set()
- for entry in index:
- index_set.add(entry[0])
-
- cmp_index_set = set()
- for entry in cmp_index:
- cmp_index_set.add(entry[0])
-
- for key in cmp_index_set:
- if key not in index_set:
- missing.add(key)
-
- return missing
- def get_overhead(index, cmp_index):
- overhead = set()
-
- index_set = set()
- for entry in index:
- index_set.add(entry[0])
-
- cmp_index_set = set()
- for entry in cmp_index:
- cmp_index_set.add(entry[0])
-
- for key in index_set:
- if key not in cmp_index_set:
- overhead.add(key)
-
- return overhead
- def handle_duplicates(suffix, calculate, threads, db_basename, force_mode):
- index = read_ffindex(db_basename+"_"+suffix+".ffindex")
- duplicates = get_duplicates(index)
-
- if(suffix == "a3m" and len(duplicates) > 0):
- sys.stderr.write("ERROR: "+db_basename+"_a3m.ffindex contains duplicates!\n")
- sys.stderr.write("ERROR: Your database is broken!\n")
- exit(1);
-
- if(len(duplicates) == 0):
- return
-
- for duplicate in duplicates:
- sys.stderr.write("WARNING: "+db_basename+"_"+suffix+".ffindex contains duplicate "+duplicate+"!\n")
-
- if force_mode:
- tmp_dir = tempfile.mkdtemp()
-
- try:
- sys.stderr.write("WARNING: remove duplicates and recalculate them from their corresponding a3m's!\n")
- a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
-
- duplicates_a3m_base = os.path.join(tmp_dir, "duplicates_a3m")
- duplicates_a3m_index_file = os.path.join(tmp_dir, "duplicates_a3m.ffindex")
- duplicates_a3m_data_file = os.path.join(tmp_dir, "duplicates_a3m.ffdata")
- write_subset_index(a3m_index, duplicates, duplicates_a3m_index_file)
- os.symlink(db_basename+"_a3m.ffdata", duplicates_a3m_data_file)
-
- duplicates_new_base = os.path.join(tmp_dir, "duplicates_new")
- duplicates_new_index_file = os.path.join(tmp_dir, "duplicates_new.ffindex")
- duplicates_new_data_file = os.path.join(tmp_dir, "duplicates_new.ffdata")
-
- duplicates_index_file = os.path.join(tmp_dir, "duplicates.dat")
- write_set_to_file(duplicates, duplicates_index_file)
- remove_files_from_index(duplicates_index_file, db_basename+"_"+suffix+".ffindex")
-
- calculate(threads, duplicates_a3m_base, duplicates_new_base)
- sort_database(duplicates_new_data_file, duplicates_new_index_file)
-
- merge_databases(duplicates_new_data_file, duplicates_new_index_file,
- db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
- sort_database(db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
- optimize_database(db_basename+"_cs219.ffdata", db_basename+"_cs219.ffindex")
- finally:
- shutil.rmtree(tmp_dir)
- else:
- sys.stderr.write("You may try to use the option --force to fix the database!\n")
-
- def handle_unsorted(suffix, db_basename, force_mode):
- index = read_ffindex(db_basename+"_"+suffix+".ffindex")
- if(not is_sorted(index)):
- sys.stderr.write("Index "+db_basename+"_"+suffix+".ffindex is unsorted!\n")
- if force_mode:
- sys.stderr.write("Try to sort unsorted index "+db_basename+"_"+suffix+".ffindex!\n")
- sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- else:
- sys.stderr.write("You may try to use the option --force to fix the database!\n")
-
- def handle_missing(suffix, calculate, db_basename, threads, force_mode):
- index = read_ffindex(db_basename+"_"+suffix+".ffindex")
- a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
-
- missing = get_missing(index, a3m_index)
-
- if(len(missing) == 0):
- return
-
- for f in missing:
- sys.stderr.write("WARNING: Missing entry "+f+" in "+db_basename+"_"+suffix+".ff{data,index}!\n")
-
- if force_mode:
- sys.stderr.write("WARNING: Try to calculate missing entries!\n")
- tmp_dir = tempfile.mkdtemp()
-
- try:
- missing_base = os.path.join(tmp_dir, "missing_"+suffix)
- missing_index_file = os.path.join(tmp_dir, "missing_"+suffix+".ffindex")
- missing_data_file = os.path.join(tmp_dir, "missing_"+suffix+".ffdata")
-
- missing_a3m_base = os.path.join(tmp_dir, "missing_a3m")
- missing_a3m_index_file = os.path.join(tmp_dir, "missing_a3m.ffindex")
- missing_a3m_data_file = os.path.join(tmp_dir, "missing_a3m.ffdata")
- write_subset_index(a3m_index, missing, missing_a3m_index_file)
- os.symlink(os.path.abspath(db_basename+"_a3m.ffdata"), missing_a3m_data_file)
-
- calculate(threads, missing_a3m_base, missing_base)
-
- merge_databases(missing_data_file, missing_index_file, db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- finally:
- shutil.rmtree(tmp_dir)
- else:
- sys.stderr.write("You may try to use the option --force to fix the database!\n")
- def handle_overhead(suffix, db_basename, force_mode):
- index = read_ffindex(db_basename+"_"+suffix+".ffindex")
- a3m_index = read_ffindex(db_basename+"_a3m.ffindex")
- overhead = get_overhead(index, a3m_index)
- #delete overhead cs219 files
- if(len(overhead) == 0):
- return
- for f in overhead:
- sys.stderr.write("WARNING: Entry "+f+" from "+db_basename+"_"+suffix+".ff{data,index} has no corresponding entry in the a3m database!\n")
- if force_mode:
- sys.stderr.write("WARNING: Try to fix overhead entries!\n")
- tmp_dir = tempfile.mkdtemp()
-
- try:
- index_file = os.path.join(tmp_dir, "to_delete.dat")
- write_set_to_file(overhead, index_file)
- remove_files_from_index(index_file, db_basename+"_"+suffix+".ffindex")
- optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- finally:
- shutil.rmtree(tmp_dir)
- else:
- sys.stderr.write("You may try to use the option --force to fix the database!\n")
- def check_a3m_format(db_basename, force_mode):
- entries = ffindex.read_index(db_basename+"_a3m.ffindex")
- data = ffindex.read_data(db_basename+"_a3m.ffdata")
-
- corrupted_alignments = set()
- for entry in entries:
- lines = ffindex.read_lines(entry, data)
- alignment = a3m.A3M_Container()
- try:
- alignment.read_a3m_from_lines(lines)
- except:
- corrupted_alignments.add(entry.name)
- sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
-
- if len(corrupted_alignments) == 0:
- return
-
- if force_mode:
- tmp_dir = tempfile.mkdtemp()
-
- try:
- sys.stderr.write("WARNING: remove corrupted a3m's!\n")
-
- corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
- write_set_to_file(corrupted_alignments, corrupted_index_file)
-
- for suffix in ["a3m", "cs219", "hhm"]:
- remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex")
- sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
- finally:
- shutil.rmtree(tmp_dir)
- else:
- sys.stderr.write("You may try to use the option --force to fix the database!\n")
-
- def check_database(output_basename, threads, force_mode):
- if not os.path.exists(output_basename+"_a3m.ffindex") or not os.path.exists(output_basename+"_a3m.ffdata"):
- sys.stderr.write("Error: No a3m database found!")
- exit(1)
-
- if not os.path.exists(output_basename+"_cs219.ffindex") or os.path.exists(output_basename+"_cs219.ffdata"):
- open(output_basename+"_cs219.ffindex", 'a').close()
- open(output_basename+"_cs219.ffdata", 'a').close()
-
- if not os.path.exists(output_basename+"_hhm.ffindex") or os.path.exists(output_basename+"_hhm.ffdata"):
- calculate_hhm(threads, output_basename+"_a3m", output_basename+"_hhm")
- check_a3m_format(output_basename, force_mode)
- #TODO check hhm's...
- #TODO check cs219...
-
- handle_unsorted("a3m", output_basename, force_mode)
- handle_unsorted("hhm", output_basename, force_mode)
- handle_unsorted("cs219", output_basename, force_mode)
-
- handle_duplicates("a3m", calculate_hhm, threads, output_basename, force_mode)
- handle_duplicates("hhm", calculate_hhm, threads, output_basename, force_mode)
- handle_duplicates("cs219", calculate_cs219, threads, output_basename, force_mode)
- handle_missing("cs219", calculate_cs219, output_basename, threads, force_mode)
-
- handle_overhead("hhm", output_basename, force_mode)
- handle_overhead("cs219", output_basename, force_mode)
- def add_new_files(globular_expression, suffix, output_basename):
- tmp_dir = tempfile.mkdtemp()
- files = set(glob(globular_expression))
-
- if(len(files) == 0):
- return
- try:
- files_index = os.path.join(tmp_dir, "files.dat")
- write_set_to_file(files, files_index)
-
- new_base = os.path.join(tmp_dir, "new")
- new_index_file = new_base + ".ffindex"
- new_data_file = new_base + ".ffdata"
- build_ffindex_database(files_index, new_data_file, new_index_file)
-
- output_index_file = output_basename+"_"+suffix+".ffindex"
- output_data_file = output_basename+"_"+suffix+".ffdata"
- remove_files_from_index(files_index, output_index_file)
- merge_databases(new_data_file, new_index_file, output_data_file, output_index_file)
- optimize_database(output_data_file, output_index_file)
-
- if(suffix == "a3m"):
- for other_suffix in ["hhm", "cs219"]:
- output_index_file = output_basename+"_"+other_suffix+".ffindex"
- output_data_file = output_basename+"_"+other_suffix+".ffdata"
- if os.path.exists(output_index_file) and os.path.exists(output_data_file) and os.path.getsize(output_data_file) > 0:
- remove_files_from_index(files_index, output_index_file)
- optimize_database(output_data_file, output_index_file)
-
- finally:
- shutil.rmtree(tmp_dir)
- def opt():
- parser = OptionParser()
- parser.add_option("--ia3m", dest="a3m_files_glob",
- help="Glob for a3m files", metavar="<GLOB>")
- parser.add_option("--ics219", dest="cs_files_glob",
- help="Glob for cs219 files", metavar="<GLOB>")
- parser.add_option("--ihhm", dest="hhm_files_glob",
- help="Glob for hhm files", metavar="<GLOB>")
- parser.add_option("-o", metavar="FILE", dest="output_basename",
- help="Output hhsuite database basename")
- parser.add_option("--cpu", metavar="INT", dest="nr_cores",
- help="Number of threads that shall used")
- parser.add_option("--force",
- action="store_true", dest="force_mode", default=False,
- help="Try to fix problems with the database")
- return parser
-
- def check_options(options, parser):
- if not options.nr_cores:
- sys.stderr.write("Please use --cpu!\n")
- parser.print_help()
- sys.exit(1)
-
- if not options.output_basename:
- sys.stderr.write("Please use -o!\n")
- parser.print_help()
- sys.exit(1)
- def main():
- parser = opt()
- (options, args) = parser.parse_args()
- check_options(options, parser)
-
- #Important to do a3m's first... deleting out of date hhm's and cs219's
- if(options.a3m_files_glob):
- add_new_files(options.a3m_files_glob, "a3m", options.output_basename)
-
- if(options.hhm_files_glob):
- add_new_files(options.hhm_files_glob, "hhm", options.output_basename)
- if(options.cs_files_glob):
- add_new_files(options.cs_files_glob, "cs219", options.output_basename)
- check_database(options.output_basename, options.nr_cores, options.force_mode)
-
- if __name__ == "__main__":
- main()
-
|