createdb.sh 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. #!/bin/sh -e
  2. INPUT="$1"
  3. OUTPUT="$2"
  4. THREADS=""
  5. if [ -n "$3" ]; then
  6. THREADS="--threads $3"
  7. fi
  8. MPI="$4"
  9. #command -v "$(echo "$MPI" | cut -d " " -f1)" >/dev/null 2>&1
  10. #if [ "$?" -eq "1" ]; then
  11. # echo "MPI will not be used"
  12. # unset MPI
  13. #fi
  14. if [ "$#" -lt 2 ]; then
  15. echo "createdb.sh input output [threads] '[mpirun --mpi-params]'"
  16. echo "Create HH-suite datebase from MMseqs2 input"
  17. echo "Example:"
  18. echo " conda create -n hhdb mmseqs2 famsa hhsuite"
  19. echo " conda activate hhdb"
  20. echo " mmseqs createdb input.fas input"
  21. echo " mmseqs cluster input clu tmp"
  22. echo " mmseqs createseqfiledb input clu seqfiledb"
  23. echo " createdb.sh seqfiledb hhdatabase"
  24. exit 1
  25. fi
  26. > "make_msa.sh" cat <<EOF
  27. #!/bin/sh -e
  28. famsa -t 1 STDIN STDOUT | hhconsensus -maxres 65535 -i stdin -o stdout -v 0 2> /dev/null
  29. EOF
  30. chmod +x make_msa.sh
  31. echo "calculate alignments"
  32. $MPI mmseqs apply "${INPUT}" "${OUTPUT}_a3m" $THREADS -- "./make_msa.sh" 2> /dev/null
  33. rm -f "make_msa.sh"
  34. echo "calculate hidden markov models of large MSAs"
  35. $MPI mmseqs apply "${OUTPUT}_a3m" "${OUTPUT}_hhm_sizes" $THREADS -- awk '/^>/{cnt++;} cnt>51 {print "1"; exit }'
  36. awk 'FNR==NR && $3 > 1 {f[$1]=1; next} $1 in f {print}' "${OUTPUT}_hhm_sizes.index" "${OUTPUT}_a3m.index" > "${OUTPUT}_a3m_large.index"
  37. ln -fs "${OUTPUT}_a3m" "${OUTPUT}_a3m_large"
  38. $MPI mmseqs apply "${OUTPUT}_a3m_large" "${OUTPUT}_hhm" $THREADS -- hhmake -i stdin -o stdout -v 0
  39. rm -f "${OUTPUT}_a3m_large" "${OUTPUT}_a3m_large.index" "${OUTPUT}_hhm_sizes.ffindex"
  40. echo "calculate context states"
  41. ln -fs "${OUTPUT}_a3m" "${OUTPUT}_a3m.ffdata"
  42. ln -fs "${OUTPUT}_a3m.index" "${OUTPUT}_a3m.ffindex"
  43. if [ "$MPI" = "" ]; then
  44. cstranslate -x 0.3 -c 4 --ffindex -I a3m -i "${OUTPUT}_a3m" -o "${OUTPUT}_cs219" > /dev/null
  45. else
  46. $MPI cstranslate_mpi -x 0.3 -c 4 -I a3m -i "${OUTPUT}_a3m" -o "${OUTPUT}_cs219" 2> /dev/null
  47. fi
  48. ln -f "${OUTPUT}_cs219.ffdata" "${OUTPUT}_cs219"
  49. ln -f "${OUTPUT}_cs219.ffindex" "${OUTPUT}_cs219.index"
  50. echo "reorder databases for faster access"
  51. sort -k 3 -n "${OUTPUT}_cs219.index" | cut -f1 > "${OUTPUT}_sorted.tsv"
  52. for type in a3m hhm; do
  53. mmseqs createsubdb "${OUTPUT}_sorted.tsv" "${OUTPUT}_${type}" "${OUTPUT}_${type}_opt" -v 0
  54. mv -f "${OUTPUT}_${type}_opt" "${OUTPUT}_${type}.ffdata"
  55. LC_ALL=C sort -k1,1 "${OUTPUT}_${type}_opt.index" > "${OUTPUT}_${type}.ffindex"
  56. rm -f "${OUTPUT}_${type}_opt.index"
  57. done
  58. rm -f "${OUTPUT}_sorted.tsv"