cif2fasta.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. #!/usr/bin/env python
  2. """
  3. Created on Mon Jun 15 21:49:32 2015
  4. @author: Harald Voehringer
  5. """
  6. from __future__ import print_function
  7. import sys, os, glob, gzip, textwrap, itertools
  8. from optparse import OptionParser
  9. from collections import defaultdict
  10. from os.path import splitext
  11. from pdbx.reader.PdbxReader import PdbxReader
  12. from multiprocessing import Pool
  13. DEBUG_MODE = False
  14. MIN_SEQ_LEN = None
  15. SCOP_LIBRARY = False
  16. THREE2ONE = {
  17. 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P',
  18. 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R',
  19. 'TRP': 'W', 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M', 'MSE': 'M',
  20. 'HYP': 'P', 'MLY': 'K', 'SEP': 'S', 'TPO': 'T', 'CSO': 'C', 'PTR': 'Y', 'KCX': 'K',
  21. 'CME': 'C', 'CSD': 'A', 'CAS': 'C', 'MLE': 'L', 'DAL': 'A', 'CGU': 'E', 'DLE': 'L',
  22. 'FME': 'M', 'DVA': 'V', 'OCS': 'C', 'DPR': 'P', 'MVA': 'V', 'TYS': 'Y', 'M3L': 'K',
  23. 'SMC': 'C', 'ALY': 'K', 'CSX': 'C', 'DCY': 'C', 'NLE': 'L', 'DGL': 'E', 'DSN': 'S',
  24. 'CSS': 'C', 'DLY': 'K', 'MLZ': 'K', 'DPN': 'F', 'DAR': 'R', 'PHI': 'F', 'IAS': 'D',
  25. 'DAS': 'D', 'HIC': 'H', 'MP8': 'P', 'DTH': 'T', 'DIL': 'I', 'MEN': 'N', 'DTY': 'Y',
  26. 'CXM': 'M', 'DGN': 'G', 'DTR': 'W', 'SAC': 'S', 'DSG': 'N', 'MME': 'M', 'MAA': 'A',
  27. 'YOF': 'Y', 'FP9': 'P', 'FVA': 'V', 'MLU': 'L', 'OMY': 'Y', 'FGA': 'E', 'MEA': 'F',
  28. 'CMH': 'C', 'DHI': 'H', 'SEC': 'C', 'OMZ': 'Y', 'SCY': 'C', 'MHO': 'M', 'MED': 'M',
  29. 'CAF': 'C', 'NIY': 'Y', 'OAS': 'S', 'SCH': 'C', 'MK8': 'L', 'SME': 'M', 'LYZ': 'K'
  30. }
  31. CANONICAL_RESIDUES = set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
  32. 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])
  33. class CIF2FASTA(object):
  34. def __init__(self, cif_path):
  35. self.cif_path = cif_path
  36. self.block = self.open_cif()
  37. def open_cif(self):
  38. """ Assumes a mmCIF or gzipped mmCIF file and returns a data block used for subsequent procedures. """
  39. # The "usual" procedure to open a mmCIF with pdbX/mmCIF
  40. data = []
  41. try:
  42. with gzip.open(self.cif_path) as cif_fh:
  43. reader = PdbxReader(cif_fh)
  44. reader.read(data)
  45. except IOError:
  46. with open(self.cif_path) as cif_fh:
  47. reader = PdbxReader(cif_fh)
  48. reader.read(data)
  49. if len(data) == 0:
  50. return None
  51. else:
  52. return data[0]
  53. def is_valid(self):
  54. return self.block is not None
  55. def chain_to_seq(self):
  56. """Extracts the sequence of the cif from entity_poly.pdbx_seq_one_letter_code"""
  57. cif_chain_to_seq = dict()
  58. non_polypeptide_chains = list()
  59. try:
  60. entity_poly = self.block.getObj('entity_poly')
  61. except AttributeError:
  62. if DEBUG_MODE > 0:
  63. print ('! {pdb} Could not extract entity_poly table.'.format(
  64. pdb = self.pdb_entry()))
  65. return False
  66. try:
  67. total_rows = entity_poly.getRowCount()
  68. except AttributeError:
  69. print ('! {pdb} Could not extract rows from entity_poly.'.format(
  70. pdb = self.pdb_entry()))
  71. return False
  72. for row in range(0, total_rows):
  73. if entity_poly.getValue('type', row) == 'polypeptide(L)':
  74. seq = entity_poly.getValue('pdbx_seq_one_letter_code', row)
  75. parsed_seq = parse_seq(seq) # removes special amino acids and newlines
  76. try:
  77. chains = entity_poly.getValue('pdbx_strand_id', row)
  78. except ValueError:
  79. if total_rows == 1:
  80. print ('! {pdb} Only one polypeptide chain, but no chain identifiers, setting it to ".".'.format(
  81. pdb = self.pdb_entry()))
  82. cif_chain_to_seq['.'] = parsed_seq
  83. return cif_chain_to_seq
  84. print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (polypeptide).'.format(
  85. pdb = self.pdb_entry()))
  86. return False
  87. chain_list = chains.split(',')
  88. for chain in chain_list:
  89. cif_chain_to_seq[chain] = parsed_seq
  90. else:
  91. try:
  92. chains = entity_poly.getValue('pdbx_strand_id', row)
  93. except ValueError:
  94. print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (non-polypeptide).'.format(
  95. pdb = self.pdb_entry()))
  96. return False
  97. non_polypeptide_chains.append(chains)
  98. chains = list(cif_chain_to_seq.keys())
  99. # remove chains that contain only unknown residues
  100. for chain in chains:
  101. # this is a very odd way to check whether a string contains only a single char
  102. tmp_set = set(cif_chain_to_seq[chain])
  103. if len(tmp_set) == 1 and 'X' in tmp_set:
  104. print ('! Removing {pdb}_{chain} (contains only unknown residues).'.format(
  105. pdb = self.pdb_entry(),
  106. chain = chain))
  107. del cif_chain_to_seq[chain]
  108. continue
  109. if len(cif_chain_to_seq[chain]) < MIN_SEQ_LEN:
  110. print ('! Removing {pdb}_{chain} (sequence length < {min_len}).'.format(
  111. pdb = self.pdb_entry(),
  112. chain = chain,
  113. min_len = MIN_SEQ_LEN))
  114. del cif_chain_to_seq[chain]
  115. if len(cif_chain_to_seq) != 0:
  116. if DEBUG_MODE > 1:
  117. print ('- Extracted chains of {pdb} {chains}.'.format(
  118. pdb = self.pdb_entry(),
  119. chains = ' '.join( str(chain) + ' (' + str(len(cif_chain_to_seq[chain])) + ')' for chain in cif_chain_to_seq.keys())))
  120. if len(non_polypeptide_chains) != 0:
  121. print ('- Following chains were non polypeptide chains {chains} no polypeptide chains were found.'.format(
  122. chains = ', '.join(non_polypeptide_chains)))
  123. return cif_chain_to_seq
  124. else:
  125. if DEBUG_MODE > 0:
  126. print ('! {pdb} No polypeptide chains were found.'.format(
  127. pdb = self.pdb_entry()))
  128. return False
  129. def chain_ratios(self, chain_to_seq):
  130. """ Tries to extract Sequence from the atom section """
  131. # chain_to_seq = self.chain_to_seq()
  132. if chain_to_seq != False:
  133. chain_ratios = dict()
  134. # compute the lengths of sequences found in _entity_poly
  135. entity_length = { chain : float(len(seq)) for chain, seq in chain_to_seq.items() }
  136. entity_chains = entity_length.keys()
  137. # load the atomsite and set up dictionary to keep track of sequences
  138. atom_site = self.block.getObj('atom_site')
  139. atom_seq = defaultdict(str)
  140. current_residue = 0
  141. # Iterate through the atomsection of the cif file
  142. for atom_row in range(0, atom_site.getRowCount()):
  143. # NMR structures contain many confomers
  144. try:
  145. model_num = int(atom_site.getValue('pdbx_PDB_model_num', atom_row))
  146. except ValueError:
  147. model_num = 1
  148. if model_num > 1:
  149. continue
  150. atom_chain = atom_site.getValue('label_asym_id', atom_row)
  151. # get the alternative chain identifier too
  152. try:
  153. alt_chain = atom_site.getValue('auth_asym_id', atom_row)
  154. except ValueError:
  155. alt_chain = None
  156. # handle cases where there are no chains but only one structure
  157. if atom_chain == '.' and entity_chains[0] == '.':
  158. atom_chain = '.'
  159. # get the residue and the residue number
  160. try:
  161. res_num = int(atom_site.getValue("label_seq_id", atom_row))
  162. except ValueError:
  163. continue
  164. if res_num != current_residue:
  165. residue = atom_site.getValue('label_comp_id', atom_row)
  166. try:
  167. residue = THREE2ONE[residue]
  168. except KeyError:
  169. residue = 'X'
  170. # try to get the chain identifier from alt_chain first, if this does not work use label_asym_id
  171. if alt_chain is not None:
  172. atom_seq[alt_chain] += residue
  173. # sometimes we find the right chain identifier not in the alt_chain
  174. if not (atom_chain in atom_seq.keys()) and atom_chain is not None:
  175. atom_seq[atom_chain] += residue
  176. current_residue = res_num
  177. for chain in entity_length.keys():
  178. if chain in atom_seq.keys():
  179. chain_ratios[chain] = len(atom_seq[chain]) / entity_length[chain]
  180. else:
  181. chain_ratios[chain] = 0
  182. return chain_ratios
  183. else:
  184. return False
  185. def pdb_entry(self):
  186. """Extracts the PDB entry information of a cif file."""
  187. try:
  188. entry = self.block.getObj('entry')
  189. entry_id = entry.getValue('id')
  190. return entry_id.replace('\n', ' ')
  191. except AttributeError:
  192. if DEBUG_MODE > 0:
  193. print ('! {pdb} Could not extract id from entry.'.format(
  194. pdb = self.pdb_entry()))
  195. def protein_description(self):
  196. """Extracts the protein description annotated in struct.pdbx_descriptor of the cif file."""
  197. try:
  198. # Get struct table which contains the protein description
  199. struct = self.block.getObj('struct')
  200. # Get the pdbx description and make format it appropritaly
  201. protein_description = struct.getValue('pdbx_descriptor')
  202. protein_description = protein_description.replace('\n', ' ')
  203. protein_description = protein_description.replace(';', ' ') # to prevent parsing errors
  204. if len(protein_description.split(' ')) >= 5:
  205. protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header
  206. return protein_description.strip(',')
  207. except AttributeError:
  208. if DEBUG_MODE > 1:
  209. print ('! {pdb} Could not extract pdbx_descriptor from struct table.'.format(
  210. pdb = self.pdb_entry()))
  211. return False
  212. def compounds(self):
  213. """ Extracts all compounds annotated in the HETATM section of the atom
  214. struct table if the compound appears at least 10 times and is not water
  215. (HOH)."""
  216. atom_site = self.block.getObj('atom_site')
  217. compounds = {}
  218. for row in range(0, atom_site.getRowCount()):
  219. if atom_site.getValue('group_PDB', row) == 'HETATM':
  220. label_comp_id = atom_site.getValue('label_comp_id', row)
  221. if label_comp_id not in compounds.keys():
  222. compounds[label_comp_id] = 1
  223. else:
  224. compounds[label_comp_id] += 1
  225. filtered_compounds = set()
  226. for compound in compounds.keys():
  227. if compounds[compound] >= 10 and compound != 'HOH':
  228. filtered_compounds.add(compound)
  229. if len(filtered_compounds) == 0:
  230. return False
  231. else:
  232. return ', '.join(filtered_compounds).replace('\n', ' ')
  233. def resolution(self):
  234. """Extracts the resolution of the mmCIF."""
  235. try:
  236. refine = self.block.getObj('refine')
  237. resolution = refine.getValue('ls_d_res_high')
  238. try:
  239. resolution = float(resolution)
  240. except ValueError:
  241. return False
  242. return resolution
  243. except AttributeError:
  244. if DEBUG_MODE > 1:
  245. print ('! {pdb} Could not extract ls_d_res_high from refine table.'.format(
  246. pdb = self.pdb_entry()))
  247. try:
  248. reflns = self.block.getObj('reflns')
  249. # Extract the resolution of the crystal
  250. resolution = reflns.getValue('d_resolution_high')
  251. try:
  252. resolution = float(resolution)
  253. except ValueError:
  254. return False
  255. return resolution
  256. except AttributeError:
  257. if DEBUG_MODE > 1:
  258. print ('! {pdb} Could not extract d_resolution_high from reflns table.'.format(
  259. pdb = self.pdb_entry()))
  260. # This is true for some Electron Microscopy structures
  261. try:
  262. em_3d = self.block.getObj('em_3d_reconstruction')
  263. resolution = em_3d.getValue('resolution')
  264. try:
  265. resolution = float(resolution)
  266. except ValueError:
  267. return False
  268. return resolution
  269. except AttributeError:
  270. if DEBUG_MODE > 1:
  271. print ('! {pdb} Could not extract resolution from em_3d_reconstruction table.'.format(
  272. pdb = self.pdb_entry()))
  273. return False
  274. def experimental_method(self):
  275. """Extracts the experimental method of the mmCIF."""
  276. try:
  277. reflns = self.block.getObj('exptl')
  278. method = reflns.getValue('method')
  279. return method.replace('\n', ' ')
  280. except AttributeError:
  281. if DEBUG_MODE > 1:
  282. print ('! Could not extract text from exptl table.'.format(
  283. pdb = self.pdb_entry()))
  284. return False
  285. def keywords(self):
  286. """Extracts the keywords of the mmCIF."""
  287. try:
  288. reflns = self.block.getObj('struct_keywords')
  289. keywords = reflns.getValue('text')
  290. # perform some string modifications
  291. keywords = keywords.replace('\n', ' ')
  292. keywords = keywords.replace(';', ' ')
  293. if len(keywords.split(' ')) >= 5:
  294. keywords = ' '.join(keywords.split(' ')[0:5])
  295. return keywords.rstrip(',')
  296. except AttributeError:
  297. if DEBUG_MODE > 1:
  298. print ('! {pdb} Could not extract text from struct_keywords table.'.format(
  299. pdb = self.pdb_entry()))
  300. return False
  301. def organism(self):
  302. """Extracts the organism of the mmCIF."""
  303. try:
  304. entity_src_nat = self.block.getObj('entity_src_nat')
  305. organsim_scientific = entity_src_nat.getValue('pdbx_organism_scientific')
  306. return organsim_scientific.replace('\n', ' ')
  307. except AttributeError:
  308. if DEBUG_MODE > 1:
  309. print ("! {pdb} Could not extract from pdbx_organism_scientific from entity_src_gen table (".format(
  310. pdb = self.pdb_entry()))
  311. pass
  312. try:
  313. entity_src_gen = self.block.getObj("entity_src_gen")
  314. src_scientific = entity_src_gen.getValue("pdbx_gene_src_scientific_name")
  315. return src_scientific.replace("\n", " ")
  316. except AttributeError:
  317. if DEBUG_MODE > 1:
  318. print ('! {pdb} Could not extract from pdbx_gene_src_scientific_name from entity_src_gen table'.format(
  319. pdb = self.pdb_entry()))
  320. return False
  321. def r_free(self):
  322. try:
  323. refine = self.block.getObj('refine')
  324. r_free = refine.getValue('ls_R_factor_R_free')
  325. try:
  326. return float(r_free)
  327. except ValueError:
  328. return False
  329. except AttributeError:
  330. if DEBUG_MODE > 2:
  331. print ('! Could not extract R free factor ({pdb})'.format(
  332. pdb = self.pdb_entry()))
  333. except ValueError:
  334. if DEBUG_MODE > 2:
  335. print ('! R free factor is not annotated ({pdb})'.format(
  336. pdb = self.pdb_entry()))
  337. return False
  338. # Helper functions
  339. def parse_seq(orginal_seq):
  340. """Parses the cif fasta sequence and replaces non-canonical residues with their canonical counterparts."""
  341. seq = orginal_seq
  342. while seq.find('(') != -1:
  343. start_pos = seq.find('(')
  344. stop_pos = seq.find(')')
  345. residue = seq[start_pos + 1:stop_pos]
  346. try:
  347. canonical = THREE2ONE[residue]
  348. except KeyError:
  349. canonical = 'X'
  350. if DEBUG_MODE > 1:
  351. print ('! Replaced non canonical residue {nc} with {c}'.format(
  352. nc = residue,
  353. c = canonical))
  354. if start_pos == 0:
  355. seq = canonical + seq[stop_pos+1:]
  356. elif stop_pos == len(seq):
  357. seq = seq[0:start_pos] + canonical
  358. else:
  359. pre_seq = seq[0:start_pos]
  360. post_seq = seq[stop_pos+1:]
  361. seq = pre_seq + canonical + post_seq
  362. seq = seq.replace('\n', '')
  363. seq_array = []
  364. for c in seq:
  365. if c in CANONICAL_RESIDUES:
  366. seq_array.append(c)
  367. else:
  368. seq_array.append('X')
  369. seq = ''.join(seq_array)
  370. return seq
  371. def get_paths(in_folder, out_folder):
  372. """Searches a directory and all its subdirectories for files ending
  373. with a specific suffix."""
  374. paths = list()
  375. for root, dirs, files in os.walk(in_folder):
  376. for fname in files:
  377. if fname.endswith(".cif") or fname.endswith(".cif.gz"):
  378. in_path = os.path.join(root, fname)
  379. out_file = in_path.split('/')[-1].split('.')[0] + ".fasta"
  380. out_path = os.path.join(out_folder, out_file)
  381. #absolute_path = os.path.join(os.getcwd(), fpath)
  382. paths.append((in_path, out_path))
  383. return paths
  384. def construct_header(cif2fasta):
  385. """Constructs a fasta header."""
  386. protein_description = cif2fasta.protein_description()
  387. keywords = cif2fasta.keywords()
  388. compounds = cif2fasta.compounds()
  389. resolution = cif2fasta.resolution()
  390. method = cif2fasta.experimental_method()
  391. organism = cif2fasta.organism()
  392. r_free = cif2fasta.r_free()
  393. # construct the header with the data given
  394. header_information = ''
  395. if protein_description:
  396. protein_description = protein_description.replace(';', ' ') # to prevent parsing errors
  397. protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header
  398. header_information += 'DSC: ' + protein_description + '; '
  399. else:
  400. header_information += 'DSC: N/A; '
  401. # if keywords:
  402. # header_information += keywords + '; '
  403. if method:
  404. header_information += 'MET: ' + method + '; '
  405. else:
  406. header_information += 'MET: N/A; '
  407. if resolution:
  408. header_information += 'RES: ' + resolution + '; '
  409. else:
  410. header_information += 'RES: N/A; '
  411. if r_free:
  412. header_information += 'RFR: ' + r_free + '; '
  413. else:
  414. header_information += 'RFR: N/A; '
  415. if organism:
  416. header_information += 'ORG: ' + organism + '; '
  417. else:
  418. header_information += 'ORG: N/A; '
  419. if compounds:
  420. header_information += 'HET: ' + compounds + '; '
  421. else:
  422. header_information += 'HET: N/A; '
  423. if SCOP_LIBRARY:
  424. try:
  425. scop_idx = SCOP_LIBRARY[cif2fasta.pdb_entry()]
  426. if len(scop_idx) != 0:
  427. domains = ', '.join(scop_idx)
  428. else:
  429. domains = 'N/A'
  430. except KeyError:
  431. domains = 'N/A'
  432. header_information += 'SCOP: ' + domains + '; '
  433. return header_information.strip()
  434. def create_fasta_entry(cif2fasta):
  435. """Combines information given in pdb_entry and strand_seq and yields a clean
  436. fasta file.
  437. """
  438. pdb_entry = cif2fasta.pdb_entry()
  439. header = construct_header(cif2fasta)
  440. chain_to_seq = cif2fasta.chain_to_seq()
  441. chain_ratios = cif2fasta.chain_ratios(chain_to_seq)
  442. #import pdb; pdb.set_trace()
  443. fasta_entry = ""
  444. if chain_to_seq and pdb_entry:
  445. for chain in sorted(chain_to_seq.keys()):
  446. if len(chain_to_seq[chain]) != 0:
  447. fasta_entry += '>{pdb}_{chain} {header} CMP: {rat:.2f}\n{seq}\n'.format(
  448. pdb = pdb_entry,
  449. chain = chain,
  450. header = header,
  451. rat = chain_ratios[chain],
  452. seq = '\n'.join(textwrap.wrap(chain_to_seq[chain], 80)))
  453. # fasta_entry += '>' + pdb_entry + '_' + chain + ' ' + header + '\n' + '\n'.join(textwrap.wrap(chain_to_seq[chain], 80)) + '\n'
  454. return fasta_entry
  455. else:
  456. return None
  457. def create_fasta_entry2(cif2fasta):
  458. """" Creates a fasta entry."""
  459. # Get all the information we need
  460. pdb_entry = cif2fasta.pdb_entry()
  461. protein_description = cif2fasta.protein_description()
  462. keywords = cif2fasta.keywords()
  463. compounds = cif2fasta.compounds()
  464. resolution = cif2fasta.resolution()
  465. method = cif2fasta.experimental_method()
  466. organism = cif2fasta.organism()
  467. r_free = cif2fasta.r_free()
  468. # get chains and sequences
  469. chain_to_seq = cif2fasta.chain_to_seq()
  470. chain_ratios = cif2fasta.chain_ratios(chain_to_seq)
  471. is_NMR = False
  472. if 'NMR' in method:
  473. is_NMR = True
  474. fasta_entry = ''
  475. pdb_filter = '' # If needed create entries that are parseable by pdb filter
  476. if chain_to_seq and pdb_entry:
  477. for chain, seq in chain_to_seq.items():
  478. # check if we can find SCOP domains for the current chain
  479. domains = False
  480. if SCOP_LIBRARY:
  481. try:
  482. scop_idx = SCOP_LIBRARY[pdb_entry + '_' + chain]
  483. if len(scop_idx) != 0:
  484. domains = ', '.join(scop_idx)
  485. else:
  486. domains = False
  487. except KeyError:
  488. domains = False
  489. if is_NMR:
  490. fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format(
  491. p = pdb_entry,
  492. c = chain,
  493. d = ' ' + protein_description + ';' if protein_description else '',
  494. k = ' ' + keywords + ';' if keywords else '',
  495. h = ' HET: ' + compounds + ';' if compounds else '',
  496. r = ' NMR ',
  497. o = organism if organism else 'N/A',
  498. s = ' SCOP: ' + domains if domains else '',
  499. seq = '\n'.join(textwrap.wrap(seq, 80)))
  500. pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format(
  501. p = pdb_entry,
  502. c = chain,
  503. m = method,
  504. r = 'N/A',
  505. f = round(r_free, 3) if r_free else 'N/A',
  506. comp = round(chain_ratios[chain], 3))
  507. else:
  508. fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format(
  509. p = pdb_entry,
  510. c = chain,
  511. d = ' ' + protein_description + ';' if protein_description else '',
  512. k = ' ' + keywords + ';' if keywords else '',
  513. h = ' HET: ' + compounds + ';' if compounds else '',
  514. r = ' ' + str(resolution) + 'A ' if resolution else '',
  515. o = organism if organism else 'N/A',
  516. s = ' SCOP: ' + domains if domains else '',
  517. seq = '\n'.join(textwrap.wrap(seq, 80)))
  518. # <pdb_entry>_<chain>\t<resolution>\t<r_free>\t<completness>
  519. pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format(
  520. p = pdb_entry,
  521. c = chain,
  522. m = method,
  523. r = round(resolution, 3) if resolution else 'N/A',
  524. f = round(r_free, 3) if r_free else 'N/A',
  525. comp = round(chain_ratios[chain], 3))
  526. return (fasta_entry, pdb_filter)
  527. def parse_scop(scop_file):
  528. scop = defaultdict(set)
  529. with open(scop_file) as fh:
  530. for line in fh:
  531. if line.startswith('#'):
  532. continue
  533. scop_id, pdb_code, chain, scop_num = line.split('\t')[0:4]
  534. chain = chain.split(':')[0]
  535. entry = pdb_code.upper() + '_' + chain
  536. scop[entry].add(scop_num)
  537. return scop
  538. def wrapper_function(paths):
  539. in_file = paths[0]
  540. out_file = paths[1]
  541. cif2fasta = CIF2FASTA(in_file)
  542. if cif2fasta.is_valid():
  543. fasta_entry = create_fasta_entry2(cif2fasta)
  544. else:
  545. print("Warning: Could not read %".format(in_file), file=sys.stderr)
  546. fasta_entry = None
  547. return fasta_entry
  548. def write_to_file(line_list, fname, pdb_filter):
  549. """
  550. Input: A list containing all lines that have to be saved to the file (line_list).
  551. A filename (str) is the name of the file that is created.
  552. Output: A file containing the lines in line_list.
  553. """
  554. if pdb_filter:
  555. fasta_file = open(fname, 'w')
  556. pdb_filter = open(pdb_filter, 'w')
  557. pdb_filter.write('#pdb_chain\tresolution\tr_free\tcompleteness\tmethod\n')
  558. for line in line_list:
  559. if line is not None:
  560. fasta_file.write(line[0])
  561. pdb_filter.write(line[1])
  562. fasta_file.close()
  563. pdb_filter.close()
  564. else:
  565. fasta_file = open(fname, 'w')
  566. for line in line_list:
  567. if line is not None:
  568. fasta_file.write(line[0])
  569. def opt():
  570. # Initiate a OptionParser Class
  571. usage = "usage: cif2fasta.py -i cif_folder -o *.fasta -c num_cores -v"
  572. description = "cif2fasta.py takes a folder that contains cif files as input and outputs their sequences into fasta file."
  573. parser = OptionParser(usage = usage, description = description)
  574. # Call add_options to the parser
  575. parser.add_option("-i", help = "input cif folder.", dest = "input_files", metavar = "DIR")
  576. parser.add_option("-o", help = "output fasta file.", dest = "output_files", metavar = "FILE")
  577. parser.add_option("-p", help = "output PDB filter file (optional).", dest= "pdb_filter", default = False, metavar = "FILE")
  578. parser.add_option("-s", help = "SCOP annotation.", dest = "scop", default = False, metavar = "FILE")
  579. parser.add_option('-c', help = "number of cores (default = 1).", dest = "cores", type = int, default = 1, metavar = "INT")
  580. parser.add_option('-l', help = "Remove chains with a length < X (default = 30).", dest = "seq_len", type = int, default = 30, metavar = "INT")
  581. parser.add_option('-v', help = 'Verbose Mode (quiet = 0, full verbosity = 2).', dest = 'bool', default = 0, type = int, metavar = "INT")
  582. return parser
  583. def main():
  584. parser = opt()
  585. (options, argv) = parser.parse_args()
  586. global DEBUG_MODE
  587. if options.bool:
  588. DEBUG_MODE = options.bool
  589. global SCOP_LIBRARY
  590. if options.scop:
  591. SCOP_LIBRARY = parse_scop(options.scop)
  592. global MIN_SEQ_LEN
  593. MIN_SEQ_LEN = options.seq_len
  594. paths = get_paths(options.input_files, options.output_files)
  595. print ("Found: " + str(len(paths)) + " files.")
  596. if options.cores > 1:
  597. pool = Pool(options.cores)
  598. fastas = pool.map(wrapper_function, paths)
  599. else:
  600. fastas = map(wrapper_function, paths)
  601. write_to_file(fastas, options.output_files, options.pdb_filter)
  602. if __name__ == "__main__":
  603. main()