123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793 |
- #!/usr/bin/env python
- """
- Created on Mon Jun 15 21:49:32 2015
- @author: Harald Voehringer
- """
- from __future__ import print_function
- import sys, os, glob, gzip, textwrap, itertools
- from optparse import OptionParser
- from collections import defaultdict
- from os.path import splitext
- from pdbx.reader.PdbxReader import PdbxReader
- from multiprocessing import Pool
- DEBUG_MODE = False
- MIN_SEQ_LEN = None
- SCOP_LIBRARY = False
- THREE2ONE = {
- 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P',
- 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R',
- 'TRP': 'W', 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M', 'MSE': 'M',
- 'HYP': 'P', 'MLY': 'K', 'SEP': 'S', 'TPO': 'T', 'CSO': 'C', 'PTR': 'Y', 'KCX': 'K',
- 'CME': 'C', 'CSD': 'A', 'CAS': 'C', 'MLE': 'L', 'DAL': 'A', 'CGU': 'E', 'DLE': 'L',
- 'FME': 'M', 'DVA': 'V', 'OCS': 'C', 'DPR': 'P', 'MVA': 'V', 'TYS': 'Y', 'M3L': 'K',
- 'SMC': 'C', 'ALY': 'K', 'CSX': 'C', 'DCY': 'C', 'NLE': 'L', 'DGL': 'E', 'DSN': 'S',
- 'CSS': 'C', 'DLY': 'K', 'MLZ': 'K', 'DPN': 'F', 'DAR': 'R', 'PHI': 'F', 'IAS': 'D',
- 'DAS': 'D', 'HIC': 'H', 'MP8': 'P', 'DTH': 'T', 'DIL': 'I', 'MEN': 'N', 'DTY': 'Y',
- 'CXM': 'M', 'DGN': 'G', 'DTR': 'W', 'SAC': 'S', 'DSG': 'N', 'MME': 'M', 'MAA': 'A',
- 'YOF': 'Y', 'FP9': 'P', 'FVA': 'V', 'MLU': 'L', 'OMY': 'Y', 'FGA': 'E', 'MEA': 'F',
- 'CMH': 'C', 'DHI': 'H', 'SEC': 'C', 'OMZ': 'Y', 'SCY': 'C', 'MHO': 'M', 'MED': 'M',
- 'CAF': 'C', 'NIY': 'Y', 'OAS': 'S', 'SCH': 'C', 'MK8': 'L', 'SME': 'M', 'LYZ': 'K'
- }
- CANONICAL_RESIDUES = set(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
- 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])
- class CIF2FASTA(object):
-
- def __init__(self, cif_path):
- self.cif_path = cif_path
- self.block = self.open_cif()
-
- def open_cif(self):
- """ Assumes a mmCIF or gzipped mmCIF file and returns a data block used for subsequent procedures. """
- # The "usual" procedure to open a mmCIF with pdbX/mmCIF
- data = []
- try:
- with gzip.open(self.cif_path) as cif_fh:
- reader = PdbxReader(cif_fh)
- reader.read(data)
- except IOError:
- with open(self.cif_path) as cif_fh:
- reader = PdbxReader(cif_fh)
- reader.read(data)
- if len(data) == 0:
- return None
- else:
- return data[0]
- def is_valid(self):
- return self.block is not None
-
- def chain_to_seq(self):
- """Extracts the sequence of the cif from entity_poly.pdbx_seq_one_letter_code"""
-
- cif_chain_to_seq = dict()
- non_polypeptide_chains = list()
-
- try:
- entity_poly = self.block.getObj('entity_poly')
- except AttributeError:
- if DEBUG_MODE > 0:
- print ('! {pdb} Could not extract entity_poly table.'.format(
- pdb = self.pdb_entry()))
- return False
- try:
- total_rows = entity_poly.getRowCount()
- except AttributeError:
- print ('! {pdb} Could not extract rows from entity_poly.'.format(
- pdb = self.pdb_entry()))
- return False
- for row in range(0, total_rows):
- if entity_poly.getValue('type', row) == 'polypeptide(L)':
- seq = entity_poly.getValue('pdbx_seq_one_letter_code', row)
- parsed_seq = parse_seq(seq) # removes special amino acids and newlines
-
- try:
- chains = entity_poly.getValue('pdbx_strand_id', row)
- except ValueError:
-
- if total_rows == 1:
- print ('! {pdb} Only one polypeptide chain, but no chain identifiers, setting it to ".".'.format(
- pdb = self.pdb_entry()))
-
- cif_chain_to_seq['.'] = parsed_seq
-
- return cif_chain_to_seq
- print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (polypeptide).'.format(
- pdb = self.pdb_entry()))
- return False
- chain_list = chains.split(',')
-
- for chain in chain_list:
- cif_chain_to_seq[chain] = parsed_seq
-
- else:
-
- try:
- chains = entity_poly.getValue('pdbx_strand_id', row)
- except ValueError:
- print ('! {pdb} Could not extract pdbx_strand_id from entity_poly table (non-polypeptide).'.format(
- pdb = self.pdb_entry()))
-
- return False
- non_polypeptide_chains.append(chains)
- chains = list(cif_chain_to_seq.keys())
- # remove chains that contain only unknown residues
- for chain in chains:
- # this is a very odd way to check whether a string contains only a single char
- tmp_set = set(cif_chain_to_seq[chain])
- if len(tmp_set) == 1 and 'X' in tmp_set:
- print ('! Removing {pdb}_{chain} (contains only unknown residues).'.format(
- pdb = self.pdb_entry(),
- chain = chain))
-
- del cif_chain_to_seq[chain]
- continue
-
- if len(cif_chain_to_seq[chain]) < MIN_SEQ_LEN:
- print ('! Removing {pdb}_{chain} (sequence length < {min_len}).'.format(
- pdb = self.pdb_entry(),
- chain = chain,
- min_len = MIN_SEQ_LEN))
-
- del cif_chain_to_seq[chain]
- if len(cif_chain_to_seq) != 0:
-
- if DEBUG_MODE > 1:
- print ('- Extracted chains of {pdb} {chains}.'.format(
- pdb = self.pdb_entry(),
- chains = ' '.join( str(chain) + ' (' + str(len(cif_chain_to_seq[chain])) + ')' for chain in cif_chain_to_seq.keys())))
-
- if len(non_polypeptide_chains) != 0:
- print ('- Following chains were non polypeptide chains {chains} no polypeptide chains were found.'.format(
- chains = ', '.join(non_polypeptide_chains)))
-
- return cif_chain_to_seq
-
- else:
- if DEBUG_MODE > 0:
- print ('! {pdb} No polypeptide chains were found.'.format(
- pdb = self.pdb_entry()))
-
- return False
- def chain_ratios(self, chain_to_seq):
- """ Tries to extract Sequence from the atom section """
- # chain_to_seq = self.chain_to_seq()
- if chain_to_seq != False:
- chain_ratios = dict()
-
- # compute the lengths of sequences found in _entity_poly
- entity_length = { chain : float(len(seq)) for chain, seq in chain_to_seq.items() }
- entity_chains = entity_length.keys()
- # load the atomsite and set up dictionary to keep track of sequences
- atom_site = self.block.getObj('atom_site')
- atom_seq = defaultdict(str)
- current_residue = 0
- # Iterate through the atomsection of the cif file
- for atom_row in range(0, atom_site.getRowCount()):
- # NMR structures contain many confomers
- try:
- model_num = int(atom_site.getValue('pdbx_PDB_model_num', atom_row))
- except ValueError:
- model_num = 1
- if model_num > 1:
- continue
- atom_chain = atom_site.getValue('label_asym_id', atom_row)
- # get the alternative chain identifier too
- try:
- alt_chain = atom_site.getValue('auth_asym_id', atom_row)
- except ValueError:
- alt_chain = None
- # handle cases where there are no chains but only one structure
- if atom_chain == '.' and entity_chains[0] == '.':
- atom_chain = '.'
-
- # get the residue and the residue number
- try:
- res_num = int(atom_site.getValue("label_seq_id", atom_row))
- except ValueError:
- continue
- if res_num != current_residue:
- residue = atom_site.getValue('label_comp_id', atom_row)
-
- try:
- residue = THREE2ONE[residue]
- except KeyError:
- residue = 'X'
- # try to get the chain identifier from alt_chain first, if this does not work use label_asym_id
- if alt_chain is not None:
- atom_seq[alt_chain] += residue
- # sometimes we find the right chain identifier not in the alt_chain
- if not (atom_chain in atom_seq.keys()) and atom_chain is not None:
- atom_seq[atom_chain] += residue
-
- current_residue = res_num
- for chain in entity_length.keys():
- if chain in atom_seq.keys():
- chain_ratios[chain] = len(atom_seq[chain]) / entity_length[chain]
- else:
- chain_ratios[chain] = 0
- return chain_ratios
- else:
- return False
- def pdb_entry(self):
- """Extracts the PDB entry information of a cif file."""
- try:
- entry = self.block.getObj('entry')
- entry_id = entry.getValue('id')
- return entry_id.replace('\n', ' ')
-
- except AttributeError:
- if DEBUG_MODE > 0:
- print ('! {pdb} Could not extract id from entry.'.format(
- pdb = self.pdb_entry()))
- def protein_description(self):
- """Extracts the protein description annotated in struct.pdbx_descriptor of the cif file."""
- try:
- # Get struct table which contains the protein description
- struct = self.block.getObj('struct')
- # Get the pdbx description and make format it appropritaly
- protein_description = struct.getValue('pdbx_descriptor')
- protein_description = protein_description.replace('\n', ' ')
- protein_description = protein_description.replace(';', ' ') # to prevent parsing errors
-
- if len(protein_description.split(' ')) >= 5:
- protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header
-
- return protein_description.strip(',')
-
- except AttributeError:
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract pdbx_descriptor from struct table.'.format(
- pdb = self.pdb_entry()))
-
- return False
- def compounds(self):
- """ Extracts all compounds annotated in the HETATM section of the atom
- struct table if the compound appears at least 10 times and is not water
- (HOH)."""
-
- atom_site = self.block.getObj('atom_site')
- compounds = {}
-
- for row in range(0, atom_site.getRowCount()):
- if atom_site.getValue('group_PDB', row) == 'HETATM':
- label_comp_id = atom_site.getValue('label_comp_id', row)
-
- if label_comp_id not in compounds.keys():
- compounds[label_comp_id] = 1
- else:
- compounds[label_comp_id] += 1
-
- filtered_compounds = set()
- for compound in compounds.keys():
- if compounds[compound] >= 10 and compound != 'HOH':
- filtered_compounds.add(compound)
- if len(filtered_compounds) == 0:
- return False
- else:
- return ', '.join(filtered_compounds).replace('\n', ' ')
-
- def resolution(self):
- """Extracts the resolution of the mmCIF."""
- try:
- refine = self.block.getObj('refine')
- resolution = refine.getValue('ls_d_res_high')
-
- try:
- resolution = float(resolution)
- except ValueError:
- return False
-
- return resolution
-
- except AttributeError:
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract ls_d_res_high from refine table.'.format(
- pdb = self.pdb_entry()))
- try:
- reflns = self.block.getObj('reflns')
- # Extract the resolution of the crystal
- resolution = reflns.getValue('d_resolution_high')
-
- try:
- resolution = float(resolution)
- except ValueError:
- return False
-
- return resolution
-
- except AttributeError:
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract d_resolution_high from reflns table.'.format(
- pdb = self.pdb_entry()))
- # This is true for some Electron Microscopy structures
- try:
- em_3d = self.block.getObj('em_3d_reconstruction')
- resolution = em_3d.getValue('resolution')
-
- try:
- resolution = float(resolution)
- except ValueError:
- return False
-
- return resolution
- except AttributeError:
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract resolution from em_3d_reconstruction table.'.format(
- pdb = self.pdb_entry()))
- return False
- def experimental_method(self):
- """Extracts the experimental method of the mmCIF."""
- try:
- reflns = self.block.getObj('exptl')
- method = reflns.getValue('method')
- return method.replace('\n', ' ')
-
- except AttributeError:
-
- if DEBUG_MODE > 1:
- print ('! Could not extract text from exptl table.'.format(
- pdb = self.pdb_entry()))
-
- return False
- def keywords(self):
- """Extracts the keywords of the mmCIF."""
- try:
- reflns = self.block.getObj('struct_keywords')
- keywords = reflns.getValue('text')
- # perform some string modifications
- keywords = keywords.replace('\n', ' ')
- keywords = keywords.replace(';', ' ')
- if len(keywords.split(' ')) >= 5:
- keywords = ' '.join(keywords.split(' ')[0:5])
- return keywords.rstrip(',')
-
- except AttributeError:
-
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract text from struct_keywords table.'.format(
- pdb = self.pdb_entry()))
-
- return False
- def organism(self):
- """Extracts the organism of the mmCIF."""
- try:
- entity_src_nat = self.block.getObj('entity_src_nat')
- organsim_scientific = entity_src_nat.getValue('pdbx_organism_scientific')
- return organsim_scientific.replace('\n', ' ')
-
- except AttributeError:
-
- if DEBUG_MODE > 1:
- print ("! {pdb} Could not extract from pdbx_organism_scientific from entity_src_gen table (".format(
- pdb = self.pdb_entry()))
- pass
-
- try:
- entity_src_gen = self.block.getObj("entity_src_gen")
- src_scientific = entity_src_gen.getValue("pdbx_gene_src_scientific_name")
- return src_scientific.replace("\n", " ")
-
- except AttributeError:
-
- if DEBUG_MODE > 1:
- print ('! {pdb} Could not extract from pdbx_gene_src_scientific_name from entity_src_gen table'.format(
- pdb = self.pdb_entry()))
-
- return False
-
- def r_free(self):
- try:
- refine = self.block.getObj('refine')
- r_free = refine.getValue('ls_R_factor_R_free')
- try:
- return float(r_free)
- except ValueError:
- return False
- except AttributeError:
- if DEBUG_MODE > 2:
- print ('! Could not extract R free factor ({pdb})'.format(
- pdb = self.pdb_entry()))
- except ValueError:
- if DEBUG_MODE > 2:
- print ('! R free factor is not annotated ({pdb})'.format(
- pdb = self.pdb_entry()))
-
- return False
- # Helper functions
- def parse_seq(orginal_seq):
- """Parses the cif fasta sequence and replaces non-canonical residues with their canonical counterparts."""
- seq = orginal_seq
-
- while seq.find('(') != -1:
- start_pos = seq.find('(')
- stop_pos = seq.find(')')
- residue = seq[start_pos + 1:stop_pos]
- try:
- canonical = THREE2ONE[residue]
- except KeyError:
- canonical = 'X'
- if DEBUG_MODE > 1:
- print ('! Replaced non canonical residue {nc} with {c}'.format(
- nc = residue,
- c = canonical))
-
- if start_pos == 0:
- seq = canonical + seq[stop_pos+1:]
- elif stop_pos == len(seq):
- seq = seq[0:start_pos] + canonical
- else:
- pre_seq = seq[0:start_pos]
- post_seq = seq[stop_pos+1:]
- seq = pre_seq + canonical + post_seq
-
- seq = seq.replace('\n', '')
- seq_array = []
-
- for c in seq:
- if c in CANONICAL_RESIDUES:
- seq_array.append(c)
- else:
- seq_array.append('X')
- seq = ''.join(seq_array)
- return seq
-
-
- def get_paths(in_folder, out_folder):
- """Searches a directory and all its subdirectories for files ending
- with a specific suffix."""
- paths = list()
-
- for root, dirs, files in os.walk(in_folder):
- for fname in files:
- if fname.endswith(".cif") or fname.endswith(".cif.gz"):
- in_path = os.path.join(root, fname)
- out_file = in_path.split('/')[-1].split('.')[0] + ".fasta"
- out_path = os.path.join(out_folder, out_file)
- #absolute_path = os.path.join(os.getcwd(), fpath)
- paths.append((in_path, out_path))
-
- return paths
-
- def construct_header(cif2fasta):
- """Constructs a fasta header."""
-
- protein_description = cif2fasta.protein_description()
- keywords = cif2fasta.keywords()
- compounds = cif2fasta.compounds()
- resolution = cif2fasta.resolution()
- method = cif2fasta.experimental_method()
- organism = cif2fasta.organism()
- r_free = cif2fasta.r_free()
- # construct the header with the data given
- header_information = ''
- if protein_description:
- protein_description = protein_description.replace(';', ' ') # to prevent parsing errors
- protein_description = ' '.join(protein_description.split(' ')[0:5]) # maximum of 5 words in header
- header_information += 'DSC: ' + protein_description + '; '
- else:
- header_information += 'DSC: N/A; '
-
- # if keywords:
- # header_information += keywords + '; '
- if method:
- header_information += 'MET: ' + method + '; '
- else:
- header_information += 'MET: N/A; '
-
- if resolution:
- header_information += 'RES: ' + resolution + '; '
- else:
- header_information += 'RES: N/A; '
- if r_free:
- header_information += 'RFR: ' + r_free + '; '
- else:
- header_information += 'RFR: N/A; '
- if organism:
- header_information += 'ORG: ' + organism + '; '
- else:
- header_information += 'ORG: N/A; '
- if compounds:
- header_information += 'HET: ' + compounds + '; '
- else:
- header_information += 'HET: N/A; '
- if SCOP_LIBRARY:
-
- try:
- scop_idx = SCOP_LIBRARY[cif2fasta.pdb_entry()]
- if len(scop_idx) != 0:
- domains = ', '.join(scop_idx)
- else:
- domains = 'N/A'
- except KeyError:
- domains = 'N/A'
-
- header_information += 'SCOP: ' + domains + '; '
- return header_information.strip()
- def create_fasta_entry(cif2fasta):
- """Combines information given in pdb_entry and strand_seq and yields a clean
- fasta file.
- """
-
- pdb_entry = cif2fasta.pdb_entry()
- header = construct_header(cif2fasta)
- chain_to_seq = cif2fasta.chain_to_seq()
- chain_ratios = cif2fasta.chain_ratios(chain_to_seq)
- #import pdb; pdb.set_trace()
-
- fasta_entry = ""
-
- if chain_to_seq and pdb_entry:
- for chain in sorted(chain_to_seq.keys()):
- if len(chain_to_seq[chain]) != 0:
-
- fasta_entry += '>{pdb}_{chain} {header} CMP: {rat:.2f}\n{seq}\n'.format(
- pdb = pdb_entry,
- chain = chain,
- header = header,
- rat = chain_ratios[chain],
- seq = '\n'.join(textwrap.wrap(chain_to_seq[chain], 80)))
-
- # fasta_entry += '>' + pdb_entry + '_' + chain + ' ' + header + '\n' + '\n'.join(textwrap.wrap(chain_to_seq[chain], 80)) + '\n'
- return fasta_entry
- else:
- return None
- def create_fasta_entry2(cif2fasta):
- """" Creates a fasta entry."""
- # Get all the information we need
- pdb_entry = cif2fasta.pdb_entry()
- protein_description = cif2fasta.protein_description()
- keywords = cif2fasta.keywords()
- compounds = cif2fasta.compounds()
- resolution = cif2fasta.resolution()
- method = cif2fasta.experimental_method()
- organism = cif2fasta.organism()
- r_free = cif2fasta.r_free()
- # get chains and sequences
- chain_to_seq = cif2fasta.chain_to_seq()
- chain_ratios = cif2fasta.chain_ratios(chain_to_seq)
- is_NMR = False
- if 'NMR' in method:
- is_NMR = True
- fasta_entry = ''
- pdb_filter = '' # If needed create entries that are parseable by pdb filter
-
- if chain_to_seq and pdb_entry:
- for chain, seq in chain_to_seq.items():
- # check if we can find SCOP domains for the current chain
- domains = False
-
- if SCOP_LIBRARY:
- try:
- scop_idx = SCOP_LIBRARY[pdb_entry + '_' + chain]
- if len(scop_idx) != 0:
- domains = ', '.join(scop_idx)
- else:
- domains = False
- except KeyError:
- domains = False
- if is_NMR:
- fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format(
- p = pdb_entry,
- c = chain,
- d = ' ' + protein_description + ';' if protein_description else '',
- k = ' ' + keywords + ';' if keywords else '',
- h = ' HET: ' + compounds + ';' if compounds else '',
- r = ' NMR ',
- o = organism if organism else 'N/A',
- s = ' SCOP: ' + domains if domains else '',
- seq = '\n'.join(textwrap.wrap(seq, 80)))
- pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format(
- p = pdb_entry,
- c = chain,
- m = method,
- r = 'N/A',
- f = round(r_free, 3) if r_free else 'N/A',
- comp = round(chain_ratios[chain], 3))
- else:
- fasta_entry += '>{p}_{c}{d}{k}{h}{r}{{{o}}}{s}\n{seq}\n'.format(
- p = pdb_entry,
- c = chain,
- d = ' ' + protein_description + ';' if protein_description else '',
- k = ' ' + keywords + ';' if keywords else '',
- h = ' HET: ' + compounds + ';' if compounds else '',
- r = ' ' + str(resolution) + 'A ' if resolution else '',
- o = organism if organism else 'N/A',
- s = ' SCOP: ' + domains if domains else '',
- seq = '\n'.join(textwrap.wrap(seq, 80)))
- # <pdb_entry>_<chain>\t<resolution>\t<r_free>\t<completness>
- pdb_filter += '{p}_{c}\t{r}\t{f}\t{comp:.3f}\t{m}\n'.format(
- p = pdb_entry,
- c = chain,
- m = method,
- r = round(resolution, 3) if resolution else 'N/A',
- f = round(r_free, 3) if r_free else 'N/A',
- comp = round(chain_ratios[chain], 3))
- return (fasta_entry, pdb_filter)
- def parse_scop(scop_file):
- scop = defaultdict(set)
- with open(scop_file) as fh:
- for line in fh:
- if line.startswith('#'):
- continue
-
- scop_id, pdb_code, chain, scop_num = line.split('\t')[0:4]
- chain = chain.split(':')[0]
- entry = pdb_code.upper() + '_' + chain
-
- scop[entry].add(scop_num)
- return scop
- def wrapper_function(paths):
-
- in_file = paths[0]
- out_file = paths[1]
-
- cif2fasta = CIF2FASTA(in_file)
- if cif2fasta.is_valid():
- fasta_entry = create_fasta_entry2(cif2fasta)
- else:
- print("Warning: Could not read %".format(in_file), file=sys.stderr)
- fasta_entry = None
- return fasta_entry
- def write_to_file(line_list, fname, pdb_filter):
- """
- Input: A list containing all lines that have to be saved to the file (line_list).
- A filename (str) is the name of the file that is created.
- Output: A file containing the lines in line_list.
- """
- if pdb_filter:
- fasta_file = open(fname, 'w')
- pdb_filter = open(pdb_filter, 'w')
- pdb_filter.write('#pdb_chain\tresolution\tr_free\tcompleteness\tmethod\n')
- for line in line_list:
- if line is not None:
- fasta_file.write(line[0])
- pdb_filter.write(line[1])
- fasta_file.close()
- pdb_filter.close()
- else:
- fasta_file = open(fname, 'w')
- for line in line_list:
- if line is not None:
- fasta_file.write(line[0])
- def opt():
- # Initiate a OptionParser Class
- usage = "usage: cif2fasta.py -i cif_folder -o *.fasta -c num_cores -v"
- description = "cif2fasta.py takes a folder that contains cif files as input and outputs their sequences into fasta file."
- parser = OptionParser(usage = usage, description = description)
- # Call add_options to the parser
- parser.add_option("-i", help = "input cif folder.", dest = "input_files", metavar = "DIR")
- parser.add_option("-o", help = "output fasta file.", dest = "output_files", metavar = "FILE")
- parser.add_option("-p", help = "output PDB filter file (optional).", dest= "pdb_filter", default = False, metavar = "FILE")
- parser.add_option("-s", help = "SCOP annotation.", dest = "scop", default = False, metavar = "FILE")
- parser.add_option('-c', help = "number of cores (default = 1).", dest = "cores", type = int, default = 1, metavar = "INT")
- parser.add_option('-l', help = "Remove chains with a length < X (default = 30).", dest = "seq_len", type = int, default = 30, metavar = "INT")
- parser.add_option('-v', help = 'Verbose Mode (quiet = 0, full verbosity = 2).', dest = 'bool', default = 0, type = int, metavar = "INT")
- return parser
-
- def main():
- parser = opt()
- (options, argv) = parser.parse_args()
-
- global DEBUG_MODE
-
- if options.bool:
- DEBUG_MODE = options.bool
- global SCOP_LIBRARY
-
- if options.scop:
- SCOP_LIBRARY = parse_scop(options.scop)
- global MIN_SEQ_LEN
- MIN_SEQ_LEN = options.seq_len
- paths = get_paths(options.input_files, options.output_files)
- print ("Found: " + str(len(paths)) + " files.")
- if options.cores > 1:
- pool = Pool(options.cores)
- fastas = pool.map(wrapper_function, paths)
- else:
- fastas = map(wrapper_function, paths)
- write_to_file(fastas, options.output_files, options.pdb_filter)
- if __name__ == "__main__":
- main()
|