hhmakemodel.py 85 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401
  1. #!/usr/bin/env python
  2. from hh_reader import read_result
  3. from copy import deepcopy
  4. from pdbx.reader.PdbxReader import PdbxReader
  5. from pdbx.writer.PdbxWriter import PdbxWriter
  6. import re, os, sys, tempfile, glob
  7. from operator import itemgetter # hzhu
  8. from itertools import groupby # hzhu
  9. EMPTY = '*'
  10. GAP = '-'
  11. DEBUG_MODE = False
  12. class Gap:
  13. """ A gap is a continuous stretch of indels.
  14. It is defined by a opening position and a size/length
  15. """
  16. def __init__(self, open_pos, size):
  17. self.open_pos = open_pos # gap opening position
  18. self.size = size # num of indels in the gap
  19. def __repr__(self):
  20. return 'Gap opening pos = %d, size = %d' % (self.open_pos, self.size)
  21. class Grid:
  22. """
  23. Implementation of 2D grid of cells
  24. Includes boundary handling
  25. """
  26. def __init__(self, grid_height, grid_width):
  27. """
  28. Initializes grid to be empty, take height and width of grid as parameters
  29. Indexed by rows (left to right), then by columns (top to bottom)
  30. """
  31. self._grid_height = grid_height
  32. self._grid_width = grid_width
  33. self._cells = [ [ EMPTY for dummy_col in range(self._grid_width) ]
  34. for dummy_row in range(self._grid_height)]
  35. def __str__(self):
  36. """ Return multi-line string represenation for grid """
  37. ans = ''
  38. for row in range(self._grid_height):
  39. ans += ''.join(self._cells[row])
  40. ans += '\n'
  41. return ans
  42. def clear(self):
  43. """ Clears grid to be empty """
  44. self._cells = [[EMPTY for dummy_col in range(self._grid_width)]
  45. for dummy_row in range(self._grid_height)]
  46. def get_grid_height(self):
  47. """ Return the height of the grid """
  48. return self._grid_height
  49. def get_grid_width(self):
  50. """ Return the width of the grid """
  51. return self._grid_width
  52. def get_cell(self, row, col):
  53. return self._cells[row][col]
  54. def get_seq_start(self, row):
  55. """ Returns the start position of the sequence """
  56. index = 0
  57. for pos in self._cells[row]:
  58. if pos != EMPTY:
  59. return index
  60. index += 1
  61. return None
  62. def get_seq_end(self, row):
  63. """ Returns the end position of the sequence """
  64. index = 0
  65. for pos in reversed(self._cells[row]):
  66. if pos != EMPTY:
  67. return self.get_grid_width() - index
  68. index += 1
  69. return None
  70. def get_gaps(self, row):
  71. """ Return the position of gaps in a row """
  72. gaps = list()
  73. index = 0
  74. for pos in self._cells[row]:
  75. if pos == GAP:
  76. gaps.append(index)
  77. index += 1
  78. return gaps
  79. def get_gaps_ref_gapless(self, row):
  80. """ Return the pos of gaps in a row.
  81. The opening positions of the gaps are wrt. the gapless seq
  82. """
  83. # get all the indels
  84. indels = self.get_gaps(row)
  85. gaps = []
  86. # combine continuous indels into a gap
  87. for k,i in groupby( enumerate(indels), lambda x: x[0]-x[1] ):
  88. g = list(map(itemgetter(1), i))
  89. gaps.append( Gap(g[0], len(g)) )
  90. # offset the gap opening positions
  91. for i in range(1, len(gaps)):
  92. # offset by total gap number before
  93. gaps[i].open_pos -= sum([gaps[j].size for j in range(i)])
  94. return gaps # a list of Gap instances
  95. def get_seq_indeces(self, row):
  96. seq = list()
  97. for pos, res in enumerate(self._cells[row]):
  98. if res != EMPTY and res != GAP:
  99. seq.append(pos)
  100. return seq
  101. ## def get_gap_list(self): # hzhu commented this out. wrote a new version
  102. ## """ Returns a list of list of all gap positions in the sequence grid. """
  103. ## gap_pos = set()
  104. ## for row in range(self.get_grid_height()):
  105. ## for gap in self.get_gaps(row):
  106. ## gap_pos.add(gap)
  107. ## gap_pos = list(sorted(gap_pos))
  108. ## boundaries = [ (x + 1) for x, y in zip(gap_pos, gap_pos[1:]) if y - x != 1 ]
  109. ## gap_list = list()
  110. ## prev = 0
  111. ## for boundary in boundaries:
  112. ## sub_list = [ pos for pos in gap_pos[prev:] if pos < boundary ]
  113. ## gap_list.append(sub_list)
  114. ## prev += len(sub_list)
  115. ## gap_list.append([ x for x in gap_pos[prev:]])
  116. ## return gap_list
  117. def get_gap_list(self):
  118. """ Returns a list of Gap instances for all rows in the grid
  119. """
  120. gap_dict = dict() # each position should occur as gap at most once
  121. # keys are gap openning positions
  122. # values are Gap instances
  123. gap_list = []
  124. for row in range(self.get_grid_height()):
  125. gap_pos = []
  126. gaps = self.get_gaps_ref_gapless(row)
  127. for g in gaps:
  128. if g.open_pos in gap_dict: # if there is already gaps at this open pos
  129. if g.size > gap_dict[g.open_pos].size: # if new gap is bigger
  130. gap_dict[g.open_pos] = g # keep the larger gap as they overlap
  131. else:
  132. gap_dict[g.open_pos] = g
  133. gap_list = sorted(list(gap_dict.values()), key=lambda x: x.open_pos) # sort according to start position
  134. return gap_list # a list of Gap instances
  135. def set_gap(self, row, col):
  136. """ Set cell with index (row, col) to be a gap """
  137. self._cells[row][col] = GAP
  138. def set_empty(self, row, col):
  139. """ Set cell with index (row, col) to be a gap """
  140. self._cells[row][col] = EMPTY
  141. def set_cell(self, row, col, res):
  142. """ Set cell with index (row, col) to be full """
  143. self._cells[row][col] = res
  144. def is_empty(self, row, col):
  145. """ Checks whether cell with index (row, col) is empty """
  146. return self._cells[row][col] == EMPTY
  147. def is_gap(self, row, col):
  148. """ Checks whetehr cell with indxex (row, col) is a gap """
  149. return self._cells[row][col] == GAP
  150. def insert_gaps(self, cols):
  151. """ Inserts a gaps into a column of the template grid """
  152. for col in cols:
  153. for row in range(self._grid_height):
  154. if col >= self.get_seq_start(row) and col < self.get_seq_end(row):
  155. self._cells[row].insert(col, GAP)
  156. else:
  157. self._cells[row].insert(col, EMPTY)
  158. self._grid_width += 1
  159. def insert_gaps_row(self, cols, row):
  160. """ Intert gaps into cols only for certain row"""
  161. for col in cols:
  162. if col >= self.get_seq_start(row) and col < self.get_seq_end(row):
  163. self._cells[row].insert(col, GAP)
  164. else:
  165. self._cells[row].insert(col, EMPTY)
  166. # NOTE: grid_with should not be changed after every row is updated.
  167. #self._grid_width += 1
  168. def clean_trail_empty(self):
  169. """ Remove all trailing EMPTY and pad grid to same width"""
  170. # first find out the max length (exluding trailing EMPTY)
  171. max_width = 0
  172. for row in range(self._grid_height):
  173. for i in range(len(self._cells[row])-1, -1, -1):
  174. if self._cells[row][i] != EMPTY:
  175. break
  176. if i+1 > max_width:
  177. max_width = i+1
  178. # delete excessive EMPTY
  179. for row in range(self._grid_height):
  180. del self._cells[row][max_width:]
  181. # then pad all rows to the same length
  182. [self._cells[row].append( EMPTY * (max_width-len(self._cells[row])) ) \
  183. for row in range(self._grid_height) if len(self._cells[row]) < max_width]
  184. self._grid_width = max_width
  185. return
  186. def remove_gaps(self, keep_width=True): # hzhu add keep_width option
  187. """ Removes all gaps from the grid. """
  188. for row in range(self.get_grid_height()):
  189. not_gap = list()
  190. for col in range(self.get_grid_width()):
  191. if not self.is_gap(row, col):
  192. not_gap.append(col)
  193. self._cells[row] = [ self._cells[row][col] for col in not_gap ]
  194. if keep_width: # hzhu only pad to original width if desired
  195. for del_pos in range(self._grid_width - len(not_gap)):
  196. self._cells[row].append(EMPTY)
  197. if not keep_width: # hzhu if width is not kept, make sure width is consistent
  198. self.clean_trail_empty()
  199. return
  200. class QueryGrid(Grid):
  201. def __init__(self, grid_height, grid_width):
  202. Grid.__init__(self, grid_height, grid_width)
  203. def get_query_start(self, row):
  204. """ Returns the query start position """
  205. return self.get_seq_start(row) + 1
  206. def get_query_end(self, row):
  207. """ Returns the query end postion """
  208. return self.get_seq_end(row) - len(self.get_gaps(row))
  209. def get_col_residue(self, col):
  210. """ Tries to find a the query residue in a given column. Used by derive_global_seq() to
  211. identify the global query sequence """
  212. for row in range(self.get_grid_height()):
  213. if not self.is_empty(row, col):
  214. return self._cells[row][col]
  215. return GAP
  216. class TemplateGrid(Grid):
  217. def __init__(self, grid_height, grid_width):
  218. Grid.__init__(self, grid_height, grid_width)
  219. self._start = list()
  220. self._end = list()
  221. self._pdb_code = list()
  222. self._chain = list()
  223. self._organism = list()
  224. self._resolution = list()
  225. def display(self):
  226. """ Return multi-line string represenation for grid """
  227. ans = ''
  228. for row in range(self._grid_height):
  229. ans += '>P1;{p}\nstructure:{p}:{s}:{c}:{e}:{c}::{o}:{r}:\n{a}*\n'.format(
  230. p = self._pdb_code[row],
  231. s = add_white_space_end(self.get_template_start(row), 4),
  232. e = add_white_space_end(self.get_template_end(row), 4),
  233. c = self._chain[row],
  234. o = self._organism[row],
  235. r = self._resolution[row],
  236. a = ''.join(self._cells[row]).replace(EMPTY, GAP).replace('#', GAP))
  237. return ans
  238. def debug(self, row):
  239. """ Return multi-line string represenation for grid, for debugging purposes """
  240. ans = '{p}\nInternal: {s}, {e} Query: {qs}, {qe} Gaps ({g1}): {g2}\n{seq}\n'.format(
  241. p = self._pdb_code[row],
  242. s = self.get_seq_start(row),
  243. e = self.get_seq_end(row),
  244. qs = self.get_template_start(row),
  245. qe = self.get_template_end(row),
  246. g1 = len(self.get_gaps(row)),
  247. g2 = ', '.join([str(gap) for gap in self.get_gaps(row)]),
  248. seq = ''.join(self._cells[row]))
  249. return ans
  250. def set_metadata(self, row, start, end, pdb_code, chain, organism, resolution):
  251. """ Used by create_template_grid() to setup metadata of pir template """
  252. self._start.append(start)
  253. self._end.append(end)
  254. self._pdb_code.append(pdb_code)
  255. self._chain.append(chain)
  256. self._organism.append(organism)
  257. self._resolution.append(resolution)
  258. def set_map(self, row, start, end):
  259. self._start[row] = start
  260. self._end[row] = end
  261. def get_template_start(self, row):
  262. """ Returns the template start position """
  263. return self._start[row]
  264. def get_template_end(self, row):
  265. """ Return sthe template end position """
  266. return self._end[row]
  267. def del_row(self, row):
  268. """ Removes a complete template entry from the grid """
  269. del self._cells[row]
  270. del self._start[row]
  271. del self._end[row]
  272. del self._pdb_code[row]
  273. del self._chain[row]
  274. del self._organism[row]
  275. del self._resolution[row]
  276. self._grid_height -= 1
  277. # Helper functions
  278. def add_white_space_end(string, length):
  279. """ Adds whitespaces to a string until it has the wished length"""
  280. edited_string = str(string)
  281. if len(edited_string) >= length:
  282. return string
  283. else:
  284. while len(edited_string) != length:
  285. edited_string += ' '
  286. return edited_string
  287. def convert_aa_code(three_letter, convert):
  288. """
  289. Assumes a string that contains a three letter aminoacid code and
  290. returns the corresponding one letter code.
  291. """
  292. aa_code = {
  293. 'CYS': 'C',
  294. 'ASP': 'D',
  295. 'SER': 'S',
  296. 'GLN': 'Q',
  297. 'LYS': 'K',
  298. 'ILE': 'I',
  299. 'PRO': 'P',
  300. 'THR': 'T',
  301. 'PHE': 'F',
  302. 'ASN': 'N',
  303. 'GLY': 'G',
  304. 'HIS': 'H',
  305. 'LEU': 'L',
  306. 'ARG': 'R',
  307. 'TRP': 'W',
  308. 'ALA': 'A',
  309. 'VAL': 'V',
  310. 'GLU': 'E',
  311. 'TYR': 'Y',
  312. 'MET': 'M',
  313. }
  314. non_canonical = {
  315. 'MSE': 1,
  316. 'HYP': 2,
  317. 'MLY': 3,
  318. 'SEP': 4,
  319. 'TPO': 5,
  320. 'CSO': 6,
  321. 'PTR': 7,
  322. 'KCX': 8,
  323. 'CME': 9,
  324. 'CSD': 10,
  325. 'CAS': 11,
  326. 'MLE': 12,
  327. 'DAL': 13,
  328. 'CGU': 14,
  329. 'DLE': 15,
  330. 'FME': 16,
  331. 'DVA': 17,
  332. 'OCS': 18,
  333. 'DPR': 19,
  334. 'MVA': 20,
  335. 'TYS': 21,
  336. 'M3L': 22,
  337. 'SMC': 23,
  338. 'ALY': 24,
  339. 'CSX': 25,
  340. 'DCY': 26,
  341. 'NLE': 27,
  342. 'DGL': 28,
  343. 'DSN': 29,
  344. 'CSS': 30,
  345. 'DLY': 31,
  346. 'MLZ': 32,
  347. 'DPN': 33,
  348. 'DAR': 34,
  349. 'PHI': 35,
  350. 'IAS': 36,
  351. 'DAS': 37,
  352. 'HIC': 38,
  353. 'MP8': 39,
  354. 'DTH': 40,
  355. 'DIL': 41,
  356. 'MEN': 42,
  357. 'DTY': 43,
  358. 'CXM': 44,
  359. 'DGN': 45,
  360. 'DTR': 46,
  361. 'SAC': 47,
  362. 'DSG': 48,
  363. 'MME': 49,
  364. 'MAA': 50,
  365. 'YOF': 51,
  366. 'FP9': 52,
  367. 'FVA': 53,
  368. 'MLU': 54,
  369. 'OMY': 55,
  370. 'FGA': 56,
  371. 'MEA': 57,
  372. 'CMH': 58,
  373. 'DHI': 59,
  374. 'SEC': 60,
  375. 'OMZ': 61,
  376. 'SCY': 62,
  377. 'MHO': 63,
  378. 'MED': 64,
  379. 'CAF': 65,
  380. 'NIY': 66,
  381. 'OAS': 67,
  382. 'SCH': 68,
  383. 'MK8': 69,
  384. 'SME': 70,
  385. 'LYZ': 71
  386. }
  387. if three_letter in aa_code.keys():
  388. return aa_code[three_letter]
  389. elif convert and (three_letter in non_canonical.keys()):
  390. return non_canonical[three_letter]
  391. else:
  392. return '-'
  393. def get_query_name(hhr_file):
  394. with open(hhr_file) as fh:
  395. for line in fh:
  396. if line.startswith('Query'):
  397. # match the PDB Code
  398. m = re.search('(\d[A-Z0-9]{3})_(\S)', line)
  399. if m:
  400. pdb_code = m.group(1)
  401. chain = m.group(2)
  402. else:
  403. pdb_code = 'UKNP'
  404. chain = 'A'
  405. # raise ValueError('Input HHR-File Does not seem to be a PDB-Structure')
  406. break
  407. return pdb_code, chain
  408. def get_cif_files(folder):
  409. """ Gets all cif files located in folder. """
  410. return glob(os.path.join(folder, '*.cif'))
  411. def open_cif(cif_file):
  412. """ Assumes a mmCif file and returns a data block used for subsequent procedures """
  413. # The "usual" procedure to open a mmCIF with pdbX/mmCIF
  414. with open(cif_file) as cif_fh:
  415. data = []
  416. reader = PdbxReader(cif_fh)
  417. reader.read(data)
  418. block = data[0]
  419. return block
  420. def get_pdb_entry_id(block):
  421. """ Extracts the PDB entry information of a cif file and returns it as a string """
  422. entry = block.getObj('entry')
  423. entry_id = entry.getValue('id')
  424. return entry_id
  425. def template_id_to_pdb(template_id):
  426. """
  427. Extracts PDB ID and chain name from the provided template id
  428. """
  429. # match PDBID without chain (8fab, 1a01)
  430. m = re.match(r'/^(\d[A-Za-z0-9]{3})$', template_id)
  431. if m:
  432. return m.group(1).upper(), 'A'
  433. # PDB CODE with chain Identifier
  434. m = re.match(r'^(\d[A-Za-z0-9]{3})_(\S)$', template_id)
  435. if m:
  436. return m.group(1).upper(), m.group(2).upper()
  437. # Match DALI ID
  438. m = re.match(r'^(\d[A-Za-z0-9]{3})([A-Za-z0-9]?)_\d+$', template_id)
  439. if m:
  440. return m.group(1).upper(), m.group(2).upper()
  441. # No PDB code and chain identified
  442. return None, None
  443. def create_template_grid(hhr_data):
  444. """ Creates a template grid """
  445. total_seq = len(hhr_data)
  446. templ_max = max( [ hhr.start[0] + len(to_seq(hhr.template_ali)) for hhr in hhr_data ] ) - 1
  447. template_grid = TemplateGrid(total_seq, templ_max)
  448. for row, template in enumerate(hhr_data):
  449. seq_start = template.start[0] - 1
  450. templatealignment = to_seq(template.template_ali)
  451. seq_end = seq_start + len(templatealignment)
  452. # Load Meta Data
  453. start = template.start[1]
  454. end = template.end[1]
  455. # Get pdb_code and chain identifier of template
  456. pdb_code, chain = template_id_to_pdb(template.template_id)
  457. m = re.search("(\d+.\d+)A", template.template_info) # try to extract resolution of the structure
  458. if m:
  459. resolution = m.group(1)
  460. else:
  461. resolution = ""
  462. m = re.search("\{(.*)\}", template.template_info) # try to extract the organism
  463. if m:
  464. organism = m.group(1).replace(":", " ") # make sure that no colons are in the organism
  465. else:
  466. organism = ""
  467. template_grid.set_metadata(row, start, end, pdb_code, chain, organism, resolution)
  468. # Write sequence into the grid
  469. for pos, col in enumerate(range(seq_start, seq_end)):
  470. template_grid.set_cell(row, col, templatealignment[pos])
  471. return template_grid
  472. def to_seq(ali):
  473. if isinstance(ali, list):
  474. return ''.join(ali)
  475. else:
  476. return ali
  477. def create_query_grid(hhr_data):
  478. """ Creates a Query Grid """
  479. total_seq = len(hhr_data)
  480. query_max = max( [ hhr.start[0] + len(to_seq(hhr.query_ali)) for hhr in hhr_data ] ) - 1
  481. query_grid = QueryGrid(total_seq, query_max)
  482. for row, query in enumerate(hhr_data):
  483. queryalignment = to_seq(query.query_ali)
  484. query_start = query.start[0] - 1
  485. query_end = query_start + len(queryalignment)
  486. for pos, col in enumerate(range(query_start, query_end)):
  487. if queryalignment[pos] not in ['Z', 'U', 'O', 'J', 'X', 'B']: # CAUTION
  488. query_grid.set_cell(row, col, queryalignment[pos])
  489. return query_grid
  490. def create_gapless_grid(grid):
  491. """ Returns a gapless grid """
  492. gapless = deepcopy(grid)
  493. gapless.remove_gaps(keep_width=False) # hzhu: shrink grid
  494. return gapless
  495. def process_query_grid(query_grid, gapless_grid):
  496. """ Processes a query grid sucht that it contains all gaps
  497. """
  498. gaplist = query_grid.get_gap_list()
  499. off_set = 0
  500. for g in gaplist:
  501. gapless_grid.insert_gaps([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ])
  502. off_set += g.size
  503. return gapless_grid
  504. def derive_global_seq(processed_query_grid, query_name, query_chain):
  505. global_seq = list()
  506. for col in range(processed_query_grid.get_grid_width()):
  507. global_seq.append(processed_query_grid.get_col_residue(col))
  508. # this is the query entry
  509. header = '>P1;{q}\nsequence:{q}:1 :{c}:{l} :{c}::::\n'.format(
  510. q = query_name,
  511. l = len(global_seq),
  512. c = query_chain)
  513. return header + ''.join(global_seq) + '*'
  514. def process_template_grid(query_grid, template_grid):
  515. """ Insertes Gaps into the template grid
  516. Only add gaps from **other** query_grids into template grid (NOT gapless)
  517. """
  518. gaplist = query_grid.get_gap_list() # use this to keep the offset
  519. for row in range(template_grid.get_grid_height()):
  520. # do NOT consider gaps in current query row
  521. gaplist_row = query_grid.get_gaps_ref_gapless(row)
  522. gapdict_row = dict(zip([g.open_pos for g in gaplist_row],
  523. [g.size for g in gaplist_row]))
  524. off_set = 0
  525. for g in gaplist:
  526. # if there is a gap with same opening position in the current row,
  527. # only consider g if it is larger than the on in the current row
  528. if g.open_pos in gapdict_row:
  529. if g.size > gapdict_row[g.open_pos]:
  530. template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos,
  531. g.open_pos+g.size-gapdict_row[g.open_pos]) ], row)
  532. else:
  533. template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ], row)
  534. off_set += g.size # even if the gaps are not inserted, the offset should be adjusted
  535. template_grid.clean_trail_empty() # clean the redundant trailing EMPTY char
  536. return template_grid
  537. def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
  538. """
  539. Compare the PIR Alignment with Atomsection of a mmCIF file. To make the ATOM-Section of
  540. a mmCIF file compatible with MODELLER, each residue has in the ATOM-Section has to match
  541. corresponding positions in the PIR-Alignment
  542. """
  543. # glob the mmCif files from given directory and map the PDB identifier to the path
  544. cif_files = glob.glob(os.path.join(folder, '*.cif'))
  545. cif_paths = { path.split('/')[-1].split('.')[0].upper() : path for path in cif_files }
  546. cif_edits = dict()
  547. # create the path where renumbered cifs are saved to
  548. if not os.path.exists(output_path):
  549. os.mkdir(output_path)
  550. # if the cif does not contain any residue of the por alignment we delete it
  551. del_row = list()
  552. for row in range(template_grid.get_grid_height()):
  553. # get the pdb code and strand id from the current template
  554. pdb_code = template_grid._pdb_code[row]
  555. chain = template_grid._chain[row] # hhr users pdb chain ID
  556. # load mmCif file accordingly
  557. if pdb_code in cif_edits.keys():
  558. block = cif_edits[pdb_code]
  559. else:
  560. try:
  561. block = open_cif(cif_paths[pdb_code])
  562. except KeyError:
  563. del_row.append(row)
  564. print ('! Did not find the mmCIF file for {pdb}. Removing it from the alignment.'.format(
  565. pdb = pdb_code))
  566. continue
  567. # Create a mapping of the atom site
  568. atom_site = block.getObj('atom_site')
  569. ########################################################################
  570. ## Get the mapping of the residues in the atom section ##
  571. ########################################################################
  572. cif_seq = dict()
  573. # For the case that we have to rename a chain
  574. cif_chains = set([])
  575. # Iterate through the atomsection of the cif file
  576. for atom_row in range(0, atom_site.getRowCount()):
  577. try:
  578. if atom_site.getValue('label_comp_id', atom_row) == 'HOH':
  579. continue
  580. cif_chain = atom_site.getValue('label_asym_id', atom_row)
  581. pdb_chain = atom_site.getValue('auth_asym_id', atom_row) # use PDB chain ID
  582. except IndexError:
  583. pass
  584. cif_chains.add(cif_chain)
  585. # We do not care about the residues apart from the chain
  586. #if cif_chain != chain: # hzhu
  587. if pdb_chain != chain: # hhr uses PDB chain, not the cif chain! hzhu
  588. continue
  589. # and update the chain id from pdb_chain to cif_chain
  590. if atom_site.getValue('group_PDB', atom_row).startswith('ATOM'): # hzhu in case HETATM ruins ch id
  591. template_grid._chain[row] = cif_chain
  592. # get the residue and the residue number
  593. try:
  594. res_num = int(atom_site.getValue("label_seq_id", atom_row))
  595. except ValueError:
  596. continue
  597. residue = atom_site.getValue('label_comp_id', atom_row)
  598. residue = convert_aa_code(residue, convert)
  599. if res_num not in cif_seq.keys():
  600. cif_seq[res_num] = residue
  601. elif res_num in cif_seq.keys() and cif_seq[res_num] == residue:
  602. continue
  603. elif res_num in cif_seq.keys() and cif_seq[res_num] != residue:
  604. cif_seq[res_num] = '-'
  605. if DEBUG_MODE:
  606. print ('! {p} {c}: mmCIF contains a residue position that is assigned {cr} to two residues. Removing it.'.format(
  607. p = pdb_code,
  608. c = chain,
  609. cr = res_num))
  610. ########################################################################
  611. ## Rename chain if necessary ##
  612. ########################################################################
  613. chain_idx = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  614. if len(template_grid._chain[row]) != 1:
  615. i = 0
  616. new_chain = 0
  617. while i < len(chain_idx):
  618. if chain_idx[i] in cif_chains:
  619. if DEBUG_MODE:
  620. print ('! {p} {c}: Chain identifier {i} is already taken.'.format(
  621. p = pdb_code,
  622. c = chain,
  623. i = chain_idx[i]))
  624. i += 1
  625. else:
  626. new_chain = chain_idx[i]
  627. break
  628. if new_chain == 0:
  629. if DEBUG_MODE:
  630. print ('! {p} {c}: Could not use {p}. The chain identifier {c} is not compatible with MODELLER (2 letters) and could not be renanmed.'.format(
  631. p = pdb_code,
  632. c = chain))
  633. del_row.append(row)
  634. continue
  635. if new_chain != 0:
  636. print ('Selected new chain name {c}'.format(c = new_chain))
  637. #TODO
  638. ########################################################################
  639. ## Compare cif positions with the atom positions ##
  640. ########################################################################
  641. del_pos = list()
  642. mod_pos = dict()
  643. mapping = dict()
  644. for pos_cif, pos_tem in zip(range(template_grid.get_template_start(row),
  645. template_grid.get_template_end(row) + 1), template_grid.get_seq_indeces(row)):
  646. res_tem = template_grid.get_cell(row, pos_tem)
  647. try:
  648. res_cif = cif_seq[pos_cif]
  649. except KeyError:
  650. res_cif = -1
  651. match = True if res_tem == res_cif else False
  652. if not match:
  653. if res_cif == 1 and res_tem == 'M':
  654. mod_pos[pos_cif] = 1
  655. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  656. elif res_cif == 2 and res_tem == 'P':
  657. mod_pos[pos_cif] = 2
  658. mapping[(pos_tem, res_tem)] = (pos_cif, 'P')
  659. elif res_cif == 3 and res_tem == 'K':
  660. mod_pos[pos_cif] = 3
  661. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  662. elif res_cif == 4 and res_tem == 'S':
  663. mod_pos[pos_cif] = 4
  664. mapping[(pos_tem, res_tem)] = (pos_cif, 'S')
  665. elif res_cif == 5 and res_tem == 'T':
  666. mod_pos[pos_cif] = 5
  667. mapping[(pos_tem, res_tem)] = (pos_cif, 'T')
  668. elif res_cif == 6 and res_tem == 'C':
  669. mod_pos[pos_cif] = 6
  670. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  671. elif res_cif == 7 and res_tem == 'Y':
  672. mod_pos[pos_cif] = 7
  673. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  674. elif res_cif == 8 and res_tem == 'K':
  675. mod_pos[pos_cif] = 8
  676. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  677. elif res_cif == 9 and res_tem == 'C':
  678. mod_pos[pos_cif] = 9
  679. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  680. elif res_cif == 10 and res_tem == 'A':
  681. mod_pos[pos_cif] = 10
  682. mapping[(pos_tem, res_tem)] = (pos_cif, 'A')
  683. elif res_cif == 11 and res_tem == 'C':
  684. mod_pos[pos_cif] = 11
  685. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  686. elif res_cif == 12 and res_tem == 'L':
  687. mod_pos[pos_cif] = 12
  688. mapping[(pos_tem, res_tem)] = (pos_cif, 'L')
  689. elif res_cif == 13 and res_tem == 'A':
  690. mod_pos[pos_cif] = 13
  691. mapping[(pos_tem, res_tem)] = (pos_cif, 'A')
  692. elif res_cif == 14 and res_tem == 'E':
  693. mod_pos[pos_cif] = 14
  694. mapping[(pos_tem, res_tem)] = (pos_cif, 'E')
  695. elif res_cif == 15 and res_tem == 'L':
  696. mod_pos[pos_cif] = 15
  697. mapping[(pos_tem, res_tem)] = (pos_cif, 'L')
  698. elif res_cif == 16 and res_tem == 'M':
  699. mod_pos[pos_cif] = 16
  700. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  701. elif res_cif == 17 and res_tem == 'V':
  702. mod_pos[pos_cif] = 17
  703. mapping[(pos_tem, res_tem)] = (pos_cif, 'V')
  704. elif res_cif == 18 and res_tem == 'C':
  705. mod_pos[pos_cif] = 18
  706. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  707. elif res_cif == 19 and res_tem == 'P':
  708. mod_pos[pos_cif] = 19
  709. mapping[(pos_tem, res_tem)] = (pos_cif, 'P')
  710. elif res_cif == 20 and res_tem == 'V':
  711. mod_pos[pos_cif] = 20
  712. mapping[(pos_tem, res_tem)] = (pos_cif, 'V')
  713. elif res_cif == 21 and res_tem == 'Y':
  714. mod_pos[pos_cif] = 21
  715. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  716. elif res_cif == 22 and res_tem == 'K':
  717. mod_pos[pos_cif] = 22
  718. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  719. elif res_cif == 23 and res_tem == 'C':
  720. mod_pos[pos_cif] = 23
  721. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  722. elif res_cif == 24 and res_tem == 'K':
  723. mod_pos[pos_cif] = 24
  724. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  725. elif res_cif == 25 and res_tem == 'C':
  726. mod_pos[pos_cif] = 25
  727. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  728. elif res_cif == 26 and res_tem == 'C':
  729. mod_pos[pos_cif] = 26
  730. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  731. elif res_cif == 27 and res_tem == 'L':
  732. mod_pos[pos_cif] = 27
  733. mapping[(pos_tem, res_tem)] = (pos_cif, 'L')
  734. elif res_cif == 28 and res_tem == 'E':
  735. mod_pos[pos_cif] = 28
  736. mapping[(pos_tem, res_tem)] = (pos_cif, 'E')
  737. elif res_cif == 29 and res_tem == 'S':
  738. mod_pos[pos_cif] = 29
  739. mapping[(pos_tem, res_tem)] = (pos_cif, 'S')
  740. elif res_cif == 30 and res_tem == 'C':
  741. mod_pos[pos_cif] = 30
  742. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  743. elif res_cif == 31 and res_tem == 'K':
  744. mod_pos[pos_cif] = 31
  745. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  746. elif res_cif == 32 and res_tem == 'K':
  747. mod_pos[pos_cif] = 32
  748. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  749. elif res_cif == 33 and res_tem == 'F':
  750. mod_pos[pos_cif] = 33
  751. mapping[(pos_tem, res_tem)] = (pos_cif, 'F')
  752. elif res_cif == 34 and res_tem == 'R':
  753. mod_pos[pos_cif] = 34
  754. mapping[(pos_tem, res_tem)] = (pos_cif, 'R')
  755. elif res_cif == 35 and res_tem == 'F':
  756. mod_pos[pos_cif] = 35
  757. mapping[(pos_tem, res_tem)] = (pos_cif, 'F')
  758. elif res_cif == 36 and res_tem == 'D':
  759. mod_pos[pos_cif] = 36
  760. mapping[(pos_tem, res_tem)] = (pos_cif, 'D')
  761. elif res_cif == 37 and res_tem == 'D':
  762. mod_pos[pos_cif] = 37
  763. mapping[(pos_tem, res_tem)] = (pos_cif, 'D')
  764. elif res_cif == 38 and res_tem == 'H':
  765. mod_pos[pos_cif] = 38
  766. mapping[(pos_tem, res_tem)] = (pos_cif, 'H')
  767. elif res_cif == 39 and res_tem == 'P':
  768. mod_pos[pos_cif] = 39
  769. mapping[(pos_tem, res_tem)] = (pos_cif, 'P')
  770. elif res_cif == 40 and res_tem == 'T':
  771. mod_pos[pos_cif] = 40
  772. mapping[(pos_tem, res_tem)] = (pos_cif, 'T')
  773. elif res_cif == 41 and res_tem == 'I':
  774. mod_pos[pos_cif] = 41
  775. mapping[(pos_tem, res_tem)] = (pos_cif, 'I')
  776. elif res_cif == 42 and res_tem == 'N':
  777. mod_pos[pos_cif] = 42
  778. mapping[(pos_tem, res_tem)] = (pos_cif, 'N')
  779. elif res_cif == 43 and res_tem == 'Y':
  780. mod_pos[pos_cif] = 43
  781. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  782. elif res_cif == 44 and res_tem == 'M':
  783. mod_pos[pos_cif] = 44
  784. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  785. elif res_cif == 45 and res_tem == 'G':
  786. mod_pos[pos_cif] = 45
  787. mapping[(pos_tem, res_tem)] = (pos_cif, 'G')
  788. elif res_cif == 46 and res_tem == 'W':
  789. mod_pos[pos_cif] = 46
  790. mapping[(pos_tem, res_tem)] = (pos_cif, 'W')
  791. elif res_cif == 47 and res_tem == 'S':
  792. mod_pos[pos_cif] = 47
  793. mapping[(pos_tem, res_tem)] = (pos_cif, 'S')
  794. elif res_cif == 48 and res_tem == 'N':
  795. mod_pos[pos_cif] = 48
  796. mapping[(pos_tem, res_tem)] = (pos_cif, 'N')
  797. elif res_cif == 49 and res_tem == 'M':
  798. mod_pos[pos_cif] = 49
  799. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  800. elif res_cif == 50 and res_tem == 'A':
  801. mod_pos[pos_cif] = 50
  802. mapping[(pos_tem, res_tem)] = (pos_cif, 'A')
  803. elif res_cif == 51 and res_tem == 'Y':
  804. mod_pos[pos_cif] = 51
  805. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  806. elif res_cif == 52 and res_tem == 'P':
  807. mod_pos[pos_cif] = 52
  808. mapping[(pos_tem, res_tem)] = (pos_cif, 'P')
  809. elif res_cif == 53 and res_tem == 'V':
  810. mod_pos[pos_cif] = 53
  811. mapping[(pos_tem, res_tem)] = (pos_cif, 'V')
  812. elif res_cif == 54 and res_tem == 'L':
  813. mod_pos[pos_cif] = 54
  814. mapping[(pos_tem, res_tem)] = (pos_cif, 'L')
  815. elif res_cif == 55 and res_tem == 'Y':
  816. mod_pos[pos_cif] = 55
  817. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  818. elif res_cif == 56 and res_tem == 'E':
  819. mod_pos[pos_cif] = 56
  820. mapping[(pos_tem, res_tem)] = (pos_cif, 'E')
  821. elif res_cif == 57 and res_tem == 'F':
  822. mod_pos[pos_cif] = 57
  823. mapping[(pos_tem, res_tem)] = (pos_cif, 'F')
  824. elif res_cif == 58 and res_tem == 'C':
  825. mod_pos[pos_cif] = 58
  826. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  827. elif res_cif == 59 and res_tem == 'H':
  828. mod_pos[pos_cif] = 59
  829. mapping[(pos_tem, res_tem)] = (pos_cif, 'H')
  830. elif res_cif == 60 and res_tem == 'C':
  831. mod_pos[pos_cif] = 60
  832. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  833. elif res_cif == 61 and res_tem == 'Y':
  834. mod_pos[pos_cif] = 61
  835. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  836. elif res_cif == 62 and res_tem == 'C':
  837. mod_pos[pos_cif] = 62
  838. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  839. elif res_cif == 63 and res_tem == 'M':
  840. mod_pos[pos_cif] = 63
  841. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  842. elif res_cif == 64 and res_tem == 'M':
  843. mod_pos[pos_cif] = 64
  844. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  845. elif res_cif == 65 and res_tem == 'C':
  846. mod_pos[pos_cif] = 65
  847. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  848. elif res_cif == 66 and res_tem == 'Y':
  849. mod_pos[pos_cif] = 66
  850. mapping[(pos_tem, res_tem)] = (pos_cif, 'Y')
  851. elif res_cif == 67 and res_tem == 'S':
  852. mod_pos[pos_cif] = 67
  853. mapping[(pos_tem, res_tem)] = (pos_cif, 'S')
  854. elif res_cif == 68 and res_tem == 'C':
  855. mod_pos[pos_cif] = 68
  856. mapping[(pos_tem, res_tem)] = (pos_cif, 'C')
  857. elif res_cif == 69 and res_tem == 'L':
  858. mod_pos[pos_cif] = 69
  859. mapping[(pos_tem, res_tem)] = (pos_cif, 'L')
  860. elif res_cif == 70 and res_tem == 'M':
  861. mod_pos[pos_cif] = 70
  862. mapping[(pos_tem, res_tem)] = (pos_cif, 'M')
  863. elif res_cif == 71 and res_tem == 'K':
  864. mod_pos[pos_cif] = 71
  865. mapping[(pos_tem, res_tem)] = (pos_cif, 'K')
  866. else:
  867. # insert a gap
  868. template_grid.set_empty(row, pos_tem)
  869. mapping[(pos_tem, res_tem)] = (pos_cif, res_cif)
  870. if DEBUG_MODE:
  871. print ('! {p} {c}: template pos {pt} ({rt}) does not match cif pos {pc} ({rc}). Replacing with gap.'.format(
  872. p = pdb_code,
  873. c = chain,
  874. pt = pos_tem,
  875. rt = res_tem,
  876. pc = pos_cif,
  877. rc = res_cif if res_cif != -1 else 'DNE'))
  878. if res_cif != -1:
  879. del_pos.append(pos_cif)
  880. else:
  881. mapping[(pos_tem, res_tem)] = (pos_cif, res_cif)
  882. # adjust template start and end positions
  883. correct_mapping = { key:value for key, value in mapping.items() if key[1] == value[1] }
  884. try:
  885. tstart = correct_mapping[sorted(correct_mapping.keys())[0]][0]
  886. tend = correct_mapping[sorted(correct_mapping.keys())[-1]][0]
  887. template_grid.set_map(row, tstart, tend)
  888. except IndexError:
  889. # This exception handles cases in which all residues were deleted
  890. if DEBUG_MODE:
  891. print ('! {p} {c}: Removing {p} from alignment. No residues matched the alignment sequence.'.format(
  892. p = pdb_code,
  893. c = chain))
  894. del_row.append(row)
  895. continue
  896. ########################################################################
  897. ## Delete rows from the PIR Alignment if the residue ratio is to low ##
  898. ########################################################################
  899. if threshold > 0:
  900. gaps = 0
  901. res = 0
  902. for col in range(template_grid.get_grid_width()):
  903. if template_grid.is_empty(row, col):
  904. template_grid.set_gap(row, col)
  905. if template_grid.is_gap(row, col):
  906. gaps += 1
  907. else:
  908. res += 1
  909. ratio = res/float(gaps + res)
  910. if ratio > threshold:
  911. print ('! Template {p} successfully passed residue ratio ({r:.2f} / {t}).'.format(
  912. p = pdb_code,
  913. r = ratio,
  914. t = threshold ))
  915. else:
  916. print ('! Template {p} did not passed residue ratio ({r:.2f} / {t}). Removing it from pir Alignment.'.format(
  917. p = pdb_code,
  918. r = ratio,
  919. t = threshold ))
  920. if row not in del_row:
  921. del_row.append(row)
  922. continue
  923. ########################################################################
  924. ## Edit cif files ##
  925. ########################################################################
  926. rem_row = list() # verbosity: saves information about removed residues
  927. mod_row = list() # verbosity: saves information about modified residues
  928. cha_row = list() # verbosity: saves any other changes
  929. for atom_row in reversed(range(0, atom_site.getRowCount())):
  930. try:
  931. cif_chain = atom_site.getValue('label_asym_id', atom_row)
  932. except IndexError:
  933. pass
  934. # We do not care about the residues apart from the chain
  935. if cif_chain != chain:
  936. continue
  937. # get the residue number
  938. try:
  939. res_num = int(atom_site.getValue("label_seq_id", atom_row))
  940. except ValueError:
  941. continue
  942. # pdb_PDB_model_num has to be set to 1
  943. try:
  944. model_num = int(atom_site.getValue('pdbx_PDB_model_num', atom_row))
  945. except IndexError:
  946. model_num = 1 # if we cannot extract, assume that it is alright
  947. try:
  948. ins_code = atom_site.getValue('pdbx_PDB_ins_code', atom_row)
  949. except IndexError:
  950. ins_code = '?' # assume it has no insertion code
  951. group_PDB = atom_site.getValue('group_PDB', atom_row)
  952. residue = atom_site.getValue('label_comp_id', atom_row)
  953. residue = convert_aa_code(residue, convert)
  954. # MODELLER accepts only structures if pdbx_PDB_model_num is set to 1
  955. if model_num != 1:
  956. if (res_num, residue, 'model_num') not in cha_row:
  957. cha_row.append((res_num, residue, 'model_num'))
  958. atom_site.setValue(1, "pdbx_PDB_model_num", atom_row)
  959. if ins_code != '?':
  960. if (res_num, residue, 'ins_code') not in cha_row:
  961. cha_row.append((res_num, residue, 'ins_code'))
  962. atom_site.setValue('?', "pdbx_PDB_ins_code", atom_row)
  963. if group_PDB != 'ATOM':
  964. if (res_num, residue, 'group_PDB') not in cha_row:
  965. cha_row.append((res_num, residue, 'group_PDB'))
  966. atom_site.setValue('ATOM', 'group_PDB', atom_row)
  967. ########################################################################
  968. ## Delete residues ##
  969. ########################################################################
  970. if res_num in del_pos:
  971. if (res_num, residue) not in rem_row:
  972. rem_row.append((res_num, residue))
  973. atom_site.removeRow(atom_row)
  974. ########################################################################
  975. ## Modify residues ##
  976. ########################################################################
  977. if res_num in mod_pos.keys():
  978. # Get the data
  979. type_symbol = atom_site.getValue('type_symbol', atom_row)
  980. label_atom_id = atom_site.getValue('label_atom_id', atom_row)
  981. auth_atom_id = atom_site.getValue('auth_atom_id', atom_row)
  982. if mod_pos[res_num] == 1: # try to convert MSE to M
  983. atom_site.setValue('MET', 'label_comp_id', atom_row)
  984. try:
  985. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  986. except IndexError:
  987. pass
  988. if type_symbol == 'SE':
  989. atom_site.setValue('S', 'type_symbol', atom_row)
  990. if label_atom_id == 'SE':
  991. atom_site.setValue('S', 'label_atom_id', atom_row)
  992. if auth_atom_id == 'SE':
  993. atom_site.setValue('S', 'auth_atom_id', atom_row)
  994. if (res_num, residue, 'MSE -> MET') not in mod_row:
  995. mod_row.append((res_num, residue, 'MSE -> MET'))
  996. elif mod_pos[res_num] == 2: # try to convert HYP to PRO
  997. # apparently it is enough to rename the label_comp_id to PRO to get
  998. # MODELLER working with Hydroxyprolines (HYP)
  999. atom_site.setValue('PRO', 'label_comp_id', atom_row)
  1000. try:
  1001. atom_site.setValue('PRO', 'auth_comp_id', atom_row)
  1002. except IndexError:
  1003. pass
  1004. if (res_num, residue, 'HYP -> PRO') not in mod_row:
  1005. mod_row.append((res_num, residue, 'HYP -> PRO'))
  1006. elif mod_pos[res_num] == 3: # try to convert MLY to LYS
  1007. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1008. try:
  1009. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1010. except IndexError:
  1011. pass
  1012. if (res_num, residue, 'MLY -> LYS') not in mod_row:
  1013. mod_row.append((res_num, residue, 'MLY -> LYS'))
  1014. elif mod_pos[res_num] == 4: # converts Phosphoserine to Serine
  1015. atom_site.setValue('SER', 'label_comp_id', atom_row)
  1016. try:
  1017. atom_site.setValue('SER', 'auth_comp_id', atom_row)
  1018. except IndexError:
  1019. pass
  1020. if (res_num, residue, 'SEP -> SER') not in mod_row:
  1021. mod_row.append((res_num, residue, 'SEP -> SER'))
  1022. elif mod_pos[res_num] == 5: # converts Phosphothreonine to Threonine
  1023. atom_site.setValue('THR', 'label_comp_id', atom_row)
  1024. try:
  1025. atom_site.setValue('THR', 'auth_comp_id', atom_row)
  1026. except IndexError:
  1027. pass
  1028. if (res_num, residue, 'TPO -> THR') not in mod_row:
  1029. mod_row.append((res_num, residue, 'TPO -> THR'))
  1030. elif mod_pos[res_num] == 6: # converts S-HYDROXYCYSTEINE to Cysteine
  1031. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1032. try:
  1033. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1034. except IndexError:
  1035. pass
  1036. if (res_num, residue, 'CSO -> CYS') not in mod_row:
  1037. mod_row.append((res_num, residue, 'CSO -> CYS'))
  1038. elif mod_pos[res_num] == 7: # converts O-PHOSPHOTYROSINE to Tyrosine
  1039. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1040. try:
  1041. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1042. except IndexError:
  1043. pass
  1044. if (res_num, residue, 'PTR -> TYR') not in mod_row:
  1045. mod_row.append((res_num, residue, 'PTR -> TYR'))
  1046. elif mod_pos[res_num] == 8: # converts LYSINE NZ-CARBOXYLIC ACID to Lysine
  1047. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1048. try:
  1049. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1050. except IndexError:
  1051. pass
  1052. if (res_num, residue, 'KCX -> LYS') not in mod_row:
  1053. mod_row.append((res_num, residue, 'KCX -> LYS'))
  1054. elif mod_pos[res_num] == 9: # converts S,S-(2-HYDROXYETHYL)THIOCYSTEINE to Cysteine
  1055. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1056. try:
  1057. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1058. except IndexError:
  1059. pass
  1060. if (res_num, residue, 'CME -> CYS') not in mod_row:
  1061. mod_row.append((res_num, residue, 'CME -> CYS'))
  1062. elif mod_pos[res_num] == 10: # converts 3-SULFINOALANINE to Alanine
  1063. atom_site.setValue('ALA', 'label_comp_id', atom_row)
  1064. try:
  1065. atom_site.setValue('ALA', 'auth_comp_id', atom_row)
  1066. except IndexError:
  1067. pass
  1068. if (res_num, residue, 'CSD -> ALA') not in mod_row:
  1069. mod_row.append((res_num, residue, 'CSD -> ALA'))
  1070. elif mod_pos[res_num] == 11: # converts S-(DIMETHYLARSENIC)CYSTEINE to Cysteine
  1071. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1072. try:
  1073. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1074. except IndexError:
  1075. pass
  1076. if (res_num, residue, 'CAS -> CYS') not in mod_row:
  1077. mod_row.append((res_num, residue, 'CAS -> CYS'))
  1078. elif mod_pos[res_num] == 12: # converts N-METHYLLEUCINE (MLE) to Leucine
  1079. atom_site.setValue('LEU', 'label_comp_id', atom_row)
  1080. try:
  1081. atom_site.setValue('LEU', 'auth_comp_id', atom_row)
  1082. except IndexError:
  1083. pass
  1084. if (res_num, residue, 'MLE -> LEU') not in mod_row:
  1085. mod_row.append((res_num, residue, 'MLE -> LEU'))
  1086. elif mod_pos[res_num] == 13: # converts D-ALANINE (DAL) to ALA
  1087. atom_site.setValue('ALA', 'label_comp_id', atom_row)
  1088. try:
  1089. atom_site.setValue('ALA', 'auth_comp_id', atom_row)
  1090. except IndexError:
  1091. pass
  1092. if (res_num, residue, 'DAL -> ALA') not in mod_row:
  1093. mod_row.append((res_num, residue, 'DAL -> ALA'))
  1094. elif mod_pos[res_num] == 14: # converts GAMMA-CARBOXY-GLUTAMIC ACID (CGU) to GLU
  1095. atom_site.setValue('GLU', 'label_comp_id', atom_row)
  1096. try:
  1097. atom_site.setValue('GLU', 'auth_comp_id', atom_row)
  1098. except IndexError:
  1099. pass
  1100. if (res_num, residue, 'CGU -> GLU') not in mod_row:
  1101. mod_row.append((res_num, residue, 'CGU -> GLU'))
  1102. elif mod_pos[res_num] == 15: # converts D-LEUCINE (DLE) to LEU
  1103. atom_site.setValue('LEU', 'label_comp_id', atom_row)
  1104. try:
  1105. atom_site.setValue('LEU', 'auth_comp_id', atom_row)
  1106. except IndexError:
  1107. pass
  1108. if (res_num, residue, 'DLE -> LEU') not in mod_row:
  1109. mod_row.append((res_num, residue, 'DLE -> LEU'))
  1110. elif mod_pos[res_num] == 16: # converts N-FORMYLMETHIONINE (FME) to MET
  1111. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1112. try:
  1113. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1114. except IndexError:
  1115. pass
  1116. if (res_num, residue, 'FME -> MET') not in mod_row:
  1117. mod_row.append((res_num, residue, 'FME -> MET'))
  1118. elif mod_pos[res_num] == 17: # converts D-VAL (DVA) to VAL
  1119. atom_site.setValue('VAL', 'label_comp_id', atom_row)
  1120. try:
  1121. atom_site.setValue('VAL', 'auth_comp_id', atom_row)
  1122. except IndexError:
  1123. pass
  1124. if (res_num, residue, 'DVA -> VAL') not in mod_row:
  1125. mod_row.append((res_num, residue, 'DVA -> VAL'))
  1126. elif mod_pos[res_num] == 18: # converts CYSTEINESULFONIC ACID (OCS) to CYS
  1127. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1128. try:
  1129. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1130. except IndexError:
  1131. pass
  1132. if (res_num, residue, 'OCS -> CYS') not in mod_row:
  1133. mod_row.append((res_num, residue, 'OCS -> CYS'))
  1134. elif mod_pos[res_num] == 19: # converts D-PROLINE (DPR) to PRO
  1135. atom_site.setValue('PRO', 'label_comp_id', atom_row)
  1136. try:
  1137. atom_site.setValue('PRO', 'auth_comp_id', atom_row)
  1138. except IndexError:
  1139. pass
  1140. if (res_num, residue, 'DPR -> PRO') not in mod_row:
  1141. mod_row.append((res_num, residue, 'DPR -> PRO'))
  1142. elif mod_pos[res_num] == 20: # converts N-METHYLVALINE (MVA) to VAL
  1143. atom_site.setValue('VAL', 'label_comp_id', atom_row)
  1144. try:
  1145. atom_site.setValue('VAL', 'auth_comp_id', atom_row)
  1146. except IndexError:
  1147. pass
  1148. if (res_num, residue, 'MVA -> VAL') not in mod_row:
  1149. mod_row.append((res_num, residue, 'MVA -> VAL'))
  1150. elif mod_pos[res_num] == 21: # converts O-SULFO-L-TYROSINE (TYS) to VAL
  1151. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1152. try:
  1153. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1154. except IndexError:
  1155. pass
  1156. if (res_num, residue, 'TYS -> TYR') not in mod_row:
  1157. mod_row.append((res_num, residue, 'TYS -> TYR'))
  1158. elif mod_pos[res_num] == 22: # converts N-TRIMETHYLLYSINE (M3L) to LYS
  1159. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1160. try:
  1161. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1162. except IndexError:
  1163. pass
  1164. if (res_num, residue, 'M3L -> LYS') not in mod_row:
  1165. mod_row.append((res_num, residue, 'M3L -> LYS'))
  1166. elif mod_pos[res_num] == 23: # converts S-METHYLCYSTEINE (SMC) to CYS
  1167. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1168. try:
  1169. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1170. except IndexError:
  1171. pass
  1172. if (res_num, residue, 'SMC -> CYS') not in mod_row:
  1173. mod_row.append((res_num, residue, 'SMC -> CYS'))
  1174. elif mod_pos[res_num] == 24: # converts N(6)-ACETYLLYSINE (ALY) to LYS
  1175. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1176. try:
  1177. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1178. except IndexError:
  1179. pass
  1180. if (res_num, residue, 'ALY -> LYS') not in mod_row:
  1181. mod_row.append((res_num, residue, 'ALY -> LYS'))
  1182. elif mod_pos[res_num] == 25: # converts S-OXY CYSTEINE (CSX) to CYS
  1183. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1184. try:
  1185. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1186. except IndexError:
  1187. pass
  1188. if (res_num, residue, 'CSX -> CYS') not in mod_row:
  1189. mod_row.append((res_num, residue, 'CSX -> CYS'))
  1190. elif mod_pos[res_num] == 26: # converts D-CYSTEINE (DCY) to CYS
  1191. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1192. try:
  1193. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1194. except IndexError:
  1195. pass
  1196. if (res_num, residue, 'DCY -> CYS') not in mod_row:
  1197. mod_row.append((res_num, residue, 'DCY -> CYS'))
  1198. elif mod_pos[res_num] == 27: # converts NORLEUCINE (NLE) to LEU
  1199. atom_site.setValue('LEU', 'label_comp_id', atom_row)
  1200. try:
  1201. atom_site.setValue('LEU', 'auth_comp_id', atom_row)
  1202. except IndexError:
  1203. pass
  1204. if (res_num, residue, 'NLE -> LEU') not in mod_row:
  1205. mod_row.append((res_num, residue, 'NLE -> LEU'))
  1206. elif mod_pos[res_num] == 28: # converts D-GLUTAMIC ACID (DGL) to GLU
  1207. atom_site.setValue('GLU', 'label_comp_id', atom_row)
  1208. try:
  1209. atom_site.setValue('GLU', 'auth_comp_id', atom_row)
  1210. except IndexError:
  1211. pass
  1212. if (res_num, residue, 'DGL -> GLU') not in mod_row:
  1213. mod_row.append((res_num, residue, 'DGL -> GLU'))
  1214. elif mod_pos[res_num] == 29: # converts D-SERINE (DSN) to SER
  1215. atom_site.setValue('SER', 'label_comp_id', atom_row)
  1216. try:
  1217. atom_site.setValue('SER', 'auth_comp_id', atom_row)
  1218. except IndexError:
  1219. pass
  1220. if (res_num, residue, 'DSN -> SER') not in mod_row:
  1221. mod_row.append((res_num, residue, 'DSN -> SER'))
  1222. elif mod_pos[res_num] == 30: # converts S-MERCAPTOCYSTEINE (CSS) to CYS
  1223. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1224. try:
  1225. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1226. except IndexError:
  1227. pass
  1228. if (res_num, residue, 'CSS -> CYS') not in mod_row:
  1229. mod_row.append((res_num, residue, 'CSS -> CYS'))
  1230. elif mod_pos[res_num] == 31: # converts D-LYSINE (DLY) to LYS
  1231. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1232. try:
  1233. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1234. except IndexError:
  1235. pass
  1236. if (res_num, residue, 'DLY -> LYS') not in mod_row:
  1237. mod_row.append((res_num, residue, 'DLY -> LYS'))
  1238. elif mod_pos[res_num] == 32: # converts N-METHYL-LYSINE (MLZ) to LYS
  1239. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1240. try:
  1241. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1242. except IndexError:
  1243. pass
  1244. if (res_num, residue, 'MLZ -> LYS') not in mod_row:
  1245. mod_row.append((res_num, residue, 'MLZ -> LYS'))
  1246. elif mod_pos[res_num] == 33: # converts D-PHENYLALANINE (DPN) to PHE
  1247. atom_site.setValue('PHE', 'label_comp_id', atom_row)
  1248. try:
  1249. atom_site.setValue('PHE', 'auth_comp_id', atom_row)
  1250. except IndexError:
  1251. pass
  1252. if (res_num, residue, 'DPN -> PHE') not in mod_row:
  1253. mod_row.append((res_num, residue, 'DPN -> PHE'))
  1254. elif mod_pos[res_num] == 34: # converts D-ARGININE (DAR) to ARG
  1255. atom_site.setValue('ARG', 'label_comp_id', atom_row)
  1256. try:
  1257. atom_site.setValue('ARG', 'auth_comp_id', atom_row)
  1258. except IndexError:
  1259. pass
  1260. if (res_num, residue, 'DAR -> ARG') not in mod_row:
  1261. mod_row.append((res_num, residue, 'DAR -> ARG'))
  1262. elif mod_pos[res_num] == 35: # converts IODO-PHENYLALANINE (PHI) to PHE
  1263. atom_site.setValue('PHE', 'label_comp_id', atom_row)
  1264. try:
  1265. atom_site.setValue('PHE', 'auth_comp_id', atom_row)
  1266. except IndexError:
  1267. pass
  1268. if (res_num, residue, 'PHI -> PHE') not in mod_row:
  1269. mod_row.append((res_num, residue, 'PHI -> PHE'))
  1270. elif mod_pos[res_num] == 36: # converts BETA-L-ASPARTIC ACID (IAS) to ASP
  1271. atom_site.setValue('ASP', 'label_comp_id', atom_row)
  1272. try:
  1273. atom_site.setValue('ASP', 'auth_comp_id', atom_row)
  1274. except IndexError:
  1275. pass
  1276. if (res_num, residue, 'IAS -> ASP') not in mod_row:
  1277. mod_row.append((res_num, residue, 'IAS -> ASP'))
  1278. elif mod_pos[res_num] == 37: # converts D-ASPARTIC ACID (DAS) to ASP
  1279. atom_site.setValue('ASP', 'label_comp_id', atom_row)
  1280. try:
  1281. atom_site.setValue('ASP', 'auth_comp_id', atom_row)
  1282. except IndexError:
  1283. pass
  1284. if (res_num, residue, 'DAS -> ASP') not in mod_row:
  1285. mod_row.append((res_num, residue, 'DAS -> ASP'))
  1286. elif mod_pos[res_num] == 38: # converts 4-METHYL-HISTIDINE (HIC) to HIS
  1287. atom_site.setValue('HIS', 'label_comp_id', atom_row)
  1288. try:
  1289. atom_site.setValue('HIS', 'auth_comp_id', atom_row)
  1290. except IndexError:
  1291. pass
  1292. if (res_num, residue, 'HIC -> HIS') not in mod_row:
  1293. mod_row.append((res_num, residue, 'HIC -> HIS'))
  1294. elif mod_pos[res_num] == 39: # converts (4R)-4-methyl-L-proline (MP8) to PRO
  1295. atom_site.setValue('PRO', 'label_comp_id', atom_row)
  1296. try:
  1297. atom_site.setValue('PRO', 'auth_comp_id', atom_row)
  1298. except IndexError:
  1299. pass
  1300. if (res_num, residue, 'MP8 -> PRO') not in mod_row:
  1301. mod_row.append((res_num, residue, 'MP8 -> PRO'))
  1302. elif mod_pos[res_num] == 40: # converts D-THREONINE (DTH) to THR
  1303. atom_site.setValue('THR', 'label_comp_id', atom_row)
  1304. try:
  1305. atom_site.setValue('THR', 'auth_comp_id', atom_row)
  1306. except IndexError:
  1307. pass
  1308. if (res_num, residue, 'DTH -> THR') not in mod_row:
  1309. mod_row.append((res_num, residue, 'DTH -> THR'))
  1310. elif mod_pos[res_num] == 41: # converts D-ISOLEUCINE (DIL) to ILE
  1311. atom_site.setValue('ILE', 'label_comp_id', atom_row)
  1312. try:
  1313. atom_site.setValue('ILE', 'auth_comp_id', atom_row)
  1314. except IndexError:
  1315. pass
  1316. if (res_num, residue, 'DIL -> ILE') not in mod_row:
  1317. mod_row.append((res_num, residue, 'DIL -> ILE'))
  1318. elif mod_pos[res_num] == 42: # converts N-METHYL ASPARAGINE (MEN) to ASN
  1319. atom_site.setValue('ASN', 'label_comp_id', atom_row)
  1320. try:
  1321. atom_site.setValue('ASN', 'auth_comp_id', atom_row)
  1322. except IndexError:
  1323. pass
  1324. if (res_num, residue, 'MEN -> ASN') not in mod_row:
  1325. mod_row.append((res_num, residue, 'MEN -> ASN'))
  1326. elif mod_pos[res_num] == 43: # converts D-TYROSINE (DTY) to TYR
  1327. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1328. try:
  1329. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1330. except IndexError:
  1331. pass
  1332. if (res_num, residue, 'DTY -> TYR') not in mod_row:
  1333. mod_row.append((res_num, residue, 'DTY -> TYR'))
  1334. elif mod_pos[res_num] == 44: # converts N-CARBOXYMETHIONINE (CXM) to MET
  1335. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1336. try:
  1337. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1338. except IndexError:
  1339. pass
  1340. if (res_num, residue, 'CXM -> MET') not in mod_row:
  1341. mod_row.append((res_num, residue, 'CXM -> MET'))
  1342. elif mod_pos[res_num] == 45: # converts D-GLUTAMINE (DGN) to MET
  1343. atom_site.setValue('GLN', 'label_comp_id', atom_row)
  1344. try:
  1345. atom_site.setValue('GLN', 'auth_comp_id', atom_row)
  1346. except IndexError:
  1347. pass
  1348. if (res_num, residue, 'DGN -> GLN') not in mod_row:
  1349. mod_row.append((res_num, residue, 'DGN -> GLN'))
  1350. elif mod_pos[res_num] == 46: # converts D-TRYPTOPHAN (DTR) to TRP
  1351. atom_site.setValue('TRP', 'label_comp_id', atom_row)
  1352. try:
  1353. atom_site.setValue('TRP', 'auth_comp_id', atom_row)
  1354. except IndexError:
  1355. pass
  1356. if (res_num, residue, 'DTR -> TRP') not in mod_row:
  1357. mod_row.append((res_num, residue, 'DTR -> TRP'))
  1358. elif mod_pos[res_num] == 47: # converts N-ACETYL-SERINE (SAC) to SER
  1359. atom_site.setValue('SER', 'label_comp_id', atom_row)
  1360. try:
  1361. atom_site.setValue('SER', 'auth_comp_id', atom_row)
  1362. except IndexError:
  1363. pass
  1364. if (res_num, residue, 'SAC -> SER') not in mod_row:
  1365. mod_row.append((res_num, residue, 'SAC -> SER'))
  1366. elif mod_pos[res_num] == 48: # converts D-ASPARAGINE (DSG) to ASN
  1367. atom_site.setValue('ASN', 'label_comp_id', atom_row)
  1368. try:
  1369. atom_site.setValue('ASN', 'auth_comp_id', atom_row)
  1370. except IndexError:
  1371. pass
  1372. if (res_num, residue, 'DSG -> ASN') not in mod_row:
  1373. mod_row.append((res_num, residue, 'DSG -> ASN'))
  1374. elif mod_pos[res_num] == 49: # converts N-METHYL METHIONINE (MME) to MET
  1375. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1376. try:
  1377. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1378. except IndexError:
  1379. pass
  1380. if (res_num, residue, 'MME -> MET') not in mod_row:
  1381. mod_row.append((res_num, residue, 'MME -> MET'))
  1382. elif mod_pos[res_num] == 50: # converts N-methyl-L-alanine (MAA) to ALA
  1383. atom_site.setValue('ALA', 'label_comp_id', atom_row)
  1384. try:
  1385. atom_site.setValue('ALA', 'auth_comp_id', atom_row)
  1386. except IndexError:
  1387. pass
  1388. if (res_num, residue, 'MAA -> ALA') not in mod_row:
  1389. mod_row.append((res_num, residue, 'MAA -> ALA'))
  1390. elif mod_pos[res_num] == 51: # converts 3-FLUOROTYROSINE (YOF) to TYR
  1391. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1392. try:
  1393. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1394. except IndexError:
  1395. pass
  1396. if (res_num, residue, 'YOF -> TYR') not in mod_row:
  1397. mod_row.append((res_num, residue, 'YOF -> TYR'))
  1398. elif mod_pos[res_num] == 52: # converts (4R)-4-fluoro-L-proline (FP9) to PRO
  1399. atom_site.setValue('PRO', 'label_comp_id', atom_row)
  1400. try:
  1401. atom_site.setValue('PRO', 'auth_comp_id', atom_row)
  1402. except IndexError:
  1403. pass
  1404. if (res_num, residue, 'FP9 -> PRO') not in mod_row:
  1405. mod_row.append((res_num, residue, 'FP9 -> PRO'))
  1406. elif mod_pos[res_num] == 53: # converts N-formyl-L-valine (FVA) to VAL
  1407. atom_site.setValue('VAL', 'label_comp_id', atom_row)
  1408. try:
  1409. atom_site.setValue('VAL', 'auth_comp_id', atom_row)
  1410. except IndexError:
  1411. pass
  1412. if (res_num, residue, 'FVA -> VAL') not in mod_row:
  1413. mod_row.append((res_num, residue, 'FVA -> VAL'))
  1414. elif mod_pos[res_num] == 54: # converts N-methyl-D-leucine (MLU) to LEU
  1415. atom_site.setValue('LEU', 'label_comp_id', atom_row)
  1416. try:
  1417. atom_site.setValue('LEU', 'auth_comp_id', atom_row)
  1418. except IndexError:
  1419. pass
  1420. if (res_num, residue, 'MLU -> LEU') not in mod_row:
  1421. mod_row.append((res_num, residue, 'MLU -> LEU'))
  1422. elif mod_pos[res_num] == 55: # converts (betaR)-3-chloro-beta-hydroxy-L-tyrosine (OMY) to TYR
  1423. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1424. try:
  1425. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1426. except IndexError:
  1427. pass
  1428. if (res_num, residue, 'OMY -> TYR') not in mod_row:
  1429. mod_row.append((res_num, residue, 'OMY -> TYR'))
  1430. elif mod_pos[res_num] == 56: # converts GAMMA-D-GLUTAMIC ACID (FGA) to GLU
  1431. atom_site.setValue('GLU', 'label_comp_id', atom_row)
  1432. try:
  1433. atom_site.setValue('GLU', 'auth_comp_id', atom_row)
  1434. except IndexError:
  1435. pass
  1436. if (res_num, residue, 'FGA -> GLU') not in mod_row:
  1437. mod_row.append((res_num, residue, 'FGA -> GLU'))
  1438. elif mod_pos[res_num] == 57: # converts N-METHYLPHENYLALANINE (MEA) to PHE
  1439. atom_site.setValue('PHE', 'label_comp_id', atom_row)
  1440. try:
  1441. atom_site.setValue('PHE', 'auth_comp_id', atom_row)
  1442. except IndexError:
  1443. pass
  1444. if (res_num, residue, 'MEA -> PHE') not in mod_row:
  1445. mod_row.append((res_num, residue, 'MEA -> PHE'))
  1446. elif mod_pos[res_num] == 58: # converts S-(METHYLMERCURY)-L-CYSTEINE (CMH) to CYS
  1447. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1448. try:
  1449. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1450. except IndexError:
  1451. pass
  1452. if (res_num, residue, 'CMH -> CYS') not in mod_row:
  1453. mod_row.append((res_num, residue, 'CMH -> CYS'))
  1454. elif mod_pos[res_num] == 59: # converts D-HISTIDINE (DHI) to HIS
  1455. atom_site.setValue('HIS', 'label_comp_id', atom_row)
  1456. try:
  1457. atom_site.setValue('HIS', 'auth_comp_id', atom_row)
  1458. except IndexError:
  1459. pass
  1460. if (res_num, residue, 'DHI -> HIS') not in mod_row:
  1461. mod_row.append((res_num, residue, 'DHI -> HIS'))
  1462. elif mod_pos[res_num] == 60: # converts SELENOCYSTEINE (SEC) to CYS
  1463. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1464. try:
  1465. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1466. except IndexError:
  1467. pass
  1468. if (res_num, residue, 'SEC -> CYS') not in mod_row:
  1469. mod_row.append((res_num, residue, 'SEC -> CYS'))
  1470. elif mod_pos[res_num] == 61: # converts (betaR)-3-CHLORO-BETA-HYDROXY-D-TYROSINE (OMZ) to TYR
  1471. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1472. try:
  1473. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1474. except IndexError:
  1475. pass
  1476. if (res_num, residue, 'OMZ -> TYR') not in mod_row:
  1477. mod_row.append((res_num, residue, 'OMZ -> TYR'))
  1478. elif mod_pos[res_num] == 62: # converts S-ACETYL-CYSTEINE (SCY) to CYS
  1479. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1480. try:
  1481. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1482. except IndexError:
  1483. pass
  1484. if (res_num, residue, 'SCY -> CYS') not in mod_row:
  1485. mod_row.append((res_num, residue, 'SCY -> CYS'))
  1486. elif mod_pos[res_num] == 63: # converts S-OXYMETHIONINE (MHO) to MET
  1487. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1488. try:
  1489. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1490. except IndexError:
  1491. pass
  1492. if (res_num, residue, 'MHO -> MET') not in mod_row:
  1493. mod_row.append((res_num, residue, 'MHO -> MET'))
  1494. elif mod_pos[res_num] == 64: # converts D-METHIONINE (MED) to MET
  1495. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1496. try:
  1497. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1498. except IndexError:
  1499. pass
  1500. if (res_num, residue, 'MED -> MET') not in mod_row:
  1501. mod_row.append((res_num, residue, 'MED -> MET'))
  1502. elif mod_pos[res_num] == 65: # converts S-DIMETHYLARSINOYL-CYSTEINE (CAF) to CYS
  1503. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1504. try:
  1505. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1506. except IndexError:
  1507. pass
  1508. if (res_num, residue, 'CAF -> CYS') not in mod_row:
  1509. mod_row.append((res_num, residue, 'CAF -> CYS'))
  1510. elif mod_pos[res_num] == 66: # converts META-NITRO-TYROSINE (NIY) to TYR
  1511. atom_site.setValue('TYR', 'label_comp_id', atom_row)
  1512. try:
  1513. atom_site.setValue('TYR', 'auth_comp_id', atom_row)
  1514. except IndexError:
  1515. pass
  1516. if (res_num, residue, 'NIY -> TYR') not in mod_row:
  1517. mod_row.append((res_num, residue, 'NIY -> TYR'))
  1518. elif mod_pos[res_num] == 67: # converts O-ACETYLSERINE (OAS) to SER
  1519. atom_site.setValue('SER', 'label_comp_id', atom_row)
  1520. try:
  1521. atom_site.setValue('SER', 'auth_comp_id', atom_row)
  1522. except IndexError:
  1523. pass
  1524. if (res_num, residue, 'OAS -> SER') not in mod_row:
  1525. mod_row.append((res_num, residue, 'OAS -> SER'))
  1526. elif mod_pos[res_num] == 68: # converts S-METHYL-THIO-CYSTEINE (SCH) to CYS
  1527. atom_site.setValue('CYS', 'label_comp_id', atom_row)
  1528. try:
  1529. atom_site.setValue('CYS', 'auth_comp_id', atom_row)
  1530. except IndexError:
  1531. pass
  1532. if (res_num, residue, 'SCH -> CYS') not in mod_row:
  1533. mod_row.append((res_num, residue, 'SCH -> CYS'))
  1534. elif mod_pos[res_num] == 69: # converts 2-methyl-L-norleucine (MK8) to LEU
  1535. atom_site.setValue('LEU', 'label_comp_id', atom_row)
  1536. try:
  1537. atom_site.setValue('LEU', 'auth_comp_id', atom_row)
  1538. except IndexError:
  1539. pass
  1540. if (res_num, residue, 'MK8 -> LEU') not in mod_row:
  1541. mod_row.append((res_num, residue, 'MK8 -> LEU'))
  1542. elif mod_pos[res_num] == 70: # converts METHIONINE SULFOXIDE (SME) to MET
  1543. atom_site.setValue('MET', 'label_comp_id', atom_row)
  1544. try:
  1545. atom_site.setValue('MET', 'auth_comp_id', atom_row)
  1546. except IndexError:
  1547. pass
  1548. if (res_num, residue, 'SME -> MET') not in mod_row:
  1549. mod_row.append((res_num, residue, 'SME -> MET'))
  1550. elif mod_pos[res_num] == 71: # converts 5-HYDROXYLYSINE (LYZ) to LYS
  1551. atom_site.setValue('LYS', 'label_comp_id', atom_row)
  1552. try:
  1553. atom_site.setValue('LYS', 'auth_comp_id', atom_row)
  1554. except IndexError:
  1555. pass
  1556. if (res_num, residue, 'LYZ -> LYS') not in mod_row:
  1557. mod_row.append((res_num, residue, 'LYZ -> LYS'))
  1558. ########################################################################
  1559. ## Notify user about modification made to cif data ##
  1560. ########################################################################
  1561. if DEBUG_MODE:
  1562. mod_model_num = len([ msg for msg in cha_row if msg[2] == 'model_num' ])
  1563. mod_ins_code = len([ msg for msg in cha_row if msg[2] == 'ins_code' ])
  1564. mod_group_PDB = len([ msg for msg in cha_row if msg[2] == 'group_PDB' ])
  1565. if mod_model_num != 0:
  1566. print ('! {p} {c}: modified atom_site.pdbx_PDB_model_num for {cr} residues to 1.'.format(
  1567. p = pdb_code,
  1568. c = chain,
  1569. cr = mod_model_num))
  1570. if mod_ins_code != 0:
  1571. print ('! {p} {c}: modified atom_site.pdbx_PDB_ins_code for {cr} residues to "?".'.format(
  1572. p = pdb_code,
  1573. c = chain,
  1574. cr = mod_ins_code))
  1575. if mod_group_PDB != 0:
  1576. print ('! {p} {c}: modified atom_site.group_PDB for {cr} residues to "ATOM".'.format(
  1577. p = pdb_code,
  1578. c = chain,
  1579. cr = mod_group_PDB))
  1580. for residue in reversed(mod_row):
  1581. print ('! {p} {c}: modified cif pos {cr} ({nr}).'.format(
  1582. p = pdb_code,
  1583. c = chain,
  1584. cr = residue[0],
  1585. ca = residue[1],
  1586. nr = residue[2]))
  1587. for residue in reversed(rem_row):
  1588. print ('! {p} {c}: removed cif pos {cr} ({ca})'.format(
  1589. p = pdb_code,
  1590. c = chain,
  1591. cr = residue[0],
  1592. ca = residue[1]))
  1593. cif_edits[pdb_code] = block
  1594. # write modified pir to disk
  1595. for pdb_code in cif_edits:
  1596. out = open(os.path.join(output_path, pdb_code + '.cif'), 'w')
  1597. writer = PdbxWriter(out)
  1598. writer.writeContainer(cif_edits[pdb_code])
  1599. # Delete missing entries from the last template sequence to the first
  1600. for row in reversed(del_row):
  1601. template_grid.del_row(row)
  1602. return template_grid
  1603. def remove_self_alignment(template_grid, query_name):
  1604. """ Removes a self alignment from the final pir alignment to prevent clashes with MODELLER """
  1605. to_delete = list()
  1606. for row in range(template_grid.get_grid_height()):
  1607. if template_grid._pdb_code[row] == query_name:
  1608. to_delete.append(row)
  1609. for row in reversed(to_delete):
  1610. template_grid.del_row(row)
  1611. return True
  1612. def write_to_file(line_list, fname):
  1613. """ Writes the final pir file """
  1614. with open(fname, 'w+') as fout:
  1615. for line in line_list:
  1616. fout.write(line + "\n")
  1617. def arg():
  1618. import argparse
  1619. description = """Creates a MODELLER alignment (*.pir) from a HHSearch results file (*.hhr)."""
  1620. epilog= '2016 Harald Voehringer.'
  1621. # Initiate a ArgumentParser Class
  1622. parser = argparse.ArgumentParser(description = description, epilog = epilog)
  1623. # Call add_options to the parser
  1624. parser.add_argument('input', help = 'results file from HHsearch with hit list and alignment', metavar = 'FILE')
  1625. parser.add_argument('cifs', help = 'path to the folder containing cif files', metavar = 'DIR')
  1626. parser.add_argument('pir', help = 'output file (PIR-formatted multiple alignment)', metavar = 'FILE')
  1627. parser.add_argument('output', help = 'path to the folder where modified cif files should be written to', metavar = 'DIR')
  1628. parser.add_argument('-v', '--verbose', action = 'store_true', help = 'verbose mode')
  1629. parser.add_argument('-m', nargs = '+', help = 'pick hits with specified indices (e.g. -m 2 5)', metavar = 'INT')
  1630. parser.add_argument('-e', type = float, help = 'maximum E-Value threshold (e.g. -e 0.001)', metavar = 'FLOAT')
  1631. parser.add_argument('-r', type = float, help = 'residue ratio (filter alignments that have contribute at least residues according to the specified ratio).',
  1632. default = 0, metavar = 'FLOAT')
  1633. parser.add_argument('-c', help = 'convert non-canonical residues (default = True)', action = 'store_true', default = True)
  1634. return parser
  1635. def main():
  1636. import sys
  1637. parser = arg()
  1638. args = parser.parse_args(sys.argv[1:])
  1639. global DEBUG_MODE
  1640. if args.verbose:
  1641. DEBUG_MODE = True
  1642. query_name, query_chain = get_query_name(args.input)
  1643. data = read_result(args.input)
  1644. selected_templates = list()
  1645. if args.m and not args.e:
  1646. selection = map(lambda x: int(x), args.m)
  1647. print ('Selected templates {st}.'.format(st = ', '.join(args.m)))
  1648. for i in selection:
  1649. tmp_info = str(data[i - 1].template_info.split('>')[1])
  1650. print ('{i}: {t}'.format(
  1651. i = i,
  1652. t = tmp_info[0:80]))
  1653. selected_templates.append(data[i - 1])
  1654. elif args.e and not args.m:
  1655. print ('Selected templates satisfying E-val <= {e}'.format(e = args.e))
  1656. e_values = { float(j.evalue):i for i, j in enumerate(data) }
  1657. selection = sorted([ val for key, val in e_values.items() if key <= args.e ])
  1658. for i in selection:
  1659. tmp_info = str(data[i - 1].template_info.split('>')[1])
  1660. print ('{i}: {t}'.format(
  1661. i = i + 1,
  1662. t = tmp_info[0:80]))
  1663. selected_templates.append(data[i - 1])
  1664. elif args.m and args.e:
  1665. print ('! Please do not use option -m and -e at the same time ! Exiting.')
  1666. sys.exit()
  1667. else:
  1668. selected_templates = data
  1669. print ('Creating pir file using all templates ({n})'.format(
  1670. n = len(selected_templates)))
  1671. query_grid = create_query_grid(selected_templates) # load query grid
  1672. print ('query_grid')
  1673. print(query_grid)
  1674. gapless_query_grid = create_gapless_grid(query_grid) # remove gaps
  1675. print ('gapless_query_grid')
  1676. print(gapless_query_grid)
  1677. processed_query_grid = process_query_grid(query_grid, gapless_query_grid) # insert gaps
  1678. ##processed_query_grid = process_query_grid(query_grid, query_grid) # insert gaps
  1679. print ('processed_query_grid')
  1680. print (processed_query_grid)
  1681. glob_seq = derive_global_seq(processed_query_grid, query_name, query_chain) # derive query sequence
  1682. template_grid = create_template_grid(selected_templates) # create template grid
  1683. print ('template_grid')
  1684. print (template_grid)
  1685. processed_template_grid = process_template_grid(query_grid, template_grid) # insert gaps to template sequnces
  1686. print ('processed_query_grid')
  1687. print (processed_query_grid)
  1688. print ('hzhu processed_template_grid')
  1689. print (processed_template_grid)
  1690. final_grid = compare_with_cifs(processed_template_grid, args.cifs, args.output, args.c, args.r) # compare with atom section of cifs
  1691. remove_self_alignment(final_grid, query_name) # remove self alignment if any
  1692. write_to_file([glob_seq, final_grid.display()], args.pir)
  1693. if __name__ == "__main__":
  1694. main()