Source code for pyrosetta_help.ligands.hunter

from collections import Counter
from io import StringIO
from typing import (Tuple, Dict, List)

import pandas as pd
import requests
from Bio import SearchIO
from Bio.Blast.NCBIWWW import qblast


[docs]class LigandHunter: """ Given a sequence find homologues (Blast) and find if they have ligands (PDBe API). The definition of what ligand is a cofactor comes from PDBe and does not count ions or triphospho-nucleotides as cofactors, but does count NADH adducts. * ``.data`` is a list of dictionaries. * ``.to_dataframe()`` converts into a pandas dataframe for further analysis. * ``.candidate_ligands`` list of ligand residue 3-letter codes """ _cofactor_reference = {} @property def cofactor_reference(self) -> Dict[str, List[dict]]: if not self._cofactor_reference: #cached self._cofactor_reference = requests.get('https://www.ebi.ac.uk/pdbe/api/pdb/compound/cofactors').json() return self._cofactor_reference missing_codes = ['ATP', 'GTP', 'CA', 'MG', 'W'] @property def cofactor_codes(self): _grouped_cofactor_codes = {name: value[0]['cofactors'] for name, value in self.cofactor_reference.items()} return [code for codes in _grouped_cofactor_codes.values() for code in codes] + self.missing_codes
[docs] def __init__(self, sequence: str): """ :param sequence: is assumed clean protein sequence """ self.sequence = sequence self._blast_result = self._retrieve_homologues() #:StringIO self.data = self._parse_blast_result() self._retrieve_ligands() # inplace. self._ligand_data = None
[docs] def to_dataframe(self): """ Converts ``.data`` to a pandas dataframe """ return pd.DataFrame(self.data).transpose()
@property def candidate_ligands(self) -> List[str]: return list(set([code for datum in self.data.values() for code in datum['ligand_codes']])) @property # cached the old way def ligand_data(self): """ Data for the ligands. Note that LigandNicker has a get smiles method, that works like this, but is unrelated. """ if self._ligand_data is None: self._ligand_data = requests.post('https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/', data=','.join(self.candidate_ligands)).json() return self._ligand_data
[docs] def get_most_common_ligands(self) -> List[Tuple[str, int]]: # Uses collections.Counter not typing.Counter: c = Counter([code for datum in self.data.values() for code in datum['ligand_codes']]) # noqa return c.most_common()
[docs] def get_pdb_entry_by_ligand(self, ligand_code: str) -> dict: """ get pdb **entry** by ligand. Returns the first, which should be lowest e-value """ for datum in self.data.values(): if ligand_code in datum['ligand_codes']: return datum else: raise ValueError(f'{ligand_code} not found in any of the {len(self.data)} hits')
# ------------ initialisation methods ------------------------ def _retrieve_homologues(self) -> StringIO: return qblast('blastp', 'pdb', self.sequence) def _parse_blast_result(self) -> Dict[str, dict]: results = {} self._blast_result.seek(0) for query in SearchIO.parse(self._blast_result, 'blast-xml'): for hit in query: datum = dict(accession=hit.accession, description=hit.description, evalue=hit.hsps[0].evalue) datum['pdb_code'], datum['chain'] = hit.accession.split('_') results[datum['pdb_code'].upper()] = datum return results def _retrieve_ligands(self) -> None: query = ','.join(self.data.keys()) reply = requests.post(url='https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/', data=query) for code, datum in reply.json().items(): entry = self.data[code.upper()] entry['ligands'] = datum entry['ligand_codes'] = [inner['chem_comp_id'] for inner in datum] entry['cofactor_codes'] = [code for code in entry['ligand_codes'] if code in self.cofactor_codes] entry['has_cofactor'] = bool(entry['cofactor_codes'])
# TODO find the part of the code that is somehow truncated between commits!!