from collections import Counter
from io import StringIO
from typing import (Tuple, Dict, List)
import pandas as pd
import requests
from Bio import SearchIO
from Bio.Blast.NCBIWWW import qblast
[docs]class LigandHunter:
"""
Given a sequence find homologues (Blast) and find if they have ligands (PDBe API).
The definition of what ligand is a cofactor comes from PDBe and does not count ions or
triphospho-nucleotides as cofactors, but does count NADH adducts.
* ``.data`` is a list of dictionaries.
* ``.to_dataframe()`` converts into a pandas dataframe for further analysis.
* ``.candidate_ligands`` list of ligand residue 3-letter codes
"""
_cofactor_reference = {}
@property
def cofactor_reference(self) -> Dict[str, List[dict]]:
if not self._cofactor_reference: #cached
self._cofactor_reference = requests.get('https://www.ebi.ac.uk/pdbe/api/pdb/compound/cofactors').json()
return self._cofactor_reference
missing_codes = ['ATP', 'GTP', 'CA', 'MG', 'W']
@property
def cofactor_codes(self):
_grouped_cofactor_codes = {name: value[0]['cofactors'] for name, value in self.cofactor_reference.items()}
return [code for codes in _grouped_cofactor_codes.values() for code in codes] + self.missing_codes
[docs] def __init__(self, sequence: str):
"""
:param sequence: is assumed clean protein sequence
"""
self.sequence = sequence
self._blast_result = self._retrieve_homologues() #:StringIO
self.data = self._parse_blast_result()
self._retrieve_ligands() # inplace.
self._ligand_data = None
[docs] def to_dataframe(self):
"""
Converts ``.data`` to a pandas dataframe
"""
return pd.DataFrame(self.data).transpose()
@property
def candidate_ligands(self) -> List[str]:
return list(set([code for datum in self.data.values() for code in datum['ligand_codes']]))
@property # cached the old way
def ligand_data(self):
"""
Data for the ligands.
Note that LigandNicker has a get smiles method, that works like this, but is unrelated.
"""
if self._ligand_data is None:
self._ligand_data = requests.post('https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/',
data=','.join(self.candidate_ligands)).json()
return self._ligand_data
[docs] def get_most_common_ligands(self) -> List[Tuple[str, int]]:
# Uses collections.Counter not typing.Counter:
c = Counter([code for datum in self.data.values() for code in datum['ligand_codes']]) # noqa
return c.most_common()
[docs] def get_pdb_entry_by_ligand(self, ligand_code: str) -> dict:
"""
get pdb **entry** by ligand.
Returns the first, which should be lowest e-value
"""
for datum in self.data.values():
if ligand_code in datum['ligand_codes']:
return datum
else:
raise ValueError(f'{ligand_code} not found in any of the {len(self.data)} hits')
# ------------ initialisation methods ------------------------
def _retrieve_homologues(self) -> StringIO:
return qblast('blastp', 'pdb', self.sequence)
def _parse_blast_result(self) -> Dict[str, dict]:
results = {}
self._blast_result.seek(0)
for query in SearchIO.parse(self._blast_result, 'blast-xml'):
for hit in query:
datum = dict(accession=hit.accession,
description=hit.description,
evalue=hit.hsps[0].evalue)
datum['pdb_code'], datum['chain'] = hit.accession.split('_')
results[datum['pdb_code'].upper()] = datum
return results
def _retrieve_ligands(self) -> None:
query = ','.join(self.data.keys())
reply = requests.post(url='https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/',
data=query)
for code, datum in reply.json().items():
entry = self.data[code.upper()]
entry['ligands'] = datum
entry['ligand_codes'] = [inner['chem_comp_id'] for inner in datum]
entry['cofactor_codes'] = [code for code in entry['ligand_codes'] if code in self.cofactor_codes]
entry['has_cofactor'] = bool(entry['cofactor_codes'])
# TODO find the part of the code that is somehow truncated between commits!!