Source code for pyrosetta_help.common_ops.utils

__all__ = ['pose_from_file',
           'pose2pandas',
           'make_blank_pose',
           'add_bfactor_from_score',
           'get_last_res_in_chain',
           'clarify_selector',
           'count_ligands',
           'correct_numbering',
           'get_pdbstr',
           'pose_range',
           'fix_offset',
           'assign_chain_letter',
           'what_is_chain']

from collections import Counter
from typing import (Optional, Tuple, Union, Iterable, List)
from Bio.Align import PairwiseAligner, PairwiseAlignments, Alignment
from IPython.display import display, HTML
import string
import numpy as np
import pandas as pd
import pyrosetta
import functools
from ..threading import rangify


[docs]def pose_from_file(pdb_filename: str,
                   params_filenames: Optional[Union[pyrosetta.rosetta.utility.vector1_string, List[str]]] = None) \
        -> pyrosetta.Pose:
    """
    Return a pose like pose_from_file but with params.

    :param pdb_filename:
    :param params_filenames:
    :return:
    """
    pose = make_blank_pose(params_filenames)
    pyrosetta.rosetta.core.import_pose.pose_from_file(pose, pdb_filename)
    return pose


vector1_string = pyrosetta.rosetta.utility.vector1_string


[docs]def make_blank_pose(params_filenames: Optional[Union[vector1_string, List[str]]] = None) \
        -> pyrosetta.Pose:
    """
    Returns an empty pose, but with params from files.

    :param params_filenames:
    :return:
    """
    pose = pyrosetta.Pose()
    if params_filenames and isinstance(params_filenames, vector1_string):
        pyrosetta.generate_nonstandard_residue_set(pose, params_filenames)
    if params_filenames and isinstance(params_filenames, list):
        params_filenames2 = vector1_string()
        params_filenames2.extend([f for f in params_filenames if f])
        pyrosetta.generate_nonstandard_residue_set(pose, params_filenames2)
    elif params_filenames:
        raise TypeError(f'Unexpected type {type(params_filenames)}')
    else:
        pass
    return pose


[docs]def pose2pandas(pose: pyrosetta.Pose, scorefxn: pyrosetta.ScoreFunction) -> pd.DataFrame:
    """
    Return a pandas dataframe from the scores of the pose

    :param pose:
    :return:
    """
    # monkey patching to make old pyrosetta code work
    if not hasattr(np, 'alltrue'):
        np.alltrue = np.all
    pose.energies().clear_energies()
    scorefxn.weights() # neccessary?
    emopts = pyrosetta.rosetta.core.scoring.methods.EnergyMethodOptions(scorefxn.energy_method_options())
    emopts.hbond_options().decompose_bb_hb_into_pair_energies(True)
    scorefxn.set_energy_method_options(emopts)
    scorefxn(pose)
    scores = pd.DataFrame(pose.energies().residue_total_energies_array())
    pi = pose.pdb_info()
    scores['residue'] = scores.index.to_series() \
        .apply(lambda r: pose.residue(r + 1) \
               .name1() + pi.pose2pdb(r + 1).strip().replace(' ', ':')
               )
    return scores


[docs]def add_bfactor_from_score(pose: pyrosetta.Pose):
    """
    Adds the bfactors from total_score.
    Snippet for testing in Jupyter

    >>> import nglview as nv
    >>> view = nv.show_rosetta(pose)
    >>> # view = nv.show_file('test.cif')
    >>> view.clear_representations()
    >>> view.add_tube(radiusType="bfactor", color="bfactor", radiusScale=0.10, colorScale="RdYlBu")
    >>> view

    ``replace_res_remap_bfactors`` may have been a cleaner strategy. This was quicker to write.

    If this fails, it may be because the pose was not scored first.
    """
    if pose.pdb_info().obsolete():
        raise ValueError('Pose pdb_info is flagged as obsolete (change `pose.pdb_info().obsolete(False)`)')
    # scores
    energies = pose.energies()

    def get_res_score(res):
        total_score = pyrosetta.rosetta.core.scoring.ScoreType.total_score
        # if pose.residue(res).is_polymer()
        try:
            return energies.residue_total_energies(res)[total_score]
        except:
            return float('nan')

    # the array goes from zero (nan) to n_residues
    total_scores = np.array([float('nan')] + [get_res_score(res) for res in range(1, pose.total_residue() + 1)])
    mask = np.isnan(total_scores)
    total_scores -= np.nanmin(total_scores)
    total_scores *= 100 / np.nanmax(total_scores)
    total_scores = np.nan_to_num(total_scores, nan=100)
    total_scores[mask] = 0.
    # add to pose
    pdb_info = pose.pdb_info()
    for res in range(pose.total_residue()):
        for i in range(pose.residue(res + 1).natoms()):
            pdb_info.bfactor(res + 1, i + 1, total_scores[res + 1])


[docs]def get_last_res_in_chain(pose, chain='A') -> int:
    """
    Returns last residue index in a chain. THere is probably a mover that does this.

    :param pose:
    :param chain: letter or number
    :return:
    """
    cv = pyrosetta.rosetta.core.select.residue_selector.ChainSelector(chain).apply(pose)
    rv = pyrosetta.rosetta.core.select.residue_selector.ResidueVector(cv)
    return max(rv)


[docs]def clarify_selector(selector: pyrosetta.rosetta.core.select.residue_selector.ResidueSelector,
                     pose: pyrosetta.Pose) -> List['str']:
    """
    Given a selector and pose return a list of residues in NGL selection format
    Example, [CMP]787:H

    :param selector:
    :param pose:
    :return: list of residues in NGL selection format
    """
    pose2pdb = pose.pdb_info().pose2pdb
    vector = selector.apply(pose)
    rv = pyrosetta.rosetta.core.select.residue_selector.ResidueVector(vector)
    return [f'[{pose.residue(r).name3()}]{pose2pdb(r).strip().replace(" ", ":")}' for r in rv]

[docs]def count_ligands(pose: pyrosetta.Pose) -> List[Tuple[str, int]]:
    LIGAND = pyrosetta.rosetta.core.chemical.ResidueProperty.LIGAND
    pr_rs = pyrosetta.rosetta.core.select.residue_selector
    lig_sele = pr_rs.ResiduePropertySelector(LIGAND)
    count = Counter([pose.residue(i).name3() for i in pr_rs.ResidueVector(lig_sele.apply(pose))])
    return count.most_common()

[docs]def correct_numbering(pose, chain_references:Optional[Union[None, str]]=None, allow_insertions:bool=False):
    """
    A fresh PDBInfo has the PDB residue number the same as the pose one
    as opposed to restarting per chain this fixes it.

    Additionaly, if the list ``chain_references`` is provided, any element as None is ignored,
    but any element with a sequence (str) where the first char is pos 1, will be used
    for numbering by calling ``fix_offset``.

    :param pose:
    :return:
    """
    pdb_info = pose.pdb_info()
    if pdb_info is None:
        pdb_mover = pyrosetta.rosetta.protocols.simple_moves.AddPDBInfoMover()
        pdb_mover.apply(pose)
        pdb_info = pose.pdb_info()
    current_chain = 'A'
    r = 1
    for i in range(1, pose.total_residue() +1):
        if pdb_info.chain(i) != current_chain:
            current_chain = pdb_info.chain(i)
            r = 1
            pdb_info.number(i, 1)
        else:
            pdb_info.number(i, r)
            r += 1
    if not chain_references:
        return
    for i, ref_sequence in enumerate(chain_references):
        chain_i = i + 1
        if ref_sequence is None:
            continue
        assert isinstance(ref_sequence, str)
        fix_offset(pose, chain_i, ref_sequence, allow_insertions)


[docs]def fix_offset(pose, chain_i: int, ref_sequence: str, allow_insertions:bool=False) -> HTML:
    """
    Fix the pose numbering on ``chain_i`` based on ``ref_sequence``
    No gaps accepted in ref_sequence if ``allow_insertions`` is False, so pad with Xs appropriately if only termini.
    If ``allow_insertions`` is True, insertion codes are used.
    """
    pdb_info = pose.pdb_info()
    assert not pdb_info.obsolete(), "pdb info is marked as obsolete"
    # ## Align
    aligner = PairwiseAligner()
    # Set the scoring
    aligner.match_score = 1
    aligner.mismatch_score = -2
    aligner.open_gap_score = -1
    aligner.extend_gap_score = -0.5
    # Perform the alignment
    alignments: PairwiseAlignments = aligner.align(ref_sequence, pose.chain_sequence(chain_i),)
    # Assuming you want the best alignment
    best_alignment: Alignment = alignments[0]
    # this is relative to alignment and not unaligned seqA, so may have gaps if insertions are allowed:
    # Creating the residue map
    resi_map: List[int] = [i + 1 for i, l in enumerate(best_alignment.aligned[1]) if best_alignment.target[l] != '-']
    # Check if the ref sequence aligns without gaps
    if '-' not in best_alignment.target:
        pass
    elif not allow_insertions:
            raise ValueError('There is an insertion in the pose, which is not allowed by attribute `allow_insertions` == False')
    else:
        gaps = [i + 1 for i, l in enumerate(best_alignment.target) if l == '-']
        for start, end in rangify(gaps):
            resi_map = resi_map[:start] + [(start - 1, letter) for letter in string.ascii_uppercase[:end + 1 - start]] + resi_map[start:]

    pdb_info = pose.pdb_info()
    chain_resi_offset = -1
    for resi in pose_range(pose):
        residue = pose.residue(resi)
        if residue.chain() != chain_i:
            continue
        if chain_resi_offset == -1:
            chain_resi_offset = resi - 1
        neoresi = resi_map[resi - chain_resi_offset - 1]
        if isinstance(neoresi, int):
            pdb_info.number(resi, neoresi)
        else: # isinstance(neoresi, tuple)
            neoresi, icode = neoresi
            pdb_info.number(resi, neoresi)
            pdb_info.icode(resi, icode)
    # ## Output
    a, b = best_alignment.target, best_alignment.query
    gap_line = ''.join('|' if a[i] == b[i] else ' ' for i in range(len(a)))
    formatted_gap = ''.join(['.' if c == '|' else '*' for c in gap_line])

    # Create the formatted alignment string
    # formatted = pairwise2.format_alignment(*alignment)
    formatted = f"{a}\n{formatted_gap}\n{b}\nScore={best_alignment.score}"
    a, gap, b, score = formatted.strip().split('\n')
    gap = ''.join(['.' if c == '|' else '*' for c in gap])
    return HTML(f'<div style="font-family:monospace; display: inline-block; white-space: nowrap;">' +
                f'{a}<br/>{gap.replace(" ", "*")}<br/>{b}<br/>{score}</div>')

[docs]def assign_chain_letter(pose, chain_i: int, new_letter: str):
    """
    Assign a new letter to chain number ``chain_i``
    """
    pdb_info = pose.pdb_info()
    assert not pdb_info.obsolete(), "pdb info is marked as obsolete"
    for resi in ph.pose_range(pose):
        residue = pose.residue(resi)
        if residue.chain() != chain_i:
            continue
        pdb_info.chain(resi, new_letter)

[docs]def get_pdbstr(pose):
    buffer = pyrosetta.rosetta.std.stringbuf()
    pose.dump_pdb(pyrosetta.rosetta.std.ostream(buffer))
    return buffer.str()


[docs]def pose_range(pose: pyrosetta.Pose, protein_only=True) -> Iterable:
    """
    range but for the pose...
    For now naive of chains, non-amino acid residues etc.
    """
    all_iter = range(1, pose.total_residue()+1)
    if protein_only:
        return filter(lambda i: pose.residue(i).is_protein(), all_iter)
    return all_iter


[docs]def what_is_chain(pose: pyrosetta.Pose, chain: Union[str, int]):
    """
    Given a pose and a chain integer or character,
    return the character or integer of that chain

    :param pose:
    :param chain: str --> pdb_info based. int --> pyrosetta.Pose based
    :return:
    """
    pdb_info = pose.pdb_info()
    if isinstance(chain, int):
        for resi in ph.pose_range(pose):
            residue = pose.residue(resi)
            if residue.chain() != chain:
                continue
            return pdb_info.chain(resi)
    elif isinstance(chain, str):
        for resi in ph.pose_range(pose):
            if pdb_info.chain(resi) != chain:
                continue
            residue = pose.residue(resi)
            return residue.chain()
    else:
        raise TypeError(f'Chain can be int or str not {type(chain)}')
    raise ValueError(f'No chain {chain}')