Source code for isopeptor.isopeptide

#! /usr/env/python
# -*- coding: utf-8 -*-

import re
from typing import List
import warnings
import numpy as np
import os
from pathlib import Path
from isopeptor.jess_wrapper import run_jess
from isopeptor.asa import get_structure_asa
from isopeptor.bond import BondElement 
from isopeptor.constants import MAX_ASA
from isopeptor.constants import BOND_TYPE
from isopeptor.logistic_regression import predict
from isopeptor.structure import get_structure
from isopeptor.bond_length import get_bond_stats
from isopeptor.dihedrals import get_dihedral_angles_stats

[docs] class Isopeptide: """ Handles isopeptide bond prediction running pyjess via the jess_wrapper module and solvent accessible area via the asa module. Stores isopeptide bond predictions as a list of BondElement. Prediction is run for all structures from struct_dir. Structures in .pdb format are directly analysed. If structures are in .cif format, they are first converted into .pdb. If multiple matches with an isopeptide bond signature are detected, only the one with the lowest RMSD is retained. Attributes: struct_dir (str): where pdb/cif files are located distance (float): that specifies permissivity of jess search jess_output (None | str): that stores jess raw output for debug purposes isopeptide_bonds (list): stores isopeptide bonds as BondElement elements fixed_r_asa (float | None): which ranges between 0 and 1 that fixes r_asa allowing to skip its calculation Example: >>> # Use a fixed solvent accessible area for a quick prediction: >>> from isopeptor.isopeptide import Isopeptide >>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1) >>> i.predict() >>> i.isopeptide_bonds[0] BondElement(struct_file=tests/data/test_structures/8beg.pdb, protein_name=8beg, rmsd=0.0, template=8beg_A_590_636_729, chain=A, r1_bond=590, r_cat=636, r2_bond=729, r1_bond_name=LYS, r_cat_name=ASP, r2_bond_name=ASN, bond_type=CnaA-like, r_asa=0.1, probability=0.991) >>> # Calculate solvent accessible area for a more accurate (and slow) prediction: >>> i = Isopeptide("tests/data/test_structures", distance=1.5) >>> i.predict() """ def __init__(self, struct_dir: str, distance: float = 3, fixed_r_asa: float | None = None): """ Raises ValueError if fixed_r_asa not between 0 and 1 """ self.struct_dir: str = struct_dir self.distance: float = distance self.jess_hits: list | None = None self.isopeptide_bonds: List[BondElement] = [] self.fixed_r_asa: float | None = fixed_r_asa if self.fixed_r_asa != None: if self.fixed_r_asa < 0 or self.fixed_r_asa > 1: raise ValueError(f"fixed_r_asa is not in 0-1 range. Found: {self.fixed_r_asa}") pdb_files = [str(p) for p in Path(self.struct_dir).glob("*.pdb")] cif_files = [str(p) for p in Path(self.struct_dir).glob("*.cif")] self.structure_files = pdb_files + cif_files self.geometry_calculated = False
[docs] def predict(self): """ Predict isopeptide bonds by 1. running jess template-based search, 2. calculating asa, 3. predicting isopeptide bond probability with logistic regression. """ self.jess_hits = run_jess(self.structure_files, self.distance) if len(self.jess_hits) == 0: warnings.warn("No isopeptide bond predictions detected. Try increasing the distance parameter.", UserWarning) return self._load_hits() self._reduce_redundant() if self.fixed_r_asa != None: for bond in self.isopeptide_bonds: bond.r_asa = self.fixed_r_asa else: self._calc_rasa() # Make prediction with linear regression self._infer() # Sort based on probability self.isopeptide_bonds.sort(key=lambda x: (x.probability, x.protein_name), reverse=True) # Infer type of bond self._infer_type()
[docs] def print_tabular(self): """ Print isopeptide bonds in a tabular format Example: >>> from isopeptor.isopeptide import Isopeptide >>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1) >>> i.predict() >>> i.print_tabular() protein_name probability chain r1_bond r_cat r2_bond r1_bond_name r_cat_name r2_bond_name bond_type rmsd r_asa template 8beg 0.991 A 590 636 729 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_590_636_729 8beg 0.991 A 756 806 894 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_756_806_894 8beg 0.991 A 922 973 1049 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_922_973_1049 8beg 0.991 A 1076 1123 1211 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_1076_1123_1211 5dz9 0.991 A 556 606 703 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 5dz9 0.991 A 730 776 861 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 4z1p 0.991 A 3 53 150 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 4z1p 0.991 A 177 223 308 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 7woi 0.909 B 57 158 195 LYS GLU ASN CnaB-like 0.314 0.1 5j4m_A_47_139_172 1amx 0.882 A 176 209 293 LYS ASP ASN CnaA-like 0.353 0.1 2f68_X_176_209_293 7woi 0.875 A 203 246 318 LYS ASP ASN CnaA-like 0.363 0.1 4hss_A_187_224_299 7woi 0.838 B 355 435 466 LYS GLU ASN CnaB-like 0.403 0.1 8f70_A_299_386_437 6to1_af 0.607 A 13 334 420 LYS ASP ASN CnaA-like 0.565 0.1 5mkc_A_191_600_695 """ if not self.geometry_calculated: headers = [ "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond", "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type", "rmsd", "r_asa", "template" ] # Print the header row using formatted widths column_widths = [] for header in headers: # Find the maximum length among the bond attributes for the current header max_bond_length = 0 for bond in self.isopeptide_bonds: bond_value_length = len(str(getattr(bond, header))) if bond_value_length > max_bond_length: max_bond_length = bond_value_length # Compare it with the length of the header and append the maximum to column_widths column_widths.append(max(len(header), max_bond_length)) formatted_header = "\t".join(f"{header:<{column_widths[i]}}" for i, header in enumerate(headers)) print(formatted_header) if len(self.isopeptide_bonds) > 0: column_widths = [max(len(header), max(len(str(getattr(bond, header))) for bond in self.isopeptide_bonds)) for header in headers] for bond in self.isopeptide_bonds: row = [ bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond, bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type, bond.rmsd, bond.r_asa, bond.template ] formatted_row = "\t".join(f"{str(item):<{column_widths[i]}}" for i, item in enumerate(row)) print(formatted_row) else: headers = [ "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond", "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type", "rmsd", "r_asa", "template", "bond_length", "bond_length_zscore", "bond_length_allowed", "pseudo_phi", "pseudo_psi", "pseudo_omega", "phi_psi_likelihood", "phi_psi_allowed", "omega_psi_likelihood", "omega_psi_allowed", "omega_phi_likelihood", "omega_phi_allowed" ] # Print the header row using formatted widths column_widths = [] for header in headers: # Find the maximum length among the bond attributes for the current header max_bond_length = 0 for bond in self.isopeptide_bonds: bond_value_length = len(str(getattr(bond, header))) if bond_value_length > max_bond_length: max_bond_length = bond_value_length # Compare it with the length of the header and append the maximum to column_widths column_widths.append(max(len(header), max_bond_length)) formatted_header = "\t".join(f"{header:<{column_widths[i]}}" for i, header in enumerate(headers)) print(formatted_header) if len(self.isopeptide_bonds) > 0: # Print each data row using the same formatting for bond in self.isopeptide_bonds: row = [ bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond, bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type, bond.rmsd, bond.r_asa, bond.template, bond.bond_length, bond.bond_length_zscore, bond.bond_length_allowed, bond.pseudo_phi, bond.pseudo_psi, bond.pseudo_omega, bond.phi_psi_likelihood, bond.phi_psi_allowed, bond.omega_psi_likelihood, bond.omega_psi_allowed, bond.omega_phi_likelihood, bond.omega_phi_allowed ] formatted_row = "\t".join(f"{str(item):<{column_widths[i]}}" for i, item in enumerate(row)) print(formatted_row)
[docs] def save_csv(self, output_table: str = "results.csv") -> None: """ Save isopeptide bond results in .csv format. Args: output_table (str): path to output table Example: >>> from isopeptor.isopeptide import Isopeptide >>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1) >>> i.predict() >>> i.save_csv() >>> # Or with geometry evaluation >>> i.get_geometry() >>> i.save_csv() """ with open(output_table, "wt") as fh: if not self.geometry_calculated: headers = [ "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond", "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type", "rmsd", "r_asa", "template" ] fh.write(",".join(headers)+"\n") if len(self.isopeptide_bonds) > 0: for bond in self.isopeptide_bonds: row = [ bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond, bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type, bond.rmsd, bond.r_asa, bond.template ] row = [str(i) for i in row] formatted_row = ",".join(row) fh.write(formatted_row+"\n") else: headers = [ "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond", "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type", "rmsd", "r_asa", "template", "bond_length", "bond_length_zscore", "bond_length_allowed", "pseudo_phi", "pseudo_psi", "pseudo_omega", "phi_psi_likelihood", "phi_psi_allowed", "omega_psi_likelihood", "omega_psi_allowed", "omega_phi_likelihood", "omega_phi_allowed" ] if len(self.isopeptide_bonds) > 0: formatted_header = ",".join(headers) fh.write(formatted_header+"\n") for bond in self.isopeptide_bonds: row = [ bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond, bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type, bond.rmsd, bond.r_asa, bond.template, bond.bond_length, bond.bond_length_zscore, bond.bond_length_allowed, bond.pseudo_phi, bond.pseudo_psi, bond.pseudo_omega, bond.phi_psi_likelihood, bond.phi_psi_allowed, bond.omega_psi_likelihood, bond.omega_psi_allowed, bond.omega_phi_likelihood, bond.omega_phi_allowed ] row = [str(i) for i in row] formatted_row = ",".join(row) fh.write(formatted_row+"\n")
[docs] def get_geometry(self): """ Get geometry measures (bond length and dihedral angles and map to existing distribution of measures from the database of PDB derived structures). Example: >>> from isopeptor.isopeptide import Isopeptide >>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1) >>> i.predict() >>> i.get_geometry() >>> i.print_tabular() protein_name probability chain r1_bond r_cat r2_bond r1_bond_name r_cat_name r2_bond_name bond_type rmsd r_asa template bond_length bond_length_zscore bond_length_allowed pseudo_phi pseudo_psi pseudo_omega phi_psi_likelihood phi_psi_allowed omega_psi_likelihood omega_psi_allowed omega_phi_likelihood omega_phi_allowed 8beg 0.991 A 590 636 729 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_590_636_729 1.33 0.024 True 92.305 -123.286 -0.032 -8.706 True -7.801 True -8.797 True 8beg 0.991 A 756 806 894 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_756_806_894 1.328 0.0 True -114.423 -130.136 44.142 -10.503 False -10.472 True -10.503 False 8beg 0.991 A 922 973 1049 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_922_973_1049 1.333 0.061 True 70.944 -131.258 17.005 -9.737 True -8.997 True -9.882 True 8beg 0.991 A 1076 1123 1211 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_1076_1123_1211 1.341 0.159 True 102.317 -124.836 2.258 -8.286 True -7.882 True -8.233 True 5dz9 0.991 A 556 606 703 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 1.291 -0.451 True 109.407 -112.845 -8.356 -7.72 True -7.722 True -7.884 True 5dz9 0.991 A 730 776 861 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 1.332 0.049 True 97.334 -122.501 6.356 -8.391 True -7.892 True -8.633 True 4z1p 0.991 A 3 53 150 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 1.291 -0.451 True 109.407 -112.845 -8.356 -7.72 True -7.722 True -7.884 True 4z1p 0.991 A 177 223 308 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 1.332 0.049 True 97.334 -122.501 6.356 -8.391 True -7.892 True -8.633 True 7woi 0.909 B 57 158 195 LYS GLU ASN CnaB-like 0.314 0.1 5j4m_A_47_139_172 1.312 -0.195 True -148.85 71.47 179.899 -9.376 True -9.126 True -9.675 True 1amx 0.882 A 176 209 293 LYS ASP ASN CnaA-like 0.353 0.1 2f68_X_176_209_293 3.345 24.598 False 96.887 -147.455 66.383 -10.918 False -14.477 False -25.835 False 7woi 0.875 A 203 246 318 LYS ASP ASN CnaA-like 0.363 0.1 4hss_A_187_224_299 1.336 0.098 True 136.992 -158.356 4.495 -14.872 False -12.529 False -9.337 True 7woi 0.838 B 355 435 466 LYS GLU ASN CnaB-like 0.403 0.1 8f70_A_299_386_437 1.36 0.39 True 71.106 149.083 171.017 -11.508 False -9.75 True -11.537 False 6to1_af 0.607 A 13 334 420 LYS ASP ASN CnaA-like 0.565 0.1 5mkc_A_191_600_695 2.928 19.512 False 83.132 -155.408 131.186 -13.496 False -51.581 False -15.775 False """ self.geometry_calculated = True bonds = self.isopeptide_bonds for struct_file in set([b.struct_file for b in bonds]): structure = get_structure(struct_file) for bond in [b for b in bonds if b.struct_file == struct_file]: # Get bond lenght and stats bond_length, bond_length_zscore, bond_length_allowed = get_bond_stats(structure, bond.chain, bond.r1_bond, bond.r2_bond, bond.r2_bond_name) # Get dihedrals and stats angle_stats = get_dihedral_angles_stats(structure, bond.chain, bond.r1_bond, bond.r2_bond, bond.r2_bond_name, bond.bond_type) pseudo_omega, pseudo_phi, pseudo_psi, phi_psi_likelihood, omega_psi_likelihood, omega_phi_likelihood, phi_psi_allowed, omega_psi_allowed, omega_phi_allowed = angle_stats # Load values into the bond instance bond.bond_length = bond_length bond.bond_length_zscore = bond_length_zscore bond.bond_length_allowed = bond_length_allowed bond.pseudo_omega = pseudo_omega bond.pseudo_phi = pseudo_phi bond.pseudo_psi = pseudo_psi bond.phi_psi_likelihood = phi_psi_likelihood bond.omega_psi_likelihood = omega_psi_likelihood bond.omega_phi_likelihood = omega_phi_likelihood bond.phi_psi_allowed = phi_psi_allowed bond.omega_psi_allowed = omega_psi_allowed bond.omega_phi_allowed = omega_phi_allowed
def _load_hits(self): """ Load pyjess._jess.Hit as list of bondElement in self.isopeptide_bonds. Raises: ValueError if number of residues found is not expected. """ for hit in self.jess_hits: template, rmsd, struct_file, atoms = hit.template.id, hit.rmsd, hit.molecule.id, hit.atoms() protein_name = os.path.basename(struct_file).replace(".pdb", "").replace(".cif", "") residues, residue_names = [], [] for atom in atoms: if atom.residue_number not in residues: residues.append(atom.residue_number) residue_names.append(atom.residue_name) chain = atom.chain_id if len(residues) == 3: self.isopeptide_bonds.append( BondElement( struct_file, protein_name, round(rmsd, 3), template, chain, residues[0], residues[1], residues[2], residue_names[0], residue_names[1], residue_names[2] ) ) else: raise ValueError(f"Found {len(residues)} residues.") def _reduce_redundant(self): """ Reduces redundant isopeptide bond predictions by keeping prediction with lower RMSD. """ bonds = self.isopeptide_bonds grouped_bonds = {} for bond in bonds: key = (bond.protein_name, bond.r1_bond, bond.r_cat, bond.r2_bond) if key not in grouped_bonds or bond.rmsd < grouped_bonds[key].rmsd: grouped_bonds[key] = bond # Keep only the best bond for each group self.isopeptide_bonds = list(grouped_bonds.values()) def _calc_rasa(self): """ Calculate r_asa for every isopeptide bond in every structure. """ bonds = self.isopeptide_bonds for struct_file in set([b.struct_file for b in bonds]): # Get asa and structure atom array pdb file structure_sasa, structure = get_structure_asa(struct_file) # Get asa of isopeptide residues for bond in [b for b in bonds if b.struct_file == struct_file]: isopep_residues = [bond.r1_bond, bond.r_cat, bond.r2_bond] isopep_residue_names = [bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name] tmp_r_asa = 0 for res_id, res_name in zip(isopep_residues, isopep_residue_names): # Handle the case of two res bb if res_name == None: continue res_indeces = [i for i, atom in enumerate(structure) if atom.res_id == res_id and atom.chain_id == bond.chain] #res_name = structure[res_indeces[0]].res_name if res_name not in MAX_ASA["rost_sander"].keys(): raise ValueError(f"{res_name} not in {MAX_ASA['rost_sander'].keys()}") # Normalise ASA by residue surface area r_asa = sum([structure_sasa[i] for i in res_indeces]) / MAX_ASA["rost_sander"][res_name] tmp_r_asa += r_asa bond.r_asa = round(tmp_r_asa / 3, 3) def _infer(self): """ Infer presence of isoeptide bond using logistic regression model. """ for bond in self.isopeptide_bonds: bond.probability = predict(bond.rmsd, bond.r_asa) def _infer_type(self): """ Infer isopeptide bond type (CnaA/B-like). """ for bond in self.isopeptide_bonds: bond.bond_type = BOND_TYPE.get(bond.template, None)
if __name__ == "__main__": import doctest doctest.testmod(report=True, optionflags=doctest.NORMALIZE_WHITESPACE)