#! /usr/env/python
# -*- coding: utf-8 -*-
import re
from typing import List
import warnings
import numpy as np
import os
from pathlib import Path
from isopeptor.jess_wrapper import run_jess
from isopeptor.asa import get_structure_asa
from isopeptor.bond import BondElement
from isopeptor.constants import MAX_ASA
from isopeptor.constants import BOND_TYPE
from isopeptor.logistic_regression import predict
from isopeptor.structure import get_structure
from isopeptor.bond_length import get_bond_stats
from isopeptor.dihedrals import get_dihedral_angles_stats
[docs]
class Isopeptide:
"""
Handles isopeptide bond prediction running pyjess via the jess_wrapper module
and solvent accessible area via the asa module. Stores isopeptide bond predictions
as a list of BondElement. Prediction is run for all structures from struct_dir.
Structures in .pdb format are directly analysed. If structures are in .cif format, they are first converted into .pdb.
If multiple matches with an isopeptide bond signature are detected, only the one
with the lowest RMSD is retained.
Attributes:
struct_dir (str): where pdb/cif files are located
distance (float): that specifies permissivity of jess search
jess_output (None | str): that stores jess raw output for debug purposes
isopeptide_bonds (list): stores isopeptide bonds as BondElement elements
fixed_r_asa (float | None): which ranges between 0 and 1 that fixes r_asa allowing to skip its calculation
Example:
>>> # Use a fixed solvent accessible area for a quick prediction:
>>> from isopeptor.isopeptide import Isopeptide
>>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1)
>>> i.predict()
>>> i.isopeptide_bonds[0]
BondElement(struct_file=tests/data/test_structures/8beg.pdb, protein_name=8beg, rmsd=0.0, template=8beg_A_590_636_729, chain=A, r1_bond=590, r_cat=636, r2_bond=729, r1_bond_name=LYS, r_cat_name=ASP, r2_bond_name=ASN, bond_type=CnaA-like, r_asa=0.1, probability=0.991)
>>> # Calculate solvent accessible area for a more accurate (and slow) prediction:
>>> i = Isopeptide("tests/data/test_structures", distance=1.5)
>>> i.predict()
"""
def __init__(self, struct_dir: str, distance: float = 3, fixed_r_asa: float | None = None):
"""
Raises
ValueError if fixed_r_asa not between 0 and 1
"""
self.struct_dir: str = struct_dir
self.distance: float = distance
self.jess_hits: list | None = None
self.isopeptide_bonds: List[BondElement] = []
self.fixed_r_asa: float | None = fixed_r_asa
if self.fixed_r_asa != None:
if self.fixed_r_asa < 0 or self.fixed_r_asa > 1:
raise ValueError(f"fixed_r_asa is not in 0-1 range. Found: {self.fixed_r_asa}")
pdb_files = [str(p) for p in Path(self.struct_dir).glob("*.pdb")]
cif_files = [str(p) for p in Path(self.struct_dir).glob("*.cif")]
self.structure_files = pdb_files + cif_files
self.geometry_calculated = False
[docs]
def predict(self):
"""
Predict isopeptide bonds by 1. running jess template-based search,
2. calculating asa, 3. predicting isopeptide bond probability with
logistic regression.
"""
self.jess_hits = run_jess(self.structure_files, self.distance)
if len(self.jess_hits) == 0:
warnings.warn("No isopeptide bond predictions detected. Try increasing the distance parameter.", UserWarning)
return
self._load_hits()
self._reduce_redundant()
if self.fixed_r_asa != None:
for bond in self.isopeptide_bonds:
bond.r_asa = self.fixed_r_asa
else:
self._calc_rasa()
# Make prediction with linear regression
self._infer()
# Sort based on probability
self.isopeptide_bonds.sort(key=lambda x: (x.probability, x.protein_name), reverse=True)
# Infer type of bond
self._infer_type()
[docs]
def print_tabular(self):
"""
Print isopeptide bonds in a tabular format
Example:
>>> from isopeptor.isopeptide import Isopeptide
>>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1)
>>> i.predict()
>>> i.print_tabular()
protein_name probability chain r1_bond r_cat r2_bond r1_bond_name r_cat_name r2_bond_name bond_type rmsd r_asa template
8beg 0.991 A 590 636 729 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_590_636_729
8beg 0.991 A 756 806 894 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_756_806_894
8beg 0.991 A 922 973 1049 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_922_973_1049
8beg 0.991 A 1076 1123 1211 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_1076_1123_1211
5dz9 0.991 A 556 606 703 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150
5dz9 0.991 A 730 776 861 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308
4z1p 0.991 A 3 53 150 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150
4z1p 0.991 A 177 223 308 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308
7woi 0.909 B 57 158 195 LYS GLU ASN CnaB-like 0.314 0.1 5j4m_A_47_139_172
1amx 0.882 A 176 209 293 LYS ASP ASN CnaA-like 0.353 0.1 2f68_X_176_209_293
7woi 0.875 A 203 246 318 LYS ASP ASN CnaA-like 0.363 0.1 4hss_A_187_224_299
7woi 0.838 B 355 435 466 LYS GLU ASN CnaB-like 0.403 0.1 8f70_A_299_386_437
6to1_af 0.607 A 13 334 420 LYS ASP ASN CnaA-like 0.565 0.1 5mkc_A_191_600_695
"""
if not self.geometry_calculated:
headers = [
"protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
"r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
"rmsd", "r_asa", "template"
]
# Print the header row using formatted widths
column_widths = []
for header in headers:
# Find the maximum length among the bond attributes for the current header
max_bond_length = 0
for bond in self.isopeptide_bonds:
bond_value_length = len(str(getattr(bond, header)))
if bond_value_length > max_bond_length:
max_bond_length = bond_value_length
# Compare it with the length of the header and append the maximum to column_widths
column_widths.append(max(len(header), max_bond_length))
formatted_header = "\t".join(f"{header:<{column_widths[i]}}" for i, header in enumerate(headers))
print(formatted_header)
if len(self.isopeptide_bonds) > 0:
column_widths = [max(len(header), max(len(str(getattr(bond, header))) for bond in self.isopeptide_bonds)) for header in headers]
for bond in self.isopeptide_bonds:
row = [
bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond,
bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
bond.rmsd, bond.r_asa, bond.template
]
formatted_row = "\t".join(f"{str(item):<{column_widths[i]}}" for i, item in enumerate(row))
print(formatted_row)
else:
headers = [
"protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
"r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
"rmsd", "r_asa", "template", "bond_length", "bond_length_zscore", "bond_length_allowed",
"pseudo_phi", "pseudo_psi", "pseudo_omega", "phi_psi_likelihood", "phi_psi_allowed", "omega_psi_likelihood",
"omega_psi_allowed", "omega_phi_likelihood", "omega_phi_allowed"
]
# Print the header row using formatted widths
column_widths = []
for header in headers:
# Find the maximum length among the bond attributes for the current header
max_bond_length = 0
for bond in self.isopeptide_bonds:
bond_value_length = len(str(getattr(bond, header)))
if bond_value_length > max_bond_length:
max_bond_length = bond_value_length
# Compare it with the length of the header and append the maximum to column_widths
column_widths.append(max(len(header), max_bond_length))
formatted_header = "\t".join(f"{header:<{column_widths[i]}}" for i, header in enumerate(headers))
print(formatted_header)
if len(self.isopeptide_bonds) > 0:
# Print each data row using the same formatting
for bond in self.isopeptide_bonds:
row = [
bond.protein_name, bond.probability, bond.chain, bond.r1_bond, bond.r_cat, bond.r2_bond,
bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
bond.rmsd, bond.r_asa, bond.template, bond.bond_length, bond.bond_length_zscore, bond.bond_length_allowed,
bond.pseudo_phi, bond.pseudo_psi, bond.pseudo_omega, bond.phi_psi_likelihood, bond.phi_psi_allowed, bond.omega_psi_likelihood,
bond.omega_psi_allowed, bond.omega_phi_likelihood, bond.omega_phi_allowed
]
formatted_row = "\t".join(f"{str(item):<{column_widths[i]}}" for i, item in enumerate(row))
print(formatted_row)
[docs]
def save_csv(self, output_table: str = "results.csv") -> None:
"""
Save isopeptide bond results in .csv format.
Args:
output_table (str): path to output table
Example:
>>> from isopeptor.isopeptide import Isopeptide
>>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1)
>>> i.predict()
>>> i.save_csv()
>>> # Or with geometry evaluation
>>> i.get_geometry()
>>> i.save_csv()
"""
with open(output_table, "wt") as fh:
if not self.geometry_calculated:
headers = [
"protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
"r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
"rmsd", "r_asa", "template"
]
fh.write(",".join(headers)+"\n")
if len(self.isopeptide_bonds) > 0:
for bond in self.isopeptide_bonds:
row = [
bond.protein_name, bond.probability, bond.chain, bond.r1_bond,
bond.r_cat, bond.r2_bond,
bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
bond.rmsd, bond.r_asa, bond.template
]
row = [str(i) for i in row]
formatted_row = ",".join(row)
fh.write(formatted_row+"\n")
else:
headers = [
"protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
"r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
"rmsd", "r_asa", "template", "bond_length", "bond_length_zscore", "bond_length_allowed",
"pseudo_phi", "pseudo_psi", "pseudo_omega", "phi_psi_likelihood", "phi_psi_allowed", "omega_psi_likelihood",
"omega_psi_allowed", "omega_phi_likelihood", "omega_phi_allowed"
]
if len(self.isopeptide_bonds) > 0:
formatted_header = ",".join(headers)
fh.write(formatted_header+"\n")
for bond in self.isopeptide_bonds:
row = [
bond.protein_name, bond.probability, bond.chain,
bond.r1_bond, bond.r_cat, bond.r2_bond,
bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
bond.rmsd, bond.r_asa, bond.template, bond.bond_length,
bond.bond_length_zscore, bond.bond_length_allowed,
bond.pseudo_phi, bond.pseudo_psi, bond.pseudo_omega,
bond.phi_psi_likelihood, bond.phi_psi_allowed, bond.omega_psi_likelihood,
bond.omega_psi_allowed, bond.omega_phi_likelihood, bond.omega_phi_allowed
]
row = [str(i) for i in row]
formatted_row = ",".join(row)
fh.write(formatted_row+"\n")
[docs]
def get_geometry(self):
"""
Get geometry measures (bond length and dihedral angles and map to existing distribution of measures from
the database of PDB derived structures).
Example:
>>> from isopeptor.isopeptide import Isopeptide
>>> i = Isopeptide("tests/data/test_structures", distance=1.5, fixed_r_asa=0.1)
>>> i.predict()
>>> i.get_geometry()
>>> i.print_tabular()
protein_name probability chain r1_bond r_cat r2_bond r1_bond_name r_cat_name r2_bond_name bond_type rmsd r_asa template bond_length bond_length_zscore bond_length_allowed pseudo_phi pseudo_psi pseudo_omega phi_psi_likelihood phi_psi_allowed omega_psi_likelihood omega_psi_allowed omega_phi_likelihood omega_phi_allowed
8beg 0.991 A 590 636 729 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_590_636_729 1.33 0.024 True 92.305 -123.286 -0.032 -8.706 True -7.801 True -8.797 True
8beg 0.991 A 756 806 894 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_756_806_894 1.328 0.0 True -114.423 -130.136 44.142 -10.503 False -10.472 True -10.503 False
8beg 0.991 A 922 973 1049 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_922_973_1049 1.333 0.061 True 70.944 -131.258 17.005 -9.737 True -8.997 True -9.882 True
8beg 0.991 A 1076 1123 1211 LYS ASP ASN CnaA-like 0.0 0.1 8beg_A_1076_1123_1211 1.341 0.159 True 102.317 -124.836 2.258 -8.286 True -7.882 True -8.233 True
5dz9 0.991 A 556 606 703 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 1.291 -0.451 True 109.407 -112.845 -8.356 -7.72 True -7.722 True -7.884 True
5dz9 0.991 A 730 776 861 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 1.332 0.049 True 97.334 -122.501 6.356 -8.391 True -7.892 True -8.633 True
4z1p 0.991 A 3 53 150 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_3_53_150 1.291 -0.451 True 109.407 -112.845 -8.356 -7.72 True -7.722 True -7.884 True
4z1p 0.991 A 177 223 308 LYS ASP ASN CnaA-like 0.0 0.1 4z1p_A_177_223_308 1.332 0.049 True 97.334 -122.501 6.356 -8.391 True -7.892 True -8.633 True
7woi 0.909 B 57 158 195 LYS GLU ASN CnaB-like 0.314 0.1 5j4m_A_47_139_172 1.312 -0.195 True -148.85 71.47 179.899 -9.376 True -9.126 True -9.675 True
1amx 0.882 A 176 209 293 LYS ASP ASN CnaA-like 0.353 0.1 2f68_X_176_209_293 3.345 24.598 False 96.887 -147.455 66.383 -10.918 False -14.477 False -25.835 False
7woi 0.875 A 203 246 318 LYS ASP ASN CnaA-like 0.363 0.1 4hss_A_187_224_299 1.336 0.098 True 136.992 -158.356 4.495 -14.872 False -12.529 False -9.337 True
7woi 0.838 B 355 435 466 LYS GLU ASN CnaB-like 0.403 0.1 8f70_A_299_386_437 1.36 0.39 True 71.106 149.083 171.017 -11.508 False -9.75 True -11.537 False
6to1_af 0.607 A 13 334 420 LYS ASP ASN CnaA-like 0.565 0.1 5mkc_A_191_600_695 2.928 19.512 False 83.132 -155.408 131.186 -13.496 False -51.581 False -15.775 False
"""
self.geometry_calculated = True
bonds = self.isopeptide_bonds
for struct_file in set([b.struct_file for b in bonds]):
structure = get_structure(struct_file)
for bond in [b for b in bonds if b.struct_file == struct_file]:
# Get bond lenght and stats
bond_length, bond_length_zscore, bond_length_allowed = get_bond_stats(structure, bond.chain, bond.r1_bond,
bond.r2_bond, bond.r2_bond_name)
# Get dihedrals and stats
angle_stats = get_dihedral_angles_stats(structure, bond.chain, bond.r1_bond,
bond.r2_bond, bond.r2_bond_name, bond.bond_type)
pseudo_omega, pseudo_phi, pseudo_psi, phi_psi_likelihood, omega_psi_likelihood, omega_phi_likelihood, phi_psi_allowed, omega_psi_allowed, omega_phi_allowed = angle_stats
# Load values into the bond instance
bond.bond_length = bond_length
bond.bond_length_zscore = bond_length_zscore
bond.bond_length_allowed = bond_length_allowed
bond.pseudo_omega = pseudo_omega
bond.pseudo_phi = pseudo_phi
bond.pseudo_psi = pseudo_psi
bond.phi_psi_likelihood = phi_psi_likelihood
bond.omega_psi_likelihood = omega_psi_likelihood
bond.omega_phi_likelihood = omega_phi_likelihood
bond.phi_psi_allowed = phi_psi_allowed
bond.omega_psi_allowed = omega_psi_allowed
bond.omega_phi_allowed = omega_phi_allowed
def _load_hits(self):
"""
Load pyjess._jess.Hit as list of bondElement in self.isopeptide_bonds.
Raises:
ValueError if number of residues found is not expected.
"""
for hit in self.jess_hits:
template, rmsd, struct_file, atoms = hit.template.id, hit.rmsd, hit.molecule.id, hit.atoms()
protein_name = os.path.basename(struct_file).replace(".pdb", "").replace(".cif", "")
residues, residue_names = [], []
for atom in atoms:
if atom.residue_number not in residues:
residues.append(atom.residue_number)
residue_names.append(atom.residue_name)
chain = atom.chain_id
if len(residues) == 3:
self.isopeptide_bonds.append(
BondElement(
struct_file, protein_name, round(rmsd, 3), template, chain,
residues[0], residues[1], residues[2],
residue_names[0], residue_names[1], residue_names[2]
)
)
else:
raise ValueError(f"Found {len(residues)} residues.")
def _reduce_redundant(self):
"""
Reduces redundant isopeptide bond predictions by keeping prediction with lower RMSD.
"""
bonds = self.isopeptide_bonds
grouped_bonds = {}
for bond in bonds:
key = (bond.protein_name, bond.r1_bond, bond.r_cat, bond.r2_bond)
if key not in grouped_bonds or bond.rmsd < grouped_bonds[key].rmsd:
grouped_bonds[key] = bond
# Keep only the best bond for each group
self.isopeptide_bonds = list(grouped_bonds.values())
def _calc_rasa(self):
"""
Calculate r_asa for every isopeptide bond in every structure.
"""
bonds = self.isopeptide_bonds
for struct_file in set([b.struct_file for b in bonds]):
# Get asa and structure atom array pdb file
structure_sasa, structure = get_structure_asa(struct_file)
# Get asa of isopeptide residues
for bond in [b for b in bonds if b.struct_file == struct_file]:
isopep_residues = [bond.r1_bond, bond.r_cat, bond.r2_bond]
isopep_residue_names = [bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name]
tmp_r_asa = 0
for res_id, res_name in zip(isopep_residues, isopep_residue_names):
# Handle the case of two res bb
if res_name == None:
continue
res_indeces = [i for i, atom in enumerate(structure) if atom.res_id == res_id and atom.chain_id == bond.chain]
#res_name = structure[res_indeces[0]].res_name
if res_name not in MAX_ASA["rost_sander"].keys():
raise ValueError(f"{res_name} not in {MAX_ASA['rost_sander'].keys()}")
# Normalise ASA by residue surface area
r_asa = sum([structure_sasa[i] for i in res_indeces]) / MAX_ASA["rost_sander"][res_name]
tmp_r_asa += r_asa
bond.r_asa = round(tmp_r_asa / 3, 3)
def _infer(self):
"""
Infer presence of isoeptide bond using logistic regression model.
"""
for bond in self.isopeptide_bonds:
bond.probability = predict(bond.rmsd, bond.r_asa)
def _infer_type(self):
"""
Infer isopeptide bond type (CnaA/B-like).
"""
for bond in self.isopeptide_bonds:
bond.bond_type = BOND_TYPE.get(bond.template, None)
if __name__ == "__main__":
import doctest
doctest.testmod(report=True, optionflags=doctest.NORMALIZE_WHITESPACE)