from moleculekit.molecule import Molecule
from moleculekit.vmdviewer import viewer
import numpy as np
from scipy.spatial.distance import cdist
from collections import Counter
import sys, getopt
from graph_tool.all import *
import math
from typing import Tuple, Dict
import logging
import moleculekit
logging.getLogger(moleculekit.molecule.__name__).setLevel(logging.WARNING)
logger = logging.getLogger('hydrophobic_clusters')
from typing import NamedTuple
[docs]class Cluster(NamedTuple):
area: np.array
residues: list
contacts: int
ratio_contacts_residue: float
ratio_area_residue: float
# Paths and variables
sel = "protein and chain A and not backbone and noh and resname ILE VAL LEU"
# mol_path='/agh/projects/PotF/data/potf/ligand/structures/potfwt-spd/3pk1/phenix/Refine_49/wt_potf_spd_refine_TLS_NCSfull_hydrogens_b20_049.pdb'
_ATOMIC_RADII = {'C': 1.88}
water_radius = 1.4
sphere_radius_carbon = _ATOMIC_RADII['C'] + water_radius
sphere_points = 610
sphere_area_const = 4.0 * math.pi * (sphere_radius_carbon ** 2) / sphere_points
[docs]class Atom:
"""
Class to handle ILV atoms
index: the vmd atom index
mol: an htmd.htmd.molecule object
"""
radius = _ATOMIC_RADII['C'] # All distances in Angstrom
def __init__(self, index: int, mol: Molecule) -> None:
self.index = index
self.coords = mol.coords[index][:, 0]
self.resid = mol.resid[index]
self.point_coords = generate_sphere_points(self.coords, sphere_points, self.radius)
self.neighbor_indices = self.get_neighbors(mol)
[docs] def get_neighbors(self, mol: Molecule) -> np.array:
"""
Provides all indices of atoms within 6.56 A of this atom.
6.56 is the upper bound of a possible neighbor 1.88 (C) + 1.4 + 1.4 + 1.88 (C).
"""
neighbor_indices = mol.get("index", sel=f"protein and chain A\
and noh and not resid '{self.resid}' and within 6.56 of index '{self.index}'")
return neighbor_indices
[docs]def generate_sphere_points(coords: np.array, n: int = 610, radius: float = 1.88) -> np.array:
"""
:param coords: The coordinates of the center of the atom
:param n: number of points to sample
:param radius: the radius of the atom
:return: a nx3 vector with the point coordinates
"""
total_radius = radius + water_radius
points = []
inc = math.pi * (3 - math.sqrt(5))
offset = 2 / float(n)
for k in range(int(n)):
y = k * offset - 1 + (offset / 2)
r = math.sqrt(1 - y * y)
phi = k * inc
points.append([math.cos(phi) * r, y, math.sin(phi) * r])
vec = np.asarray(points)
vec *= total_radius
vec += coords
return vec
[docs]def retrieve_neighbor_positions(atom: Atom, mol: Molecule) -> Tuple[np.array, Dict[int, int]]:
"""
:param atom: an Atom object
:param mol: a htmd.htmd.molecule object
:return:
Positions of the neighboring atoms
A dictionary indexing column positions to resid positions
"""
positions = mol.coords[atom.neighbor_indices][:, :, 0]
position_index_to_resid = {index: mol.resid[neighbor_indice] for index, neighbor_indice in
enumerate(atom.neighbor_indices)}
return positions, position_index_to_resid
[docs]def retrieve_indices(matrix: np.array, coords: np.array, neighborpositions: np.array, radius: float = 1.88) -> np.array:
"""
Computes if each of the n sphere points are penetrating neighboring spheres
:param matrix: n x m Distance matrix where n is the number of sphere points and m the number of neighbors
:param coords: the coordinates of the atom
:param neighborpositions: Coordinates of the neighbors
:param radius: radius of the atom
:return: The atoms that are in closest with each n sphere points.
"""
## When a row contains several atoms in contact,
# select the one with the closest center to the center of atom A
sphere_radius = water_radius + radius
#When a point is within the sphere of other atoms, the atom that is closest is chosen.
# We need to compute the distances between all atom-neighbor pairs.
dist_center_atoms = cdist(np.reshape(coords, (1, 3)), neighborpositions)
ranking = np.argsort(dist_center_atoms)
valid = matrix <= sphere_radius
idx2 = []
for row in valid:
if row.any():
# 1. ordering the row with Falses and Trues according to the ranking order
# 2. Getting the first element that is true, which will be the first in the
# raking list, thus the index of the closest atom.
idx2.append(ranking[0][np.where(np.isin(ranking, np.where(row)))[1][0]])
return idx2
[docs]def fill_matrices(atom: Atom, mol: Molecule, atom_matrix: np.array,
resid_matrix: np.array, indices: np.array, resids: np.array) -> Tuple[np.array, np.array]:
"""
:param atom: An Atom class
:param mol: an htmd.htmd.molecule object
:param atom_matrix: the index x index area matrix
:param resid_matrix: the ILVresid x ILVresid area matrix
:param indices: the indices that belong to ILV sidechain heavy atoms
:param resids: the resids that belong to ILV sidechain heavy atoms
:return: Updated atom_matrix and resid_matrix
"""
neighbor_positions, position_index_to_resid = retrieve_neighbor_positions(atom, mol)
# Compute distances between all sphere points and the neighbors.
# THe shape will be 610 (rows) x nr. of neighbors (columns).
distances = cdist(atom.point_coords, neighbor_positions)
column_indices = retrieve_indices(distances, atom.coords, neighbor_positions)
colpos_occurrences = Counter(column_indices)
for colpos, occurrences in colpos_occurrences.items():
if atom.neighbor_indices[colpos] in indices:
area = sphere_area_const * occurrences
atom_matrix[atom.index, atom.neighbor_indices[colpos]] = area
index_i = np.where(resids == atom.resid)[0][0]
index_j = np.where(resids == position_index_to_resid[colpos])[0][0]
resid_matrix[index_i, index_j] += area
return atom_matrix, resid_matrix
[docs]def create_graph(resid_matrix: np.array, resid_list: np.array, cutoff_area: float = 10.0) -> Graph:
"""
:param resid_matrix: the ILVresid x ILVresid area matrix
:param resid_list: the index x index area matrix
:return: A Graph object where each component is a ILV cluster
"""
g = Graph()
g.vp.resid = g.new_vertex_property("int")
g.ep.area = g.new_edge_property("float")
# 1. Create all vertices
for v in range(len(resid_matrix)):
v1 = g.add_vertex()
g.vp.resid[v1] = resid_list[v]
# 2. Create edges and fill its values with areas
for row_index, row in enumerate(resid_matrix):
v1 = g.vertex(row_index)
for column_index, area in enumerate(row):
v2 = g.vertex(column_index)
if area > cutoff_area:
ae = g.add_edge(v1, v2)
g.ep.area[ae] = area
return g
[docs]def filter_mol(inputmolfile: str) -> Molecule:
"""
Loads, filters the object to chain A and writes filtered pdb out.
:param inputmolfile: path to the pdb file
:return: the Molecule object
"""
mol = Molecule(inputmolfile)
mol.filter("protein and chain A")
mol.write(f"{inputmolfile[:-4]}-chainA.pdb")
return mol
[docs]def write_clusters(g: Graph, components: PropertyArray, inputmolfile: str, outputname: str) -> None:
"""
This function prints the clusters to the terminal and outputs them
into a VMD session
:param g: A Graph object
:param outputname: The pdb filename.
:param outputname: The file name to output the VMD session to.
:return:
"""
f = open(outputname[:-4] + ".txt", "w")
mol = Molecule(f"{inputmolfile[:-4]}-chainA.pdb")
mol.reps.add(sel='protein', style='NewCartoon', color=8)
for cluster_index in range(max(components) + 1):
cluster = [i for i, x in enumerate(components) if x == cluster_index]
if len(cluster) < 2: continue
vfilt = g.new_vertex_property('bool')
for i in cluster: vfilt[i] = True
sub = GraphView(g, vfilt)
area = np.sum([g.ep.area[edge] for edge in sub.edges()])
f.write(f"Cluster index {cluster_index}:\t"
f"Residues {len(cluster)}. Total area {area} A^2.\t "
f"Number of contacts: {sub.num_edges()}.\t "
f"Contacts / Residue: {sub.num_edges() / len(cluster) }.\t "
f"Area / Residue {area / sub.num_edges()}\n") # sum reverse and inverse
resid_cluster = [g.vp.resid[i] for i in cluster]
mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
style="VDW", color=cluster_index)
vmdobject = viewer(dispdev='text')
vmdobject.loadMol(mol, name=inputmolfile)
vmdobject.send("save_state " + outputname)
vmdobject.close()
f.close()
[docs]def add_clusters(mol, g: Graph, components: PropertyArray):
"""
:param mol:
:param g:
:param components:
:return: Molecule representations
A list with Cluster objects
"""
clusters = []
mol.reps.add(sel='protein', style='NewCartoon', color=8)
for cluster_index in range(max(components) + 1):
cluster = [i for i, x in enumerate(components) if x == cluster_index]
if len(cluster) < 2: continue
vfilt = g.new_vertex_property('bool')
for i in cluster: vfilt[i] = True
sub = GraphView(g, vfilt)
area = np.sum([g.ep.area[edge] for edge in sub.edges()])
logger.info(f"Cluster index {cluster_index}:"
f"Residues {len(cluster)}. Total area {area} A^2. "
f"Number of contacts: {sub.num_edges()}. "
f"Contacts / Residue: {sub.num_edges() / len(cluster) }. "
f"Area / Residue {area / sub.num_edges()}") # sum reverse and inverse
resid_cluster = [g.vp.resid[i] for i in cluster]
clusters.append(Cluster(
area=area,
residues=resid_cluster,
contacts=sub.num_edges(),
ratio_contacts_residue=sub.num_edges() / len(cluster),
ratio_area_residue=area / sub.num_edges()
))
mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
style="VDW", color=cluster_index)
return clusters
[docs]def write_largest_cluster(g: Graph, inputmolfile: str, outputname: str) -> None:
"""
This function prints the clusters to the terminal and outputs them
into a VMD session
:param g: A Graph object
:param outputname: The pdb filename.
:param outputname: The file name to output the VMD session to.
:return:
"""
mol = Molecule(f"{inputmolfile[:-4]}-chainA.pdb")
mol.reps.add(sel='protein', style='NewCartoon', color=8)
l = label_largest_component(g)
sub = GraphView(g, vfilt=l)
resid_cluster = [g.vp.resid[i] for i in sub.vertices()]
mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
style="VDW", color=1) # red
vmdobject = viewer(dispdev='text')
vmdobject.loadMol(mol, name=inputmolfile)
vmdobject.send("save_state " + outputname)
vmdobject.close()
[docs]def postprocess_session(inputmolfile: str, outputname: str) -> None:
"""
Modifies the VMD session to not include tmp files
:param outputname: The vmd session (output file)
:param inputmolfile: Path to the pdb file already processed (filtered and or protonated)
:return:
"""
f = open(outputname, "r")
lines = f.readlines()
f.close()
f = open(f"{outputname}", "w")
for line in lines:
if "mol new /tmp/" in line:
line = f"mol new {inputmolfile}" \
f" type pdb first 0 last -1 step 1 filebonds 1 autobonds 1 waitfor all\n"
f.write(line)
elif "mol addfile /tmp/" in line:
continue
else:
f.write(line)
f.close()
def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError("usage:"):
print('hydroClusters.py -i <inputfile> -o <outputfile>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('test.py -i <inputfile> -o <outputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
# Initialize molecules
logger.info("Filtering and writing PDB")
mol = filter_mol(inputfile)
resids = np.unique(mol.get("resid", sel=sel))
dims = len((resids))
indices = mol.get("index", sel=sel)
dims_indices = len(mol.get("index", sel="protein and chain A"))
# Initializing Final output
logger.info("Initializing final output")
contacts = np.zeros((dims, dims))
atoms_to_atoms = np.zeros((dims_indices, dims_indices))
# Filling matrices
logger.info("Computing clusters")
for index in indices:
a = Atom(index, mol)
if not a.neighbor_indices.any(): continue
_, contacts = fill_matrices(a, mol, atoms_to_atoms, contacts, indices, resids)
# Rendering clusters
graph = create_graph(contacts, resids)
comp, _ = label_components(graph)
if comp.a.any():
write_clusters(graph, comp.a, inputfile, outputfile)
inputfile_processed = f"{inputfile[:-4]}-chainA.pdb"
postprocess_session(inputfile_processed, outputfile)
summary_output = outputfile[:-4] + "-major.vmd"
write_largest_cluster(graph, inputfile, summary_output)
postprocess_session(inputfile, summary_output)
logger.info("Saving VMD sessions")
else:
logging.warning("The ILV residues are not in contact")
if __name__ == "__main__":
main(sys.argv[1:])