Source code for structural.clusters

from moleculekit.molecule import Molecule
from moleculekit.vmdviewer import viewer
import numpy as np
from scipy.spatial.distance import cdist
from collections import Counter
import sys, getopt
from graph_tool.all import *
import math
from typing import Tuple, Dict

import logging
import moleculekit

logging.getLogger(moleculekit.molecule.__name__).setLevel(logging.WARNING)
logger = logging.getLogger('hydrophobic_clusters')
from typing import NamedTuple


[docs]class Cluster(NamedTuple):
    area: np.array
    residues: list
    contacts: int
    ratio_contacts_residue: float
    ratio_area_residue: float


# Paths and variables
sel = "protein and chain A and not backbone and noh and resname ILE VAL LEU"
# mol_path='/agh/projects/PotF/data/potf/ligand/structures/potfwt-spd/3pk1/phenix/Refine_49/wt_potf_spd_refine_TLS_NCSfull_hydrogens_b20_049.pdb'
_ATOMIC_RADII = {'C': 1.88}
water_radius = 1.4
sphere_radius_carbon = _ATOMIC_RADII['C'] + water_radius
sphere_points = 610
sphere_area_const = 4.0 * math.pi * (sphere_radius_carbon ** 2) / sphere_points


[docs]class Atom:
    """
    Class to handle ILV atoms
    index: the vmd atom index
    mol: an htmd.htmd.molecule object
    """
    radius = _ATOMIC_RADII['C']  # All distances in Angstrom

    def __init__(self, index: int, mol: Molecule) -> None:
        self.index = index
        self.coords = mol.coords[index][:, 0]
        self.resid = mol.resid[index]
        self.point_coords = generate_sphere_points(self.coords, sphere_points, self.radius)
        self.neighbor_indices = self.get_neighbors(mol)

[docs]    def get_neighbors(self, mol: Molecule) -> np.array:
        """
        Provides all indices of atoms within 6.56 A of this atom.
        6.56 is the upper bound of a possible neighbor 1.88 (C) + 1.4 + 1.4 + 1.88 (C).
        """
        neighbor_indices = mol.get("index", sel=f"protein and chain A\
        and noh and not resid '{self.resid}' and within 6.56 of index '{self.index}'")
        return neighbor_indices


[docs]def generate_sphere_points(coords: np.array, n: int = 610, radius: float = 1.88) -> np.array:
    """

    :param coords: The coordinates of the center of the atom
    :param n: number of points to sample
    :param radius: the radius of the atom
    :return: a nx3 vector with the point coordinates
    """
    total_radius = radius + water_radius
    points = []
    inc = math.pi * (3 - math.sqrt(5))
    offset = 2 / float(n)
    for k in range(int(n)):
        y = k * offset - 1 + (offset / 2)
        r = math.sqrt(1 - y * y)
        phi = k * inc
        points.append([math.cos(phi) * r, y, math.sin(phi) * r])
    vec = np.asarray(points)
    vec *= total_radius
    vec += coords
    return vec


[docs]def retrieve_neighbor_positions(atom: Atom, mol: Molecule) -> Tuple[np.array, Dict[int, int]]:
    """

    :param atom: an Atom object
    :param mol: a htmd.htmd.molecule object
    :return:
    Positions of the neighboring atoms
    A dictionary indexing column positions to resid positions
    """
    positions = mol.coords[atom.neighbor_indices][:, :, 0]
    position_index_to_resid = {index: mol.resid[neighbor_indice] for index, neighbor_indice in
                               enumerate(atom.neighbor_indices)}
    return positions, position_index_to_resid


[docs]def retrieve_indices(matrix: np.array, coords: np.array, neighborpositions: np.array, radius: float = 1.88) -> np.array:
    """
    Computes if each of the n sphere points are penetrating neighboring spheres
    :param matrix: n x m Distance matrix where n is the number of sphere points and m the number of neighbors
    :param coords: the coordinates of the atom
    :param neighborpositions: Coordinates of the neighbors
    :param radius: radius of the atom
    :return: The atoms that are in closest with each n sphere points.
    """

    ## When a row contains several atoms in contact,
    #  select the one with the closest center to the center of atom A
    sphere_radius = water_radius + radius
    #When a point is within the sphere of other atoms, the atom that is closest is chosen.
    # We need to compute the distances between all atom-neighbor pairs.
    dist_center_atoms = cdist(np.reshape(coords, (1, 3)), neighborpositions)
    ranking = np.argsort(dist_center_atoms)
    valid = matrix <= sphere_radius
    idx2 = []
    for row in valid:
        if row.any():
            # 1. ordering the row with Falses and Trues according to the ranking order
            # 2. Getting the first element that is true, which will be the first in the
            # raking list, thus the index of the closest atom.
            idx2.append(ranking[0][np.where(np.isin(ranking, np.where(row)))[1][0]])
    return idx2


[docs]def fill_matrices(atom: Atom, mol: Molecule, atom_matrix: np.array,
                  resid_matrix: np.array, indices: np.array, resids: np.array) -> Tuple[np.array, np.array]:
    """
    :param atom: An Atom class
    :param mol: an htmd.htmd.molecule object
    :param atom_matrix: the index x index area matrix
    :param resid_matrix: the ILVresid x ILVresid area matrix
    :param indices: the indices that belong to ILV sidechain heavy atoms
    :param resids: the resids that belong to ILV sidechain heavy atoms
    :return: Updated atom_matrix and resid_matrix
    """
    neighbor_positions, position_index_to_resid = retrieve_neighbor_positions(atom, mol)
    # Compute distances between all sphere points and the neighbors.
    # THe shape will be 610 (rows) x nr. of neighbors (columns).
    distances = cdist(atom.point_coords, neighbor_positions)
    column_indices = retrieve_indices(distances, atom.coords, neighbor_positions)
    colpos_occurrences = Counter(column_indices)
    for colpos, occurrences in colpos_occurrences.items():
        if atom.neighbor_indices[colpos] in indices:
            area = sphere_area_const * occurrences
            atom_matrix[atom.index, atom.neighbor_indices[colpos]] = area
            index_i = np.where(resids == atom.resid)[0][0]
            index_j = np.where(resids == position_index_to_resid[colpos])[0][0]
            resid_matrix[index_i, index_j] += area
    return atom_matrix, resid_matrix


[docs]def create_graph(resid_matrix: np.array, resid_list: np.array, cutoff_area: float = 10.0) -> Graph:
    """

    :param resid_matrix: the ILVresid x ILVresid area matrix
    :param resid_list: the index x index area matrix
    :return: A Graph object where each component is a ILV cluster
    """
    g = Graph()
    g.vp.resid = g.new_vertex_property("int")
    g.ep.area = g.new_edge_property("float")
    # 1. Create all vertices
    for v in range(len(resid_matrix)):
        v1 = g.add_vertex()
        g.vp.resid[v1] = resid_list[v]

    # 2. Create edges and fill its values with areas
    for row_index, row in enumerate(resid_matrix):
        v1 = g.vertex(row_index)
        for column_index, area in enumerate(row):
            v2 = g.vertex(column_index)
            if area > cutoff_area:
                ae = g.add_edge(v1, v2)
                g.ep.area[ae] = area
    return g


[docs]def filter_mol(inputmolfile: str) -> Molecule:
    """
    Loads, filters the object to chain A and writes filtered pdb out.
    :param inputmolfile: path to the pdb file
    :return: the Molecule object
    """
    mol = Molecule(inputmolfile)
    mol.filter("protein and chain A")
    mol.write(f"{inputmolfile[:-4]}-chainA.pdb")
    return mol


[docs]def write_clusters(g: Graph, components: PropertyArray, inputmolfile: str, outputname: str) -> None:
    """
    This function prints the clusters to the terminal and outputs them
    into a VMD session
    :param g: A Graph object
    :param outputname: The pdb filename.
    :param outputname: The file name to output the VMD session to.
    :return:
    """
    f = open(outputname[:-4] + ".txt", "w")
    mol = Molecule(f"{inputmolfile[:-4]}-chainA.pdb")
    mol.reps.add(sel='protein', style='NewCartoon', color=8)
    for cluster_index in range(max(components) + 1):
        cluster = [i for i, x in enumerate(components) if x == cluster_index]
        if len(cluster) < 2: continue
        vfilt = g.new_vertex_property('bool')
        for i in cluster: vfilt[i] = True
        sub = GraphView(g, vfilt)
        area = np.sum([g.ep.area[edge] for edge in sub.edges()])
        f.write(f"Cluster index {cluster_index}:\t"
                f"Residues {len(cluster)}. Total area {area} A^2.\t "
                f"Number of contacts: {sub.num_edges()}.\t "
                f"Contacts / Residue: {sub.num_edges() / len(cluster) }.\t "
                f"Area / Residue {area / sub.num_edges()}\n")  # sum reverse and inverse
        resid_cluster = [g.vp.resid[i] for i in cluster]
        mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
                     style="VDW", color=cluster_index)

    vmdobject = viewer(dispdev='text')
    vmdobject.loadMol(mol, name=inputmolfile)
    vmdobject.send("save_state " + outputname)
    vmdobject.close()
    f.close()


[docs]def add_clusters(mol, g: Graph, components: PropertyArray):
    """

    :param mol:
    :param g:
    :param components:
    :return: Molecule representations
            A list with Cluster objects
    """
    clusters = []
    mol.reps.add(sel='protein', style='NewCartoon', color=8)
    for cluster_index in range(max(components) + 1):
        cluster = [i for i, x in enumerate(components) if x == cluster_index]
        if len(cluster) < 2: continue
        vfilt = g.new_vertex_property('bool')
        for i in cluster: vfilt[i] = True
        sub = GraphView(g, vfilt)
        area = np.sum([g.ep.area[edge] for edge in sub.edges()])
        logger.info(f"Cluster index {cluster_index}:"
                    f"Residues {len(cluster)}. Total area {area} A^2. "
                    f"Number of contacts: {sub.num_edges()}. "
                    f"Contacts / Residue: {sub.num_edges() / len(cluster) }. "
                    f"Area / Residue {area / sub.num_edges()}")  # sum reverse and inverse
        resid_cluster = [g.vp.resid[i] for i in cluster]
        clusters.append(Cluster(
            area=area,
            residues=resid_cluster,
            contacts=sub.num_edges(),
            ratio_contacts_residue=sub.num_edges() / len(cluster),
            ratio_area_residue=area / sub.num_edges()
        ))
        mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
                     style="VDW", color=cluster_index)
    return clusters


[docs]def write_largest_cluster(g: Graph, inputmolfile: str, outputname: str) -> None:
    """
    This function prints the clusters to the terminal and outputs them
    into a VMD session
    :param g: A Graph object
    :param outputname: The pdb filename.
    :param outputname: The file name to output the VMD session to.
    :return:
    """
    mol = Molecule(f"{inputmolfile[:-4]}-chainA.pdb")
    mol.reps.add(sel='protein', style='NewCartoon', color=8)
    l = label_largest_component(g)
    sub = GraphView(g, vfilt=l)
    resid_cluster = [g.vp.resid[i] for i in sub.vertices()]
    mol.reps.add('chain A and noh and not backbone and resid %s' % ' '.join(map(str, resid_cluster)),
                 style="VDW", color=1)  # red
    vmdobject = viewer(dispdev='text')
    vmdobject.loadMol(mol, name=inputmolfile)
    vmdobject.send("save_state " + outputname)
    vmdobject.close()


[docs]def postprocess_session(inputmolfile: str, outputname: str) -> None:
    """
    Modifies the VMD session to not include tmp files
    :param outputname: The vmd session (output file)
    :param inputmolfile: Path to the pdb file already processed (filtered and or protonated)
    :return:
    """
    f = open(outputname, "r")
    lines = f.readlines()
    f.close()
    f = open(f"{outputname}", "w")
    for line in lines:
        if "mol new /tmp/" in line:
            line = f"mol new {inputmolfile}" \
                   f" type pdb first 0 last -1 step 1 filebonds 1 autobonds 1 waitfor all\n"
            f.write(line)
        elif "mol addfile /tmp/" in line:
            continue
        else:
            f.write(line)
    f.close()


def main(argv):
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError("usage:"):
        print('hydroClusters.py -i <inputfile> -o <outputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg

    # Initialize molecules
    logger.info("Filtering and writing PDB")
    mol = filter_mol(inputfile)
    resids = np.unique(mol.get("resid", sel=sel))
    dims = len((resids))
    indices = mol.get("index", sel=sel)
    dims_indices = len(mol.get("index", sel="protein and chain A"))

    # Initializing Final output
    logger.info("Initializing final output")
    contacts = np.zeros((dims, dims))
    atoms_to_atoms = np.zeros((dims_indices, dims_indices))

    # Filling matrices
    logger.info("Computing clusters")
    for index in indices:
        a = Atom(index, mol)
        if not a.neighbor_indices.any(): continue
        _, contacts = fill_matrices(a, mol, atoms_to_atoms, contacts, indices, resids)

    # Rendering clusters
    graph = create_graph(contacts, resids)
    comp, _ = label_components(graph)
    if comp.a.any():
        write_clusters(graph, comp.a, inputfile, outputfile)
        inputfile_processed = f"{inputfile[:-4]}-chainA.pdb"
        postprocess_session(inputfile_processed, outputfile)
        summary_output = outputfile[:-4] + "-major.vmd"
        write_largest_cluster(graph, inputfile, summary_output)
        postprocess_session(inputfile, summary_output)
        logger.info("Saving VMD sessions")
    else:
        logging.warning("The ILV residues are not in contact")


if __name__ == "__main__":
    main(sys.argv[1:])