Source code for psico.aggrescanning

"""
Implementation of AGGRESCAN according to Conchillo-Solé et al. 2007 and the
corresponding web tool at http://bioinf.uab.es/aap/

Implementation details are tweaked to match the web tool results and may
devitate from the paper.

(c) 2022 Thomas Holder
(c) 2022 Leukocare AG

License: BSD-2-Clause
"""

from pymol import cmd, CmdException

# Amino-acid aggregation-propensity value (a3v)
# N. Sánchez de Groot et al.
# Prediction of "hot spots" of aggregation in disease-linked polypeptides.
# BMC Structural Biology 2005, 5:18
AGGRESCAN_A3V = {
    "I": 1.822,
    "F": 1.754,
    "V": 1.594,
    "L": 1.380,
    "Y": 1.159,
    "W": 1.037,
    "M": 0.910,
    "C": 0.604,
    "A": -0.036,
    "T": -0.159,
    "S": -0.294,
    "P": -0.334,
    "G": -0.535,
    "K": -0.931,
    "H": -1.033,
    "Q": -1.231,
    "R": -1.240,
    "N": -1.302,
    "E": -1.412,
    "D": -1.836,
}

# termini get average scores of charged residues, according to publication
AGGRESCAN_A3V["^"] = (AGGRESCAN_A3V["K"] + AGGRESCAN_A3V["R"]) / 2
AGGRESCAN_A3V["$"] = (AGGRESCAN_A3V["D"] + AGGRESCAN_A3V["E"]) / 2

# FIXME values in web tool appear to be swapped!
AGGRESCAN_A3V["^"], AGGRESCAN_A3V["$"] = AGGRESCAN_A3V["$"], AGGRESCAN_A3V["^"]

# Hot Spot Threshold (HST)
AGGRESCAN_HST = -0.02

# Minimum number of residues per hotspot
HS_MIN_RES_COUNT = 5


[docs] def mean(values: list) -> float: """Compute the arithmetic mean""" return sum(values) / len(values)
[docs] def trapezoidal_integration(values: list) -> float: """Trapezoidal integration (midpoint rule)""" return sum(values[1:-1]) + (values[0] + values[-1]) / 2
[docs] def aggrescan1d_scores(seq: str, winsize: int = 0) -> dict: """Aggrescan reimplementation. Equivalent to http://bioinf.uab.es/aggrescan/ Conchillo-Solé et al. AGGRESCAN: a server for the prediction and evaluation of "hot spots" of aggregation in polypeptides. BMC Bioinformatics 8, 65 (2007). https://doi.org/10.1186/1471-2105-8-65 Args: seq: Amino acid sequence in single letter code. winsize: Window size, must be odd and larger than 2. A value of 0 will choose a window size based on the sequence length. Returns: Dictionary with aggregation and hotspot scores. Dictionary keys: - a3vSA: a3v Sequence Average - nHS: Number of Hot Spots - NnHS: Normalized number of Hot Spots for 100 residues - AAT: Area of the Aggregation Profile above the hot-spot Threshold - THSA: Total Hot-Spot Area - TA: Total Area of the aggregation profile - AATr: AAT per residue - THSAr: THSA per residue - a4vSS: a4v Sequence Sum - Na4vSS: Normalized a4vSS for 100 residues - a3v: amino-acid aggregation-propensity value - a4v: a3v window average - HSA: Hot-Spot Area - NHSA: Normalized Hot-Spot Area per residue - a4vAHS: a4v average in the Hot Spot """ assert isinstance(seq, str) non_aa = set(seq).difference("ACDEFGHIKLMNPQRSTVWY") if non_aa: print(f"Warning: unexpected codes in sequence: {non_aa}") N = len(seq) if not winsize: if N <= 75: winsize = 5 elif N <= 175: winsize = 7 elif N <= 300: winsize = 9 else: winsize = 11 assert winsize > 2 and winsize % 2 == 1 # amino-acid aggregation-propensity value (a3v) a3v = [AGGRESCAN_A3V.get(aa, AGGRESCAN_A3V["A"]) for aa in seq] a3v_with_termini = [AGGRESCAN_A3V["^"]] + a3v + [AGGRESCAN_A3V["$"]] a4v = [ mean(a3v_with_termini[i:i + winsize]) for i in range(N - winsize + 3) ] # "The remaining off-centre N- and C-terminal residues are assigned the a4v # calculated for the first and last window centres, respectively." offcenter = winsize // 2 - 1 a4v = [a4v[0]] * offcenter + a4v + [a4v[-1]] * offcenter assert len(a3v) == N assert len(a4v) == N a4v_minus_hst = [(v - AGGRESCAN_HST) for v in a4v] nHS = 0 # Number of Hot Spots (nHS) HSA = [0.0] * N # Hot-Spot Area (HSA) NHSA = [0.0] * N # Normalized Hot-Spot Area per residue a4vAHS = [0.0] * N # a4v average in the Hot Spot run_hs = 0 for (j, aa) in enumerate(seq + "P"): if aa != "P" and a4v_minus_hst[j] > 0: run_hs += 1 continue if run_hs > 0: i = j - run_hs hsa = sum(a4v_minus_hst[i:j]) # trapezoidal_integration? # FIXME Needed to match values in web tool if i == 0: hsa += a4v_minus_hst[0] / 2 HSA[i:j] = [hsa] * run_hs a4vAHS[i:j] = [mean(a4v[i:j])] * run_hs if run_hs >= HS_MIN_RES_COUNT: NHSA[i:j] = [hsa / run_hs] * run_hs nHS += 1 run_hs = 0 ta = trapezoidal_integration(a4v_minus_hst) aat = sum(max(0, v) for v in a4v_minus_hst) # trapezoidal_integration? thsa = sum(NHSA) a4vSS = sum(a4v) return { "a3vSA": mean(a3v), "nHS": nHS, "NnHS": nHS / N * 100, "AAT": aat, "THSA": thsa, "TA": ta, "AATr": aat / N, "THSAr": thsa / N, "a4vSS": a4vSS, "Na4vSS": a4vSS / N * 100, "table": { "a3v": a3v, "a4v": a4v, "HSA": HSA, "NHSA": NHSA, "a4vAHS": a4vAHS, }, }
[docs] @cmd.extend def aggrescan1d(selection="polymer", key="a4v", palette="white forest", minimum=0, maximum=None, *, var='b', quiet=1, _self=cmd): """ DESCRIPTION Aggrescan (sequence based). ARGUMENTS key = a3v | a4v | HSA | NHSA | a4vAHS: Score used for coloring. For the raw aggregation profile, use a4v. For hotspots, use NHSA. {default: a4v} var = b | q | ...: PyMOL property to assign the score to {default: b} """ from psico.querying import iterate_to_list if maximum is None: maximum = { "a3v": 1.8, "a4v": 0.7, "HSA": 7, "NHSA": 0.5, "a4vAHS": 0.5, }.get(key, 5) ids_key = "(model, segi, chain), resi, oneletter" ids_all = iterate_to_list(f"({selection}) & guide", ids_key, _self=_self) data = {} def gen_chain_ranges(): i, prev = -1, () for j, idtuple in enumerate(ids_all + [(None, )]): current = idtuple[0] if prev != current: if i != -1: yield ids_all[i:j] i, prev = j, current for ids in gen_chain_ranges(): seq = "".join(idtuple[-1] for idtuple in ids) agg = aggrescan1d_scores(seq) table = agg.pop("table") assert len(ids) == len(table[key]) data.update(zip(ids, table[key])) if not int(quiet): print('/{0}/{1}/{2}'.format(*ids[0][0])) for akey, values in table.items(): print(f'Max {akey:7s}: {max(values):7.3f}') for akey, value in agg.items(): if isinstance(value, float): print(f"{akey:7s}: {value:7.3f}") else: print(f"{akey:7s}: {value:7}") _self.alter(selection, f"{var} = data.get(({ids_key}), 0)", space={"data": data}) _self.spectrum(var, palette, selection, minimum=minimum, maximum=maximum, quiet=quiet)
[docs] @cmd.extend def aggrescan3d(selection: str = "polymer", palette: str = "white forest", minimum: float = 0, maximum: float = 2, *, ph: float = 7.0, missing: float = 0.0, color_missing: str = "yellow", state: int = -1, var: str = 'b', quiet: int = 1, exe: str = "aggrescan", _self=cmd): """ DESCRIPTION Aggrescan3d (structure based). """ quiet = int(quiet) import os, tempfile, shutil, subprocess, csv from psico.querying import iterate_to_sele ph_arg = ["--ph", str(ph)] # sanity check for chain in _self.get_chains(selection): if len(chain) > 1: raise CmdException(f"Chain ID longer than 1: {chain!r}") # temporary directory tempdir = tempfile.mkdtemp() if not quiet: print(' Tempdir:', tempdir) # filenames pdbfile = os.path.join(tempdir, 'my-input.pdb') try: _self.save(pdbfile, selection, state=state) subprocess.check_call( [exe, "--protein", pdbfile, "--work_dir", tempdir] + ph_arg) with open(f"{tempdir}/A3D.csv") as handle: scores = {(row['chain'], str(row['residue'])): float(row["score"]) for row in csv.DictReader(handle)} finally: shutil.rmtree(tempdir) if not quiet: sum_scores = sum(scores.values()) sum_scores_pos = sum(s for s in scores.values() if s > 0) print(" Aggrescan3D" f" sum(all): {sum_scores:.2f}" f" sum(positive): {sum_scores_pos:.2f}") _self.alter(selection, f"{var} = scores.get((chain, resi), {missing!r})", space={"scores": scores}) _self.spectrum(var, palette, selection, minimum=minimum, maximum=maximum) sele_missing = iterate_to_sele(selection, "(chain, resi) not in scores", space={"scores": scores}) if sele_missing: _self.color(color_missing, sele_missing)
# tab-completion of arguments cmd.auto_arg[0].update([ ('aggrescan1d', cmd.auto_arg[1]['select']), ('aggrescan3d', cmd.auto_arg[1]['select']), ]) # vi:expandtab:smarttab