DTI 1c: Processed Drug System Tables

Enriching intermediate drug system tables with Polars.

Published

May 17, 2024

This release of the drug-target interaction series will focus on building the processed drug system tables – Ligand_Molecule, Ligand_Atom, and Ligand_Bond – from the interim variants and the Ligand_Ring table. These tables were generated in a prior step by looping through the molecule supply SDF file.

Target schemas

Molecule

Atom

Bond

Sourcing

import polars as pl

interim_molecule = pl.read_parquet(interim_drug_system_root_path / "mol/*")
ring = pl.read_parquet(interim_drug_system_root_path / "ring/*")
interim_atom = pl.read_parquet(interim_drug_system_root_path / "atom/*")
interim_bond = pl.read_parquet(interim_drug_system_root_path / "bond/*")

Molecule

The interim Molecule table lacks the mean_atomic_weight and bonds_per_atom features.

`mean_atomic_weight`

# Note: `weight` in the _Atom_ table is the average atomic weight of the given element.
# It _doesn't_ take isotopes into account.
# `weight` in the _Molecule_ table _does_ take isotopes into account.
mean_atom_weight_lookup = (
    interim_atom.group_by("molecule_id")
    .agg(pl.mean("weight").alias("mean_atomic_weight"))
    .sort("molecule_id")
)

`bonds_per_atom`

bonds_per_atom_lookup = (
    interim_bond.melt(
        id_vars=["molecule_id"], value_vars=["atom1_index", "atom2_index"], value_name="atom_index"
    )
    .group_by(["molecule_id", "atom_index"])
    .len()
    .group_by("molecule_id")
    .agg(pl.mean("len").alias("bonds_per_atom"))
    .sort("molecule_id")
)

Join

molecule = interim_molecule.join(
    mean_atom_weight_lookup.rename({"molecule_id": "id"}), on="id", how="left"
).join(bonds_per_atom_lookup.rename({"molecule_id": "id"}), on="id", how="left")

molecule

shape: (1_277_006, 9)

id	supplier_index	name	smiles	n_atoms	n_bonds	weight	mean_atomic_weight	bonds_per_atom
i64	i64	str	str	i64	i64	f64	f64	f64
608734	0	"6-[(4R,5S,6S,7R)-4,7-dibenzyl-…	"O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)…	40	42	554.299202	12.8087	2.1
22	1	"(4R,5S,6S,7R)-4,7-dibenzyl-5,6…	"O=C1N(C/C=C/c2cn[nH]c2)[C@H](C…	40	44	538.269239	12.6095	2.2
23	2	"(4R,5S,6S,7R)-4,7-dibenzyl-1-(…	"O=C1N(C/C=C/c2cn[nH]c2)[C@H](C…	36	40	486.263091	12.565111	2.222222
24	3	"(4R,5S,6S,7R)-4,7-dibenzyl-1-(…	"O=C1N(CCCCCCO)[C@H](Cc2ccccc2)…	35	38	480.298808	12.580829	2.171429
25	4	"(4R,5S,6S,7R)-4,7-dibenzyl-1-(…	"O=C1N(CCCCCO)[C@H](Cc2ccccc2)[…	34	37	466.283158	12.597588	2.176471
…	…	…	…	…	…	…	…	…
492310	999964	"US10975068, Example 451"	"O=C(N[C@H]1CCC[C@@H]1O)c1nc(C(…	41	46	599.198952	13.887463	2.243902
492311	999965	"US10975068, Comparator Example…	"C[C@H]1CCCN1C(=O)c1nc(C(=O)NCC…	36	39	510.24131	13.233111	2.166667
492313	999968	"US10975068, Comparator Example…	"CC(C)S(=O)(=O)Nc1cc(C(F)(F)F)c…	39	41	595.154624	14.523308	2.102564
492332	999988	"7-Methoxy-3-methyl-1-(3- methy…	"COc1cc2ncc3c(c2cc1-c1c[nH]nc1C…	30	34	400.164774	12.676067	4.533333
492336	999992	"7-Methoxy-3-methyl-1,8-bis- (1…	"COc1cc2ncc3c(c2cc1-c1cn(C)nc1C…	30	34	403.175673	12.7426	4.533333

Write

molecule.write_parquet(data_path / "processed/molecule.parquet")

Atom

The Atom table lacks the ring_size_[]_count fields.

`ring_size_[]_count`

ring_size_range = (3, 8)
ring_size_count_columns = [
    f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)
]

ring_size_count_lookup = (
    ring.explode("atom_indices")
    .rename({"atom_indices": "atom_index"})
    .group_by(["molecule_id", "atom_index", "size"])
    .len()
    .filter((pl.col("size") >= ring_size_range[0]) & (pl.col("size") <= ring_size_range[1]))
    .pivot(index=["molecule_id", "atom_index"], columns="size", values="len")
    # insert ring sizes in specified range _if_ not already present
    .pipe(
        lambda df: df.select(
            pl.all(),
            *[
                pl.lit(None).alias(str(i))
                for i in range(ring_size_range[0], ring_size_range[1] + 1)
                if str(i) not in df
            ],
        )
    )
    .rename(
        {str(i): f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)}
    )
    .fill_null(0)
    .select("molecule_id", pl.col("atom_index").alias("index"), *ring_size_count_columns)
    .sort("molecule_id", "index")
)

Join

atom = interim_atom.join(ring_size_count_lookup, on=["molecule_id", "index"], how="left").select(
    *[pl.col(col) for col in interim_atom.columns[:4]],
    *ring_size_count_columns,
    *[pl.col(col) for col in interim_atom.columns[4:]]
)

atom

shape: (41_723_806, 18)

molecule_id	index	symbol	weight	ring_size_3_count	ring_size_4_count	ring_size_5_count	ring_size_6_count	ring_size_7_count	ring_size_8_count	chirality	hybridization	acceptor	donor	aromatic	x	y	z
i64	i64	str	f64	i64	i64	i64	i64	i64	i64	str	str	bool	bool	bool	f64	f64	f64
608734	0	"O"	15.999	null	null	null	null	null	null	null	"SP2"	true	false	false	-1.011	3.174	-6.577
608734	1	"C"	12.011	null	null	null	null	null	null	null	"SP2"	false	false	false	-2.049	3.469	-6.009
608734	2	"O"	15.999	null	null	null	null	null	null	null	"SP2"	false	true	false	-2.863	4.337	-6.577
608734	3	"C"	12.011	null	null	null	null	null	null	null	"SP3"	false	false	false	-2.393	2.867	-4.752
608734	4	"C"	12.011	null	null	null	null	null	null	null	"SP3"	false	false	false	-2.502	3.929	-3.645
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
492336	25	"N"	14.007	0	0	2	0	0	0	null	"SP2"	true	false	true	1.47	-5.32	0.007
492336	26	"N"	14.007	0	0	2	0	0	0	null	"SP2"	true	false	true	1.579	-4.954	-1.234
492336	27	"C"	12.011	0	0	2	0	0	0	null	"SP2"	false	false	true	0.386	-4.576	-1.722
492336	28	"C"	12.011	0	0	2	0	0	0	null	"SP2"	false	false	true	-0.54	-4.712	-0.701
492336	29	"C"	12.011	null	null	null	null	null	null	null	"SP3"	false	false	false	2.805	-4.955	-1.945

Write

atom.write_parquet(data_path / "processed/atom.parquet")

Bond

The Bond table lacks the same_ring boolean feature.

`same_ring`

In PySpark, this problem could be solved with a conditional join:

same_ring_bonds = (
    interim_bond
    .select("molecule_id", "index", "atom1_index", "atom2_index")
    .join(
        ring.WithColumnsRenamed({"atom_indices": "ring_atom_indices"}).select("ring_atom_indices"),
        on=[
            psf.col("molecule_id") == psf.col("molecule_id"), 
            psf.col("atom1_index").isin(psf.col("ring_atom_indices"), 
            psf.col("atom2_index").isin(psf.col("ring_atom_indices")),
        ],
        how="inner"
    )
    .select("molecule_id", "index")
)

Polars doesn’t allow complex join logic like this so the operation must be split up into multiple parts.

bond_atom1_rings = interim_bond.select("molecule_id", "index", "atom1_index").join(
    ring
    .select("molecule_id", "index", "atom_indices")
    .explode("atom_indices")
    .rename({"atom_indices": "atom1_index", "index": "ring_index"})
    .select("molecule_id", "ring_index", "atom1_index"),
    on=["molecule_id", "atom1_index"],
    how="inner",
).unique()

bond_atom2_rings = interim_bond.select("molecule_id", "index", "atom2_index").join(
    ring
    .select("molecule_id", "index", "atom_indices")
    .explode("atom_indices")
    .rename({"atom_indices": "atom2_index", "index": "ring_index"})
    .select("molecule_id", "ring_index", "atom2_index"),
    on=["molecule_id", "atom2_index"],
    how="inner",
).unique()

same_ring_lookup = bond_atom1_rings.join(
    bond_atom2_rings, on=["molecule_id", "index", "ring_index"], how="inner"
).select("molecule_id", "index", pl.lit(True).alias("same_ring")).unique()

bond = (
    interim_bond
    .join(
        same_ring_lookup,
        on=["molecule_id", "index"],
        how="left"
    )
    .with_columns(
        pl.col("same_ring").fill_null(False)
    )
)

bond

shape: (45_597_555, 7)

molecule_id	index	atom1_index	atom2_index	type	stereochemistry	same_ring
i64	i64	i64	i64	str	str	bool
608734	0	0	1	"double"	null	false
608734	1	1	2	"single"	null	false
608734	2	1	3	"single"	null	false
608734	3	3	4	"single"	null	false
608734	4	4	5	"single"	null	false
…	…	…	…	…	…	…
492336	29	24	28	"aromatic"	null	true
492336	30	25	26	"aromatic"	null	true
492336	31	26	27	"aromatic"	null	true
492336	32	26	29	"single"	null	false
492336	33	27	28	"aromatic"	null	true

Write

bond.write_parquet(data_path / "processed/bond.parquet")