DTI 1c: Processed Drug System Tables
Enriching intermediate drug system tables with Polars.
This release of the drug-target interaction series will focus on building the processed drug system tables – Ligand_Molecule, Ligand_Atom, and Ligand_Bond – from the interim variants and the Ligand_Ring table. These tables were generated in a prior step by looping through the molecule supply SDF file.
Target schemas
Molecule
Atom
Bond
Sourcing
import polars as pl
interim_molecule = pl.read_parquet(interim_drug_system_root_path / "mol/*")
ring = pl.read_parquet(interim_drug_system_root_path / "ring/*")
interim_atom = pl.read_parquet(interim_drug_system_root_path / "atom/*")
interim_bond = pl.read_parquet(interim_drug_system_root_path / "bond/*")Molecule
The interim Molecule table lacks the mean_atomic_weight and bonds_per_atom features.
mean_atomic_weight
# Note: `weight` in the _Atom_ table is the average atomic weight of the given element.
# It _doesn't_ take isotopes into account.
# `weight` in the _Molecule_ table _does_ take isotopes into account.
mean_atom_weight_lookup = (
interim_atom.group_by("molecule_id")
.agg(pl.mean("weight").alias("mean_atomic_weight"))
.sort("molecule_id")
)bonds_per_atom
bonds_per_atom_lookup = (
interim_bond.melt(
id_vars=["molecule_id"], value_vars=["atom1_index", "atom2_index"], value_name="atom_index"
)
.group_by(["molecule_id", "atom_index"])
.len()
.group_by("molecule_id")
.agg(pl.mean("len").alias("bonds_per_atom"))
.sort("molecule_id")
)Join
molecule = interim_molecule.join(
mean_atom_weight_lookup.rename({"molecule_id": "id"}), on="id", how="left"
).join(bonds_per_atom_lookup.rename({"molecule_id": "id"}), on="id", how="left")molecule
shape: (1_277_006, 9)
| id | supplier_index | name | smiles | n_atoms | n_bonds | weight | mean_atomic_weight | bonds_per_atom |
|---|---|---|---|---|---|---|---|---|
| i64 | i64 | str | str | i64 | i64 | f64 | f64 | f64 |
| 608734 | 0 | "6-[(4R,5S,6S,7R)-4,7-dibenzyl-… | "O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)… | 40 | 42 | 554.299202 | 12.8087 | 2.1 |
| 22 | 1 | "(4R,5S,6S,7R)-4,7-dibenzyl-5,6… | "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… | 40 | 44 | 538.269239 | 12.6095 | 2.2 |
| 23 | 2 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… | 36 | 40 | 486.263091 | 12.565111 | 2.222222 |
| 24 | 3 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(CCCCCCO)[C@H](Cc2ccccc2)… | 35 | 38 | 480.298808 | 12.580829 | 2.171429 |
| 25 | 4 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(CCCCCO)[C@H](Cc2ccccc2)[… | 34 | 37 | 466.283158 | 12.597588 | 2.176471 |
| … | … | … | … | … | … | … | … | … |
| 492310 | 999964 | "US10975068, Example 451" | "O=C(N[C@H]1CCC[C@@H]1O)c1nc(C(… | 41 | 46 | 599.198952 | 13.887463 | 2.243902 |
| 492311 | 999965 | "US10975068, Comparator Example… | "C[C@H]1CCCN1C(=O)c1nc(C(=O)NCC… | 36 | 39 | 510.24131 | 13.233111 | 2.166667 |
| 492313 | 999968 | "US10975068, Comparator Example… | "CC(C)S(=O)(=O)Nc1cc(C(F)(F)F)c… | 39 | 41 | 595.154624 | 14.523308 | 2.102564 |
| 492332 | 999988 | "7-Methoxy-3-methyl-1-(3- methy… | "COc1cc2ncc3c(c2cc1-c1c[nH]nc1C… | 30 | 34 | 400.164774 | 12.676067 | 4.533333 |
| 492336 | 999992 | "7-Methoxy-3-methyl-1,8-bis- (1… | "COc1cc2ncc3c(c2cc1-c1cn(C)nc1C… | 30 | 34 | 403.175673 | 12.7426 | 4.533333 |
Write
molecule.write_parquet(data_path / "processed/molecule.parquet")Atom
The Atom table lacks the ring_size_[]_count fields.
ring_size_[]_count
ring_size_range = (3, 8)
ring_size_count_columns = [
f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)
]
ring_size_count_lookup = (
ring.explode("atom_indices")
.rename({"atom_indices": "atom_index"})
.group_by(["molecule_id", "atom_index", "size"])
.len()
.filter((pl.col("size") >= ring_size_range[0]) & (pl.col("size") <= ring_size_range[1]))
.pivot(index=["molecule_id", "atom_index"], columns="size", values="len")
# insert ring sizes in specified range _if_ not already present
.pipe(
lambda df: df.select(
pl.all(),
*[
pl.lit(None).alias(str(i))
for i in range(ring_size_range[0], ring_size_range[1] + 1)
if str(i) not in df
],
)
)
.rename(
{str(i): f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)}
)
.fill_null(0)
.select("molecule_id", pl.col("atom_index").alias("index"), *ring_size_count_columns)
.sort("molecule_id", "index")
)Join
atom = interim_atom.join(ring_size_count_lookup, on=["molecule_id", "index"], how="left").select(
*[pl.col(col) for col in interim_atom.columns[:4]],
*ring_size_count_columns,
*[pl.col(col) for col in interim_atom.columns[4:]]
)atom
shape: (41_723_806, 18)
| molecule_id | index | symbol | weight | ring_size_3_count | ring_size_4_count | ring_size_5_count | ring_size_6_count | ring_size_7_count | ring_size_8_count | chirality | hybridization | acceptor | donor | aromatic | x | y | z |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | str | f64 | i64 | i64 | i64 | i64 | i64 | i64 | str | str | bool | bool | bool | f64 | f64 | f64 |
| 608734 | 0 | "O" | 15.999 | null | null | null | null | null | null | null | "SP2" | true | false | false | -1.011 | 3.174 | -6.577 |
| 608734 | 1 | "C" | 12.011 | null | null | null | null | null | null | null | "SP2" | false | false | false | -2.049 | 3.469 | -6.009 |
| 608734 | 2 | "O" | 15.999 | null | null | null | null | null | null | null | "SP2" | false | true | false | -2.863 | 4.337 | -6.577 |
| 608734 | 3 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | -2.393 | 2.867 | -4.752 |
| 608734 | 4 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | -2.502 | 3.929 | -3.645 |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 492336 | 25 | "N" | 14.007 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | true | false | true | 1.47 | -5.32 | 0.007 |
| 492336 | 26 | "N" | 14.007 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | true | false | true | 1.579 | -4.954 | -1.234 |
| 492336 | 27 | "C" | 12.011 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | false | false | true | 0.386 | -4.576 | -1.722 |
| 492336 | 28 | "C" | 12.011 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | false | false | true | -0.54 | -4.712 | -0.701 |
| 492336 | 29 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | 2.805 | -4.955 | -1.945 |
Write
atom.write_parquet(data_path / "processed/atom.parquet")Bond
The Bond table lacks the same_ring boolean feature.
same_ring
In PySpark, this problem could be solved with a conditional join:
same_ring_bonds = (
interim_bond
.select("molecule_id", "index", "atom1_index", "atom2_index")
.join(
ring.WithColumnsRenamed({"atom_indices": "ring_atom_indices"}).select("ring_atom_indices"),
on=[
psf.col("molecule_id") == psf.col("molecule_id"),
psf.col("atom1_index").isin(psf.col("ring_atom_indices"),
psf.col("atom2_index").isin(psf.col("ring_atom_indices")),
],
how="inner"
)
.select("molecule_id", "index")
)Polars doesn’t allow complex join logic like this so the operation must be split up into multiple parts.
bond_atom1_rings = interim_bond.select("molecule_id", "index", "atom1_index").join(
ring
.select("molecule_id", "index", "atom_indices")
.explode("atom_indices")
.rename({"atom_indices": "atom1_index", "index": "ring_index"})
.select("molecule_id", "ring_index", "atom1_index"),
on=["molecule_id", "atom1_index"],
how="inner",
).unique()
bond_atom2_rings = interim_bond.select("molecule_id", "index", "atom2_index").join(
ring
.select("molecule_id", "index", "atom_indices")
.explode("atom_indices")
.rename({"atom_indices": "atom2_index", "index": "ring_index"})
.select("molecule_id", "ring_index", "atom2_index"),
on=["molecule_id", "atom2_index"],
how="inner",
).unique()
same_ring_lookup = bond_atom1_rings.join(
bond_atom2_rings, on=["molecule_id", "index", "ring_index"], how="inner"
).select("molecule_id", "index", pl.lit(True).alias("same_ring")).unique()bond = (
interim_bond
.join(
same_ring_lookup,
on=["molecule_id", "index"],
how="left"
)
.with_columns(
pl.col("same_ring").fill_null(False)
)
)bond
shape: (45_597_555, 7)
| molecule_id | index | atom1_index | atom2_index | type | stereochemistry | same_ring |
|---|---|---|---|---|---|---|
| i64 | i64 | i64 | i64 | str | str | bool |
| 608734 | 0 | 0 | 1 | "double" | null | false |
| 608734 | 1 | 1 | 2 | "single" | null | false |
| 608734 | 2 | 1 | 3 | "single" | null | false |
| 608734 | 3 | 3 | 4 | "single" | null | false |
| 608734 | 4 | 4 | 5 | "single" | null | false |
| … | … | … | … | … | … | … |
| 492336 | 29 | 24 | 28 | "aromatic" | null | true |
| 492336 | 30 | 25 | 26 | "aromatic" | null | true |
| 492336 | 31 | 26 | 27 | "aromatic" | null | true |
| 492336 | 32 | 26 | 29 | "single" | null | false |
| 492336 | 33 | 27 | 28 | "aromatic" | null | true |
Write
bond.write_parquet(data_path / "processed/bond.parquet")