在上一篇文章中,我们探讨了强化学习在机器人控制中的应用。本文将深入介绍图生成模型及其在分子设计领域的应用,这是一个结合深度学习与化学的交叉领域。我们将使用PyTorch Geometric实现基于图神经网络的分子生成模型,并在ZINC250k数据集上进行分子生成实验。
一、图生成模型基础
1. 分子表示方法
表示形式 | 优点 | 缺点 |
---|---|---|
SMILES字符串 | 紧凑易存储 | 语法约束强 |
分子图 | 保留结构信息 | 需要专门处理 |
3D构象 | 包含空间信息 | 计算成本高 |
2. 图生成模型分类
class GraphGenModels:def __init__(self):self.autoregressive = ["GraphRNN", "MolGPT"] # 逐步生成self.one_shot = ["VAE", "GAN"] # 整体生成self.flow_based = ["GraphNVP", "MoFlow"] # 可逆变换
3. 分子生成评价指标
def calculate_metrics(generated_mols):validity = len([m for m in generated_mols if m is not None]) / len(generated_mols)uniqueness = len(set([Chem.MolToSmiles(m) for m in generated_mols if m])) / len(generated_mols)novelty = len([m for m in generated_mols if m and Chem.MolToSmiles(m) not in train_smiles]) / len(generated_mols)return {"validity": validity, "uniqueness": uniqueness, "novelty": novelty}
二、基于图变分自编码器的分子设计
1. 环境配置
pip install torch torch-geometric rdkit-pypi networkx matplotlib
2. 分子图数据处理
import torch
from torch_geometric.data import Data, Dataset
from rdkit import Chem
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool
from torch_geometric.loader import DataLoader
# 1. 数据集定义
class MoleculeDataset(Dataset):def __init__(self, smiles_list):super().__init__()self.smiles_list = smiles_list
def __len__(self):return len(self.smiles_list)
def __getitem__(self, idx):smiles = self.smiles_list[idx]mol = Chem.MolFromSmiles(smiles)
if mol is None:raise ValueError(f"Invalid SMILES at index {idx}: {smiles}")
# 原子特征atom_features = []for atom in mol.GetAtoms():feature = [float(atom.GetAtomicNum()),float(atom.GetDegree()),float(atom.GetFormalCharge()),float(atom.GetIsAromatic())]atom_features.append(feature)
# 边索引edge_index = []for bond in mol.GetBonds():i = bond.GetBeginAtomIdx()j = bond.GetEndAtomIdx()edge_index.append([i, j])edge_index.append([j, i]) # 无向图
# 转换为PyG数据格式x = torch.tensor(atom_features, dtype=torch.float)edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
return Data(x=x, edge_index=edge_index, smiles=smiles)
# 2. 归一化数据集
class NormalizedMoleculeDataset(MoleculeDataset):def __init__(self, smiles_list):super().__init__(smiles_list)self.feat_means = Noneself.feat_stds = None
def _init_stats(self):if self.feat_means is None:temp_data = [MoleculeDataset.__getitem__(self, i) for i in range(min(100, len(self)))]all_features = torch.cat([d.x for d in temp_data], dim=0)self.feat_means = torch.mean(all_features, dim=0, keepdim=True)self.feat_stds = torch.std(all_features, dim=0, keepdim=True) + 1e-6
def __getitem__(self, idx):self._init_stats()data = super().__getitem__(idx)data.x = (data.x - self.feat_means) / self.feat_stdsreturn data
3. 图变分自编码器实现
# 3. GVAE模型
class StableGVAE(nn.Module):def __init__(self, node_dim, latent_dim):super().__init__()self.node_dim = node_dimself.latent_dim = latent_dimself.training_step = 0
# 编码器self.conv1 = GCNConv(node_dim, 128)self.norm1 = nn.LayerNorm(128)self.conv2 = GCNConv(128, 256)self.norm2 = nn.LayerNorm(256)self.conv3 = GCNConv(256, 512)self.norm3 = nn.LayerNorm(512)
# 潜空间self.mean = nn.Linear(512, latent_dim)self.logstd = nn.Linear(512, latent_dim)self.logstd_max = 2
# 解码器self.node_decoder = nn.Sequential(nn.Linear(latent_dim, 256),nn.LayerNorm(256),nn.ReLU(),nn.Linear(256, 128),nn.LayerNorm(128),nn.ReLU(),nn.Linear(128, node_dim))
# 初始化权重for m in self.modules():if isinstance(m, nn.Linear):nn.init.xavier_uniform_(m.weight)nn.init.constant_(m.bias, 0)
def encode(self, x, edge_index, batch):x = F.relu(self.norm1(self.conv1(x, edge_index)))x = F.relu(self.norm2(self.conv2(x, edge_index)))x = F.relu(self.norm3(self.conv3(x, edge_index)))x = global_add_pool(x, batch)mean = self.mean(x)logstd = torch.clamp(self.logstd(x), max=self.logstd_max)return mean, logstd
def reparameterize(self, mean, logstd):std = torch.exp(logstd)eps = torch.randn_like(std)return mean + eps * std
def forward(self, data):x, edge_index, batch = data.x, data.edge_index, data.batch
# 编码mean, logstd = self.encode(x, edge_index, batch)z = self.reparameterize(mean, logstd)
# 解码z_nodes = z[batch]recon = self.node_decoder(z_nodes)
# 计算损失recon_loss = F.mse_loss(recon, x)kl_loss = -0.5 * torch.mean(1 + logstd - mean.pow(2) - logstd.exp())
# 动态调整KL权重kl_weight = min(0.1, 0.01 * (self.training_step / 100))total_loss = recon_loss + kl_weight * kl_loss
return total_loss, mean, logstd
4. 分子生成
# 4. 训练函数
def train(model, loader, optimizer, epochs=10):model.train()
for epoch in range(epochs):total_loss = 0for batch in loader:batch = batch.to(device)optimizer.zero_grad()
loss, mean, logstd = model(batch)
loss.backward()torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()model.training_step += 1
print(f'Epoch {epoch + 1}, Loss: {total_loss / len(loader):.4f}')print(f' Mean |z|: {torch.mean(torch.abs(mean)).item():.4f}')print(f' Std exp(logstd): {torch.mean(torch.exp(logstd)).item():.4f}')# 5. 主程序
if __name__ == "__main__":# 示例数据smiles_list = ["CCO", "CCN", "CC(=O)O", "c1ccccc1", "C1CCCCC1", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"]
# 创建数据集dataset = NormalizedMoleculeDataset(smiles_list)loader = DataLoader(dataset, batch_size=32, shuffle=True)
# 设备设置device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')print(f"Using device: {device}")
# 初始化模型model = StableGVAE(node_dim=4, latent_dim=32).to(device)optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# 开始训练train(model, loader, optimizer, epochs=20)
输出为:
Using device: cuda
Epoch 1, Loss: 1.8490Mean |z|: 4.5453Std exp(logstd): 3.1631
Epoch 2, Loss: 1.4654Mean |z|: 4.6283Std exp(logstd): 3.1690
Epoch 3, Loss: 1.3454Mean |z|: 4.7163Std exp(logstd): 3.1893
Epoch 4, Loss: 1.1472Mean |z|: 4.8204Std exp(logstd): 3.1949
Epoch 5, Loss: 1.2276Mean |z|: 4.9333Std exp(logstd): 3.1850
Epoch 6, Loss: 1.2820Mean |z|: 5.0495Std exp(logstd): 3.1673
Epoch 7, Loss: 1.0549Mean |z|: 5.1596Std exp(logstd): 3.1532
Epoch 8, Loss: 1.1106Mean |z|: 5.2716Std exp(logstd): 3.1439
Epoch 9, Loss: 0.9983Mean |z|: 5.3749Std exp(logstd): 3.1421
Epoch 10, Loss: 0.8377Mean |z|: 5.4713Std exp(logstd): 3.1421
Epoch 11, Loss: 0.9254Mean |z|: 5.5570Std exp(logstd): 3.1369
Epoch 12, Loss: 0.8516Mean |z|: 5.6426Std exp(logstd): 3.1326
Epoch 13, Loss: 0.8242Mean |z|: 5.7219Std exp(logstd): 3.1306
Epoch 14, Loss: 0.8309Mean |z|: 5.7898Std exp(logstd): 3.1227
Epoch 15, Loss: 0.8721Mean |z|: 5.8507Std exp(logstd): 3.1251
Epoch 16, Loss: 0.7623Mean |z|: 5.9046Std exp(logstd): 3.1181
Epoch 17, Loss: 0.9805Mean |z|: 5.9619Std exp(logstd): 3.1088
Epoch 18, Loss: 0.6974Mean |z|: 6.0277Std exp(logstd): 3.0938
Epoch 19, Loss: 0.7069Mean |z|: 6.0858Std exp(logstd): 3.0828
Epoch 20, Loss: 0.7618Mean |z|: 6.1408Std exp(logstd): 3.0777
三、总结与展望
本文实现了基于图变分自编码器的分子生成系统,主要技术亮点包括:
-
完整的分子图处理流程:从SMILES到PyG数据格式
-
图生成模型架构:结合GNN与VAE的优势
在下一篇文章中,我们将探讨混合精度训练与梯度缩放技术,介绍如何利用FP16加速训练同时保持模型稳定性。