7、构建、训练模型
7、构建、训练模型
7.1、构建模型
构建模型有三种方法:
- 继承 nn.Module 基类构建自定义模型(最常见)
- 使用 nn.Sequential 按层顺序构建模型(最简单)
- 继承 nn.Module 基类构建模型并辅助应用模型容器进行封装(nn.Sequential,nn.ModuleList,nn.ModuleDict)(最为灵活也较为复杂)
模型定义好后,会给参数(w,b)赋值
pytorch hub 模块,调用他人的网络架构
7.1.1、继承 nn.Module 基类构建自定义模型
模型中:
- 用到的层 在
__init__
函数中定义 - 在
forward
方法中定义模型的正向传播逻辑
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3)
self.pool1 = nn.MaxPool2d(kernel_size = 2,stride = 2)
self.conv2 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5)
self.pool2 = nn.MaxPool2d(kernel_size = 2,stride = 2)
self.dropout = nn.Dropout2d(p = 0.1)
self.adaptive_pool = nn.AdaptiveMaxPool2d((1,1))
self.flatten = nn.Flatten()
self.linear1 = nn.Linear(64,32)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(32,1)
def forward(self,x):
x = self.conv1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.dropout(x)
x = self.adaptive_pool(x)
x = self.flatten(x)
x = self.linear1(x)
x = self.relu(x)
y = self.linear2(x)
return y
net = Net()
print(net)
'''
Net(
(conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
(pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
(pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(dropout): Dropout2d(p=0.1, inplace=False)
(adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1))
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear1): Linear(in_features=64, out_features=32, bias=True)
(relu): ReLU()
(linear2): Linear(in_features=32, out_features=1, bias=True)
)
'''
from torchkeras import summary
summary(net,input_shape= (3,32,32));
'''
--------------------------------------------------------------------------
Layer (type) Output Shape Param #
==========================================================================
Conv2d-1 [-1, 32, 30, 30] 896
MaxPool2d-2 [-1, 32, 15, 15] 0
Conv2d-3 [-1, 64, 11, 11] 51,264
MaxPool2d-4 [-1, 64, 5, 5] 0
Dropout2d-5 [-1, 64, 5, 5] 0
AdaptiveMaxPool2d-6 [-1, 64, 1, 1] 0
Flatten-7 [-1, 64] 0
Linear-8 [-1, 32] 2,080
ReLU-9 [-1, 32] 0
Linear-10 [-1, 1] 33
==========================================================================
Total params: 54,273
Trainable params: 54,273
Non-trainable params: 0
--------------------------------------------------------------------------
Input size (MB): 0.011719
Forward/backward pass size (MB): 0.359627
Params size (MB): 0.207035
Estimated Total Size (MB): 0.578381
--------------------------------------------------------------------------
'''
7.1.2、使用 nn.Sequential 按层顺序构建模型
使用 nn.Sequential 按层顺序构建模型 无需定义 forward 方法。仅仅适合于简单的模型。
以下是使用 nn.Sequential 搭建模型的一些等价方法
利用 add_module 方法
net = nn.Sequential() net.add_module("conv1",nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3)) net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2)) net.add_module("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5)) net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2)) net.add_module("dropout",nn.Dropout2d(p = 0.1)) net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1))) net.add_module("flatten",nn.Flatten()) net.add_module("linear1",nn.Linear(64,32)) net.add_module("relu",nn.ReLU()) net.add_module("linear2",nn.Linear(32,1)) print(net) ''' Sequential( (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (dropout): Dropout2d(p=0.1, inplace=False) (adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1)) (flatten): Flatten(start_dim=1, end_dim=-1) (linear1): Linear(in_features=64, out_features=32, bias=True) (relu): ReLU() (linear2): Linear(in_features=32, out_features=1, bias=True) ) '''
利用变长参数
这种方式构建时不能给每个层指定名称
net = nn.Sequential( nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Dropout2d(p = 0.1), nn.AdaptiveMaxPool2d((1,1)), nn.Flatten(), nn.Linear(64,32), nn.ReLU(), nn.Linear(32,1) ) print(net) ''' Sequential( (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (4): Dropout2d(p=0.1, inplace=False) (5): AdaptiveMaxPool2d(output_size=(1, 1)) (6): Flatten(start_dim=1, end_dim=-1) (7): Linear(in_features=64, out_features=32, bias=True) (8): ReLU() (9): Linear(in_features=32, out_features=1, bias=True) ) '''
利用 OrderedDict
from collections import OrderedDict net = nn.Sequential(OrderedDict( [("conv1",nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3)), ("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2)), ("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5)), ("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2)), ("dropout",nn.Dropout2d(p = 0.1)), ("adaptive_pool",nn.AdaptiveMaxPool2d((1,1))), ("flatten",nn.Flatten()), ("linear1",nn.Linear(64,32)), ("relu",nn.ReLU()), ("linear2",nn.Linear(32,1)) ]) ) print(net) ''' Sequential( (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (dropout): Dropout2d(p=0.1, inplace=False) (adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1)) (flatten): Flatten(start_dim=1, end_dim=-1) (linear1): Linear(in_features=64, out_features=32, bias=True) (relu): ReLU() (linear2): Linear(in_features=32, out_features=1, bias=True) ) '''
from torchkeras import summary summary(net,input_shape= (3,32,32)); ''' -------------------------------------------------------------------------- Layer (type) Output Shape Param # ========================================================================== Conv2d-1 [-1, 32, 30, 30] 896 MaxPool2d-2 [-1, 32, 15, 15] 0 Conv2d-3 [-1, 64, 11, 11] 51,264 MaxPool2d-4 [-1, 64, 5, 5] 0 Dropout2d-5 [-1, 64, 5, 5] 0 AdaptiveMaxPool2d-6 [-1, 64, 1, 1] 0 Flatten-7 [-1, 64] 0 Linear-8 [-1, 32] 2,080 ReLU-9 [-1, 32] 0 Linear-10 [-1, 1] 33 ========================================================================== Total params: 54,273 Trainable params: 54,273 Non-trainable params: 0 -------------------------------------------------------------------------- Input size (MB): 0.011719 Forward/backward pass size (MB): 0.359627 Params size (MB): 0.207035 Estimated Total Size (MB): 0.578381 -------------------------------------------------------------------------- '''
7.1.3、继承 nn.Module 基类构建模型并辅助应用模型容器进行封装
当模型的结构比较复杂时,我们可以应用模型容器(nn.Sequential,nn.ModuleList,nn.ModuleDict)对模型的部分结构进行封装
这样做会让模型整体更加有层次感,有时候也能减少代码量。
注意,在下面的范例中我们每次仅仅使用一种模型容器,但实际上这些模型容器的使用是非常灵活的,可以在一个模型中任意组合任意嵌套使用。
nn.Sequential 作为模型容器
class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv = nn.Sequential( nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Dropout2d(p = 0.1), nn.AdaptiveMaxPool2d((1,1)) ) self.dense = nn.Sequential( nn.Flatten(), nn.Linear(64,32), nn.ReLU(), nn.Linear(32,1) ) def forward(self,x): x = self.conv(x) y = self.dense(x) return y net = Net() print(net) ''' Net( (conv): Sequential( (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (4): Dropout2d(p=0.1, inplace=False) (5): AdaptiveMaxPool2d(output_size=(1, 1)) ) (dense): Sequential( (0): Flatten(start_dim=1, end_dim=-1) (1): Linear(in_features=64, out_features=32, bias=True) (2): ReLU() (3): Linear(in_features=32, out_features=1, bias=True) ) ) '''
nn.ModuleList 作为模型容器
注意下面中的 ModuleList 不能用 Python 中的列表代替。
class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.layers = nn.ModuleList([ nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5), nn.MaxPool2d(kernel_size = 2,stride = 2), nn.Dropout2d(p = 0.1), nn.AdaptiveMaxPool2d((1,1)), nn.Flatten(), nn.Linear(64,32), nn.ReLU(), nn.Linear(32,1)] ) def forward(self,x): for layer in self.layers: x = layer(x) return x net = Net() print(net) ''' Net( (layers): ModuleList( (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (4): Dropout2d(p=0.1, inplace=False) (5): AdaptiveMaxPool2d(output_size=(1, 1)) (6): Flatten(start_dim=1, end_dim=-1) (7): Linear(in_features=64, out_features=32, bias=True) (8): ReLU() (9): Linear(in_features=32, out_features=1, bias=True) ) ) '''
from torchkeras import summary summary(net,input_shape= (3,32,32)); ''' -------------------------------------------------------------------------- Layer (type) Output Shape Param # ========================================================================== Conv2d-1 [-1, 32, 30, 30] 896 MaxPool2d-2 [-1, 32, 15, 15] 0 Conv2d-3 [-1, 64, 11, 11] 51,264 MaxPool2d-4 [-1, 64, 5, 5] 0 Dropout2d-5 [-1, 64, 5, 5] 0 AdaptiveMaxPool2d-6 [-1, 64, 1, 1] 0 Flatten-7 [-1, 64] 0 Linear-8 [-1, 32] 2,080 ReLU-9 [-1, 32] 0 Linear-10 [-1, 1] 33 ========================================================================== Total params: 54,273 Trainable params: 54,273 Non-trainable params: 0 -------------------------------------------------------------------------- Input size (MB): 0.011719 Forward/backward pass size (MB): 0.359627 Params size (MB): 0.207035 Estimated Total Size (MB): 0.578381 -------------------------------------------------------------------------- '''
nn.ModuleDict 作为模型容器
class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.layers_dict = nn.ModuleDict({"conv1":nn.Conv2d(in_channels=3,out_channels=32,kernel_size = 3), "pool": nn.MaxPool2d(kernel_size = 2,stride = 2), "conv2":nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5), "dropout": nn.Dropout2d(p = 0.1), "adaptive":nn.AdaptiveMaxPool2d((1,1)), "flatten": nn.Flatten(), "linear1": nn.Linear(64,32), "relu":nn.ReLU(), "linear2": nn.Linear(32,1) }) def forward(self,x): layers = ["conv1","pool","conv2","pool","dropout","adaptive", "flatten","linear1","relu","linear2","sigmoid"] for layer in layers: x = self.layers_dict[layer](x) return x net = Net() print(net) ''' Net( (layers_dict): ModuleDict( (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1)) (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1)) (dropout): Dropout2d(p=0.1, inplace=False) (adaptive): AdaptiveMaxPool2d(output_size=(1, 1)) (flatten): Flatten(start_dim=1, end_dim=-1) (linear1): Linear(in_features=64, out_features=32, bias=True) (relu): ReLU() (linear2): Linear(in_features=32, out_features=1, bias=True) ) ) '''
7.2、训练模型
Pytorch 通常需要用户编写自定义训练循环
有 3 类典型的训练循环代码风格:
- 脚本形式训练循环
- 函数形式训练循环
- 类形式训练循环
训练模式 和 预测模式 的切换:
- 一般在训练模型时加上 model.train(),这样会正常使用 Batch Normalization 和 Dropout
- 测试的时候选择 model.eval(),这样就不会使用 Batch Normalization 和 Dropout
下面以 minist 数据集的多分类模型的训练为例,演示这 3 种训练模型的风格
其中类形式训练循环我们同时演示 torchkeras.KerasModel 和 torchkeras.LightModel 两种示范
7.2.0、准备数据
import torch
from torch import nn
import torchvision
from torchvision import transforms
# 将图像数据转换为张量(tensor)
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./data/mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./data/mnist/",train=False,download=True,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True, num_workers=4)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=128, shuffle=False, num_workers=4)
print(len(ds_train)) # 60000
print(len(ds_val)) # 10000
7.2.1、脚本风格
脚本风格的训练循环非常常见
构建模型:
# 构建模型
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(64,32))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(32,10))
print(net)
'''
Sequential(
(conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
(pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
(pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(dropout): Dropout2d(p=0.1, inplace=False)
(adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1))
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear1): Linear(in_features=64, out_features=32, bias=True)
(relu): ReLU()
(linear2): Linear(in_features=32, out_features=10, bias=True)
)
'''
训练模型
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm # 进度条库
import torch
from torch import nn
from copy import deepcopy
from torchmetrics import Accuracy
# 于打印带有时间戳的日志信息
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
# 交叉熵损失函数、Adam 优化器、准确率评估指标
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
# 训练的轮数、检查点路径、早停监控指标、容忍度和模式,以及历史记录的字典
epochs = 20
ckpt_path='checkpoint.pt'
# early_stopping 相关设置
monitor="val_acc"
patience=5
mode="max"
history = {}
# 在每个 epoch 中,首先进行训练模式的前向和反向传播,然后进行验证模式的前向传播,并记录损失和指标。所有指标和损失在每个 epoch 结束时都会被存储在 history 字典中
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
# 训练模式
net.train()
# 初始化本轮训练的总损失和步数计数器
total_loss,step = 0,0
# 使用 tqdm 包装数据加载器 dl_train,以显示训练进度条。enumerate函数用于生成数据批次的索引和值
loop = tqdm(enumerate(dl_train), total =len(dl_train))
# 深拷贝 metrics_dict,用于记录本轮训练的评估指标
train_metrics_dict = deepcopy(metrics_dict)
# 遍历训练数据集 dl_train,每个 batch 是一个批次的训练数据
for i, batch in loop:
# 解包批次数据,features 是输入特征,labels 是对应的标签。
features,labels = batch
# forward
# 向前传播,生成预测值
preds = net(features)
# 计算损失(目标函数)
loss = loss_fn(preds,labels)
# backward 反向传播计算梯度。
loss.backward()
# 使用优化器更新模型参数。
optimizer.step()
# 清零优化器中的梯度,以便下一步计算。
optimizer.zero_grad()
# metrics
# 计算当前批次的评估指标,train_metrics_dict 中包含了准确率等指标。
step_metrics = {"train_"+name:metric_fn(preds, labels).item()
for name,metric_fn in train_metrics_dict.items()}
step_log = dict({"train_loss":loss.item()},**step_metrics)
# 将当前批次的损失和评估指标存入 step_log 字典。
total_loss += loss.item()
# 累加当前批次的损失到总损失中。
step+=1
# 增加步数计数器。
if i!=len(dl_train)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {"train_"+name:metric_fn.compute().item()
for name,metric_fn in train_metrics_dict.items()}
epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in train_metrics_dict.items():
metric_fn.reset()
# 如果不是最后一个批次,更新进度条的后缀为 step_log。如果是最后一个批次,计算整个 epoch 的平均损失和评估指标,更新进度条的后缀为epoch_log,并重置评估指标。
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
# 将模型设置为评估模式,禁用 dropout 等只在训练期间有效的层
net.eval()
# 初始化本轮验证的总损失和步数计数器
total_loss,step = 0,0
# 使用tqdm包装验证数据加载器dl_val,以显示验证进度条
loop = tqdm(enumerate(dl_val), total =len(dl_val))
# 深拷贝metrics_dict,用于记录本轮验证的评估指标
val_metrics_dict = deepcopy(metrics_dict)
# 禁用梯度计算,以节省内存并加快计算
with torch.no_grad():
for i, batch in loop:
# 解包批次数据,features是输入特征,labels是对应的标签
features,labels = batch
# forward
preds = net(features)
# 计算当前批次的损失,使用预定义的损失函数 loss_fn
loss = loss_fn(preds,labels)
# metrics
# 计算当前批次的评估指标,val_metrics_dict中包含了准确率等指标
step_metrics = {"val_"+name:metric_fn(preds, labels).item()
for name,metric_fn in val_metrics_dict.items()}
# 将当前批次的损失和评估指标存入step_log字典
step_log = dict({"val_loss":loss.item()},**step_metrics)
# 累加当前批次的损失到总损失中。
total_loss += loss.item()
# 增加步数计数器。
step+=1
# 如果不是最后一个批次,更新进度条的后缀为step_log。如果是最后一个批次,计算整个epoch的平均损失和评估指标,更新进度条的后缀为epoch_log,并重置评估指标。
if i!=len(dl_val)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = (total_loss/step)
epoch_metrics = {"val_"+name:metric_fn.compute().item()
for name,metric_fn in val_metrics_dict.items()}
epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in val_metrics_dict.items():
metric_fn.reset()
epoch_log["epoch"] = epoch
# 将本轮验证的损失和评估指标存入history字典中
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
# 早停机制:根据验证集上的性能指标决定是否保存当前模型参数,并在指定的轮次内指标没有提升时停止训练
# 获取监控指标的历史记录
arr_scores = history[monitor]
# 根据监控模式(最大化或最小化),找到最佳得分的索引
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
# 如果当前轮次是最佳得分,保存模型参数,并打印提示信息。
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]),file=sys.stderr)
# 如果在指定的容忍度内(patience)没有取得进步,打印提示信息并停止训练。
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience),file=sys.stderr)
break
# 加载保存的最佳模型参数。
net.load_state_dict(torch.load(ckpt_path))
# 将历史记录转换为 DataFrame:
dfhistory = pd.DataFrame(history)
7.2.2、函数风格
该风格在脚本形式上做了进一步的函数封装
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.ModuleList([
nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Dropout2d(p = 0.1),
nn.AdaptiveMaxPool2d((1,1)),
nn.Flatten(),
nn.Linear(64,32),
nn.ReLU(),
nn.Linear(32,10)]
)
def forward(self,x):
for layer in self.layers:
x = layer(x)
return x
net = Net()
print(net)
'''
Net(
(layers): ModuleList(
(0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
(1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Dropout2d(p=0.1, inplace=False)
(5): AdaptiveMaxPool2d(output_size=(1, 1))
(6): Flatten(start_dim=1, end_dim=-1)
(7): Linear(in_features=64, out_features=32, bias=True)
(8): ReLU()
(9): Linear(in_features=32, out_features=10, bias=True)
)
)
'''
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
import torch
from torch import nn
from copy import deepcopy
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
class StepRunner:
def __init__(self, net, loss_fn,
stage = "train", metrics_dict = None,
optimizer = None
):
self.net,self.loss_fn,self.metrics_dict,self.stage = net,loss_fn,metrics_dict,stage
self.optimizer = optimizer
def step(self, features, labels):
#loss
preds = self.net(features)
loss = self.loss_fn(preds,labels)
#backward()
if self.optimizer is not None and self.stage=="train":
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
#metrics
step_metrics = {self.stage+"_"+name:metric_fn(preds, labels).item()
for name,metric_fn in self.metrics_dict.items()}
return loss.item(),step_metrics
def train_step(self,features,labels):
self.net.train() #训练模式, dropout层发生作用
return self.step(features,labels)
@torch.no_grad()
def eval_step(self,features,labels):
self.net.eval() #预测模式, dropout层不发生作用
return self.step(features,labels)
def __call__(self,features,labels):
if self.stage=="train":
return self.train_step(features,labels)
else:
return self.eval_step(features,labels)
class EpochRunner:
def __init__(self,steprunner):
self.steprunner = steprunner
self.stage = steprunner.stage
def __call__(self,dataloader):
total_loss,step = 0,0
loop = tqdm(enumerate(dataloader), total =len(dataloader))
for i, batch in loop:
loss, step_metrics = self.steprunner(*batch)
step_log = dict({self.stage+"_loss":loss},**step_metrics)
total_loss += loss
step+=1
if i!=len(dataloader)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {self.stage+"_"+name:metric_fn.compute().item()
for name,metric_fn in self.steprunner.metrics_dict.items()}
epoch_log = dict({self.stage+"_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in self.steprunner.metrics_dict.items():
metric_fn.reset()
return epoch_log
def train_model(net, optimizer, loss_fn, metrics_dict,
train_data, val_data=None,
epochs=10, ckpt_path='checkpoint.pt',
patience=5, monitor="val_loss", mode="min"):
history = {}
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
train_step_runner = StepRunner(net = net,stage="train",
loss_fn = loss_fn,metrics_dict=deepcopy(metrics_dict),
optimizer = optimizer)
train_epoch_runner = EpochRunner(train_step_runner)
train_metrics = train_epoch_runner(train_data)
for name, metric in train_metrics.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
if val_data:
val_step_runner = StepRunner(net = net,stage="val",
loss_fn = loss_fn,metrics_dict=deepcopy(metrics_dict))
val_epoch_runner = EpochRunner(val_step_runner)
with torch.no_grad():
val_metrics = val_epoch_runner(val_data)
val_metrics["epoch"] = epoch
for name, metric in val_metrics.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]),file=sys.stderr)
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience),file=sys.stderr)
break
net.load_state_dict(torch.load(ckpt_path))
return pd.DataFrame(history)
from torchmetrics import Accuracy
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
dfhistory = train_model(net,
optimizer,
loss_fn,
metrics_dict,
train_data = dl_train,
val_data= dl_val,
epochs=10,
patience=3,
monitor="val_acc",
mode="max")
7.2.3、类风格
此处使用torchkeras.KerasModel高层次API接口中的fit方法训练模型。
使用该形式训练模型非常简洁明了
from torchkeras import KerasModel
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.ModuleList([
nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Dropout2d(p = 0.1),
nn.AdaptiveMaxPool2d((1,1)),
nn.Flatten(),
nn.Linear(64,32),
nn.ReLU(),
nn.Linear(32,10)]
)
def forward(self,x):
for layer in self.layers:
x = layer(x)
return x
net = Net()
print(net)
'''
Net(
(layers): ModuleList(
(0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
(1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Dropout2d(p=0.1, inplace=False)
(5): AdaptiveMaxPool2d(output_size=(1, 1))
(6): Flatten(start_dim=1, end_dim=-1)
(7): Linear(in_features=64, out_features=32, bias=True)
(8): ReLU()
(9): Linear(in_features=32, out_features=10, bias=True)
)
)
'''
from torchmetrics import Accuracy
model = KerasModel(net,
loss_fn=nn.CrossEntropyLoss(),
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)},
optimizer = torch.optim.Adam(net.parameters(),lr = 0.01) )
model.fit(
train_data = dl_train,
val_data= dl_val,
epochs=10,
patience=3,
monitor="val_acc",
mode="max",
plot=True,
cpu=True
)
7.3、使用 GPU 训练模型
训练过程的耗时主要来自于两个部分,一部分来自数据准备,另一部分来自参数迭代
- 当数据准备过程还是模型训练时间的主要瓶颈时,我们可以使用更多进程来准备数据
- 当参数迭代过程成为训练时间的主要瓶颈时,我们通常的方法是应用 GPU 来进行加速
Pytorch 中使用 GPU 加速模型非常简单,只要将模型和数据移动到 GPU 上。核心代码只有以下几行:
# 定义模型
...
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device) # 移动模型到 cuda
# 训练模型
...
features = features.to(device) # 移动数据到 cuda
labels = labels.to(device) # 或者 labels = labels.cuda() if torch.cuda.is_available() else labels
...
如果要使用多个 GPU 训练模型,也非常简单。只需要在将模型设置为数据并行风格模型。 则模型移动到 GPU 上之后,会在每一个 GPU 上拷贝一个副本,并把数据平分到各个 GPU 上进行训练。核心代码如下:
# 定义模型
...
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model) # 包装为并行风格模型
# 训练模型
...
features = features.to(device) # 移动数据到 cuda
labels = labels.to(device) # 或者 labels = labels.cuda() if torch.cuda.is_available() else labels
...
7.3.0、GPU 相关操作汇总
查看 GPU 信息
import torch from torch import nn # 1,查看 gpu 信息 if_cuda = torch.cuda.is_available() print("if_cuda=",if_cuda) # True gpu_count = torch.cuda.device_count() print("gpu_count=",gpu_count) # 1
将张量在 GPU 和 CPU 间移动
# 2,将张量在gpu和cpu间移动 tensor = torch.rand((100,100)) tensor_gpu = tensor.to("cuda:0") # 或者 tensor_gpu = tensor.cuda() print(tensor_gpu.device) # cuda:0 print(tensor_gpu.is_cuda) # True tensor_cpu = tensor_gpu.to("cpu") # 或者 tensor_cpu = tensor_gpu.cpu() print(tensor_cpu.device) # cpu
将模型中的全部张量移动到 GPU 上
# 3,将模型中的全部张量移动到gpu上 net = nn.Linear(2,1) print(next(net.parameters()).is_cuda) # False net.to("cuda:0") # 将模型中的全部参数张量依次到GPU上,注意,无需重新赋值为 net = net.to("cuda:0") print(next(net.parameters()).is_cuda) # True print(next(net.parameters()).device) # cuda:0
创建支持多个 GPU 数据并行的模型
# 4,创建支持多个gpu数据并行的模型 linear = nn.Linear(2,1) print(next(linear.parameters()).device) # cpu model = nn.DataParallel(linear) print(model.device_ids) # [0] print(next(model.module.parameters()).device) # cuda:0 #注意保存参数时要指定保存model.module的参数 torch.save(model.module.state_dict(), "model_parameter.pt") linear = nn.Linear(2,1) linear.load_state_dict(torch.load("model_parameter.pt"))
7.3.1、矩阵乘法案例
下面分别使用 CPU 和 GPU 作一个矩阵乘法,并比较其计算效率。
import time
import torch
from torch import nn
# 使用 CPU
a = torch.rand((10000,200))
b = torch.rand((200,10000))
tic = time.time()
c = torch.matmul(a,b)
toc = time.time()
print(toc-tic)
print(a.device)
print(b.device)
# 使用 Gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
a = torch.rand((10000,200),device = device) #可以指定在GPU上创建张量
b = torch.rand((200,10000)) #也可以在CPU上创建张量后移动到GPU上
b = b.to(device) #或者 b = b.cuda() if torch.cuda.is_available() else b
tic = time.time()
c = torch.matmul(a,b)
toc = time.time()
print(toc-tic)
print(a.device)
print(b.device)
7.3.2、线性回归范例
下面对比使用 CPU 和 GPU 训练一个线性回归模型的效率
使用 CPU:
# 准备数据
n = 1000000 #样本数量
X = 10*torch.rand([n,2])-5.0 #torch.rand是均匀分布
w0 = torch.tensor([[2.0,-3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal( 0.0,2.0,size = [n,1]) # @表示矩阵乘法,增加正态扰动
# 定义模型
class LinearRegression(nn.Module):
def __init__(self):
super().__init__()
self.w = nn.Parameter(torch.randn_like(w0))
self.b = nn.Parameter(torch.zeros_like(b0))
#正向传播
def forward(self,x):
return x@self.w.t() + self.b
linear = LinearRegression()
# 训练模型
optimizer = torch.optim.Adam(linear.parameters(),lr = 0.1)
loss_fn = nn.MSELoss()
def train(epoches):
tic = time.time()
for epoch in range(epoches):
optimizer.zero_grad()
Y_pred = linear(X)
loss = loss_fn(Y_pred,Y)
loss.backward()
optimizer.step()
if epoch%50==0:
print({"epoch":epoch,"loss":loss.item()})
toc = time.time()
print("time used:",toc-tic)
train(500)
使用 GPU:
# 准备数据
n = 1000000 # 样本数量
X = 10*torch.rand([n,2])-5.0 #torch.rand 是均匀分布
w0 = torch.tensor([[2.0,-3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal( 0.0,2.0,size = [n,1]) # @表示矩阵乘法,增加正态扰动
# 数据移动到 GPU 上
print("torch.cuda.is_available() = ",torch.cuda.is_available())
X = X.cuda()
Y = Y.cuda()
print("X.device:",X.device)
print("Y.device:",Y.device)
# 定义模型
class LinearRegression(nn.Module):
def __init__(self):
super().__init__()
self.w = nn.Parameter(torch.randn_like(w0))
self.b = nn.Parameter(torch.zeros_like(b0))
# 正向传播
def forward(self,x):
return x@self.w.t() + self.b
linear = LinearRegression()
# 移动模型到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
linear.to(device)
# 查看模型是否已经移动到 GPU 上
print("if on cuda:",next(linear.parameters()).is_cuda)
# 训练模型
optimizer = torch.optim.Adam(linear.parameters(),lr = 0.1)
loss_fn = nn.MSELoss()
def train(epoches):
tic = time.time()
for epoch in range(epoches):
optimizer.zero_grad()
Y_pred = linear(X)
loss = loss_fn(Y_pred,Y)
loss.backward()
optimizer.step()
if epoch%50==0:
print({"epoch":epoch,"loss":loss.item()})
toc = time.time()
print("time used:",toc-tic)
train(500)
7.3.3、图片分类范例
import torch
from torch import nn
import torchvision
from torchvision import transforms
# 准备数据
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="mnist/",train=False,download=True,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True, num_workers=2)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=128, shuffle=False, num_workers=2)
print(len(ds_train))
print(len(ds_val))
# 定义模型
def create_net():
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(64,32))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(32,10))
return net
net = create_net()
print(net)
使用 CPU 训练:
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
import torch
from torch import nn
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分类使用 torchmetrics 中的评估指标,二分类使用 torchkeras.metrics 中的评估指标
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
net = create_net()
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
epochs = 3
ckpt_path='checkpoint.pt'
#early_stopping 相关设置
monitor="val_acc"
patience=1
mode="max"
history = {}
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
net.train()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
train_metrics_dict = deepcopy(metrics_dict)
for i, batch in loop:
features,labels = batch
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#backward
loss.backward()
optimizer.step()
optimizer.zero_grad()
#metrics
step_metrics = {"train_"+name:metric_fn(preds, labels).item()
for name,metric_fn in train_metrics_dict.items()}
step_log = dict({"train_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_train)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {"train_"+name:metric_fn.compute().item()
for name,metric_fn in train_metrics_dict.items()}
epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in train_metrics_dict.items():
metric_fn.reset()
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
net.eval()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
val_metrics_dict = deepcopy(metrics_dict)
with torch.no_grad():
for i, batch in loop:
features,labels = batch
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#metrics
step_metrics = {"val_"+name:metric_fn(preds, labels).item()
for name,metric_fn in val_metrics_dict.items()}
step_log = dict({"val_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_val)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = (total_loss/step)
epoch_metrics = {"val_"+name:metric_fn.compute().item()
for name,metric_fn in val_metrics_dict.items()}
epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in val_metrics_dict.items():
metric_fn.reset()
epoch_log["epoch"] = epoch
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]))
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience))
break
net.load_state_dict(torch.load(ckpt_path))
dfhistory = pd.DataFrame(history)
使用 GPU 训练:
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
import torch
from torch import nn
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分类使用torchmetrics中的评估指标,二分类使用torchkeras.metrics中的评估指标
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
net = create_net()
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
# =========================移动模型到GPU上==============================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
loss_fn.to(device)
for name,fn in metrics_dict.items():
fn.to(device)
# ====================================================================
epochs = 5
ckpt_path='checkpoint.pt'
#early_stopping相关设置
monitor="val_acc"
patience=1
mode="max"
history = {}
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
net.train()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
train_metrics_dict = deepcopy(metrics_dict)
for i, batch in loop:
features,labels = batch
# =========================移动数据到GPU上==============================
features = features.to(device)
labels = labels.to(device)
# ====================================================================
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#backward
loss.backward()
optimizer.step()
optimizer.zero_grad()
#metrics
step_metrics = {"train_"+name:metric_fn(preds, labels).item()
for name,metric_fn in train_metrics_dict.items()}
step_log = dict({"train_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_train)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {"train_"+name:metric_fn.compute().item()
for name,metric_fn in train_metrics_dict.items()}
epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in train_metrics_dict.items():
metric_fn.reset()
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
net.eval()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
val_metrics_dict = deepcopy(metrics_dict)
with torch.no_grad():
for i, batch in loop:
features,labels = batch
# =========================移动数据到GPU上==============================
features = features.to(device)
labels = labels.to(device)
# ====================================================================
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#metrics
step_metrics = {"val_"+name:metric_fn(preds, labels).item()
for name,metric_fn in val_metrics_dict.items()}
step_log = dict({"val_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_val)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = (total_loss/step)
epoch_metrics = {"val_"+name:metric_fn.compute().item()
for name,metric_fn in val_metrics_dict.items()}
epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in val_metrics_dict.items():
metric_fn.reset()
epoch_log["epoch"] = epoch
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]))
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience))
break
net.load_state_dict(torch.load(ckpt_path))
dfhistory = pd.DataFrame(history)
7.3.4、torchkeras.KerasModel 中使用 GPU
从上面的例子可以看到,在 pytorch 中使用 GPU 并不复杂,但对于经常炼丹的同学来说,模型和数据老是移来移去还是蛮麻烦的。
一不小心就会忘了移动某些数据或者某些 module,导致报错。
torchkeras.KerasModel 在设计的时候考虑到了这一点,如果环境当中存在可用的 GPU,会自动使用 GPU,反之则使用 CPU。
通过引入 accelerate 的一些基础功能,torchkeras.KerasModel 以非常优雅的方式在 GPU 和 CPU 之间切换。
详细实现可以参考 torchkeras.KerasModel 的源码。
import accelerate
accelerator = accelerate.Accelerator()
print(accelerator.device) # cuda
from torchkeras import KerasModel
from torchmetrics import Accuracy
net = create_net()
model = KerasModel(net,
loss_fn=nn.CrossEntropyLoss(),
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)},
optimizer = torch.optim.Adam(net.parameters(),lr = 0.01) )
model.fit(
train_data = dl_train,
val_data= dl_val,
epochs=10,
patience=3,
monitor="val_acc",
mode="max")