ShopTRAINING/server/trainers/tcn_trainer.py
2025-07-02 11:05:23 +08:00

506 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
药店销售预测系统 - TCN模型训练函数
"""
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from tqdm import tqdm
from models.tcn_model import TCNForecaster
from utils.data_utils import create_dataset, PharmacyDataset
from utils.visualization import plot_loss_curve
from analysis.metrics import evaluate_model
from core.config import DEVICE, DEFAULT_MODEL_DIR, LOOK_BACK, FORECAST_HORIZON
from utils.training_progress import progress_manager
def save_checkpoint(checkpoint_data: dict, epoch_or_label, product_id: str,
model_type: str, model_dir: str, store_id=None,
training_mode: str = 'product', aggregation_method=None):
"""
保存训练检查点
Args:
checkpoint_data: 检查点数据
epoch_or_label: epoch编号或标签'best'
product_id: 产品ID
model_type: 模型类型
model_dir: 模型保存目录
store_id: 店铺ID
training_mode: 训练模式
aggregation_method: 聚合方法
"""
# 创建检查点目录
checkpoint_dir = os.path.join(model_dir, 'checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)
# 生成检查点文件名
if training_mode == 'store' and store_id:
filename = f"{model_type}_store_{store_id}_{product_id}_epoch_{epoch_or_label}.pth"
elif training_mode == 'global' and aggregation_method:
filename = f"{model_type}_global_{product_id}_{aggregation_method}_epoch_{epoch_or_label}.pth"
else:
filename = f"{model_type}_product_{product_id}_epoch_{epoch_or_label}.pth"
checkpoint_path = os.path.join(checkpoint_dir, filename)
# 保存检查点
torch.save(checkpoint_data, checkpoint_path)
print(f"[TCN] 检查点已保存: {checkpoint_path}", flush=True)
return checkpoint_path
def train_product_model_with_tcn(
product_id,
store_id=None,
training_mode='product',
aggregation_method='sum',
epochs=50,
model_dir=DEFAULT_MODEL_DIR,
version=None,
socketio=None,
task_id=None,
continue_training=False
):
"""
使用TCN模型训练产品销售预测模型
参数:
product_id: 产品ID
epochs: 训练轮次
model_dir: 模型保存目录默认使用配置中的DEFAULT_MODEL_DIR
version: 指定版本号如果为None则自动生成
socketio: WebSocket对象用于实时反馈
task_id: 训练任务ID
continue_training: 是否继续训练现有模型
返回:
model: 训练好的模型
metrics: 模型评估指标
version: 实际使用的版本号
model_path: 模型文件路径
"""
def emit_progress(message, progress=None, metrics=None):
"""发送训练进度到前端"""
if socketio and task_id:
data = {
'task_id': task_id,
'message': message,
'timestamp': time.time()
}
if progress is not None:
data['progress'] = progress
if metrics is not None:
data['metrics'] = metrics
socketio.emit('training_progress', data, namespace='/training')
# 确定版本号
if version is None:
from core.config import get_latest_model_version, get_next_model_version
if continue_training:
version = get_latest_model_version(product_id, 'tcn')
if version is None:
version = get_next_model_version(product_id, 'tcn')
else:
version = get_next_model_version(product_id, 'tcn')
emit_progress(f"开始训练 TCN 模型版本 {version}")
# 根据训练模式加载数据
from utils.multi_store_data_utils import load_multi_store_data, get_store_product_sales_data, aggregate_multi_store_data
try:
if training_mode == 'store' and store_id:
# 加载特定店铺的数据
product_df = get_store_product_sales_data(
store_id,
product_id,
'pharmacy_sales_multi_store.csv'
)
training_scope = f"店铺 {store_id}"
elif training_mode == 'global':
# 聚合所有店铺的数据
product_df = aggregate_multi_store_data(
product_id,
aggregation_method=aggregation_method,
file_path='pharmacy_sales_multi_store.csv'
)
training_scope = f"全局聚合({aggregation_method})"
else:
# 默认:加载所有店铺的产品数据
product_df = load_multi_store_data('pharmacy_sales_multi_store.csv', product_id=product_id)
training_scope = "所有店铺"
except Exception as e:
print(f"多店铺数据加载失败: {e}")
# 后备方案:尝试原始数据
df = pd.read_excel('pharmacy_sales.xlsx')
product_df = df[df['product_id'] == product_id].sort_values('date')
training_scope = "原始数据"
if product_df.empty:
raise ValueError(f"产品 {product_id} 没有可用的销售数据")
# 数据量检查
min_required_samples = LOOK_BACK + FORECAST_HORIZON
if len(product_df) < min_required_samples:
error_msg = (
f"❌ 训练数据不足错误\n"
f"当前配置需要: {min_required_samples} 天数据 (LOOK_BACK={LOOK_BACK} + FORECAST_HORIZON={FORECAST_HORIZON})\n"
f"实际数据量: {len(product_df)}\n"
f"产品ID: {product_id}, 训练模式: {training_mode}\n"
f"建议解决方案:\n"
f"1. 生成更多数据: uv run generate_multi_store_data.py\n"
f"2. 调整配置参数: 减小 LOOK_BACK 或 FORECAST_HORIZON\n"
f"3. 使用全局训练模式聚合更多数据"
)
print(error_msg)
emit_progress(f"训练失败:数据不足 ({len(product_df)}/{min_required_samples} 天)")
raise ValueError(error_msg)
product_df = product_df.sort_values('date')
product_name = product_df['product_name'].iloc[0]
print(f"使用TCN模型训练产品 '{product_name}' (ID: {product_id}) 的销售预测模型")
print(f"训练范围: {training_scope}")
print(f"版本: {version}")
print(f"使用设备: {DEVICE}")
print(f"模型将保存到目录: {model_dir}")
emit_progress(f"训练产品: {product_name} (ID: {product_id})")
# 创建特征和目标变量
features = ['sales', 'price', 'weekday', 'month', 'is_holiday', 'is_weekend', 'is_promotion', 'temperature']
# 预处理数据
X = product_df[features].values
y = product_df[['sales']].values # 保持为二维数组
# 设置数据预处理阶段
progress_manager.set_stage("data_preprocessing", 0)
emit_progress("数据预处理中...")
# 归一化数据
scaler_X = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)
# 划分训练集和测试集80% 训练20% 测试)
train_size = int(len(X_scaled) * 0.8)
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = y_scaled[:train_size], y_scaled[train_size:]
progress_manager.set_stage("data_preprocessing", 50)
# 创建时间序列数据
trainX, trainY = create_dataset(X_train, y_train, LOOK_BACK, FORECAST_HORIZON)
testX, testY = create_dataset(X_test, y_test, LOOK_BACK, FORECAST_HORIZON)
# 转换为PyTorch的Tensor
trainX_tensor = torch.Tensor(trainX)
trainY_tensor = torch.Tensor(trainY)
testX_tensor = torch.Tensor(testX)
testY_tensor = torch.Tensor(testY)
# 创建数据加载器
train_dataset = PharmacyDataset(trainX_tensor, trainY_tensor)
test_dataset = PharmacyDataset(testX_tensor, testY_tensor)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 更新进度管理器的批次信息
total_batches = len(train_loader)
total_samples = len(train_dataset)
progress_manager.total_batches_per_epoch = total_batches
progress_manager.batch_size = batch_size
progress_manager.total_samples = total_samples
progress_manager.set_stage("data_preprocessing", 100)
# 初始化TCN模型
input_dim = X_train.shape[1]
output_dim = FORECAST_HORIZON
hidden_size = 64
num_layers = 3
kernel_size = 3
dropout_rate = 0.2
model = TCNForecaster(
num_features=input_dim,
output_sequence_length=output_dim,
num_channels=[hidden_size] * num_layers,
kernel_size=kernel_size,
dropout=dropout_rate
)
# 如果是继续训练,加载现有模型
if continue_training and version != 'v1':
try:
from core.config import get_model_file_path
existing_model_path = get_model_file_path(product_id, 'tcn', version)
if os.path.exists(existing_model_path):
checkpoint = torch.load(existing_model_path, map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"加载现有模型: {existing_model_path}")
emit_progress(f"加载现有模型版本 {version} 进行继续训练")
except Exception as e:
print(f"无法加载现有模型,将重新开始训练: {e}")
emit_progress("无法加载现有模型,重新开始训练")
# 将模型移动到设备上
model = model.to(DEVICE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
emit_progress("开始模型训练...")
# 训练模型
train_losses = []
test_losses = []
start_time = time.time()
# 配置检查点保存
checkpoint_interval = max(1, epochs // 10) # 每10%进度保存一次最少每1个epoch
best_loss = float('inf')
progress_manager.set_stage("model_training", 0)
emit_progress(f"开始训练 - 总epoch: {epochs}, 检查点间隔: {checkpoint_interval}")
for epoch in range(epochs):
# 开始新的轮次
progress_manager.start_epoch(epoch)
model.train()
epoch_loss = 0
for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
# 确保目标张量有正确的形状 (batch_size, forecast_horizon, 1)
if y_batch.dim() == 2:
y_batch = y_batch.unsqueeze(-1)
# 前向传播
outputs = model(X_batch)
# 确保输出和目标形状匹配
loss = criterion(outputs, y_batch)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
# 更新批次进度每10个批次更新一次
if batch_idx % 10 == 0 or batch_idx == len(train_loader) - 1:
current_lr = optimizer.param_groups[0]['lr']
progress_manager.update_batch(batch_idx, loss.item(), current_lr)
# 计算训练损失
train_loss = epoch_loss / len(train_loader)
train_losses.append(train_loss)
# 设置验证阶段
progress_manager.set_stage("validation", 0)
# 在测试集上评估
model.eval()
test_loss = 0
with torch.no_grad():
for batch_idx, (X_batch, y_batch) in enumerate(test_loader):
X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
# 确保目标张量有正确的形状
if y_batch.dim() == 2:
y_batch = y_batch.unsqueeze(-1)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
test_loss += loss.item()
# 更新验证进度
if batch_idx % 5 == 0 or batch_idx == len(test_loader) - 1:
val_progress = (batch_idx / len(test_loader)) * 100
progress_manager.set_stage("validation", val_progress)
test_loss = test_loss / len(test_loader)
test_losses.append(test_loss)
# 完成当前轮次
progress_manager.finish_epoch(train_loss, test_loss)
# 发送训练进度(保持与旧系统的兼容性)
if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
progress = ((epoch + 1) / epochs) * 100
current_metrics = {
'train_loss': train_loss,
'test_loss': test_loss,
'epoch': epoch + 1,
'total_epochs': epochs
}
emit_progress(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}",
progress=progress, metrics=current_metrics)
# 定期保存检查点
if (epoch + 1) % checkpoint_interval == 0 or epoch == epochs - 1:
checkpoint_data = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss,
'test_loss': test_loss,
'train_losses': train_losses,
'test_losses': test_losses,
'scaler_X': scaler_X,
'scaler_y': scaler_y,
'config': {
'input_dim': input_dim,
'output_dim': output_dim,
'hidden_size': hidden_size,
'num_layers': num_layers,
'dropout': dropout_rate,
'kernel_size': kernel_size,
'sequence_length': LOOK_BACK,
'forecast_horizon': FORECAST_HORIZON,
'model_type': 'tcn'
},
'training_info': {
'product_id': product_id,
'product_name': product_name,
'training_mode': training_mode,
'store_id': store_id,
'aggregation_method': aggregation_method,
'timestamp': time.time()
}
}
# 保存检查点
save_checkpoint(checkpoint_data, epoch + 1, product_id, 'tcn',
model_dir, store_id, training_mode, aggregation_method)
# 如果是最佳模型,额外保存一份
if test_loss < best_loss:
best_loss = test_loss
save_checkpoint(checkpoint_data, 'best', product_id, 'tcn',
model_dir, store_id, training_mode, aggregation_method)
emit_progress(f"💾 保存最佳模型检查点 (epoch {epoch+1}, test_loss: {test_loss:.4f})")
emit_progress(f"💾 保存训练检查点 epoch_{epoch+1}")
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
# 计算训练时间
training_time = time.time() - start_time
# 设置模型保存阶段
progress_manager.set_stage("model_saving", 0)
emit_progress("训练完成,正在保存模型...")
# 绘制损失曲线并保存到模型目录
loss_curve_path = plot_loss_curve(
train_losses,
test_losses,
product_name,
'TCN',
model_dir=model_dir
)
print(f"损失曲线已保存到: {loss_curve_path}")
# 评估模型
model.eval()
with torch.no_grad():
# 确保测试数据的形状正确
test_pred = model(testX_tensor.to(DEVICE))
# 将输出转换为二维数组 [samples, forecast_horizon]
test_pred = test_pred.squeeze(-1).cpu().numpy()
# 反归一化预测结果和真实值
test_pred_inv = scaler_y.inverse_transform(test_pred.reshape(-1, 1)).flatten()
test_true_inv = scaler_y.inverse_transform(testY.reshape(-1, 1)).flatten()
# 计算评估指标
metrics = evaluate_model(test_true_inv, test_pred_inv)
metrics['training_time'] = training_time
# 打印评估指标
print("\n模型评估指标:")
print(f"MSE: {metrics['mse']:.4f}")
print(f"RMSE: {metrics['rmse']:.4f}")
print(f"MAE: {metrics['mae']:.4f}")
print(f"R²: {metrics['r2']:.4f}")
print(f"MAPE: {metrics['mape']:.2f}%")
print(f"训练时间: {training_time:.2f}")
# 保存最终训练完成的模型基于最终epoch
final_model_data = {
'epoch': epochs, # 最终epoch
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_losses[-1],
'test_loss': test_losses[-1],
'train_losses': train_losses,
'test_losses': test_losses,
'scaler_X': scaler_X,
'scaler_y': scaler_y,
'config': {
'input_dim': input_dim,
'output_dim': output_dim,
'hidden_size': hidden_size,
'num_layers': num_layers,
'dropout': dropout_rate,
'kernel_size': kernel_size,
'sequence_length': LOOK_BACK,
'forecast_horizon': FORECAST_HORIZON,
'model_type': 'tcn'
},
'metrics': metrics,
'loss_curve_path': loss_curve_path,
'training_info': {
'product_id': product_id,
'product_name': product_name,
'training_mode': training_mode,
'store_id': store_id,
'aggregation_method': aggregation_method,
'timestamp': time.time(),
'training_completed': True
}
}
progress_manager.set_stage("model_saving", 50)
# 保存最终模型使用epoch标识
final_model_path = save_checkpoint(
final_model_data, f"final_epoch_{epochs}", product_id, 'tcn',
model_dir, store_id, training_mode, aggregation_method
)
progress_manager.set_stage("model_saving", 100)
final_metrics = {
'mse': metrics['mse'],
'rmse': metrics['rmse'],
'mae': metrics['mae'],
'r2': metrics['r2'],
'mape': metrics['mape'],
'training_time': training_time,
'final_epoch': epochs
}
emit_progress(f"模型训练完成最终epoch: {epochs}", progress=100, metrics=final_metrics)
return model, metrics, epochs, final_model_path