ShopTRAINING/server/trainers/transformer_trainer.py

421 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
药店销售预测系统 - Transformer模型训练函数
"""
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from models.transformer_model import TimeSeriesTransformer
from utils.data_utils import prepare_data, PharmacyDataset, prepare_sequences
from utils.multi_store_data_utils import get_store_product_sales_data, aggregate_multi_store_data
from utils.visualization import plot_loss_curve
from analysis.metrics import evaluate_model
from core.config import (
DEVICE, DEFAULT_MODEL_DIR, LOOK_BACK, FORECAST_HORIZON
)
from utils.training_progress import progress_manager
from utils.model_manager import model_manager
def train_product_model_with_transformer(
model_identifier: str,
training_df: pd.DataFrame,
feature_list: list,
training_mode: str,
epochs: int = 50,
sequence_length: int = LOOK_BACK,
forecast_horizon: int = FORECAST_HORIZON,
model_dir: str = DEFAULT_MODEL_DIR,
product_id: str = None,
store_id: str = None,
aggregation_method: str = None,
version: str = None,
socketio=None,
task_id: str = None,
progress_callback=None,
patience: int = 10,
learning_rate: float = 0.001,
clip_norm: float = 1.0,
**kwargs
):
"""
使用Transformer模型训练产品销售预测模型 (新数据管道版)
"""
def emit_progress(message, progress=None, metrics=None):
"""发送训练进度到前端"""
if socketio and task_id:
data = {
'task_id': task_id,
'message': message,
'timestamp': time.time()
}
if progress is not None:
data['progress'] = progress
if metrics is not None:
data['metrics'] = metrics
socketio.emit('training_progress', data, namespace='/training')
print(f"[{time.strftime('%H:%M:%S')}] {message}", flush=True)
import sys
sys.stdout.flush()
sys.stderr.flush()
emit_progress("开始Transformer模型训练...")
try:
from utils.training_progress import progress_manager
except ImportError:
class DummyProgressManager:
def set_stage(self, *args, **kwargs): pass
def start_training(self, *args, **kwargs): pass
def start_epoch(self, *args, **kwargs): pass
def update_batch(self, *args, **kwargs): pass
def finish_epoch(self, *args, **kwargs): pass
def finish_training(self, *args, **kwargs): pass
progress_manager = DummyProgressManager()
min_required_samples = sequence_length + forecast_horizon
if len(training_df) < min_required_samples:
error_msg = f"训练数据不足: 需要 {min_required_samples} 条记录, 但只有 {len(training_df)} 条。"
emit_progress(error_msg)
raise ValueError(error_msg)
product_name = training_df['product_name'].iloc[0] if 'product_name' in training_df.columns else model_identifier
emit_progress(f"开始为 '{product_name}' (标识: {model_identifier}) 训练Transformer模型")
# --- 新数据管道核心改造 ---
emit_progress("数据预处理中...")
# 1. 使用标准化的 prepare_data 函数处理数据
_, _, trainX, testX, trainY, testY, scaler_X, scaler_y = prepare_data(
training_df=training_df,
feature_list=feature_list,
target_column='net_sales_quantity',
sequence_length=sequence_length,
forecast_horizon=forecast_horizon
)
# 2. 使用标准化的 prepare_sequences 函数创建 DataLoader
batch_size = 32
train_loader = prepare_sequences(trainX, trainY, batch_size)
test_loader = prepare_sequences(testX, testY, batch_size)
total_batches = len(train_loader)
total_samples = len(trainX)
if hasattr(progress_manager, 'total_batches_per_epoch'):
progress_manager.total_batches_per_epoch = total_batches
progress_manager.batch_size = batch_size
progress_manager.total_samples = total_samples
emit_progress("数据预处理完成,开始模型训练...")
input_dim = trainX.shape[2]
output_dim = forecast_horizon
hidden_size = 64
num_heads = 4
dropout_rate = 0.1
num_layers = 3
model = TimeSeriesTransformer(
num_features=input_dim,
d_model=hidden_size,
nhead=num_heads,
num_encoder_layers=num_layers,
dim_feedforward=hidden_size * 2,
dropout=dropout_rate,
output_sequence_length=output_dim,
seq_length=sequence_length,
batch_size=batch_size
)
model = model.to(DEVICE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.5)
train_losses = []
test_losses = []
start_time = time.time()
# 版本锁定
current_version = model_manager.peek_next_version(
model_type='transformer',
product_id=product_id,
store_id=store_id,
training_mode=training_mode,
aggregation_method=aggregation_method
)
print(f"🔒 本次训练版本锁定为: {current_version}")
checkpoint_interval = max(1, epochs // 10)
best_loss = float('inf')
epochs_no_improve = 0
best_model_path = None
progress_manager.set_stage("model_training", 0)
emit_progress(f"开始训练 - 总epoch: {epochs}, 检查点间隔: {checkpoint_interval}, 耐心值: {patience}")
for epoch in range(epochs):
progress_manager.start_epoch(epoch)
model.train()
epoch_loss = 0
for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
if clip_norm:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
optimizer.step()
epoch_loss += loss.item()
if batch_idx % 5 == 0 or batch_idx == len(train_loader) - 1:
current_lr = optimizer.param_groups[0]['lr']
progress_manager.update_batch(batch_idx, loss.item(), current_lr)
train_loss = epoch_loss / len(train_loader)
train_losses.append(train_loss)
progress_manager.set_stage("validation", 0)
model.eval()
test_loss = 0
with torch.no_grad():
for batch_idx, (X_batch, y_batch) in enumerate(test_loader):
X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
test_loss += loss.item()
if batch_idx % 3 == 0 or batch_idx == len(test_loader) - 1:
val_progress = (batch_idx / len(test_loader)) * 100
progress_manager.set_stage("validation", val_progress)
test_loss = test_loss / len(test_loader)
test_losses.append(test_loss)
scheduler.step(test_loss)
progress_manager.finish_epoch(train_loss, test_loss)
if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
progress = ((epoch + 1) / epochs) * 100
current_metrics = {
'train_loss': train_loss,
'test_loss': test_loss,
'epoch': epoch + 1,
'total_epochs': epochs
}
emit_progress(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}",
progress=progress, metrics=current_metrics)
if (epoch + 1) % checkpoint_interval == 0 or epoch == epochs - 1:
checkpoint_data = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_loss,
'test_loss': test_loss,
'train_losses': train_losses,
'test_losses': test_losses,
'scaler_X': scaler_X,
'scaler_y': scaler_y,
'config': {
'input_dim': input_dim,
'output_dim': output_dim,
'hidden_size': hidden_size,
'num_heads': num_heads,
'dropout': dropout_rate,
'num_layers': num_layers,
'sequence_length': sequence_length,
'forecast_horizon': forecast_horizon,
'model_type': 'transformer'
},
'training_info': {
'product_id': product_id,
'product_name': product_name,
'training_mode': training_mode,
'store_id': store_id,
'aggregation_method': aggregation_method,
'timestamp': time.time()
}
}
if test_loss < best_loss:
best_loss = test_loss
# 修正: 保存最佳模型路径
best_model_path, _ = model_manager.save_model(
model_data=checkpoint_data,
product_id=product_id,
model_type='transformer',
store_id=store_id,
training_mode=training_mode,
aggregation_method=aggregation_method,
product_name=product_name,
version=f"{current_version}_best"
)
emit_progress(f"💾 保存最佳模型检查点 (epoch {epoch+1}, test_loss: {test_loss:.4f})")
epochs_no_improve = 0
else:
epochs_no_improve += 1
if (epoch + 1) % 10 == 0:
print(f"📊 Epoch {epoch+1}/{epochs}, 训练损失: {train_loss:.4f}, 测试损失: {test_loss:.4f}", flush=True)
if epochs_no_improve >= patience:
emit_progress(f"连续 {patience} 个epoch测试损失未改善提前停止训练。")
break
training_time = time.time() - start_time
progress_manager.set_stage("model_saving", 0)
emit_progress("训练完成,正在保存模型...")
model.eval()
with torch.no_grad():
all_test_X = []
all_test_Y = []
for X_batch, y_batch in test_loader:
all_test_X.append(X_batch)
all_test_Y.append(y_batch)
testX_tensor = torch.cat(all_test_X, dim=0)
testY_tensor = torch.cat(all_test_Y, dim=0)
test_pred = model(testX_tensor.to(DEVICE)).cpu().numpy()
test_true = testY_tensor.cpu().numpy()
test_pred_inv = scaler_y.inverse_transform(test_pred)
test_true_inv = scaler_y.inverse_transform(test_true)
metrics = evaluate_model(test_true_inv, test_pred_inv)
metrics['training_time'] = training_time
print(f"\n📊 模型评估指标:", flush=True)
print(f" MSE: {metrics['mse']:.4f}", flush=True)
print(f" RMSE: {metrics['rmse']:.4f}", flush=True)
print(f" MAE: {metrics['mae']:.4f}", flush=True)
print(f" R²: {metrics['r2']:.4f}", flush=True)
print(f" MAPE: {metrics['mape']:.2f}%", flush=True)
print(f" ⏱️ 训练时间: {training_time:.2f}", flush=True)
final_model_data = {
'epoch': epochs,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': train_losses[-1],
'test_loss': test_losses[-1],
'train_losses': train_losses,
'test_losses': test_losses,
'scaler_X': scaler_X,
'scaler_y': scaler_y,
'config': {
'input_dim': input_dim,
'output_dim': output_dim,
'hidden_size': hidden_size,
'num_heads': num_heads,
'dropout': dropout_rate,
'num_layers': num_layers,
'sequence_length': sequence_length,
'forecast_horizon': forecast_horizon,
'model_type': 'transformer'
},
'metrics': metrics,
'metrics': metrics,
'training_info': {
'product_id': product_id,
'product_name': product_name,
'training_mode': training_mode,
'store_id': store_id,
'aggregation_method': aggregation_method,
'timestamp': time.time(),
'training_completed': True
}
}
progress_manager.set_stage("model_saving", 50)
final_model_path, final_version = model_manager.save_model(
model_data=final_model_data,
product_id=product_id,
model_type='transformer',
store_id=store_id,
training_mode=training_mode,
aggregation_method=aggregation_method,
product_name=product_name,
version=current_version
)
progress_manager.set_stage("model_saving", 100)
emit_progress(f"模型已保存到 {final_model_path}")
print(f"💾 模型已保存到 {final_model_path}", flush=True)
final_metrics = {
'mse': metrics['mse'],
'rmse': metrics['rmse'],
'mae': metrics['mae'],
'r2': metrics['r2'],
'mape': metrics['mape'],
'training_time': training_time,
'final_epoch': epochs,
'version': final_version
}
# 准备 scope 和 identifier 以生成标准化的文件名
scope = training_mode
if scope == 'product':
identifier = model_identifier
elif scope == 'store':
identifier = store_id
elif scope == 'global':
identifier = aggregation_method
else:
identifier = product_name # 后备方案
# 绘制带有版本号的损失曲线图
loss_curve_path = plot_loss_curve(
train_losses=train_losses,
val_losses=test_losses,
model_type='transformer',
scope=scope,
identifier=identifier,
version=current_version, # 使用锁定的版本
model_dir=model_dir
)
print(f"📈 带版本号的损失曲线已保存: {loss_curve_path}")
# 更新模型数据中的损失图路径
final_model_data['loss_curve_path'] = loss_curve_path
artifacts = {
"versioned_model": final_model_path,
"loss_curve_plot": loss_curve_path,
"best_model": best_model_path,
"version": final_version
}
return final_metrics, artifacts
# --- 将此训练器注册到系统中 ---
from models.model_registry import register_trainer
register_trainer('transformer', train_product_model_with_transformer)