ShopTRAINING/server/trainers/xgboost_trainer.py

import xgboost as xgb
import numpy as np
import pandas as pd
import os
import joblib
import xgboost as xgb
from xgboost.callback import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# 从项目中导入正确的工具函数和配置
from utils.multi_store_data_utils import load_multi_store_data
from core.config import DEFAULT_DATA_PATH
from utils.file_save import ModelPathManager
from analysis.metrics import evaluate_model

# 重构后的原生API兼容回调
class EpochCheckpointCallback(xgb.callback.TrainingCallback):
    def __init__(self, save_period, payload, base_path):
        super().__init__()
        self.save_period = save_period
        self.payload = payload
        self.base_path = base_path
        self.best_score = float('inf')

    def _save_checkpoint(self, model, path_suffix):
        """辅助函数，用于保存模型和元数据检查点"""
        metadata_path = self.base_path.replace('_model.pth', f'_{path_suffix}.pth')
        model_file_path = metadata_path.replace('.pth', '.xgb')

        # 确保目录存在
        os.makedirs(os.path.dirname(metadata_path), exist_ok=True)

        # 保存原生Booster模型
        model.save_model(model_file_path)

        # 更新payload中的模型文件引用
        self.payload['model_file'] = model_file_path
        joblib.dump(self.payload, metadata_path)

        print(f"[Checkpoint] 已保存检查点到: {metadata_path}")

    def after_iteration(self, model, epoch, evals_log):
        # 获取当前验证集的分数 (假设'test'是验证集)
        current_score = evals_log['test']['rmse'][-1]

        # 保存最佳模型
        if current_score < self.best_score:
            self.best_score = current_score
            self._save_checkpoint(model, 'checkpoint_best')

        # 保存周期性检查点
        if (epoch + 1) % self.save_period == 0:
            self._save_checkpoint(model, f'checkpoint_epoch_{epoch + 1}')

        return False # 继续训练

def create_dataset(data, look_back=7):
    """
    将时间序列数据转换为监督学习格式。
    :param data: 输入的DataFrame，包含特征和目标。
    :param look_back: 用于预测的时间窗口大小。
    :return: X (特征), y (目标)
    """
    X, y = [], []
    feature_columns = [col for col in data.columns if col != 'date']

    for i in range(len(data) - look_back):
        # 展平look_back窗口内的所有特征
        features = data[feature_columns].iloc[i:(i + look_back)].values.flatten()
        X.append(features)
        # 目标是窗口后的第一个销售值
        y.append(data['sales'].iloc[i + look_back])

    return np.array(X), np.array(y)

def train_product_model_with_xgboost(
    product_id,
    store_id=None,
    epochs=100, # XGBoost中n_estimators更常用
    look_back=7,
    socketio=None,
    task_id=None,
    version='v1',
    path_info=None,
    **kwargs):
    """
    使用XGBoost训练产品销售预测模型。
    """

    def emit_progress(message, progress=None):
        if socketio and task_id:
            payload = {'task_id': task_id, 'message': message}
            if progress is not None:
                payload['progress'] = progress
            socketio.emit('training_update', payload, namespace='/api/training', room=task_id)
        print(f"[{task_id}] {message}")

    try:
        model_path = None
        emit_progress("开始XGBoost模型训练...", 0)

        # 1. 加载数据
        # 使用正确的函数并从config导入路径
        full_df = load_multi_store_data(DEFAULT_DATA_PATH)

        # 根据 store_id 和 product_id 筛选数据
        if store_id:
            df = full_df[(full_df['product_id'] == product_id) & (full_df['store_id'] == store_id)].copy()
        else:
            # 如果没有store_id，则聚合该产品在所有店铺的数据
            df = full_df[full_df['product_id'] == product_id].groupby('date').agg({
                'sales': 'sum',
                'weekday': 'first',
                'month': 'first',
                'is_holiday': 'max',
                'is_weekend': 'max',
                'is_promotion': 'max',
                'temperature': 'mean'
            }).reset_index()

        if df.empty:
            raise ValueError(f"加载的数据为空 (product: {product_id}, store: {store_id})，无法进行训练。")

        # 确保数据按日期排序
        df = df.sort_values('date').reset_index(drop=True)

        emit_progress("数据加载完成。", 10)

        # 2. 创建数据集
        features_to_use = ['sales', 'weekday', 'month', 'is_holiday', 'is_weekend', 'is_promotion', 'temperature']
        # 确保所有需要的特征都存在
        for col in features_to_use:
            if col not in df.columns:
                # 如果特征不存在，用0填充
                df[col] = 0

        df_features = df[['date'] + features_to_use]

        X, y = create_dataset(df_features, look_back)
        if X.shape[0] == 0:
            raise ValueError("创建数据集后样本数量为0，请检查数据量和look_back参数。")

        emit_progress(f"数据集创建完成，样本数: {X.shape[0]}", 20)

        # 3. 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 数据缩放
        scaler_X = MinMaxScaler()
        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)

        scaler_y = MinMaxScaler()
        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
        # y_test is not scaled, used for metric calculation against inverse_transformed predictions

        emit_progress("数据划分和缩放完成。", 30)

        # 4. 切换到XGBoost原生API
        params = {
            'learning_rate': kwargs.get('learning_rate', 0.1),
            'max_depth': kwargs.get('max_depth', 5),
            'subsample': kwargs.get('subsample', 0.8),
            'colsample_bytree': kwargs.get('colsample_bytree', 0.8),
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'random_state': 42
        }

        dtrain = xgb.DMatrix(X_train_scaled, label=y_train_scaled.ravel())
        dtest = xgb.DMatrix(X_test_scaled, label=scaler_y.transform(y_test.reshape(-1, 1)).ravel())

        emit_progress("开始模型训练...", 40)

        # 定义验证集
        evals = [(dtrain, 'train'), (dtest, 'test')]

        # 准备回调
        callbacks = []
        checkpoint_interval = kwargs.get('checkpoint_interval', 10) # 默认每10轮保存一次
        if path_info and path_info.get('model_path') and checkpoint_interval > 0:
            # 准备用于保存的payload，模型对象将在回调中动态更新
            checkpoint_payload = {
                'metrics': {}, # 检查点不保存最终指标
                'scaler_X': scaler_X,
                'scaler_y': scaler_y,
                'config': {
                    'look_back': look_back,
                    'features': features_to_use,
                    'product_id': product_id,
                    'store_id': store_id,
                    'version': version
                }
            }
            checkpoint_callback = EpochCheckpointCallback(
                save_period=checkpoint_interval,
                payload=checkpoint_payload,
                base_path=path_info['model_path']
            )
            callbacks.append(checkpoint_callback)

        # 添加早停回调 (移除save_best)
        callbacks.append(EarlyStopping(rounds=10))

        # 用于存储评估结果
        evals_result = {}

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=epochs,
            evals=evals,
            callbacks=callbacks,
            evals_result=evals_result,
            verbose_eval=False
        )
        emit_progress("模型训练完成。", 80)

        # 绘制并保存损失曲线
        if path_info and path_info.get('model_path'):
            try:
                loss_curve_path = path_info['model_path'].replace('_model.pth', '_loss_curve.png')
                results = evals_result
                train_rmse = results['train']['rmse']
                test_rmse = results['test']['rmse']
                num_epochs = len(train_rmse)
                x_axis = range(0, num_epochs)

                fig, ax = plt.subplots(figsize=(10, 6))
                ax.plot(x_axis, train_rmse, label='Train')
                ax.plot(x_axis, test_rmse, label='Test')
                ax.legend()
                plt.ylabel('RMSE')
                plt.xlabel('Epoch')
                plt.title('XGBoost RMSE Loss Curve')
                plt.savefig(loss_curve_path)
                plt.close(fig)
                emit_progress(f"损失曲线图已保存到: {loss_curve_path}")
            except Exception as e:
                emit_progress(f"警告: 绘制损失曲线失败: {str(e)}")

        # 5. 评估模型
        dtest_pred = xgb.DMatrix(X_test_scaled)
        y_pred_scaled = model.predict(dtest_pred)
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

        metrics = {
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'MAE': mean_absolute_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred)
        }
        emit_progress(f"模型评估完成: {metrics}", 90)

        # 6. 保存模型 (原生API方式)
        if path_info and path_info.get('model_path'):
            metadata_path = path_info['model_path']
            # 使用 .xgb 扩展名保存原生Booster模型
            model_file_path = metadata_path.replace('.pth', '.xgb')

            # 确保目录存在
            os.makedirs(os.path.dirname(metadata_path), exist_ok=True)

            # 使用原生方法保存Booster模型
            model.save_model(model_file_path)
            emit_progress(f"原生XGBoost模型已保存到: {model_file_path}")

            # 保存元数据（包括模型文件路径）
            metadata_payload = {
                'model_file': model_file_path, # 保存模型文件的引用
                'metrics': metrics,
                'scaler_X': scaler_X,
                'scaler_y': scaler_y,
                'config': {
                    'look_back': look_back,
                    'features': features_to_use,
                    'product_id': product_id,
                    'store_id': store_id,
                    'version': version
                }
            }
            joblib.dump(metadata_payload, metadata_path)
            model_path = metadata_path # 确保model_path被赋值
            emit_progress(f"模型元数据已保存到: {metadata_path}", 100)
        else:
            emit_progress("警告: 未提供path_info，模型未保存。", 100)

        return metrics, model_path

    except Exception as e:
        emit_progress(f"XGBoost训练失败: {str(e)}", 100)
        import traceback
        traceback.print_exc()
        return {'error': str(e)}, None