56 lines
2.3 KiB
Python
56 lines
2.3 KiB
Python
import pandas as pd
|
|
import os
|
|
|
|
def analyze_parquet_files():
|
|
"""
|
|
分析两个Parquet数据文件的结构差异。
|
|
"""
|
|
data_path = 'data'
|
|
current_data_file = os.path.join(data_path, 'timeseries_training_data_sample_10s50p.parquet')
|
|
new_data_file = os.path.join(data_path, 'old_5shops_50skus.parquet')
|
|
|
|
print("="*50)
|
|
print("数据文件差异分析报告")
|
|
print("="*50)
|
|
|
|
try:
|
|
# --- 分析当前数据文件 ---
|
|
print(f"\n--- 1. 分析当前数据: {current_data_file} ---\n")
|
|
if os.path.exists(current_data_file):
|
|
df_current = pd.read_parquet(current_data_file)
|
|
print("【列名和数据类型】:")
|
|
df_current.info(verbose=False)
|
|
print("\n【前5行样本数据】:")
|
|
print(df_current.head())
|
|
print(f"\n【总行数】: {len(df_current)}")
|
|
print(f"【唯一店铺数】: {df_current['store_id'].nunique()}")
|
|
print(f"【唯一商品数】: {df_current['product_id'].nunique()}")
|
|
else:
|
|
print(f"错误: 文件不存在 {current_data_file}")
|
|
|
|
print("\n" + "-"*40 + "\n")
|
|
|
|
# --- 分析新数据文件 ---
|
|
print(f"\n--- 2. 分析新数据: {new_data_file} ---\n")
|
|
if os.path.exists(new_data_file):
|
|
df_new = pd.read_parquet(new_data_file)
|
|
print("【列名和数据类型 (仅显示部分)】:")
|
|
df_new.info(verbose=True, max_cols=10, show_counts=True) # 显示更详细的信息
|
|
print("\n【所有列名列表】:")
|
|
print(df_new.columns.tolist())
|
|
print("\n【前5行样本数据 (部分列)】:")
|
|
# 选择一些关键列进行展示
|
|
display_cols = ['subbh', 'hh', 'kdrq', 'net_sales_quantity', 'is_weekend', 'sales_quantity_rolling_mean_7d', 'province', 'temperature_2m_mean', 'brand_encoded']
|
|
print(df_new[display_cols].head())
|
|
print(f"\n【总行数】: {len(df_new)}")
|
|
print(f"【唯一店铺数 (subbh)】: {df_new['subbh'].nunique()}")
|
|
print(f"【唯一商品数 (hh)】: {df_new['hh'].nunique()}")
|
|
else:
|
|
print(f"错误: 文件不存在 {new_data_file}")
|
|
|
|
except Exception as e:
|
|
print(f"\n分析过程中出现错误: {e}")
|
|
|
|
if __name__ == '__main__':
|
|
analyze_parquet_files()
|