数据清洗注意事项
1. 保留原始数据副本,避免不可逆操作
2. 记录清洗步骤和决策依据
3. 对异常值进行分析而非简单删除
4. 注意数据清洗可能引入偏差
5. 定期审查清洗规则的有效性
Python数据处理基础
数据清洗是量化交易数据处理中最关键的步骤之一。高质量的数据是策略有效性的基础。 本节将系统介绍数据清洗的各种技术和预处理方法。
import pandas as pd
import numpy as np
# 加载数据
df = pd.read_csv('stock_data.csv')
# 基本检查
print(df.info()) # 数据类型和缺失值
print(df.describe()) # 统计描述
print(df.isnull().sum()) # 缺失值统计
# 重复值检查
print(df.duplicated().sum())
# 异常值检查(3σ原则)
mean = df['price'].mean()
std = df['price'].std()
outliers = df[(df['price'] < mean - 3*std) | (df['price'] > mean + 3*std)]
# 缺失值统计
missing_count = df.isnull().sum()
missing_percent = df.isnull().sum() / len(df) * 100
missing_table = pd.DataFrame({
'Count': missing_count,
'Percent': missing_percent
})
# 可视化缺失值
import missingno as msno
msno.matrix(df)
msno.heatmap(df)
# 删除缺失值
df_dropped = df.dropna() # 删除含缺失的行
df_dropped = df.dropna(subset=['price']) # 删除特定列缺失的行
df_dropped = df.dropna(thresh=3) # 至少3个非缺失值
# 填充缺失值
df_filled = df.fillna(0) # 填充0
df_filled = df.fillna(method='ffill') # 前向填充
df_filled = df.fillna(method='bfill') # 后向填充
df_filled = df.fillna(df.mean()) # 均值填充
df_filled = df.fillna(df.median()) # 中位数填充
# 分组填充
df['price'] = df.groupby('symbol')['price'].transform(
lambda x: x.fillna(x.mean())
)
# 插值填充
df['price'] = df['price'].interpolate() # 线性插值
df['price'] = df['price'].interpolate(method='time') # 时间插值
# 1. 统计方法(Z-score)
from scipy import stats
z_scores = np.abs(stats.zscore(df['price']))
outliers_z = df[z_scores > 3]
# 2. IQR方法
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
# 3. 修改后的Z-score(针对偏态分布)
median = df['price'].median()
mad = np.median(np.abs(df['price'] - median))
modified_z_scores = 0.6745 * (df['price'] - median) / mad
outliers_mz = df[np.abs(modified_z_scores) > 3.5]
# 删除异常值
df_clean = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
# 限制异常值(Winsorization)
from scipy.stats import mstats
df['price_capped'] = mstats.winsorize(df['price'], limits=[0.05, 0.05])
# 用百分位值替换
df['price_clipped'] = df['price'].clip(lower=Q1, upper=Q3)
# 标记异常值
df['is_outlier'] = np.where(
(df['price'] < lower_bound) | (df['price'] > upper_bound),
1, 0
)
# 类型转换
df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['volume'] = df['volume'].astype(int)
df['symbol'] = df['symbol'].astype('category')
# 数值型数据处理
df['price'] = pd.to_numeric(df['price'], errors='coerce')
# 字符串清洗
df['symbol'] = df['symbol'].str.upper().str.strip()
df['name'] = df['name'].str.replace(r'\s+', ' ', regex=True)
# Z-score标准化
df['price_zscore'] = (df['price'] - df['price'].mean()) / df['price'].std()
# Min-Max标准化
df['price_minmax'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
# 对数变换(针对右偏分布)
df['price_log'] = np.log(df['price'])
# Box-Cox变换
from scipy.stats import boxcox
df['price_boxcox'], _ = boxcox(df['price'][df['price'] > 0])
# 检查价格异常(如负价格、零价格)
df = df[(df['price'] > 0) & (df['price'] < df['price'].quantile(0.995))]
# 检查价格跳跃(单日变化过大)
df['price_change'] = df['price'].pct_change()
df = df[np.abs(df['price_change']) < 0.5] # 过滤单日涨跌幅超过50%的数据
# 复权处理
df['adj_close'] = df['close'] * df['adj_factor']
# 检查成交量异常
df = df[df['volume'] >= 0]
df = df[df['volume'] < df['volume'].quantile(0.99)]
# 成交量异常比例
df['volatility'] = df['high'] - df['low']
df['volume_price_ratio'] = df['volume'] / df['volatility']
# 时区处理
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
df['datetime'] = df['datetime'].dt.tz_convert('Asia/Shanghai')
# 交易日历
from pandas.tseries.offsets import BDay
df['is_trading_day'] = df['date'].dt.weekday < 5
1. 保留原始数据副本,避免不可逆操作
2. 记录清洗步骤和决策依据
3. 对异常值进行分析而非简单删除
4. 注意数据清洗可能引入偏差
5. 定期审查清洗规则的有效性