因子过拟合风险
因子构造和筛选过程中,容易出现过拟合问题。建议:1)使用样本外验证;2)控制因子复杂度;3)避免过度数据挖掘;4)进行稳健性检验。
项目一:多因子选股策略
因子是股票收益的驱动因素。选择有效的因子是构建多因子模型的核心。本节将介绍因子分类、构造方法和有效性检验。
import pandas as pd
import numpy as np
# 价值因子构造示例
def calculate_value_factor(df):
"""
计算价值因子
市盈率倒数 + 市净率倒数 + 市销率倒数
"""
# 市盈率倒数
pe_inv = 1 / df['pe_ratio']
# 市净率倒数
pb_inv = 1 / df['pb_ratio']
# 市销率倒数
ps_inv = 1 / df['ps_ratio']
# 标准化
pe_norm = (pe_inv - pe_inv.mean()) / pe_inv.std()
pb_norm = (pb_inv - pb_inv.mean()) / pb_inv.std()
ps_norm = (ps_inv - ps_inv.mean()) / ps_inv.std()
# 综合价值因子
value_factor = (pe_norm + pb_norm + ps_norm) / 3
return value_factor
# 动量因子构造示例
def calculate_momentum_factor(df, periods=[20, 60, 120]):
"""
计算动量因子
多周期动量的加权平均
"""
close = df['close']
momentum_signals = []
weights = []
for period in periods:
momentum = close.pct_change(period)
momentum_signals.append(momentum)
weights.append(1 / period)
# 加权平均
weights = np.array(weights) / sum(weights)
momentum_factor = sum(m * w for m, w in zip(momentum_signals, weights))
return momentum_factor
# 质量因子构造示例
def calculate_quality_factor(df):
"""
计算质量因子
ROE + ROA + 利润率稳定性
"""
roe = df['roe']
roa = df['roa']
# 利润率(营业收入/净利润)
profit_margin = df['net_profit'] / df['revenue']
# 标准化
roe_norm = (roe - roe.mean()) / roe.std()
roa_norm = (roa - roa.mean()) / roa.std()
profit_margin_norm = (profit_margin - profit_margin.mean()) / profit_margin.std()
# 综合质量因子
quality_factor = (roe_norm + roa_norm + profit_margin_norm) / 3
return quality_factor
IC(Information Coefficient)衡量因子值与下期收益率的相关系数
def calculate_ic(factor, returns):
"""
计算因子的IC值
"""
# 按日期分组计算
ic_values = []
for date in factor.index.get_level_values('date').unique():
factor_slice = factor.loc[date]
return_slice = returns.loc[date]
ic = factor_slice.corr(return_slice, method='spearman')
ic_values.append(ic)
return pd.Series(ic_values)
# IC均值与IR
ic_mean = ic.mean()
ic_std = ic.std()
ir = ic_mean / ic_std
将股票按因子值分成若干组,比较各组收益率差异
def group_backtest(factor, returns, n_groups=5):
"""
分层回测
"""
# 按因子值分组
groups = pd.qcut(factor, n_groups, labels=False)
# 计算各组收益率
group_returns = []
for i in range(n_groups):
mask = groups == i
group_ret = returns[mask].mean()
group_returns.append(group_ret)
return pd.Series(group_returns)
检验因子值与收益是否呈单调关系
from scipy.stats import spearmanr
def monotonicity_test(group_returns):
"""
单调性检验
"""
group_idx = range(len(group_returns))
correlation, p_value = spearmanr(group_idx, group_returns)
return correlation, p_value
| 指标 | 标准 | 说明 |
|---|---|---|
| IC均值 | > 3% | 因子预测能力 |
| IR(信息比率) | > 0.5 | 因子稳定性 |
| 多空收益 | > 5% | 分层回测表现 |
| 单调性 | > 0.3 | 因子排序一致性 |
| 换手率 | < 80% | 交易成本考量 |
去除因子之间的相关性,使因子相互独立
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def orthogonalize_factors(factors_df):
"""
因子正交化
"""
# 标准化
scaler = StandardScaler()
factors_scaled = scaler.fit_transform(factors_df)
# PCA正交化
pca = PCA()
factors_orth = pca.fit_transform(factors_scaled)
return pd.DataFrame(factors_orth, index=factors_df.index)
# 正交化前相关性
corr_before = factors_df.corr()
# 正交化后相关性
factors_orth = orthogonalize_factors(factors_df)
corr_after = factors_orth.corr()
因子构造和筛选过程中,容易出现过拟合问题。建议:1)使用样本外验证;2)控制因子复杂度;3)避免过度数据挖掘;4)进行稳健性检验。