I really wish factor momentum would work - help me show that it actually does

Jrinne · November 23, 2024, 1:01pm

It tried but...

Trending versus Mean Reversion in Factor Returns: A Technical Analysis

I analyzed the weekly returns of the top bucket for 31 different stock market factors to investigate their predictive properties. The key finding: weak predictive relationships (three statistically significant) with mean reversion dominating. NOTE: you would expect a few statistically positive tests of 31 factors, with multiple window sizes tested, by chance alone.

Methodology:

Tested if the SMA over a period predicts average returns over the next 4 weeks
Grid search for optimal SMA window sizes: [4, 8, 12, 16, 20, 24, 28, 32 weeks]
Used linear regression to test predictive power
Analyzed correlations, p-values, and statistical significance

Key Findings:

Evidence of weak predictive relationships:
- Correlations range from -0.0880 to +0.0339
- Three factors show statistical significance (p < 0.05)
- 21 of 31 factors show negative correlations (mean-reverting tendency)
- All best performing windows were 4 weeks
Window Size Analysis:
- 4-week window emerged as optimal for all factors
- R-squared values all below 1%
- Predominantly negative correlations suggesting mean reversion

Results table (name of factors removed):

Most significant regression results:

Code included for transparency and replication.

import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_predictive_power(data_series, window):
    """
    Analyze predictive power of SMA for average of next 4 weeks' returns
    """
    # Calculate SMA
    sma = data_series.rolling(window=window).mean()
    
    # Calculate average return over next 4 weeks
    forward_returns = pd.concat([data_series.shift(-i) for i in range(1, 5)], axis=1).mean(axis=1)
    
    # Remove NAs from both series
    sma = sma[window:-4]  # Remove first window periods and last 4 periods
    forward_returns = forward_returns[window:-4]  # Align with SMA
    
    # Run regression of forward returns on SMA
    slope, intercept, r_value, p_value, std_err = stats.linregress(sma, forward_returns)
    
    # Calculate correlation
    correlation = sma.corr(forward_returns)
    
    return {
        'window': window,
        'slope': slope,
        'p_value': p_value,
        'r_squared': r_value**2,
        'correlation': correlation,
        'std_err': std_err,
        'direction': 'positive' if slope > 0 else 'negative',
        'is_significant': p_value < 0.05
    }

def analyze_all_metrics(data, windows=[4, 8, 12, 16, 20, 24, 28, 32]):
    """
    Analyze predictive power for all metrics
    """
    all_results = {}
    significant_predictions = []
    
    # Analyze each column
    for column in data.columns:
        if column != 'date':
            print(f"\nAnalyzing {column}...")
            
            # Test each window size
            window_results = []
            for window in windows:
                result = analyze_predictive_power(data[column], window)
                window_results.append(result)
            
            # Convert results to DataFrame
            results_df = pd.DataFrame(window_results)
            all_results[column] = results_df
            
            # Track significant predictions
            best_window = results_df.loc[results_df['r_squared'].idxmax()]
            if best_window['is_significant']:
                significant_predictions.append({
                    'metric': column,
                    'window': int(best_window['window']),
                    'r_squared': best_window['r_squared'],
                    'direction': best_window['direction'],
                    'p_value': best_window['p_value'],
                    'correlation': best_window['correlation']
                })
    
    return all_results, significant_predictions

# Read and prepare data
file_path = "insert_your_path"  # Replace with your file path
data = pd.read_csv(file_path)
data['date'] = pd.to_datetime(data['date'], format='%m/%d/%y')
data = data.set_index('date')

# Perform analysis
all_results, significant_predictions = analyze_all_metrics(data)

# Create summary of best windows for each metric
summary_data = []
for metric, results in all_results.items():
    best_result = results.loc[results_df['r_squared'].idxmax()]
    summary_data.append({
        'metric': metric,
        'best_window': int(best_result['window']),
        'r_squared': best_result['r_squared'],
        'p_value': best_result['p_value'],
        'correlation': best_result['correlation'],
        'direction': best_result['direction'],
        'is_significant': best_result['is_significant']
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('r_squared', ascending=False)

# Print summary results
print("\n=== Summary of Predictive Power Analysis ===")
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
print(summary_df)

# Create visualization of top metrics
plt.figure(figsize=(20, 15))

# Select top 6 metrics by R-squared for visualization
top_metrics = list(summary_df.head(6)['metric'])

for i, metric in enumerate(top_metrics, 1):
    plt.subplot(3, 2, i)
    
    # Get best window for this metric
    best_window = int(summary_df[summary_df['metric'] == metric]['best_window'].values[0])
    
    # Calculate SMA and forward returns
    sma = data[metric].rolling(window=best_window).mean()
    forward_returns = pd.concat([data[metric].shift(-i) for i in range(1, 5)], axis=1).mean(axis=1)
    
    # Remove NAs
    sma = sma[best_window:-4]
    forward_returns = forward_returns[best_window:-4]
    
    # Create scatter plot
    plt.scatter(sma, forward_returns, alpha=0.5)
    
    # Add regression line
    z = np.polyfit(sma, forward_returns, 1)
    p = np.poly1d(z)
    plt.plot(sma, p(sma), "r--", alpha=0.8)
    
    plt.title(f'{metric}\nWindow={best_window}, R²={summary_df[summary_df["metric"]==metric]["r_squared"].values[0]:.4f}')
    plt.xlabel(f'{best_window}-Week SMA')
    plt.ylabel('Average of Next 4 Weeks Returns')
    plt.grid(True)

plt.tight_layout()
plt.show()

# Print key insights
print("\n=== Key Insights ===")
print(f"1. Total metrics analyzed: {len(summary_df)}")
print(f"2. Metrics with significant predictive power: {len(significant_predictions)}")
print("\n3. Top 5 most predictive metrics:")
print(summary_df.head().to_string())

# Additional analysis of significant predictors
if significant_predictions:
    print("\n4. Metrics with significant predictive power:")
    sig_df = pd.DataFrame(significant_predictions).sort_values('r_squared', ascending=False)
    print(sig_df.to_string())
else:
    print("\n4. No metrics showed significant predictive power")

# Create correlation heatmap for top metrics
top_10_metrics = summary_df.head(10)['metric'].tolist()
corr_matrix = data[top_10_metrics].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Top 10 Predictive Metrics')
plt.tight_layout()
plt.show()

# Save results
summary_df.to_csv('predictive_power_analysis.csv')
if significant_predictions:
    pd.DataFrame(significant_predictions).to_csv('significant_predictors.csv')