← Back
XGBoost for Trading
Gradient boosting library excelling at tabular data and feature-based trading strategies.
Advanced
Machine Learning
Installation
$ pip install xgboost scikit-learn
Key Features
Gradient Boosting
Ensemble of decision trees for powerful predictions.
High Performance
Optimized C++ backend with GPU acceleration.
Feature Importance
Built-in feature importance analysis.
Handles Tabular Data
Excellent for structured financial features.
Code Examples
Feature Engineering for Trading
Create features from price data
Python
import xgboost as xgb
import pandas as pd
import numpy as np
import yfinance as yf
# Download data
df = yf.download('EURUSD=X', start='2020-01-01', end='2024-01-01')
# Create features
df['Returns'] = df['Close'].pct_change()
df['SMA_10'] = df['Close'].rolling(10).mean()
df['SMA_50'] = df['Close'].rolling(50).mean()
df['RSI'] = 100 - (100 / (1 + df['Close'].diff().clip(lower=0).rolling(14).mean() /
(-df['Close'].diff().clip(upper=0)).rolling(14).mean()))
df['Volatility'] = df['Returns'].rolling(20).std()
df['Momentum'] = df['Close'].pct_change(10)
# Target: 1 if price goes up next day, 0 otherwise
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
# Drop NaN
df = df.dropna()
print(f"Features: {df.shape[1] - 1}, Samples: {len(df)}")
Train XGBoost Classifier
Train model to predict direction
Python
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Features and target
features = ['Returns', 'SMA_10', 'SMA_50', 'RSI', 'Volatility', 'Momentum']
X = df[features]
y = df['Target']
# Time-series split
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
# Train XGBoost
model = xgb.XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
objective='binary:logistic',
random_state=42
)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(classification_report(y_test, y_pred))
Analyze Feature Importance
Understand which features drive predictions
Python
import matplotlib.pyplot as plt
# Get feature importance
importance = model.feature_importances_
feature_importance = pd.DataFrame({
'Feature': features,
'Importance': importance
}).sort_values('Importance', ascending=False)
print("Feature Importance:")
print(feature_importance)
# Plot
xgb.plot_importance(model, importance_type='gain')
plt.title('XGBoost Feature Importance')
plt.tight_layout()
plt.show()
Hyperparameter Optimization
Find optimal parameters with GridSearchCV
Python
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
# Parameter grid
param_grid = {
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'min_child_weight': [1, 3, 5]
}
# Grid search
grid_search = GridSearchCV(
xgb.XGBClassifier(objective='binary:logistic', random_state=42),
param_grid,
cv=tscv,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.2%}")
# Use best model
best_model = grid_search.best_estimator_
Price Return Prediction
Predict continuous returns instead of direction
Python
# Regression target: next day return
df['Target_Return'] = df['Close'].pct_change().shift(-1)
df = df.dropna()
X = df[features]
y = df['Target_Return']
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
# XGBoost Regressor
reg_model = xgb.XGBRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
objective='reg:squarederror',
random_state=42
)
reg_model.fit(X_train, y_train)
# Evaluate
from sklearn.metrics import mean_squared_error, r2_score
y_pred = reg_model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.6f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
Generate Trading Signals
Use predictions for trading decisions
Python
def generate_trading_signals(model, data, features, threshold=0.6):
"""Generate trading signals from predictions"""
X = data[features]
# Get probability predictions
proba = model.predict_proba(X)
signals = []
for i, (p_down, p_up) in enumerate(proba):
if p_up > threshold:
signals.append('BUY')
elif p_down > threshold:
signals.append('SELL')
else:
signals.append('HOLD')
return signals
# Generate signals
signals = generate_trading_signals(best_model, df.iloc[split_idx:], features, threshold=0.55)
# Backtest
df_test = df.iloc[split_idx:].copy()
df_test['Signal'] = signals
df_test['Strategy_Return'] = df_test['Returns'].shift(-1) * df_test['Signal'].map({'BUY': 1, 'SELL': -1, 'HOLD': 0})
cumulative = (1 + df_test['Strategy_Return']).cumprod()
print(f"Strategy Return: {(cumulative.iloc[-1] - 1):.2%}")
Save and Load Model
Persist model for production use
Python
import joblib
# Save model
model.save_model('xgb_trading_model.json')
# Or with joblib (includes sklearn wrapper)
joblib.dump(best_model, 'xgb_model.pkl')
# Load model
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgb_trading_model.json')
# Or with joblib
loaded_model = joblib.load('xgb_model.pkl')
# Verify
pred = loaded_model.predict(X_test.iloc[:5])
print(f"Predictions: {pred}")
Best Practices
Time-Series Split
Never use random split. Use time-ordered splits.
Feature Scaling Not Required
Tree models dont need normalization.
Avoid Overfitting
Use early stopping and proper cross-validation.
Feature Engineering Key
Model quality depends heavily on features.