P
PipsGrowth
← Back

XGBoost for Trading

Gradient boosting library excelling at tabular data and feature-based trading strategies.

Advanced
Machine Learning

Installation

$ pip install xgboost scikit-learn

Key Features

Gradient Boosting

Ensemble of decision trees for powerful predictions.

High Performance

Optimized C++ backend with GPU acceleration.

Feature Importance

Built-in feature importance analysis.

Handles Tabular Data

Excellent for structured financial features.

Code Examples

Feature Engineering for Trading

Create features from price data

Python
import xgboost as xgb
import pandas as pd
import numpy as np
import yfinance as yf
# Download data
df = yf.download('EURUSD=X', start='2020-01-01', end='2024-01-01')
# Create features
df['Returns'] = df['Close'].pct_change()
df['SMA_10'] = df['Close'].rolling(10).mean()
df['SMA_50'] = df['Close'].rolling(50).mean()
df['RSI'] = 100 - (100 / (1 + df['Close'].diff().clip(lower=0).rolling(14).mean() /
(-df['Close'].diff().clip(upper=0)).rolling(14).mean()))
df['Volatility'] = df['Returns'].rolling(20).std()
df['Momentum'] = df['Close'].pct_change(10)
# Target: 1 if price goes up next day, 0 otherwise
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
# Drop NaN
df = df.dropna()
print(f"Features: {df.shape[1] - 1}, Samples: {len(df)}")

Train XGBoost Classifier

Train model to predict direction

Python
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Features and target
features = ['Returns', 'SMA_10', 'SMA_50', 'RSI', 'Volatility', 'Momentum']
X = df[features]
y = df['Target']
# Time-series split
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
# Train XGBoost
model = xgb.XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
objective='binary:logistic',
random_state=42
)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(classification_report(y_test, y_pred))

Analyze Feature Importance

Understand which features drive predictions

Python
import matplotlib.pyplot as plt
# Get feature importance
importance = model.feature_importances_
feature_importance = pd.DataFrame({
'Feature': features,
'Importance': importance
}).sort_values('Importance', ascending=False)
print("Feature Importance:")
print(feature_importance)
# Plot
xgb.plot_importance(model, importance_type='gain')
plt.title('XGBoost Feature Importance')
plt.tight_layout()
plt.show()

Hyperparameter Optimization

Find optimal parameters with GridSearchCV

Python
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
# Parameter grid
param_grid = {
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'min_child_weight': [1, 3, 5]
}
# Grid search
grid_search = GridSearchCV(
xgb.XGBClassifier(objective='binary:logistic', random_state=42),
param_grid,
cv=tscv,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.2%}")
# Use best model
best_model = grid_search.best_estimator_

Price Return Prediction

Predict continuous returns instead of direction

Python
# Regression target: next day return
df['Target_Return'] = df['Close'].pct_change().shift(-1)
df = df.dropna()
X = df[features]
y = df['Target_Return']
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
# XGBoost Regressor
reg_model = xgb.XGBRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
objective='reg:squarederror',
random_state=42
)
reg_model.fit(X_train, y_train)
# Evaluate
from sklearn.metrics import mean_squared_error, r2_score
y_pred = reg_model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.6f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

Generate Trading Signals

Use predictions for trading decisions

Python
def generate_trading_signals(model, data, features, threshold=0.6):
"""Generate trading signals from predictions"""
X = data[features]
# Get probability predictions
proba = model.predict_proba(X)
signals = []
for i, (p_down, p_up) in enumerate(proba):
if p_up > threshold:
signals.append('BUY')
elif p_down > threshold:
signals.append('SELL')
else:
signals.append('HOLD')
return signals
# Generate signals
signals = generate_trading_signals(best_model, df.iloc[split_idx:], features, threshold=0.55)
# Backtest
df_test = df.iloc[split_idx:].copy()
df_test['Signal'] = signals
df_test['Strategy_Return'] = df_test['Returns'].shift(-1) * df_test['Signal'].map({'BUY': 1, 'SELL': -1, 'HOLD': 0})
cumulative = (1 + df_test['Strategy_Return']).cumprod()
print(f"Strategy Return: {(cumulative.iloc[-1] - 1):.2%}")

Save and Load Model

Persist model for production use

Python
import joblib
# Save model
model.save_model('xgb_trading_model.json')
# Or with joblib (includes sklearn wrapper)
joblib.dump(best_model, 'xgb_model.pkl')
# Load model
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgb_trading_model.json')
# Or with joblib
loaded_model = joblib.load('xgb_model.pkl')
# Verify
pred = loaded_model.predict(X_test.iloc[:5])
print(f"Predictions: {pred}")

Best Practices

Time-Series Split

Never use random split. Use time-ordered splits.

Feature Scaling Not Required

Tree models dont need normalization.

Avoid Overfitting

Use early stopping and proper cross-validation.

Feature Engineering Key

Model quality depends heavily on features.