XGBoost for Trading

Gradient boosting library excelling at tabular data and feature-based trading strategies.

Advanced

Machine Learning

Installation

$ pip install xgboost scikit-learn

Key Features

Gradient Boosting

Ensemble of decision trees for powerful predictions.

High Performance

Optimized C++ backend with GPU acceleration.

Feature Importance

Built-in feature importance analysis.

Handles Tabular Data

Excellent for structured financial features.

Code Examples

Feature Engineering for Trading

Create features from price data

Python

import xgboost as xgb

import pandas as pd

import numpy as np

import yfinance as yf

# Download data

df = yf.download('EURUSD=X', start='2020-01-01', end='2024-01-01')

# Create features

df['Returns'] = df['Close'].pct_change()

df['SMA_10'] = df['Close'].rolling(10).mean()

df['SMA_50'] = df['Close'].rolling(50).mean()

df['RSI'] = 100 - (100 / (1 + df['Close'].diff().clip(lower=0).rolling(14).mean() /

(-df['Close'].diff().clip(upper=0)).rolling(14).mean()))

df['Volatility'] = df['Returns'].rolling(20).std()

df['Momentum'] = df['Close'].pct_change(10)

# Target: 1 if price goes up next day, 0 otherwise

df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

# Drop NaN

df = df.dropna()

print(f"Features: {df.shape[1] - 1}, Samples: {len(df)}")

Train XGBoost Classifier

Train model to predict direction

Python

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

# Features and target

features = ['Returns', 'SMA_10', 'SMA_50', 'RSI', 'Volatility', 'Momentum']

X = df[features]

y = df['Target']

# Time-series split

split_idx = int(len(X) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]

y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Train XGBoost

model = xgb.XGBClassifier(

n_estimators=100,

max_depth=5,

learning_rate=0.1,

objective='binary:logistic',

random_state=42

)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Evaluate

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")

print(classification_report(y_test, y_pred))

Analyze Feature Importance

Understand which features drive predictions

Python

import matplotlib.pyplot as plt

# Get feature importance

importance = model.feature_importances_

feature_importance = pd.DataFrame({

'Feature': features,

'Importance': importance

}).sort_values('Importance', ascending=False)

print("Feature Importance:")

print(feature_importance)

# Plot

xgb.plot_importance(model, importance_type='gain')

plt.title('XGBoost Feature Importance')

plt.tight_layout()

plt.show()

Hyperparameter Optimization

Find optimal parameters with GridSearchCV

Python

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

# Time-series cross-validation

tscv = TimeSeriesSplit(n_splits=5)

# Parameter grid

param_grid = {

'max_depth': [3, 5, 7],

'n_estimators': [50, 100, 200],

'learning_rate': [0.01, 0.1, 0.2],

'min_child_weight': [1, 3, 5]

}

# Grid search

grid_search = GridSearchCV(

xgb.XGBClassifier(objective='binary:logistic', random_state=42),

param_grid,

cv=tscv,

scoring='accuracy',

n_jobs=-1,

verbose=1

)

grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")

print(f"Best score: {grid_search.best_score_:.2%}")

# Use best model

best_model = grid_search.best_estimator_

Price Return Prediction

Predict continuous returns instead of direction

Python

# Regression target: next day return

df['Target_Return'] = df['Close'].pct_change().shift(-1)

df = df.dropna()

X = df[features]

y = df['Target_Return']

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]

y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# XGBoost Regressor

reg_model = xgb.XGBRegressor(

n_estimators=100,

max_depth=5,

learning_rate=0.1,

objective='reg:squarederror',

random_state=42

)

reg_model.fit(X_train, y_train)

# Evaluate

from sklearn.metrics import mean_squared_error, r2_score

y_pred = reg_model.predict(X_test)

print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.6f}")

print(f"R²: {r2_score(y_test, y_pred):.4f}")

Generate Trading Signals

Use predictions for trading decisions

Python

def generate_trading_signals(model, data, features, threshold=0.6):

"""Generate trading signals from predictions"""

X = data[features]

# Get probability predictions

proba = model.predict_proba(X)

signals = []

for i, (p_down, p_up) in enumerate(proba):

if p_up > threshold:

signals.append('BUY')

elif p_down > threshold:

signals.append('SELL')

else:

signals.append('HOLD')

return signals

# Generate signals

signals = generate_trading_signals(best_model, df.iloc[split_idx:], features, threshold=0.55)

# Backtest

df_test = df.iloc[split_idx:].copy()

df_test['Signal'] = signals

df_test['Strategy_Return'] = df_test['Returns'].shift(-1) * df_test['Signal'].map({'BUY': 1, 'SELL': -1, 'HOLD': 0})

cumulative = (1 + df_test['Strategy_Return']).cumprod()

print(f"Strategy Return: {(cumulative.iloc[-1] - 1):.2%}")

Save and Load Model

Persist model for production use

Python

import joblib

# Save model

model.save_model('xgb_trading_model.json')

# Or with joblib (includes sklearn wrapper)

joblib.dump(best_model, 'xgb_model.pkl')

# Load model

loaded_model = xgb.XGBClassifier()

loaded_model.load_model('xgb_trading_model.json')

# Or with joblib

loaded_model = joblib.load('xgb_model.pkl')

# Verify

pred = loaded_model.predict(X_test.iloc[:5])

print(f"Predictions: {pred}")

Best Practices

Time-Series Split

Never use random split. Use time-ordered splits.

Feature Scaling Not Required

Tree models dont need normalization.

Avoid Overfitting

Use early stopping and proper cross-validation.

Feature Engineering Key

Model quality depends heavily on features.

Resources

Next Steps

scikit-learn TensorFlow