PythonでLightGBM機械学習を使った翌日の株価方向予測

「明日の株価は上がるか下がるか？」を機械学習で予測することは可能でしょうか。完璧な予測は不可能ですが、LightGBMを使って統計的に有意な予測モデルを作る方法を解説します。

LightGBMとは？
特徴量エンジニアリング
目的変数（翌日の騰落）の作成とデータ分割
LightGBMモデルの学習と評価
特徴量重要度の確認
予測に基づくバックテスト
注意点
まとめ
- 📚 関連記事

LightGBMとは？

LightGBM（Light Gradient Boosting Machine）はMicrosoftが開発した勾配ブースティング系の機械学習ライブラリです。高速・高精度・省メモリが特徴で、株価予測タスクに広く使われています。

pip install lightgbm yfinance scikit-learn pandas numpy

特徴量エンジニアリング

import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
from datetime import datetime

def create_features(df):
    """株価データからテクニカル特徴量を作成"""
    df = df.copy()
    
    # リターン系特徴量
    for period in [1, 2, 3, 5, 10, 20, 60]:
        df[f"return_{period}d"] = df["Close"].pct_change(period)
    
    # 移動平均系
    for period in [5, 10, 20, 50, 200]:
        df[f"ma_{period}"] = df["Close"].rolling(period).mean()
        df[f"ma_ratio_{period}"] = df["Close"] / df[f"ma_{period}"]
    
    # ボリンジャーバンド
    df["bb_mid"] = df["Close"].rolling(20).mean()
    df["bb_std"] = df["Close"].rolling(20).std()
    df["bb_upper"] = df["bb_mid"] + 2 * df["bb_std"]
    df["bb_lower"] = df["bb_mid"] - 2 * df["bb_std"]
    df["bb_pos"] = (df["Close"] - df["bb_lower"]) / (df["bb_upper"] - df["bb_lower"])
    
    # RSI
    delta = df["Close"].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = (-delta.clip(upper=0)).rolling(14).mean()
    df["rsi"] = 100 - (100 / (1 + gain / loss))
    
    # MACD
    df["ema12"] = df["Close"].ewm(span=12).mean()
    df["ema26"] = df["Close"].ewm(span=26).mean()
    df["macd"] = df["ema12"] - df["ema26"]
    df["macd_signal"] = df["macd"].ewm(span=9).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]
    
    # 出来高系
    df["volume_ratio"] = df["Volume"] / df["Volume"].rolling(20).mean()
    df["volume_return"] = df["Volume"].pct_change()
    
    # ボラティリティ
    df["volatility_20"] = df["Close"].pct_change().rolling(20).std()
    df["high_low_ratio"] = df["High"] / df["Low"]
    df["close_open_ratio"] = df["Close"] / df["Open"]
    
    # 曜日・月情報
    df["dayofweek"] = df.index.dayofweek
    df["month"] = df.index.month
    
    return df

# データ取得と特徴量作成
df = yf.download("7203.T", period="10y", progress=False)
df = create_features(df)
print(f"特徴量数: {len(df.columns)}")
print(df.tail())

目的変数（翌日の騰落）の作成とデータ分割

def prepare_dataset(df, forecast_horizon=1):
    """学習用データセットを準備"""
    df = df.copy()
    
    # 目的変数：翌N日後の騰落（1=上昇, 0=下落）
    df["target"] = (df["Close"].shift(-forecast_horizon) > df["Close"]).astype(int)
    
    # 特徴量の選択
    feature_cols = [c for c in df.columns if c not in
        ["Open", "High", "Low", "Close", "Volume", "target",
         "Dividends", "Stock Splits"]]
    
    df = df.dropna()
    X = df[feature_cols]
    y = df["target"]
    
    # 時系列分割（最後の20%をテストデータ）
    split = int(len(df) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    
    print(f"学習データ: {len(X_train)}件 ({X_train.index[0].date()} - {X_train.index[-1].date()})")
    print(f"テストデータ: {len(X_test)}件 ({X_test.index[0].date()} - {X_test.index[-1].date()})")
    print(f"目的変数の分布 - 上昇: {y_train.mean():.1%}")
    
    return X_train, X_test, y_train, y_test, feature_cols

X_train, X_test, y_train, y_test, feature_cols = prepare_dataset(df)

LightGBMモデルの学習と評価

def train_lgbm(X_train, y_train, X_test, y_test):
    """LightGBMモデルを学習・評価"""
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "n_estimators": 1000,
        "learning_rate": 0.01,
        "num_leaves": 31,
        "max_depth": -1,
        "min_child_samples": 20,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_alpha": 0.1,
        "reg_lambda": 0.1,
        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1,
    }
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print(f"\n=== LightGBMモデル評価 ===")
    print(f"精度: {accuracy_score(y_test, y_pred):.3f}")
    print(f"\n分類レポート:")
    print(classification_report(y_test, y_pred, target_names=["下落", "上昇"]))
    
    return model, y_pred, y_prob

model, y_pred, y_prob = train_lgbm(X_train, y_train, X_test, y_test)

特徴量重要度の確認

import matplotlib.pyplot as plt

def plot_feature_importance(model, feature_cols, top_n=20):
    """特徴量重要度を可視化"""
    importance = pd.Series(
        model.feature_importances_,
        index=feature_cols
    ).sort_values(ascending=False)
    
    plt.figure(figsize=(10, 8))
    importance.head(top_n).plot(kind="barh")
    plt.title(f"LightGBM 特徴量重要度 Top{top_n}")
    plt.xlabel("重要度")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("Top10 重要な特徴量:")
    print(importance.head(10))

plot_feature_importance(model, feature_cols)

予測に基づくバックテスト

📘 外部参考：Backtesting.py（公式ドキュメント）／ Backtrader 公式

def backtest_with_prediction(df, y_pred_series, threshold=0.6):
    """予測結果を使ってバックテスト"""
    backtest_df = df.loc[y_pred_series.index].copy()
    backtest_df["pred"] = y_pred_series.values
    backtest_df["daily_return"] = backtest_df["Close"].pct_change()
    
    # 予測確度が高い（threshold以上）ときのみ取引
    backtest_df["strategy_return"] = np.where(
        backtest_df["pred"] == 1,
        backtest_df["daily_return"],
        0
    )
    
    cumulative = (1 + backtest_df["strategy_return"]).cumprod()
    market = (1 + backtest_df["daily_return"]).cumprod()
    
    strategy_total = cumulative.iloc[-1] - 1
    market_total = market.iloc[-1] - 1
    
    print(f"戦略リターン: {strategy_total:.1%}")
    print(f"市場リターン: {market_total:.1%}")
    print(f"超過リターン: {strategy_total - market_total:.1%}")
    
    return backtest_df

# 予測Seriesを作成
pred_series = pd.Series(y_pred, index=X_test.index)
result = backtest_with_prediction(df, pred_series)