強化学習は、エージェントが「買い・売り・保持」を繰り返しながら収益最大化を学ぶ手法です。ということで、この記事ではDQN(Deep Q-Network)を使った取引エージェントをPyTorchで実装する手順をまとめます。
📘 外部参考:OpenAI Spinning Up(強化学習入門・公式)
📘 外部参考:Playing Atari with Deep RL(DQN原論文・arXiv) / PyTorch DQNチュートリアル(公式)
📘 外部参考:PyTorch 公式 / PyTorch チュートリアル(公式)
強化学習の基本概念
株式取引への強化学習の適用では、状態(State)として直近N日間の株価・指標データを使い、行動(Action)として買い・売り・保持の3択を設定します。報酬(Reward)は売買の損益で定義し、方策(Policy)をニューラルネットワークで表現します。
取引環境の実装
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import yfinance as yf
from collections import deque
import random
class TradingEnvironment:
"""株式取引シミュレーション環境"""
def __init__(self, df, window=20, initial_capital=1_000_000, commission=0.001):
self.df = df
self.window = window
self.initial_capital = initial_capital
self.commission = commission
self.feature_cols = ['Return', 'MA5_ratio', 'MA20_ratio', 'RSI', 'Volume_ratio']
self.reset()
def reset(self):
self.current_step = self.window
self.capital = self.initial_capital
self.shares = 0
self.total_trades = 0
return self._get_state()
def _get_state(self):
"""現在の状態ベクトルを返す(直近window日の特徴量)"""
state_data = self.df[self.feature_cols].iloc[self.current_step - self.window:self.current_step].values
# ポートフォリオ状態を追加
current_price = self.df['Close'].iloc[self.current_step]
portfolio_value = self.capital + self.shares * current_price
position_ratio = self.shares * current_price / portfolio_value if portfolio_value > 0 else 0
state = np.concatenate([state_data.flatten(), [position_ratio]])
return state.astype(np.float32)
def step(self, action):
"""
action: 0=保持, 1=全量買い, 2=全量売り
returns: next_state, reward, done
"""
current_price = self.df['Close'].iloc[self.current_step]
prev_value = self.capital + self.shares * current_price
if action == 1 and self.shares == 0: # 買い
shares_to_buy = int(self.capital / (current_price * (1 + self.commission)))
if shares_to_buy > 0:
self.shares = shares_to_buy
self.capital -= shares_to_buy * current_price * (1 + self.commission)
self.total_trades += 1
elif action == 2 and self.shares > 0: # 売り
self.capital += self.shares * current_price * (1 - self.commission)
self.shares = 0
self.total_trades += 1
self.current_step += 1
done = self.current_step >= len(self.df) - 1
next_price = self.df['Close'].iloc[self.current_step]
next_value = self.capital + self.shares * next_price
# 報酬: リターンのlog
reward = np.log(next_value / prev_value) if prev_value > 0 else 0
next_state = self._get_state() if not done else None
return next_state, reward, done, next_value
def prepare_data(ticker, period='3y'):
df = yf.download(ticker, period=period, progress=False)
df['Return'] = df['Close'].pct_change()
df['MA5_ratio'] = df['Close'].rolling(5).mean() / df['Close'] - 1
df['MA20_ratio'] = df['Close'].rolling(20).mean() / df['Close'] - 1
delta = df['Close'].diff()
gain = delta.clip(lower=0).rolling(14).mean()
loss = (-delta.clip(upper=0)).rolling(14).mean()
df['RSI'] = (100 - 100 / (1 + gain / loss)) / 100
df['Volume_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
return df.dropna()
df = prepare_data('7203.T')
print(f"データ件数: {len(df)}")
DQNエージェントの実装
class QNetwork(nn.Module):
"""Q値を出力するニューラルネットワーク"""
def __init__(self, state_size, action_size=3, hidden=128):
super().__init__()
self.net = nn.Sequential(
nn.Linear(state_size, hidden),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(hidden, hidden),
nn.ReLU(),
nn.Linear(hidden, action_size)
)
def forward(self, x):
return self.net(x)
class DQNAgent:
def __init__(self, state_size, action_size=3, lr=1e-4, gamma=0.99, epsilon=1.0):
self.state_size = state_size
self.action_size = action_size
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = 0.05
self.epsilon_decay = 0.995
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.q_net = QNetwork(state_size, action_size).to(self.device)
self.target_net = QNetwork(state_size, action_size).to(self.device)
self.target_net.load_state_dict(self.q_net.state_dict())
self.optimizer = optim.Adam(self.q_net.parameters(), lr=lr)
self.memory = deque(maxlen=10000)
def act(self, state):
if random.random() < self.epsilon:
return random.randint(0, self.action_size - 1)
state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
with torch.no_grad():
q_values = self.q_net(state_t)
return q_values.argmax().item()
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def replay(self, batch_size=64):
if len(self.memory) < batch_size:
return
batch = random.sample(self.memory, batch_size)
states = torch.FloatTensor([e[0] for e in batch]).to(self.device)
actions = torch.LongTensor([e[1] for e in batch]).to(self.device)
rewards = torch.FloatTensor([e[2] for e in batch]).to(self.device)
next_states = torch.FloatTensor([e[3] if e[3] is not None else np.zeros(self.state_size) for e in batch]).to(self.device)
dones = torch.FloatTensor([e[4] for e in batch]).to(self.device)
current_q = self.q_net(states).gather(1, actions.unsqueeze(1)).squeeze()
next_q = self.target_net(next_states).max(1)[0]
target_q = rewards + self.gamma * next_q * (1 - dones)
loss = nn.MSELoss()(current_q, target_q.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def update_target(self):
self.target_net.load_state_dict(self.q_net.state_dict())
# 訓練
split = int(len(df) * 0.8)
train_df = df.iloc[:split]
test_df = df.iloc[split:]
env = TradingEnvironment(train_df)
state_size = env.window * len(env.feature_cols) + 1
agent = DQNAgent(state_size)
print(f"状態空間サイズ: {state_size}")
print("DQNエージェント初期化完了")
学習ループ
num_episodes = 200
target_update_freq = 10
for episode in range(1, num_episodes + 1):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done, portfolio_value = env.step(action)
agent.remember(state, action, reward,
next_state if not done else np.zeros(state_size), done)
agent.replay()
if next_state is not None:
state = next_state
total_reward += reward
if episode % target_update_freq == 0:
agent.update_target()
if episode % 50 == 0:
final_return = (portfolio_value - 1_000_000) / 1_000_000
print(f"Episode {episode}: Return={final_return:.2%}, Epsilon={agent.epsilon:.3f}, Trades={env.total_trades}")
# テストデータで評価
agent.epsilon = 0
test_env = TradingEnvironment(test_df)
state = test_env.reset()
done = False
while not done:
action = agent.act(state)
next_state, reward, done, portfolio_value = test_env.step(action)
if next_state is not None:
state = next_state
buy_hold = (test_df['Close'].iloc[-1] - test_df['Close'].iloc[0]) / test_df['Close'].iloc[0]
dqn_return = (portfolio_value - 1_000_000) / 1_000_000
print(f"DQNエージェント テストリターン: {dqn_return:.2%}")
print(f"バイ&ホールド テストリターン: {buy_hold:.2%}")
まとめ
DQNを使った株式取引エージェントのポイントをまとめます。強化学習では取引環境の設計(状態・報酬の定義)が性能を大きく左右します。Experience Replay(メモリバッファ)とTarget Networkの定期更新がDQNの安定学習に不可欠です。株式データは非定常なため、定期的な再学習とオンライン学習の組み合わせが有効です。単純なDQNは過学習しやすく、Dueling DQN・Double DQNなどの改善版を試すと精度が向上することがあります。実運用前には必ずウォークフォワードテストとリスク分析を行い、収益性と安定性を確認してください。

