main_ac_batch.py

import numpy as np
from utils.data_loader import load_data, preprocess_data
from utils.env import TradingEnvironment
from a2c_batch_agent import A2CBatchAgent
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm
import psutil
import gc
from utils.config import *
import matplotlib.dates as mdates

current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = f'logs/A2C_{current_time}'
summary_writer = tf.summary.create_file_writer(log_dir)

train_data = load_data(TRAIN_DATA_PATH)
train_data = preprocess_data(train_data)
test_data = load_data(TEST_DATA_PATH)
test_data = preprocess_data(test_data)
test_data = test_data[TEST_DATA_START:]

env = TradingEnvironment(train_data)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = A2CBatchAgent(n_actions=action_size, critic_fc1=8, actor_fc1=8, critic_alpha=0.001, actor_alpha=0.001, gamma=0.95, entropy_coeff=2)

# for loading previously trained agent
dummy_state = tf.random.normal([1, 9])
agent.actor(dummy_state)
agent.critic(dummy_state)
agent.load_models()

episodes = 51
best_score = env.reward_range[0]
score_history = []
training_rewards = []
portfolio_values = []
best_reward = float('-inf')

def train_agent(agent, train_data, episodes, batch_size=32):
    env = TradingEnvironment(train_data)

    for episode in tqdm(range(episodes)):
        print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
        
        max_steps = 20

        # generate a batch of trajectories
        states_batch = [[] for _ in range(batch_size)]
        next_states_batch = [[] for _ in range(batch_size)]
        rewards_batch = [[] for _ in range(batch_size)]
        actions_batch = [[] for _ in range(batch_size)]
        dones_batch = [[] for _ in range(batch_size)]

        for traj in range(batch_size):
            state = env.reset()
            done = False
            step = 0

            while not done or step < max_steps:
                action = agent.choose_action(state)
                next_state, reward, done, info = env.step(action)

                states_batch[traj].append(state)
                next_states_batch[traj].append(next_state)
                actions_batch[traj].append(action)
                rewards_batch[traj].append(reward)
                dones_batch[traj].append(done)
                    
                state = next_state
                step += 1

        states_np = np.array(states_batch, dtype=np.float32)
        next_states_np = np.array(next_states_batch, dtype=np.float32)
        actions_np = np.array(actions_batch, dtype=np.int32)
        rewards_np = np.array(rewards_batch, dtype=np.float32)
        dones_np = np.array(dones_batch, dtype=np.float32)

        # train the agent on the batch using numpy arrays
        metrics = agent.learn(states_np, next_states_np, actions_np, rewards_np, dones_np)

        del states_batch, next_states_batch, actions_batch, rewards_batch, dones_batch
        del states_np, next_states_np, actions_np, rewards_np, dones_np
        gc.collect()

        print('Portfolio value', info['portfolio_value'])
        
        if episode % 10 == 0:
            agent.save_models(str(episode))
            tf.keras.backend.clear_session()

        print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
                        
        with summary_writer.as_default():
            tf.summary.scalar('Metrics/Reward', reward, step=env.current_step)
            tf.summary.scalar('Metrics/Portfolio_Value', info['portfolio_value'], step=env.current_step)
                            
            if metrics:
                tf.summary.scalar('Loss/Actor', metrics.get('actor_loss', 0), step=env.current_step)
                tf.summary.scalar('Loss/Critic', metrics.get('critic_loss', 0), step=env.current_step)
                tf.summary.scalar('Gradients/Actor_Norm', metrics.get('actor_grad_norm', 0), step=env.current_step)
                tf.summary.scalar('Gradients/Critic_Norm', metrics.get('critic_grad_norm', 0), step=env.current_step)            


training_rewards = []
portfolio_values = []
best_reward = float('-inf')


def plot_decisions(dates, prices, buy_points, sell_points, revenue, save=False, filename='decision_plot_a2c_online'):
    """
    Plots the price graph with buy and sell points, and optionally saves it.
    - `dates`: List of datetime objects corresponding to each price.
    - `prices`: List of prices.
    - `buy_points`: List of tuples (date, price) where buys occurred.
    - `sell_points`: List of tuples (date, price) where sells occurred.
    - `revenue`: List of cumulative revenue values corresponding to dates.
    """
    fig, ax1 = plt.subplots(figsize=(FIGURE_WIDTH, FIGURE_HEIGHT))

    # Plot price on the left y-axis
    ax1.plot(dates, prices, label='Price', color='blue', alpha=ALPHA)
    ax1.set_xlabel("Date")
    ax1.set_ylabel("Price", color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Add buy and sell markers
    if buy_points:
        buy_dates, buy_prices = zip(*buy_points)
        ax1.scatter(buy_dates, buy_prices, color='green', label='Buy', marker='*', s=MARKER_SIZE, alpha=MARKER_ALPHA)

    if sell_points:
        sell_dates, sell_prices = zip(*sell_points)
        ax1.scatter(sell_dates, sell_prices, color='red', label='Sell', marker='*', s=MARKER_SIZE, alpha=MARKER_ALPHA)

    # Set x-axis to show dates nicely
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax1.xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

    # Add grid and legend
    ax1.grid(alpha=0.3)
    ax1.legend(loc='upper left')

    # Plot revenue on the right y-axis
    ax2 = ax1.twinx()
    ax2.plot(dates, revenue, label='Revenue', color='orange', alpha=0.8, linestyle='--')
    ax2.set_ylabel("Revenue", color='orange')
    ax2.tick_params(axis='y', labelcolor='orange')

    # Title
    plt.title("Trading Decisions (Buy/Sell) and Revenue Over Time")

    # Save plot if required
    if save:
        plt.savefig(filename, bbox_inches='tight')
        print(f"Graph saved at: {filename}")


def test_agent(agent, test_data):
    env = TradingEnvironment(test_data)
    state = env.reset()
    total_reward = 0
    total_portfolio_value = env.initial_balance
    actions_taken = []
    done = False

    buy_points = []
    sell_points = []
    prices = []
    dates = []
    revenue = []
    decisions_log = []
    
    with summary_writer.as_default():
        while not done:
            action = agent.choose_action(state)
            actions_taken.append(action)
            next_state, reward, done, info = env.step(action)
            
            # Log test metrics
            tf.summary.scalar('Test/Step_Reward', reward, step=env.current_step)
            tf.summary.scalar('Test/Portfolio_Value', info['portfolio_value'], step=env.current_step)
            
            state = next_state
            total_reward += reward

            current_price = test_data.iloc[env.current_step]["Close"]
            current_date = test_data.index[env.current_step]
            total_portfolio_value = env.balance + (env.shares_held * current_price)
            revenue.append(total_portfolio_value)
            dates.append(current_date)

            decision = "Hold" if action == 0 else "Buy" if action == 1 else "Sell"
            decisions_log.append((env.current_step, decision, current_price, reward))

            if action == 1:
                buy_points.append((current_date, current_price))
            elif action == 2:
                sell_points.append((current_date, current_price))

            prices.append(current_price)

        print("Trading Log:")
        print("Step | Decision | Price | Reward")
        for step, decision, price, step_reward in decisions_log:
            print(f"{step:4} | {decision:<8} | {price:.2f} | {step_reward:.2f}")

        print(f"\nTest Reward: {total_reward}")
        print(f"Final Portfolio Value: {env.balance + (env.shares_held * prices[-1])}")

        plot_decisions(dates, prices, buy_points, sell_points, revenue, save=True)
        
        tf.summary.scalar('Test/Final_Reward', total_reward, step=0)

        actions_array = np.array(actions_taken)
        for action_idx in range(action_size):
            action_freq = np.mean(actions_array == action_idx)
            tf.summary.scalar(f'Test/Action_{action_idx}_Frequency', action_freq, step=0)


train_agent(agent, train_data, episodes, batch_size=8)
test_agent(agent, test_data)