# import needed libraries
import gymnasium as gym
from matplotlib import pyplot as plt
# from IPython.display import clear_output
import numpy as np
import random
import NeuralNetwork

def discountedRewards(rewards, gamma=1):
    # in this implementation each discounted reward, starting from the last one, is computed by multipling the following discounted rewards (R) by a factor gamma
    # and adding the actual reward.
    R = 0
    return [R:=(R * gamma + rewards [t]) for t in reversed(range(len(rewards))) ][::-1]

def discountedRewards2(rewards, gamma=1):
    #return np.cumsum(rewards[::-1])[::-1]
    return np.cumsum(rewards * (gamma ** np.arange(len(rewards))))

# Epsilon-greedy exploration
def sampleSoftmax(softmaxOutput, epsilon):
    if random.uniform(0, 1) < epsilon: # esilon probability to act random
        sampledAction = random.randint(0, len(softmaxOutput) - 1) # random action selected
    else:
        rand = np.random.random()  # uniform distribution generator
        cdf = np.cumsum(softmaxOutput)
        sampledAction = np.argmax(cdf > rand) # inverse sampling to sample actions according to the policy distribution 
    return sampledAction, np.log(softmaxOutput[sampledAction][0]+1e-8)

def reward_function(state, alpha=1):
    # Compute the angle between the vertical and the pole
    angle = state[2]
    angularVelocity = state[3]
    
    # Angle threshold
    angle_threshold = 0.418
    
    # If the angle exceeds the threshold, penalize the agent
    if abs(angle) > angle_threshold * alpha: reward = -1.0
    else: reward = 1.0
    
    return reward

def moving_average_baseline(rewards, alpha=0.9):
    baseline = 0
    baselines = []
    for r in rewards:
        baseline = alpha * baseline + (1 - alpha) * r
        baselines.append(baseline)
    return np.array(baselines)

def exponential_moving_average2(values, alpha):
    ema = [values[0]]
    for i in range(1, len(values)):
        ema.append(alpha * values[i] + (1 - alpha) * ema[i-1])
    return ema

def exponential_moving_average(prev, actual, alpha=0.8):
    return alpha*actual + (1-alpha) * prev

import NeuralNetwork
critic = NeuralNetwork.load('../DeepLearningProject/Networks/800') # THe net that is used to dictate the best actions

def trainAgent(model,env, ValueNet, QNet, trainNets=False, randomExploration = 0, angleTreshold = 1, learninRate =  0.01, lambda_=0, iterationAmplification = 1, critic=None, print_=True):
    # reset the environment and observe it
    envStates = env.observation_space.shape[0]
    env.action_space.seed()

    observation, info = env.reset()
    observation = observation.reshape(envStates,1)

    learningScore = []
    rewards = []
    trueRewards = []
    probs = []
    actions = []
    states = []
    savedLogProbs = []
    savedProbs = []
    actionIndexes=[]
    actionSingleIndex=[]
    replayBuffer=[]
    game = 1
    # randomExploration: hyperparameter to incentive random exploration (random actions)
    # angleTreshold: hyperparameter that regulates the maximum rewarding angle
    eta=learninRate
    meanReward = 0
    baseline = 0
    maxScore = 0
    interval = min(100, iterationAmplification)
    prev_avg_reward = 0 # baseline
    breakFlag = False
    lastMean=0
    imitate = False
    batch = 0
    advantageList = np.array([])
    
    # Main loop
    while(True):
        observation = observation.reshape(envStates,1)
        states.append(observation)
        # Policy probabilities
        prob = model.feedForward(np.array(observation))
        # Policy Action selection
        action, lP = sampleSoftmax(prob, randomExploration)
        
        # Imitation Learning
        if imitate and np.random.random()<0.5:
            criticAction, _ = sampleSoftmax(critic.feedForward(np.array(observation)), 0)
            lP = np.log(prob[criticAction])
            action=criticAction

        # Agent makes the action
        observation, trueReward, terminated, truncated, info = env.step(action) 
        # Reward shaping
        reward = reward_function(observation, angleTreshold)
        # One-hot encoded vector
        y = tuple([1 if i==action else 0 for i in range(len(prob))])
        
        probs.append(prob)
        actions.append(y)
        rewards.append(reward)
        trueRewards.append(trueReward)
        savedLogProbs.append(lP)
        savedProbs.append(prob[action])
        actionIndexes.append(np.array([action]))
        actionSingleIndex.append(action)

        # Episode terminaed
        if terminated or truncated:
            meanReward += sum(trueRewards)
            game += 1
            states.append(observation)
            # baseline = (baseline *(game-1) + sum(rewards)) / game

            # Hyperparameter Annealing:     Slowly modify hyperparameters in order to help the learning
            if not game%interval:
                meanReward /= interval
                if print_: print(f' game: {game}, mean reward: {meanReward}, at eta: {eta}, exp: {randomExploration}, angle: {angleTreshold}, lambda: {lambda_}')
                learningScore.append((game, meanReward))
                if meanReward >= 500: breakFlag=True; break
                if game==iterationAmplification: breakFlag=True; break
                if meanReward>=400: eta = 0.0001; randomExploration = 0.01;  angleTreshold=.5
                elif meanReward>=100: eta = 0.005; randomExploration=0.05; angleTreshold=.8
                elif meanReward<20: randomExploration=0.8; eta=0.01; 
                else: eta = 0.001; randomExploration = 0.05; angleTreshold=1
                #if game>1000 and game<2000 and meanReward<22: breakFlag=True
                # if game==200 and meanReward<=22: breakFlag=True
                meanReward = 0
                trueRewards=[]
                
            
            
            # Obtaining the discounted rewards
            dR = discountedRewards(rewards, .9)
        
            advantage = dR # discounted rewards used as advantageS

            

            # Training of the State-Value net and of the Action-Value net for a better baseline and advantage
            if trainNets:
                gamma=0.99
                valueStates=[]
                actionvalueStates=[]
                for i in range(len(rewards)):
                            # ValueNet estimates
                            V_next_estimate = ValueNet.feedForward(states[i])
                            Vs = rewards[i] + gamma * V_next_estimate
                            valueStates.append(Vs)
                            # QNet estimates
                            actionValueVector=[QNet.feedForward(np.append(states[i], [action]))[action] for action in range(env.action_space.n)]
                            actionValueVector = [el.tolist() for el in actionValueVector]
                            Q_next_state = QNet.feedForward(np.append(states[i], actionSingleIndex[i]))[actionSingleIndex[i]]
                            actionValueVector[actionSingleIndex[i]] = [rewards[i] + gamma * np.max(Q_next_state)]
                            actionvalueStates.append(np.array(actionValueVector).reshape(-1))

                V_train = [(state,target) for state, target in zip(states, valueStates+[0])]
                Q_train = [(np.append(state,[action]).reshape(-1,1), target.reshape(-1)) for state, action, target in zip(states[:-1], actionSingleIndex, actionvalueStates)]
                                
                ValueNet.SGD(V_train, 20, 64, 0.1) 
                QNet.SGD(Q_train, 20, 64, 0.1) 
        
                advantage = [QNet.feedForward(np.append(states[i], actionSingleIndex[i])) - ValueNet.feedForward(states[i]) for i in range(len(rewards))]
                advantage = [el[0] for el in advantage]
    
            

            #returns = np.array(dR)
            returns = np.array(advantage)
            
            # scaling and normalization of the returns in order to reduce their variance and improve learning
            eps = np.finfo(np.float32).eps.item()
            returns = (returns - returns.mean()) / (returns.std() + eps)
            returns = np.array(returns)

            advantageList = np.append(advantageList, returns) 

            rewards = []
            batch += 1
            if not batch%32:
                batch = 0
            
                # apply Policy Gradient Descent
                model.PolicyGradientDescent(states, actions, advantageList, probs, eta=eta, lambda_=lambda_)
                

                # Proximal Policy Optimization: not used on CartPole

                
                '''currentProb=[]
                for state in states:
                    prob = model.feedForward(np.array(state))
                    currentProb.append(prob)
                    
                model.ProximalPolicyOptmization(states, actions, advantageList, probs, currentProb, eta=eta, lambda_=lambda_)'''
                

                # Replay Buffer: Not used on CartPole
                '''
                replayBuffer.append((states, actions, returns, probs, game))
                #replayBuffer.sort(key=lambda x:len(x[1]), reverse=True)
                #replayBuffer.sort(key=lambda x:len(x[1]))
                #replayBuffer = [el for el in replayBuffer if game - el[4]<5 and len(el[1])>90]

                if len(replayBuffer)>1000:
                    replayBuffer.pop(0)
                
                #if np.random.random()<=0.01:
                
                indices = random.sample(range(0, len(replayBuffer)), k=min(5, len(replayBuffer)))
                for i in indices:
                    traj = replayBuffer[i]
                    model.PolicyGradientDescent(*traj[:-1], eta=eta, lambda_=lambda_)'''
                
                advantageList = np.array([])
                probs = []
                actions = []
                rewards = []
                trueRewards=[]
                states = []
                observation, info = env.reset()   
            if breakFlag: break  
            
            
    env.close()
    return learningScore

def showAgent(model,env, games=10, flag=False):
    observation, info = env.reset()
    envStates = env.observation_space.shape[0]
    observation = observation.reshape(envStates,1)
    rewards = []
    
    while games:
        observation = observation.reshape(envStates,1)
        prob = model.feedForward(np.array(observation))
        action, _ = sampleSoftmax(prob, 0)
        observation, reward, terminated, truncated, info = env.step(action) 
        rewards.append(reward)
        
        if terminated or truncated:
            if flag: print(sum(rewards))
            rewards = []
            observation, info = env.reset() 
            games-=1
            
    env.close()

def train_V_Q_Nets(model, env, ValueNet, QNet, games=10, epochs=100, gamma=0.95, flag=False):
    observation, info = env.reset()
    envStates = env.observation_space.shape[0]
    observation = observation.reshape(envStates,1)
    rewards = []
    states = []
    actionSingleIndex = []

    while games:
        observation = observation.reshape(envStates,1)
        states.append(observation)
        prob = model.feedForward(np.array(observation))
        action, _ = sampleSoftmax(prob, 0)
        observation, reward, terminated, truncated, info = env.step(action) 
        rewards.append(reward)
        actionSingleIndex.append(action)
        
        if terminated or truncated:
            if flag: print(sum(rewards))

            valueStates=[]
            actionvalueStates=[]
            for i in range(len(rewards)):
                        # ValueNet estimates
                        V_next_estimate = ValueNet.feedForward(states[i])
                        Vs = rewards[i] + gamma * V_next_estimate
                        valueStates.append(Vs)
                        # QNet estimates
                        actionValueVector=[QNet.feedForward(np.append(states[i], [action]))[action] for action in range(env.action_space.n)]
                        actionValueVector = [el.tolist() for el in actionValueVector]
                        Q_next_state = QNet.feedForward(np.append(states[i], actionSingleIndex[i]))[actionSingleIndex[i]]
                        actionValueVector[actionSingleIndex[i]] = [rewards[i] + gamma * np.max(Q_next_state)]
                        actionvalueStates.append(np.array(actionValueVector).reshape(-1))

            V_train = [(state,target) for state, target in zip(states, valueStates+[0])]
            Q_train = [(np.append(state,[action]).reshape(-1,1), target.reshape(-1)) for state, action, target in zip(states[:-1], actionSingleIndex, actionvalueStates)]
                            
            ValueNet.SGD(V_train, epochs, 64, 1) 
            QNet.SGD(Q_train, epochs, 64, 1) 

            rewards = []
            observation, info = env.reset() 
            games-=1

import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/800')
showAgent(agent, gym.make("CartPole-v1", render_mode="human"), games=1, flag=True)

500.0

import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/4000_16_6')
showAgent(agent, gym.make("CartPole-v1", render_mode="human"), games=1, flag=True)

500.0

import NewNeuralNetwork
env = gym.make("CartPole-v1")
states = env.observation_space.shape[0]
actions = env.action_space.n
ValueNet = NewNeuralNetwork.Network([states, 64, 1], output=NewNeuralNetwork.Linear, cost=NewNeuralNetwork.QuadraticCost)
QNet = NewNeuralNetwork.Network([states+1, 64, actions], output=NewNeuralNetwork.Linear, cost=NewNeuralNetwork.QuadraticCost)
#ValueNet = NeuralNetwork.load('../DeepLearningProject/ValueNet')
#QNet = NeuralNetwork.load('../DeepLearningProject/QNet')

# train on cart-pole
import NeuralNetwork

env = gym.make("CartPole-v1")
states = env.observation_space.shape[0]
actions = env.action_space.n
model = NeuralNetwork.Network([states,128,64,actions])#, activation=NeuralNetwork.Network.ReLU)

trainAgent(model, env, ValueNet, QNet, trainNets=False, angleTreshold=1, randomExploration=1, iterationAmplification=100000, learninRate=0.1, lambda_=0, critic=None)

#trainLander(model, env, None, ValueNet, None, trainNets=True, randomExploration=0.3, iterationAmplification=1000000, learninRate=1, lambda_=0.00001)

showAgent(model, gym.make("CartPole-v1", render_mode="human"), flag=True)

def computeTrajectories(n=10, hidden=[], print_=False):
    trajs = []
    size = [states] + hidden + [actions]
    for i in range(n):
        if print_: print(f'\nIteration {i+1}')
        model = NeuralNetwork.Network(size)
        s = trainAgent(model, env, None, None, trainNets=False, angleTreshold=0.5, randomExploration=1, iterationAmplification=100000, learninRate=0.1, lambda_=0, print_=print_)
        trajs.append(s)
    return trajs

def plotTrajectories(trajs):
        for i, traj in enumerate(trajs):
                # extract x and y values from each set
                x, y = zip(*traj)
        
                Rcolor = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
                color = '#{:02x}{:02x}{:02x}'.format(*Rcolor)

                plt.plot(x, y, color=color, label=f'{i}')

        # add labels and title to the plot
        plt.xlabel('games')
        plt.ylabel('score')
        plt.title('Models training performance over 100\'000 trajectories ')

        # display the plot
        plt.show()

plotTrajectories(computeTrajectories(n=5, hidden=[4,4]))

plotTrajectories(computeTrajectories(n=5, hidden=[16,16]))

plotTrajectories(computeTrajectories(n=5, hidden=[4,4]))

# 64 x 64 x 64
import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/64x64x64_700')
showAgent(agent, gym.make("CartPole-v1", render_mode="human"), games=1, flag=True)

500.0

# 32 x 32
import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/2000_32_32')
showAgent(agent, gym.make("CartPole-v1", render_mode="human"), games=1, flag=True)

500.0

plotTrajectories(computeTrajectories(n=5, hidden=[16,16])) # actually 100'000 trajs.

plotTrajectories(computeTrajectories(n=5, hidden=[64,64])) #4 trajs batch, 10'000 trajs on y-axis

def sampleLander(softmaxOutput, epsilon, observation=None):
    x, y, vel_x, vel_y, angle, _, leg1_contact, leg2_contact = observation
    landed = False
    if leg1_contact and leg2_contact: landed=True
    
    if not landed and random.uniform(0, 1) < epsilon:
        # Explore: select a random action
        sampledAction = random.randint(0, len(softmaxOutput) - 1)
        #if random.random()<0.1: sampledAction = 2
        
        #if vel_y < -0.05 and 0<y<1 and abs(angle)<0.15 and not (leg1_contact or leg2_contact): sampledAction=2; #print(np.random.random())
        '''rand = np.random.random()
        cdf = np.cumsum([0.2,0.2,.4,.2])
        sampledAction = np.argmax(cdf > rand)'''
        
    else:
        rand = np.random.random()
        cdf = np.cumsum(softmaxOutput)
        sampledAction = np.argmax(cdf > rand)
    return sampledAction, np.log(softmaxOutput[sampledAction][0]+1e-16)

def reward_function(state, action, next_state):
    # get relevant information from state and next_state
    pos_x, pos_y, vel_x, vel_y, angle, angular_vel, left_leg, right_leg = state
    next_pos_x, next_pos_y, next_vel_x, next_vel_y, next_angle, next_angular_vel, next_left_leg, next_right_leg = next_state
    
    # calculate distance from landing pad
    distance_to_pad = abs(pos_x - 0.5)
    
    # calculate speed
    speed = np.sqrt(vel_x**2 + vel_y**2)
    
    # calculate angular speed
    angular_speed = abs(angular_vel)
    
    # calculate landing reward
    if left_leg == 1 and right_leg == 1 and speed<0.05:
        # lander is on the ground
        if distance_to_pad < 0.5:
            # landed on the pad
            landing_reward = 100
        else:
            # landed but not on the pad
            landing_reward = 10
    else:
        # lander is in the air
        landing_reward = 0
        
    # calculate crash penalty
    '''if pos_y < 0:
        # lander has crashed
        crash_penalty = 0
    else:
        # lander is still in the air
        crash_penalty = 0'''
    crash_penalty = 0
    
    # calculate control penalty
    # control_penalty = -0.3 * abs(action)
    control_penalty=0
    if action == 1 or action ==3: control_penalty=-0.01
    if action==2: control_penalty=-0.1
    
    # calculate velocity penalty
    velocity_penalty = -speed**2
    
    # calculate angular velocity penalty
    angular_velocity_penalty = -1 * angular_speed - abs(angle)
    
    # combine rewards and penalties
    reward = landing_reward + crash_penalty + control_penalty + velocity_penalty + angular_velocity_penalty
    
    return reward

def gae_advantages(advantages, gamma=0.99, gaeLambda=0.1):  # gamma: importance of future rewards; lambda: trade-off bias/variance    
    # Calculate GAE advantages
    gae = 0

    gaeAdvantages = [gae := advantages[t] + gamma + gaeLambda * gae for t in reversed(range(len(advantages)))]

    gae_advantages = np.array(gaeAdvantages)
    
    # Normalize GAE advantages
    mean_advantage = np.mean(gae_advantages)
    std_advantage = np.std(gae_advantages) + 1e-8
    gae_advantages = [(advantage - mean_advantage) / std_advantage for advantage in gae_advantages]

    return np.array(gae_advantages)

def compute_gae_advantages(advantages, gamma=0.99, gae_lambda=0.5): # gamma: importance of future rewards; lambda: trade-off bias/variance    
    # Initialize GAE advantages
    gae_advantages = np.zeros_like(advantages)
    
    # Calculate GAE advantages
    gae = 0
    for t in reversed(range(len(advantages))):
        delta = advantages[t]
        gae = delta + gamma * gae_lambda * gae
        gae_advantages[t] = gae
        
    # Normalize GAE advantages
    ### gae_advantages = (gae_advantages - np.mean(gae_advantages)) / (np.std(gae_advantages) + 1e-8)
    
    return gae_advantages

def slow_reward(state, action, next_state):
    reward = 0
    x, y, vel_x, vel_y, angle, _, leg1_contact, leg2_contact = state
    next_x, next_y, next_vel_x, next_vel_y, next_angle, _, next_leg1_contact, next_leg2_contact = next_state
    speed = np.sqrt(vel_x ** 2 + vel_y ** 2)
    '''if speed < 0.05: # and 0<y<0.4 and abs(x - 0.5):
        reward += 10
    reward -= 1 * speed
    reward -= 1 * abs(y)
    reward -= 1 * abs(x)
    #if 0<y<0.1: reward += 1
    if leg1_contact == 1 and leg2_contact == 1 and speed<0.05:
        reward += 50; #print('landing\n')
        if -0.5<x<0.5: reward+=100'''
    
    if leg1_contact and leg2_contact and speed<0.05:
        reward += 150
    
    reward -= 1 * abs(vel_y)
    if abs(vel_y)<0.1 and y<0.2: reward+=1

    return reward

def landedLander(obs):
    landed = False
    rew = 0
    
    x, y, vel_x, vel_y, angle, _, leg1_contact, leg2_contact = obs
    speed = np.sqrt(vel_x ** 2 + vel_y ** 2)

    if leg1_contact and leg2_contact and speed<0.05: #-0.3<x<0.3 and 0<y<0.15 and 
        landed = True
        rew=1000
    
    return landed, rew

def trainLander(model,env, env2, ValueNet, QNet, trainNets=False, randomExploration = 0, learninRate =  0.01, lambda_=0, iterationAmplification = 1, print_=True):
    #env = gym.make(env) #, render_mode="human")
    envStates = env.observation_space.shape[0]
    env.action_space.seed(0)
    seed= None

    observation, info = env.reset(seed=seed)
    observation = observation.reshape(envStates,1)

    learningScore = []

    rewards = []
    trueRewards = []
    probs = []
    actions = []
    states = []
    savedLogProbs = []
    savedProbs = []
    actionIndexes=[]
    actionSingleIndex=[]
    replayStates=[]
    replayActions = []
    replayAdv = []
    replayProbs = []
    game = 1
    # randomExploration: hyperparameter to incentive random exploration (random actions)
    # angleTreshold: hyperparameter that regulates the maximum rewarding angle
    eta=learninRate
    meanReward = 0
    baseline = 0
    maxScore = 0
    interval = min(100, iterationAmplification)
    prev_avg_reward = 0 # baseline
    breakFlag = False
    batch = 0
    advantageList = np.array([])

    mainEnv = env
    while(True):
        observation = observation.reshape(envStates,1)
        states.append(observation)
        prob = model.feedForward(np.array(observation))
        
        if not (game-1)%1:
            #action, lP = sampleSoftmax(prob, randomExploration)
            action, lP = sampleLander(prob, randomExploration, observation)
        #action, lP = sampleSoftmax(prob, randomExploration)
        
        old_obs = observation
        observation, trueReward, terminated, truncated, info = mainEnv.step(action) 
        reward = trueReward
        # landed, _ = landedLander(observation)
        # terminated = terminated or landed
        # reward = trueReward #+ landR
        reward += slow_reward(state=old_obs, action=action, next_state=observation)
        '''if (terminated or truncated): 
            if not landed: reward -= 150
            #else: reward += 100
        reward = reward_function(state=old_obs, action=action, next_state=observation)'''
        y = tuple([1 if i==action else 0 for i in range(len(prob))])
        

        probs.append(prob)
        actions.append(y)
        rewards.append(reward)
        trueRewards.append(trueReward)
        savedLogProbs.append(lP)
        savedProbs.append(prob[action])
        actionIndexes.append(np.array([action]))
        actionSingleIndex.append(action)

        
        if terminated or truncated:
            meanReward += sum(trueRewards)
            game += 1
            #states.append(observation)

            if not game%interval:
                meanReward /= interval
                if print_: print(f' game: {game}, mean reward: {meanReward}, at eta: {eta}, exp: {randomExploration}, lambda: {lambda_}')
                learningScore.append((game, meanReward))
                if game==iterationAmplification: breakFlag=True; break
                if meanReward>=400: eta = 0.001; randomExploration = 0; 
                elif meanReward>=100: eta = 0.01; randomExploration *=0.9; 
                elif meanReward>=0: randomExploration *=0.9; eta=1; 
                elif meanReward>=-100: randomExploration *=.9; eta=1; 
                else: eta = 1; randomExploration *= 0.99;
                #if game<500: randomExploration=0.9; eta=10
                #elif game<1000: randomExploration=0.7
                
                meanReward = 0
            
            dR = discountedRewards(rewards, 0.8)
            returns = np.array(dR)

            if trainNets:
                gamma=0.99
                            
                V_train = [(state,target.reshape(-1,1)) for state, target in zip(states, returns)]
                      
                ValueNet.SGD(V_train, 10, 100, 0.01) 

                advantage = [dR[i] - ValueNet.feedForward(states[i]) for i in range(len(states))]

                #mygae = gae_advantages(advantage)
                
                gae = compute_gae_advantages(advantage)

                returns = np.array(gae)
                
            
            eps = np.finfo(np.float32).eps.item()
            returns = (returns - returns.mean()) / (returns.std() + eps)
            
            returns = np.array(returns)

            advantageList = np.append(advantageList, returns)
            rewards = []

            '''replayStates += states
            replayActions += actions
            replayAdv.extend(returns)
            replayProbs += probs
            if len(replayStates)>50000:
                   replayStates = replayStates[1000:]
                   replayActions = replayActions[1000:]
                   replayAdv = replayAdv[1000:]
                   replayProbs = replayProbs[1000:]'''
            
            batch += 1
            if not batch%8:
                batch=0
                
                for _ in range(1):
                    model.PolicyGradientDescent(states, actions, returns, probs, eta=eta, lambda_=lambda_)
                    
                    '''
                    currentProb=[]
                    for state in states:
                        prob = model.feedForward(np.array(state))
                        currentProb.append(prob)


                    model.ProximalPolicyOptmization(states, actions, returns, probs, currentProb, eta=eta, lambda_=lambda_)
                    '''
                    


                # replay buffer
                
                '''indices = random.sample(range(len(replayStates)), min(100, len(replayStates)))
                s = [ replayStates[idx] for idx in indices]
                a = [ replayActions[idx] for idx in indices]
                r = [ replayAdv[idx] for idx in indices]
                p = [ replayProbs[idx] for idx in indices]
                
                model.PolicyGradientDescent(s, a, r, p, eta=eta, lambda_=lambda_)
                currentProb=[]
                for state in states:
                    prob = model.feedForward(np.array(state))
                    currentProb.append(prob)
                model.ProximalPolicyOptmization(s, a, r, p, currentProb, eta=eta, lambda_=lambda_)
                V_train = [(state,target.reshape(-1,1)) for state, target in zip(s, r)]   '''
                #ValueNet.SGD(V_train, 20, 25, 0.01)
                    
        
                advantageList = np.array([])
                probs = []
                actions = []
                rewards = []
                trueRewards=[]
                states = []
                if random.random()<0.05 and env2: mainEnv=env2
                else: mainEnv=env
                observation, info = mainEnv.reset(seed=seed)     
            if breakFlag: break
            
            
            
    #print(f'model weights: ', model.weights)
    #print(f'model biases: ', model.biases)
    env.close()
    return learningScore

import NeuralNetwork
import NewNeuralNetwork
env = gym.make("LunarLander-v2", enable_wind=False, gravity=-7.0, continuous=False,wind_power = 0, turbulence_power = 0)
env2 = gym.make("LunarLander-v2", enable_wind = False, render_mode="human", gravity=-7.0)
states = env.observation_space.shape[0]
actions = env.action_space.n
ValueNet = NewNeuralNetwork.Network([states,16,16,16,1], output=NewNeuralNetwork.Linear, cost=NewNeuralNetwork.QuadraticCost)

landerModel = NeuralNetwork.Network([states,512,256,128,64,16,actions])

trainLander(landerModel, env, env2, ValueNet, None, trainNets=False, randomExploration=.8, iterationAmplification=100000, learninRate=1, lambda_=0)

def computeLanderTrajectories(n=10, hidden=[], print_=False):
    trajs = []
    size = [states] + hidden + [actions]
    for i in range(n):
        if print_: print(f'\nIteration {i+1}')
        landerModel = NeuralNetwork.Network(size)
        s = trainLander(landerModel, env, env2, None, None, trainNets=False, randomExploration=0.2, iterationAmplification=2000, learninRate=1, lambda_=0, print_=print_)
        trajs.append(s)
    return trajs

plotTrajectories(computeLanderTrajectories(n=5, hidden=[96,64,64]))

plotTrajectories(computeLanderTrajectories(n=5, hidden=[96,64,64]))

def trainMountainCar(model,env, env2, ValueNet, QNet=None, trainNets=False, randomExploration = 0, learninRate =  0.01, lambda_=0, iterationAmplification = 1000, print_=True):
    #env = gym.make(env) #, render_mode="human")
    envStates = env.observation_space.shape[0]
    env.action_space.seed(0)
    seed=None

    observation, info = env.reset(seed=seed)
    observation = observation.reshape(envStates,1)

    learningScore = []

    rewards = []
    trueRewards = []
    probs = []
    actions = []
    states = []
    savedLogProbs = []
    savedProbs = []
    actionIndexes=[]
    actionSingleIndex=[]
    replayStates=[]
    replayActions = []
    replayAdv = []
    replayProbs = []
    game = 1
    # randomExploration: hyperparameter to incentive random exploration (random actions)
    # angleTreshold: hyperparameter that regulates the maximum rewarding angle
    eta=learninRate
    meanReward = 0
    baseline = 0
    maxScore = 0
    interval = min(100, iterationAmplification)
    prev_avg_reward = 0 # baseline
    breakFlag = False
    batch = 0
    advantageList = np.array([])

    mainEnv = env
    while(True):
        observation = observation.reshape(envStates,1)
        states.append(observation)
        prob = model.feedForward(np.array(observation))
        
        if not (game-1)%1:
            '''if np.random.random()<randomExploration:
                action=1
                if observation[1] > 0: action=2
                if observation[1] < 0: action=0
                lP = np.log(prob[action]+1e-8)
            else:'''
            action, lP = sampleSoftmax(prob, randomExploration)
            #action, lP = sampleLander(prob, randomExploration, observation)
            #action, lP = sampleSoftmax(prob, randomExploration)
        
        
        observation, trueReward, terminated, truncated, info = mainEnv.step(action) 
        reward = trueReward
        reward += abs(observation[1])
        if observation[0]>=0.5: reward+=100
    
        y = tuple([1 if i==action else 0 for i in range(len(prob))])
        

        probs.append(prob)
        actions.append(y)
        rewards.append(reward)
        trueRewards.append(trueReward)
        savedLogProbs.append(lP)
        savedProbs.append(prob[action])
        actionIndexes.append(np.array([action]))
        actionSingleIndex.append(action)

        
        if terminated or truncated:
            meanReward += sum(trueRewards)
            game += 1
            #states.append(observation)

            if not game%interval:
                meanReward /= interval
                if print_: print(f' game: {game}, mean reward: {meanReward}, at eta: {eta}, exp: {randomExploration}, lambda: {lambda_}')
                if game==iterationAmplification: breakFlag=True; break
                learningScore.append((game, meanReward))
                '''if meanReward == 500: breakFlag=True; break
                if meanReward>=400: eta = 0.001; randomExploration = 0; 
                elif meanReward>=100: eta = 0.01; randomExploration *=0.9; 
                elif meanReward>=-150: randomExploration = .4; eta=1; 
                elif meanReward>=-160: randomExploration = .5; eta=1; 
                else: eta = 1; randomExploration = 0.75;'''
                #randomExploration -= 0.01
                #if game<1000: randomExploration=1
                randomExploration=np.random.random()
                #if np.random.random()<0.4: randomExploration=1
                
                
                meanReward = 0
            
            dR = discountedRewards(rewards, 1)
            returns = np.array(dR)

            if trainNets:
                gamma=0.99
                            
                V_train = [(state,target.reshape(-1,1)) for state, target in zip(states, returns)]
                      
                ValueNet.SGD(V_train, 10, 100, 0.01) 

                advantage = [dR[i] - ValueNet.feedForward(states[i]) for i in range(len(states))]

                #mygae = gae_advantages(advantage)
                
                gae = compute_gae_advantages(advantage)

                returns = np.array(gae)
                
            
            # eps = np.finfo(np.float32).eps.item()
            # returns = (returns - returns.mean()) / (returns.std() + eps)
            
            # returns = np.array(returns)

            '''replayStates += states
            replayActions += actions
            replayAdv.extend(returns)
            replayProbs += probs
            if len(replayStates)>50000:
                   replayStates = replayStates[1000:]
                   replayActions = replayActions[1000:]
                   replayAdv = replayAdv[1000:]
                   replayProbs = replayProbs[1000:]'''
            
            # print(returns)
            rewards = []
            batch += 1
            if not batch%4:
                batch = 0

                for _ in range(1):
                    model.PolicyGradientDescent(states, actions, returns, probs, eta=eta, lambda_=lambda_)
                    
                
                    '''currentProb=[]
                    for state in states:
                        prob = model.feedForward(np.array(state))
                        currentProb.append(prob)


                    model.ProximalPolicyOptmization(states, actions, returns, probs, currentProb, eta=eta, lambda_=lambda_)
                    '''


                # replay buffer
                
                '''indices = random.sample(range(len(replayStates)), min(100, len(replayStates)))
                s = [ replayStates[idx] for idx in indices]
                a = [ replayActions[idx] for idx in indices]
                r = [ replayAdv[idx] for idx in indices]
                p = [ replayProbs[idx] for idx in indices]
                
                model.PolicyGradientDescent(s, a, r, p, eta=eta, lambda_=lambda_)
                currentProb=[]
                for state in states:
                    prob = model.feedForward(np.array(state))
                    currentProb.append(prob)
                model.ProximalPolicyOptmization(s, a, r, p, currentProb, eta=eta, lambda_=lambda_)
                V_train = [(state,target.reshape(-1,1)) for state, target in zip(s, r)]   '''
                #ValueNet.SGD(V_train, 20, 25, 0.01)
                    
        
        
                advantageList = np.array([])
                probs = []
                actions = []
                rewards = []
                trueRewards=[]
                states = []
                if random.random()<0 and env2: mainEnv=env2
                else: mainEnv=env
                observation, info = mainEnv.reset(seed=seed)     
            if breakFlag: break
            
            
            
    #print(f'model weights: ', model.weights)
    #print(f'model biases: ', model.biases)
    env.close()
    return learningScore

import NewNeuralNetwork
env3 = gym.make('MountainCar-v0')#, render_mode="human")
env4 = gym.make('MountainCar-v0', render_mode="human")
states = env3.observation_space.shape[0]
actions = env3.action_space.n
VNet = NewNeuralNetwork.Network([states,16,16,16,1], output=NewNeuralNetwork.Linear, cost=NewNeuralNetwork.QuadraticCost)

mountainModel = NeuralNetwork.Network([states,actions])

trainMountainCar(mountainModel, env3, env4, ValueNet=VNet, trainNets=False, randomExploration = 1, iterationAmplification=2000, lambda_=0, learninRate=100)

 game: 100, mean reward: -93.06, at eta: 100, exp: 1, lambda: 0
 game: 200, mean reward: -95.12, at eta: 100, exp: 0.8427099447597383, lambda: 0
 game: 300, mean reward: -103.09, at eta: 100, exp: 0.5823987575828864, lambda: 0
 game: 400, mean reward: -99.77, at eta: 100, exp: 0.5947802796644819, lambda: 0
 game: 500, mean reward: -102.08, at eta: 100, exp: 0.4796995292898151, lambda: 0
 game: 600, mean reward: -95.73, at eta: 100, exp: 0.9325838281622248, lambda: 0
 game: 700, mean reward: -94.3, at eta: 100, exp: 0.9021528691359678, lambda: 0
 game: 800, mean reward: -102.89, at eta: 100, exp: 0.471950111762633, lambda: 0
 game: 900, mean reward: -95.31, at eta: 100, exp: 0.6047001552517405, lambda: 0
 game: 1000, mean reward: -103.44, at eta: 100, exp: 0.13990346753078653, lambda: 0
 game: 1100, mean reward: -97.5, at eta: 100, exp: 0.6894720487673404, lambda: 0
 game: 1200, mean reward: -97.02, at eta: 100, exp: 0.7506466582443434, lambda: 0
 game: 1300, mean reward: -99.6, at eta: 100, exp: 0.18914011306510192, lambda: 0
 game: 1400, mean reward: -95.98, at eta: 100, exp: 0.08138415071598315, lambda: 0
 game: 1500, mean reward: -104.19, at eta: 100, exp: 0.44290048550530237, lambda: 0
 game: 1600, mean reward: -97.12, at eta: 100, exp: 0.865375662491242, lambda: 0
 game: 1700, mean reward: -104.85, at eta: 100, exp: 0.1929234337707728, lambda: 0
 game: 1800, mean reward: -106.66, at eta: 100, exp: 0.3463384683289228, lambda: 0
 game: 1900, mean reward: -98.67, at eta: 100, exp: 0.23242966672668341, lambda: 0
 game: 2000, mean reward: -100.81, at eta: 100, exp: 0.7249166952132159, lambda: 0

[(100, -93.06),
 (200, -95.12),
 (300, -103.09),
 (400, -99.77),
 (500, -102.08),
 (600, -95.73),
 (700, -94.3),
 (800, -102.89),
 (900, -95.31),
 (1000, -103.44),
 (1100, -97.5),
 (1200, -97.02),
 (1300, -99.6),
 (1400, -95.98),
 (1500, -104.19),
 (1600, -97.12),
 (1700, -104.85),
 (1800, -106.66),
 (1900, -98.67)]

showAgent(env=gym.make('MountainCar-v0', render_mode="human"), model=mountainModel, games=1, flag=True)

-128.0

mountainModel.save("../DeepLearningProject/Networks/MC_01")

mountainModel = NeuralNetwork.load("../DeepLearningProject/Networks/MC_00")

def computeMCTrajectories(n=10, hidden=[], print_=False):
    trajs = []
    size = [states] + hidden + [actions]
    for i in range(n):
        if print_: print(f'\nIteration {i+1}')
        mountainModel = NeuralNetwork.Network(size)
        s = trainMountainCar(mountainModel, env3, env4, ValueNet=VNet, trainNets=False, randomExploration = 1, iterationAmplification=2000, lambda_=0, learninRate=100, print_=print_)
        trajs.append(s)
    return trajs

plotTrajectories(computeMCTrajectories())

plotTrajectories(computeMCTrajectories(n=5, hidden=[8]))

plotTrajectories(computeMCTrajectories(n=5, hidden=[16,16]))

plotTrajectories(computeMCTrajectories(n=5))

import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/MC_00')
showAgent(agent, gym.make("MountainCar-v0", render_mode="human"), games=1, flag=True)

-122.0

import NeuralNetwork, gymnasium as gym
agent = NeuralNetwork.load('../DeepLearningProject/Networks/MC_01')
showAgent(agent, gym.make("MountainCar-v0", render_mode="human"), games=1, flag=True)

-122.0

Policy Gradient Descent from scratch:¶

Implementing REINFORCE Algorithm with no use of Deep Learning Libraries¶

What is Reinforcement Learning?¶

POLICY GRADIENT¶

OVERTURE¶

Markov Decision Processes¶

Policy¶

Policy Gradient Descent¶

REINFORCE ALGORITHM¶

Our Implementation¶

Reinforce Gradient¶

From where does this loss function come from?¶

In the REINFORCE algorithm, we estimate this gradient using Monte Carlo sampling, and we approximate the expectation by averaging over multiple sampled trajectories.¶

First Term:¶

DERIVATION:¶

Second Term:¶

DERIVATION:¶

The Code¶

Thoughts and impressions¶

Cart Pole!¶

Details:¶

Action Space¶

Observation Space¶

Rewards¶

Episod End¶

We now define some of the functions we need for our agent, we will go through each of them as we define them¶

Discounted Rewards¶

Epsilon-greedy exploration & Epsilon decay¶

Reward shaping¶

Baseline¶

Imitation Learning¶

Learning Process!¶

Run the cells below to see the agents playing the cartpole!¶

These are some examples of agents that successfully learned to play the cartpole, we will dive into the details soon!¶

Code used for the learning of the agents:¶

Code used to compute and plot the trajectories of an agent¶

We struggled much to find working set of hyperparameters and had to experiment a bit between all the different techniques.¶

4x4 and 16x16¶

Weight Decay:¶

Anyway our implementation was able to train very big nets, even 64x64x64 hidden layer size.¶

Lunar Lander¶

Details:¶

Action Space¶

Observation Space¶

Rewards¶

Episod End¶

Train lander using Proximal Policy Optimization¶

Demonstration code, used for training:¶

Proximal Policy Optimization¶

Extra:¶

Mountain Car¶

Demonstration code, used for testing the training¶

Imitation Learning:¶

No hidden layers:¶

Hidden layers:¶

PPO:¶

Our Agents:¶

Final Considerations¶

Thank you for the attention and support, it has been a pleasure working on this project, and we sincerely appreciated this opportunity to learn and grow.¶

Matteo and Riccardo De Sanctis.¶