您的位置:首页 > 健康 > 养生 > 强化学习实践(三):Monte Carlo Basic(MC Basic \ MC Exploring Starts \ MC Epsilon-Greedy)

强化学习实践(三):Monte Carlo Basic(MC Basic \ MC Exploring Starts \ MC Epsilon-Greedy)

2024/10/6 14:23:40 来源:https://blog.csdn.net/qq_42828479/article/details/141829836  浏览:    关键词:强化学习实践(三):Monte Carlo Basic(MC Basic \ MC Exploring Starts \ MC Epsilon-Greedy)

强化学习实践(三):Monte Carlo Basic(MC Basic \ MC Exploring Starts \ MC Epsilon-Greedy)

  • 伪代码
    • MC Basic
    • MC Exploring Starts
    • MC Epsilon-Greedy
  • 代码
  • 项目地址

伪代码

具体的理解可以看理论学习篇,以及代码中的注释,以及赵老师原著

MC Basic

在这里插入图片描述

MC Exploring Starts

在这里插入图片描述

MC Epsilon-Greedy

在这里插入图片描述

代码

import numpy as np
from tqdm import tqdmfrom environment.env import Env
from environment.vis import Vis
from environment.enums import TrajItemsclass MonteCarlo:"""蒙特卡洛是近似Qsa值的一种方法, 用多条实际轨迹的Discounted Return的平均值来代替真正的Qsa同时, 原本PE过程计算的是状态价值, 然后PI过程用这个值来更新策略, 为啥MC会近似Qsa而不是状态价值呢是因为展开PI过程会发现, 实际是sigma(pi(a | s) * sigma(p(r | s, a) * r)), 其中Qsa的计算需要状态价值和环境模型p(r | s, a)所以直接近似Qsa才能进行PI"""def __init__(self, gamma: float = 0.9, env: Env = None, vis: Vis = None, render: bool = False):self.gamma = gammaself.env = envself.vis = visself.render = renderself.policy = np.zeros(shape=(self.env.state_space_size, self.env.action_space_size), dtype=float)self.qtable = np.zeros(shape=self.env.state_space_size, dtype=float)def mc_basic(self, steps: int = 30, epochs: int = 100, trajectory_numbers: int = 1) -> None:"""基本的mc, 遍历每个状态动作对, 并从每个状态动作队采样多个trajectory, 用trajectory的平局奖励作为q(s,a)注意这里采样的多条轨迹是确定且一致的, 因为在某个策略下, 下一个状态要采取动作只有一个概率为1, 其他为0:param steps: trajectory的长度:param epochs: 迭代次数:param trajectory_numbers: 每个状态动作对采集的trajectory的数量,这里设置为1:return: None"""self.init_policy()for _ in tqdm(range(epochs)):for state in self.env.state_space:qsa = np.zeros(shape=self.env.action_space_size, dtype=float)for action in self.env.action_space:gs = np.zeros(shape=trajectory_numbers, dtype=float)for traj_index in range(trajectory_numbers):traj = self.env.episode(self.policy, state, action, steps)[::-1, :]for step in range(steps):gs[traj_index] = traj[step, TrajItems.REWARD.value] + self.gamma * gs[traj_index]qsa[action] = gs.mean()self.policy[state] = np.zeros(shape=self.env.action_space_size)self.policy[state, np.argmax(qsa)] = 1self.qtable[state] = np.max(qsa)if self.render:self.vis.show_policy(self.policy)self.vis.show_value(self.qtable)self.vis.show()def mc_exploring_starts(self, steps: int = 30, epochs: int = 100) -> None:"""为了保证每个状态动作对都访问到, 使用了遍历。这个算法整体来说只能说提高了数据利用率, 效果很差:param steps: trajectory的长度:param epochs: 迭代次数:return: None"""self.init_policy()returns = np.zeros(shape=(self.env.state_space_size, self.env.action_space_size), dtype=float)nums = np.zeros(shape=(self.env.state_space_size, self.env.action_space_size), dtype=int)for _ in tqdm(range(epochs)):for state in self.env.state_space:qsa = np.zeros(shape=self.env.action_space_size, dtype=float)for action in self.env.action_space:traj = self.env.episode(self.policy, state, action, steps)[::-1, :]g = 0for step in range(steps):g = traj[step, TrajItems.REWARD.value] + self.gamma * gtraj_state = int(traj[step, TrajItems.STATE.value])traj_action = int(traj[step, TrajItems.ACTION.value])returns[traj_state, traj_action] += gnums[traj_state, traj_action] += 1qsa[traj_action] = returns[traj_state, traj_action] / nums[traj_state, traj_action]self.policy[traj_state] = np.zeros(shape=self.env.action_space_size)self.policy[traj_state, np.argmax(qsa)] = 1self.qtable[traj_state] = np.max(qsa)if self.render:self.vis.show_policy(self.policy)self.vis.show_value(self.qtable)self.vis.show()def mc_epsilon_greedy(self, steps: int = 200, epochs: int = 2000, epsilon: float = 0.1) -> None:"""非傻贪婪,实际上效果和上面一个一样, 很差, 但是这种思想很重要:param steps: trajectory的长度:param epochs: 迭代次数:param epsilon: 探索率:return: None"""self.init_policy()returns = np.zeros(shape=(self.env.state_space_size, self.env.action_space_size), dtype=float)nums = np.zeros(shape=(self.env.state_space_size, self.env.action_space_size), dtype=int)# for each episode, dofor _ in tqdm(range(epochs)):state = np.random.choice(self.env.state_space)action = np.random.choice(self.env.action_space)qsa = np.zeros(shape=self.env.action_space_size, dtype=float)traj = self.env.episode(self.policy, state, action, steps)[::-1, :]g = 0for step in range(steps):g = traj[step, TrajItems.REWARD.value] + self.gamma * gtraj_state = int(traj[step, TrajItems.STATE.value])traj_action = int(traj[step, TrajItems.ACTION.value])returns[traj_state, traj_action] += gnums[traj_state, traj_action] += 1qsa[traj_action] = returns[traj_state, traj_action] / nums[traj_state, traj_action]other_probability = epsilon * (1 / self.env.action_space_size)self.policy[traj_state] = np.ones(shape=self.env.action_space_size) * other_probabilityself.policy[traj_state, np.argmax(qsa)] = 1 - other_probability * (self.env.action_space_size - 1)self.qtable[traj_state] = np.max(qsa)if self.render:self.vis.show_policy(self.policy)self.vis.show_value(self.qtable)self.vis.show()def init_policy(self) -> None:"""随机初始化策略:return: None"""random_action = np.random.randint(self.env.action_space_size, size=self.env.state_space_size)for state, action in enumerate(random_action):self.policy[state, action] = 1if __name__ == "__main__":start_state = [0, 0]target_state = [2, 3]forbid = [[2, 2], [2, 1], [1, 1], [3, 3], [1, 3], [1, 4]]model = MonteCarlo(vis=Vis(target_state=target_state, forbid=forbid),env=Env(target_state=target_state, forbid=forbid),render=True)model.mc_basic()# model.mc_exploring_starts()# model.mc_epsilon_greedy()

项目地址

RL_Algorithms(正在逐步更新多智能体的算法,STAR HOPE(^ - ^)

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com