Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] ReinforcementLearning.jl integration #9

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Polyhedra = "67491407-f73d-577b-9b50-8179a7c68029"
Quaternions = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand Down
51 changes: 25 additions & 26 deletions environments/environment.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ mutable struct Environment{X,T,M,A,O,I}
dynamics_jacobian_state::Matrix{T}
dynamics_jacobian_input::Matrix{T}
input_previous::Vector{T}
control_map::Matrix{T}
control_map::Matrix{T}
num_states::Int
num_inputs::Int
num_observations::Int
Expand Down Expand Up @@ -66,33 +66,33 @@ end
attitude_decompress: flag for pre- and post-concatenating Jacobians with attitude Jacobians
"""
function Base.step(env::Environment, x, u;
gradients=false,
attitude_decompress=false)
gradients = false,
attitude_decompress = false)

mechanism = env.mechanism
timestep= mechanism.timestep
timestep = mechanism.timestep

x0 = x
# u = clip(env.input_space, u) # control limits
env.input_previous .= u # for rendering in Gym
u_scaled = env.control_map * u
u_scaled = env.control_map * u

z0 = env.representation == :minimal ? minimal_to_maximal(mechanism, x0) : x0
z1 = step!(mechanism, z0, u_scaled; opts=env.opts_step)
z1 = step!(mechanism, z0, u_scaled; opts = env.opts_step)
env.state .= env.representation == :minimal ? maximal_to_minimal(mechanism, z1) : z1

# Compute cost
costs = cost(env, x, u)

# Check termination
done = is_done(env, x)
# Check termination
done = is_done(env, x)

# Gradients
if gradients
if env.representation == :minimal
fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad)
fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad)
elseif env.representation == :maximal
fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad)
fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad)
if attitude_decompress
A0 = attitude_jacobian(z0, length(env.mechanism.bodies))
A1 = attitude_jacobian(z1, length(env.mechanism.bodies))
Expand All @@ -109,11 +109,11 @@ function Base.step(env::Environment, x, u;
end

function Base.step(env::Environment, u;
gradients=false,
attitude_decompress=false)
step(env, env.state, u;
gradients=gradients,
attitude_decompress=attitude_decompress)
gradients = false,
attitude_decompress = false)
step(env, env.state, u;
gradients = gradients,
attitude_decompress = attitude_decompress)
end

"""
Expand Down Expand Up @@ -156,7 +156,7 @@ is_done(env::Environment, x) = false
x: state
"""
function Base.reset(env::Environment{X};
x=nothing) where X
x = nothing) where {X}

initialize!(env.mechanism, type2symbol(X))
if x != nothing
Expand All @@ -172,15 +172,15 @@ function Base.reset(env::Environment{X};
return get_observation(env)
end

function MeshCat.render(env::Environment,
mode="human")
function MeshCat.render(env::Environment,
mode = "human")
z = env.representation == :minimal ? minimal_to_maximal(env.mechanism, env.state) : env.state
set_robot(env.vis, env.mechanism, z, name=:robot)
set_robot(env.vis, env.mechanism, z, name = :robot)
return nothing
end

function seed(env::Environment; s=0)
env.rng[1] = MersenneTwister(seed)
function seed(env::Environment, s = 0)
env.rng[1] = MersenneTwister(s)
return nothing
end

Expand All @@ -196,7 +196,7 @@ end
abstract type Space{T,N} end

"""
BoxSpace{T,N} <: Environment{T,N}
BoxSpace{T,N} <: Space{T,N}

domain with lower and upper limits

Expand All @@ -214,12 +214,12 @@ mutable struct BoxSpace{T,N} <: Space{T,N}
dtype::DataType # this is always T, it's needed to interface with Stable-Baselines
end

function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where T
function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where {T}
return BoxSpace{T,n}(n, low, high, (n,), T)
end

function sample(s::BoxSpace{T,N}) where {T,N}
return rand(T,N) .* (s.high .- s.low) .+ s.low
return rand(T, N) .* (s.high .- s.low) .+ s.low
end

function contains(s::BoxSpace{T,N}, v::AbstractVector{T}) where {T,N}
Expand All @@ -230,5 +230,4 @@ function clip(s::BoxSpace, u)
clamp.(u, s.low, s.high)
end



Random.rand(rng::Random.AbstractRNG, s::BoxSpace{T,N}) where {T,N} = return rand(rng, T, N) .* (s.high .- s.low) .+ s.low
46 changes: 46 additions & 0 deletions environments/rlenv.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
using ReinforcementLearningBase: RLBase

mutable struct DojoRLEnv{T} <: RLBase.AbstractEnv
dojoenv::Environment
state::Vector{T}
reward::T
done::Bool
info::Dict
end

function DojoRLEnv(dojoenv::Environment{X,T}) where {X,T}
state = reset(dojoenv)
return DojoRLEnv{T}(dojoenv, state, convert(T, 0.0), false, Dict())
end

function DojoRLEnv(name::String; kwargs...)
DojoRLEnv(Dojo.get_environment(name; kwargs...))
end

function Base.convert(::Type{RLBase.Space}, s::BoxSpace)
RLBase.Space([BoxSpace(1; low = s.low[i:i], high = s.high[i:i]) for i in 1:s.n])
end

RLBase.action_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.input_space)
RLBase.state_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.observation_space)
RLBase.is_terminated(env::DojoRLEnv) = env.done

RLBase.reset!(env::DojoRLEnv) = reset(env.dojoenv)

RLBase.reward(env::DojoRLEnv) = env.reward
RLBase.state(env::DojoRLEnv) = env.state

Random.seed!(env::DojoRLEnv, seed) = Dojo.seed(env.dojoenv, seed)

# TODO:
# RLBase.ChanceStyle(env::DojoRLEnv) = RLBase.DETERMINISTIC

function (env::DojoRLEnv)(a)
s, r, d, i = step(env.dojoenv, a)
env.state .= s
env.reward = r
env.done = d
env.info = i
return nothing
end
(env::DojoRLEnv)(a::Number) = env([a])
6 changes: 6 additions & 0 deletions examples/deeprl/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[deps]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Dojo = "ac60b53e-8d92-4c83-b960-e78698fa1916"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318"
80 changes: 80 additions & 0 deletions examples/deeprl/ant_ddpg.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
using ReinforcementLearning
using Flux
using Flux.Losses

using Random
using Dojo

function RL.Experiment(
::Val{:JuliaRL},
::Val{:DDPG},
::Val{:DojoAnt},
::Nothing,
save_dir = nothing,
seed = 42
)

rng = MersenneTwister(seed)
env = Dojo.DojoRLEnv("ant")
Random.seed!(env, seed)
A = action_space(env)
ns, na = length(state(env)), length(action_space(env))
@show na

init = glorot_uniform(rng)

create_actor() = Chain(
Dense(ns, 30, relu; init = init),
Dense(30, 30, relu; init = init),
Dense(30, na, tanh; init = init),
)
create_critic() = Chain(
Dense(ns + na, 30, relu; init = init),
Dense(30, 30, relu; init = init),
Dense(30, 1; init = init),
)

agent = Agent(
policy = DDPGPolicy(
behavior_actor = NeuralNetworkApproximator(
model = create_actor(),
optimizer = ADAM(),
),
behavior_critic = NeuralNetworkApproximator(
model = create_critic(),
optimizer = ADAM(),
),
target_actor = NeuralNetworkApproximator(
model = create_actor(),
optimizer = ADAM(),
),
target_critic = NeuralNetworkApproximator(
model = create_critic(),
optimizer = ADAM(),
),
γ = 0.99f0,
ρ = 0.995f0,
na = na,
batch_size = 64,
start_steps = 1000,
start_policy = RandomPolicy(A; rng = rng),
update_after = 1000,
update_freq = 1,
act_limit = 1.0,
act_noise = 0.1,
rng = rng,
),
trajectory = CircularArraySARTTrajectory(
capacity = 10000,
state = Vector{Float32} => (ns,),
action = Float32 => (na, ),
),
)

stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
hook = TotalRewardPerEpisode()
Experiment(agent, env, stop_condition, hook, "# Dojo Ant with DDPG")
end

ex = E`JuliaRL_DDPG_DojoAnt`
run(ex)
73 changes: 73 additions & 0 deletions examples/deeprl/ant_ppo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
using ReinforcementLearning
using Flux
using Flux.Losses

using Random
using Distributions
using Dojo

function RL.Experiment(
::Val{:JuliaRL},
::Val{:PPO},
::Val{:DojoAnt},
::Nothing,
save_dir = nothing,
seed = 42
)
rng = MersenneTwister(seed)
N_ENV = 6
UPDATE_FREQ = 32
env_vec = [Dojo.DojoRLEnv("ant") for i in 1:N_ENV]
for i in 1:N_ENV
Random.seed!(env_vec[i], hash(seed+i))
end
env = MultiThreadEnv(env_vec)

ns, na = length(state(env[1])), length(action_space(env[1]))
RLBase.reset!(env; is_force=true)

agent = Agent(
policy = PPOPolicy(
approximator = ActorCritic(
actor = GaussianNetwork(
pre = Chain(
Dense(ns, 64, relu; init = glorot_uniform(rng)),
Dense(64, 64, relu; init = glorot_uniform(rng)),
),
μ = Chain(Dense(64, na, tanh; init = glorot_uniform(rng)), vec),
logσ = Chain(Dense(64, na; init = glorot_uniform(rng)), vec),
),
critic = Chain(
Dense(ns, 256, relu; init = glorot_uniform(rng)),
Dense(256, na; init = glorot_uniform(rng)),
),
optimizer = ADAM(1e-3),
),
γ = 0.99f0,
λ = 0.95f0,
clip_range = 0.1f0,
max_grad_norm = 0.5f0,
n_epochs = 4,
n_microbatches = 4,
actor_loss_weight = 1.0f0,
critic_loss_weight = 0.5f0,
entropy_loss_weight = 0.001f0,
dist = Normal,
update_freq = UPDATE_FREQ,
),
trajectory = PPOTrajectory(;
capacity = UPDATE_FREQ,
state = Matrix{Float32} => (ns, N_ENV),
action = Matrix{Float32} => (na, N_ENV),
action_log_prob = Vector{Float32} => (N_ENV,),
reward = Vector{Float32} => (N_ENV,),
terminal = Vector{Bool} => (N_ENV,),
),
)
stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
hook = TotalBatchRewardPerEpisode(N_ENV)
Experiment(agent, env, stop_condition, hook, "# PPO with Dojo Ant")
end

ex = E`JuliaRL_PPO_DojoAnt`
run(ex)
Loading