dojo-sim · rejuvyesh · Mar 9, 2022 · Mar 10, 2022 · Mar 10, 2022 · Mar 10, 2022
diff --git a/Project.toml b/Project.toml
@@ -20,6 +20,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Polyhedra = "67491407-f73d-577b-9b50-8179a7c68029"
 Quaternions = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

diff --git a/environments/environment.jl b/environments/environment.jl
@@ -31,7 +31,7 @@ mutable struct Environment{X,T,M,A,O,I}
     dynamics_jacobian_state::Matrix{T}
     dynamics_jacobian_input::Matrix{T}
     input_previous::Vector{T}
-	control_map::Matrix{T}
+    control_map::Matrix{T}
     num_states::Int
     num_inputs::Int
     num_observations::Int
@@ -66,33 +66,33 @@ end
     attitude_decompress: flag for pre- and post-concatenating Jacobians with attitude Jacobians
 """
 function Base.step(env::Environment, x, u;
-    gradients=false,
-    attitude_decompress=false)
+    gradients = false,
+    attitude_decompress = false)
 
     mechanism = env.mechanism
-    timestep= mechanism.timestep
+    timestep = mechanism.timestep
 
     x0 = x
     # u = clip(env.input_space, u) # control limits
     env.input_previous .= u  # for rendering in Gym
-	u_scaled = env.control_map * u
+    u_scaled = env.control_map * u
 
     z0 = env.representation == :minimal ? minimal_to_maximal(mechanism, x0) : x0
-    z1 = step!(mechanism, z0, u_scaled; opts=env.opts_step)
+    z1 = step!(mechanism, z0, u_scaled; opts = env.opts_step)
     env.state .= env.representation == :minimal ? maximal_to_minimal(mechanism, z1) : z1
 
     # Compute cost
     costs = cost(env, x, u)
 
-	# Check termination
-	done = is_done(env, x)
+    # Check termination
+    done = is_done(env, x)
 
     # Gradients
     if gradients
         if env.representation == :minimal
-            fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad)
+            fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad)
         elseif env.representation == :maximal
-            fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad)
+            fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad)
             if attitude_decompress
                 A0 = attitude_jacobian(z0, length(env.mechanism.bodies))
                 A1 = attitude_jacobian(z1, length(env.mechanism.bodies))
@@ -109,11 +109,11 @@ function Base.step(env::Environment, x, u;
 end
 
 function Base.step(env::Environment, u;
-    gradients=false, 
-    attitude_decompress=false) 
-    step(env, env.state, u; 
-        gradients=gradients, 
-        attitude_decompress=attitude_decompress)
+    gradients = false,
+    attitude_decompress = false)
+    step(env, env.state, u;
+        gradients = gradients,
+        attitude_decompress = attitude_decompress)
 end
 
 """
@@ -156,7 +156,7 @@ is_done(env::Environment, x) = false
     x: state
 """
 function Base.reset(env::Environment{X};
-    x=nothing) where X
+    x = nothing) where {X}
 
     initialize!(env.mechanism, type2symbol(X))
     if x != nothing
@@ -172,15 +172,15 @@ function Base.reset(env::Environment{X};
     return get_observation(env)
 end
 
-function MeshCat.render(env::Environment, 
-    mode="human")
+function MeshCat.render(env::Environment,
+    mode = "human")
     z = env.representation == :minimal ? minimal_to_maximal(env.mechanism, env.state) : env.state
-    set_robot(env.vis, env.mechanism, z, name=:robot)
+    set_robot(env.vis, env.mechanism, z, name = :robot)
     return nothing
 end
 
-function seed(env::Environment; s=0)
-    env.rng[1] = MersenneTwister(seed)
+function seed(env::Environment, s = 0)
+    env.rng[1] = MersenneTwister(s)
     return nothing
 end
 
@@ -196,7 +196,7 @@ end
 abstract type Space{T,N} end
 
 """ 
-    BoxSpace{T,N} <: Environment{T,N}
+    BoxSpace{T,N} <: Space{T,N}
 
     domain with lower and upper limits 
 
@@ -214,12 +214,12 @@ mutable struct BoxSpace{T,N} <: Space{T,N}
     dtype::DataType # this is always T, it's needed to interface with Stable-Baselines
 end
 
-function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where T
+function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where {T}
     return BoxSpace{T,n}(n, low, high, (n,), T)
 end
 
 function sample(s::BoxSpace{T,N}) where {T,N}
-    return rand(T,N) .* (s.high .- s.low) .+ s.low
+    return rand(T, N) .* (s.high .- s.low) .+ s.low
 end
 
 function contains(s::BoxSpace{T,N}, v::AbstractVector{T}) where {T,N}
@@ -230,5 +230,4 @@ function clip(s::BoxSpace, u)
     clamp.(u, s.low, s.high)
 end
 
-
-
+Random.rand(rng::Random.AbstractRNG, s::BoxSpace{T,N}) where {T,N} = return rand(rng, T, N) .* (s.high .- s.low) .+ s.low
diff --git a/environments/rlenv.jl b/environments/rlenv.jl
@@ -0,0 +1,46 @@
+using ReinforcementLearningBase: RLBase
+
+mutable struct DojoRLEnv{T} <: RLBase.AbstractEnv
+    dojoenv::Environment
+    state::Vector{T}
+    reward::T
+    done::Bool
+    info::Dict
+end
+
+function DojoRLEnv(dojoenv::Environment{X,T}) where {X,T}
+    state = reset(dojoenv)
+    return DojoRLEnv{T}(dojoenv, state, convert(T, 0.0), false, Dict())
+end
+
+function DojoRLEnv(name::String; kwargs...)
+    DojoRLEnv(Dojo.get_environment(name; kwargs...))
+end
+
+function Base.convert(::Type{RLBase.Space}, s::BoxSpace)
+    RLBase.Space([BoxSpace(1; low = s.low[i:i], high = s.high[i:i]) for i in 1:s.n])
+end
+
+RLBase.action_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.input_space)
+RLBase.state_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.observation_space)
+RLBase.is_terminated(env::DojoRLEnv) = env.done
+
+RLBase.reset!(env::DojoRLEnv) = reset(env.dojoenv)
+
+RLBase.reward(env::DojoRLEnv) = env.reward
+RLBase.state(env::DojoRLEnv) = env.state
+
+Random.seed!(env::DojoRLEnv, seed) = Dojo.seed(env.dojoenv, seed)
+
+# TODO:
+# RLBase.ChanceStyle(env::DojoRLEnv) = RLBase.DETERMINISTIC
+
+function (env::DojoRLEnv)(a)
+    s, r, d, i = step(env.dojoenv, a)
+    env.state .= s
+    env.reward = r
+    env.done = d
+    env.info = i
+    return nothing
+end
+(env::DojoRLEnv)(a::Number) = env([a])
diff --git a/examples/deeprl/Project.toml b/examples/deeprl/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Dojo = "ac60b53e-8d92-4c83-b960-e78698fa1916"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318"
diff --git a/examples/deeprl/ant_ddpg.jl b/examples/deeprl/ant_ddpg.jl
@@ -0,0 +1,80 @@
+using ReinforcementLearning
+using Flux
+using Flux.Losses
+
+using Random
+using Dojo
+
+function RL.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:DDPG},
+    ::Val{:DojoAnt},
+    ::Nothing,
+    save_dir = nothing,
+    seed = 42
+)
+
+    rng = MersenneTwister(seed)
+    env = Dojo.DojoRLEnv("ant")
+    Random.seed!(env, seed)
+    A = action_space(env)
+    ns, na = length(state(env)), length(action_space(env))
+    @show na
+
+    init = glorot_uniform(rng)
+
+    create_actor() = Chain(
+        Dense(ns, 30, relu; init = init),
+        Dense(30, 30, relu; init = init),
+        Dense(30, na, tanh; init = init),
+    )
+    create_critic() = Chain(
+        Dense(ns + na, 30, relu; init = init),
+        Dense(30, 30, relu; init = init),
+        Dense(30, 1; init = init),
+    )
+
+    agent = Agent(
+        policy = DDPGPolicy(
+            behavior_actor = NeuralNetworkApproximator(
+                model = create_actor(),
+                optimizer = ADAM(),
+            ),
+            behavior_critic = NeuralNetworkApproximator(
+                model = create_critic(),
+                optimizer = ADAM(),
+            ),
+            target_actor = NeuralNetworkApproximator(
+                model = create_actor(),
+                optimizer = ADAM(),
+            ),
+            target_critic = NeuralNetworkApproximator(
+                model = create_critic(),
+                optimizer = ADAM(),
+            ),
+            γ = 0.99f0,
+            ρ = 0.995f0,
+            na = na,
+            batch_size = 64,
+            start_steps = 1000,
+            start_policy = RandomPolicy(A; rng = rng),
+            update_after = 1000,
+            update_freq = 1,
+            act_limit = 1.0,
+            act_noise = 0.1,
+            rng = rng,
+        ),
+        trajectory = CircularArraySARTTrajectory(
+            capacity = 10000,
+            state = Vector{Float32} => (ns,),
+            action = Float32 => (na, ),
+        ),
+    )
+
+    stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
+    hook = TotalRewardPerEpisode()
+    Experiment(agent, env, stop_condition, hook, "# Dojo Ant with DDPG")    
+end
+
+ex = E`JuliaRL_DDPG_DojoAnt`
+run(ex)
diff --git a/examples/deeprl/ant_ppo.jl b/examples/deeprl/ant_ppo.jl
@@ -0,0 +1,73 @@
+using ReinforcementLearning
+using Flux
+using Flux.Losses
+
+using Random
+using Distributions
+using Dojo
+
+function RL.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:PPO},
+    ::Val{:DojoAnt},
+    ::Nothing,
+    save_dir = nothing,
+    seed = 42
+)
+    rng = MersenneTwister(seed)
+    N_ENV = 6
+    UPDATE_FREQ = 32
+    env_vec = [Dojo.DojoRLEnv("ant") for i in 1:N_ENV]
+    for i in 1:N_ENV
+        Random.seed!(env_vec[i], hash(seed+i))
+    end
+    env = MultiThreadEnv(env_vec)
+
+    ns, na = length(state(env[1])), length(action_space(env[1]))
+    RLBase.reset!(env; is_force=true)
+
+    agent = Agent(
+        policy = PPOPolicy(
+            approximator = ActorCritic(
+                actor = GaussianNetwork(
+                    pre = Chain(
+                        Dense(ns, 64, relu; init = glorot_uniform(rng)),
+                        Dense(64, 64, relu; init = glorot_uniform(rng)),
+                    ),
+                    μ = Chain(Dense(64, na, tanh; init = glorot_uniform(rng)), vec),
+                    logσ = Chain(Dense(64, na; init = glorot_uniform(rng)), vec),
+                ),                
+                critic = Chain(
+                    Dense(ns, 256, relu; init = glorot_uniform(rng)),
+                    Dense(256, na; init = glorot_uniform(rng)),
+                ),
+                optimizer = ADAM(1e-3),
+            ),
+            γ = 0.99f0,
+            λ = 0.95f0,
+            clip_range = 0.1f0,
+            max_grad_norm = 0.5f0,
+            n_epochs = 4,
+            n_microbatches = 4,
+            actor_loss_weight = 1.0f0,
+            critic_loss_weight = 0.5f0,
+            entropy_loss_weight = 0.001f0,
+            dist = Normal,
+            update_freq = UPDATE_FREQ,
+        ),    
+        trajectory = PPOTrajectory(;
+            capacity = UPDATE_FREQ,
+            state = Matrix{Float32} => (ns, N_ENV),
+            action = Matrix{Float32} => (na, N_ENV),
+            action_log_prob = Vector{Float32} => (N_ENV,),
+            reward = Vector{Float32} => (N_ENV,),
+            terminal = Vector{Bool} => (N_ENV,),
+        ),
+    )        
+    stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
+    hook = TotalBatchRewardPerEpisode(N_ENV)
+    Experiment(agent, env, stop_condition, hook, "# PPO with Dojo Ant")        
+end
+
+ex = E`JuliaRL_PPO_DojoAnt`
+run(ex)