Failing to make the arm grasp the ball. (PPO, SB3)

I have been learning Reinforcement learning through projects and am at a point where i am stuck. the ball reaches the arm but struggles to grasp the ball. Any way to improve it is helpful!

import pybullet as p import pybullet_data import numpy as np import gymnasium as gym import os import time from gymnasium import spaces from armKinematics import ArmKinematics from stable_baselines3 import PPO from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize, DummyVecEnv JOINT_LIMIT = 2.35619 MIN_DIST = 0.25 MAX_DIST = 0.45 class ReachEnv(gym.Env): robot_cfg = { "arm_joints": [1, 2, 3], "claw_joints": [6, 7], "ee_link_idx": 8, } reward_cfg = { "pos_error_weight": -0.2, "pos_fine_weight": 0.2, "pos_fine_std": 0.1, "pos_fine_shift": 2.0, "action_penalty": -1e-4, "vel_penalty": -1e-4, "floor_penalty": -0.5, "min_height": 0.04, "step_penalty": -0.1, "success_bonus": 200.0, } curriculum_cfg = { "action_final_weight": -5e-3, "joint_final_weight": -1e-3, "ramp_steps": 36_000, } command_cfg = { "x_range": (0.15, 0.45), "y_range": (-0.30, 0.30), "spawn_z": 0.02, } obs_noise = 0.01 def __init__(self, render_mode=None): super().__init__() self.physicsClient = p.connect(p.GUI if render_mode == "human" else p.DIRECT) p.setAdditionalSearchPath(pybullet_data.getDataPath()) self.base_path = os.path.dirname(os.path.abspath(__file__)) self.kinematics = ArmKinematics() self._total_steps = 0 self.action_space = spaces.Box(-1, 1, shape=(4,), dtype=np.float32) self.observation_space = spaces.Box(-10, 10, shape=(13,), dtype=np.float32) def _sample_target(self): while True: sx = np.random.uniform(*self.command_cfg["x_range"]) sy = np.random.uniform(*self.command_cfg["y_range"]) if MIN_DIST < np.linalg.norm([sx, sy]) < MAX_DIST: return np.array([sx, sy, self.command_cfg["spawn_z"]], dtype=np.float32) def _get_obs(self): js = p.getJointStates(self.robot_id, self.robot_cfg["arm_joints"]) self.joints = np.array([s[0] for s in js], dtype=np.float32) self.joint_vels = np.array([s[1] for s in js], dtype=np.float32) ee_pos = np.array( p.getLinkState(self.robot_id, self.robot_cfg["ee_link_idx"])[0], dtype=np.float32, ) ball_pos = p.getBasePositionAndOrientation(self.ball_id)[0] noisy_joints = self.joints + np.random.uniform(-self.obs_noise, self.obs_noise, 3).astype(np.float32) noisy_vels = self.joint_vels + np.random.uniform(-self.obs_noise, self.obs_noise, 3).astype(np.float32) pose_cmd = np.array(ball_pos) - ee_pos return np.concatenate([noisy_joints, noisy_vels, pose_cmd, self.prev_action]) def _curriculum_weight(self, term: str): t = min(self._total_steps / self.curriculum_cfg["ramp_steps"], 1.0) if term == "action": return self.reward_cfg["action_penalty"] + t * ( self.curriculum_cfg["action_final_weight"] - self.reward_cfg["action_penalty"] ) return self.reward_cfg["vel_penalty"] + t * ( self.curriculum_cfg["joint_final_weight"] - self.reward_cfg["vel_penalty"] ) def _rew_pos_error(self, dist): return dist * self.reward_cfg["pos_error_weight"] def _rew_pos_fine(self, dist): return ( np.tanh(self.reward_cfg["pos_fine_shift"] - dist / self.reward_cfg["pos_fine_std"]) * self.reward_cfg["pos_fine_weight"] ) def _rew_action_rate(self, action): return np.sum(np.square(action - self.prev_action)) * self._curriculum_weight("action") def _rew_joint_vel(self): return np.sum(np.square(self.joint_vels)) * self._curriculum_weight("joint") def reset(self, seed=None, options=None): super().reset(seed=seed) p.resetSimulation() p.setGravity(0, 0, -9.81) self.plane_id = p.loadURDF("plane.urdf") self.robot_id = p.loadURDF( os.path.join(self.base_path, "models/3dof_arm/arm.urdf"), [0, 0, 0], useFixedBase=1, ) scale = np.random.uniform(0.5, 1.5, size=3).astype(np.float32) for idx, j_idx in enumerate(self.robot_cfg["arm_joints"]): p.resetJointState(self.robot_id, j_idx, scale[idx] * 0.1) for j_idx in self.robot_cfg["claw_joints"]: p.resetJointState(self.robot_id, j_idx, 0.0) self.target_pos = self._sample_target() self.ball_id = p.loadURDF( os.path.join(self.base_path, "models/objects/ball.urdf"), self.target_pos.tolist(), ) p.changeDynamics(self.ball_id, -1, lateralFriction=2.0, mass=0.05) p.changeDynamics(self.robot_id, self.robot_cfg["claw_joints"][0], lateralFriction=2.0) p.changeDynamics(self.robot_id, self.robot_cfg["claw_joints"][1], lateralFriction=2.0) self.current_joints = np.zeros(3, dtype=np.float32) self.prev_action = np.zeros(4, dtype=np.float32) self.step_count = 0 return self._get_obs(), {} def step(self, action): ee_state = p.getLinkState(self.robot_id, self.robot_cfg["ee_link_idx"]) ee_pos = np.array(ee_state[0]) ball_pos = p.getBasePositionAndOrientation(self.ball_id)[0] ball_z = ball_pos[2] dist_to_ball = float(np.linalg.norm(ee_pos - np.array(ball_pos))) self.current_joints = np.clip( self.current_joints + action[:3] * 0.05, -JOINT_LIMIT, JOINT_LIMIT ) p.setJointMotorControlArray( self.robot_id, self.robot_cfg["arm_joints"], p.POSITION_CONTROL, targetPositions=self.current_joints.tolist(), targetVelocities=[0.0, 0.0, 0.0], positionGains=[0.5, 0.5, 0.5], velocityGains=[1.0, 1.0, 1.0], forces=[30.0, 30.0, 30.0], ) if dist_to_ball <= 0.015: grip_cmd = ((action[3] + 1.0) / 2.0) * 0.030 else: grip_cmd = 0.0 p.setJointMotorControlArray( self.robot_id, self.robot_cfg["claw_joints"], p.POSITION_CONTROL, targetPositions=[grip_cmd, grip_cmd], forces=[100.0, 100.0] ) for _ in range(25): p.stepSimulation() obs = self._get_obs() dist = float(np.linalg.norm(obs[6:9])) ee_state = p.getLinkState(self.robot_id, self.robot_cfg["ee_link_idx"]) ee_pos = np.array(ee_state[0]) ball_pos = p.getBasePositionAndOrientation(self.ball_id)[0] ball_z = ball_pos[2] dist_xy = np.linalg.norm(ee_pos[:2] - np.array(ball_pos[:2])) floor_rew = 0.0 if ee_pos[2] < self.reward_cfg["min_height"] and dist_xy > 0.06: floor_rew = self.reward_cfg["floor_penalty"] approach_rew = 0.0 if dist_xy < 0.1: z_target = ball_z + 0.04 z_error = abs(ee_pos[2] - z_target) approach_rew = -z_error * 0.1 lift_height = ball_z - self.command_cfg["spawn_z"] lift_rew = 0.0 if lift_height > 0.01: lift_rew = lift_height * 200.0 terminated = lift_height > 0.05 success_payout = 0.0 if terminated: success_payout = self.reward_cfg["success_bonus"] * 2.0 reward = ( self._rew_pos_error(dist) + self._rew_pos_fine(dist) + self._rew_action_rate(action) + self._rew_joint_vel() + floor_rew + approach_rew + lift_rew + success_payout + self.reward_cfg["step_penalty"] ) self.step_count += 1 self._total_steps += 1 self.prev_action = np.copy(action) truncated = self.step_count >= 720 return obs, float(reward), terminated, truncated, {"dist": dist, "lift": lift_height} def close(self): p.disconnect() 

submitted by /u/Not_Neon_Op
[link] [comments]

Liked Liked