diff --git a/README.md b/README.md index 9eae9c4..da2ac71 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,41 @@ # gym-soccer The [Soccer environment](https://github.com/LARG/HFO) is a multiagent -domain featuring continuous state and action spaces. Currently, -several tasks are supported: +domain featuring continuous state and action spaces. -## Soccer +## Changes + +Several changes have been made to more closely reflect the setup used by [[Hausknecht & Stone 2016]](https://arxiv.org/abs/1511.04143): + +- The number of steps without touching the ball before ending an episode has been reduced to 100. +- The reward function has abeen updated to reflect the one used in their code (https://github.com/mhauskn/dqn-hfo). Specifically, the negative reward given for the distance between the ball and goal is only activated once the agent is in possession of the ball. A separate environment has been created with this change: `SoccerScoreGoal-v0`. It is the same as `SoccerEmptyGoal-v0` except for the reward function. +- The state of the environment is returned after each step (useful for counting the number of goals). +## Tasks + +There are several tasks supported at the moment: + +### Soccer The soccer task initializes a single offensive agent on the field and rewards +1 for scoring a goal and 0 otherwise. In order to score a goal, the agent will need to know how to approach the ball and kick towards the goal. The sparse nature of the goal reward makes this task very difficult to accomplish. -## SoccerEmptyGoal +### SoccerEmptyGoal The SoccerEmptyGoal task features a more informative reward signal than the Soccer task. As before, the objective is to score a goal. However, SoccerEmtpyGoal rewards the agent for approaching the ball and moving the ball towards the goal. These frequent rewards make the task much more accessible. -## SoccerAgainstKeeper +### SoccerAgainstKeeper The objective of the SoccerAgainstKeeper task is to score against a goal keeper. The agent is rewarded for moving the ball towards the goal and for scoring a goal. The goal keeper uses a hand-coded policy developed by the Helios RoboCup team. The difficulty in this task is learning how to shoot around the goal keeper. + # Installation ```bash cd gym-soccer pip install -e . ``` + +or + +```bash +pip install -e git+https://github.com/cycraig/gym-soccer#egg=gym_soccer +``` \ No newline at end of file diff --git a/gym_soccer/__init__.py b/gym_soccer/__init__.py index a682811..5f2ccbf 100644 --- a/gym_soccer/__init__.py +++ b/gym_soccer/__init__.py @@ -19,9 +19,17 @@ nondeterministic = True, ) +register( + id='SoccerScoreGoal-v0', + entry_point='gym_soccer.envs:SoccerScoreGoalEnv', + timestep_limit=1000, + reward_threshold=10.0, + nondeterministic = True, +) + register( id='SoccerAgainstKeeper-v0', - entry_point='gym.envs:SoccerAgainstKeeperEnv', + entry_point='gym_soccer.envs:SoccerAgainstKeeperEnv', timestep_limit=1000, reward_threshold=8.0, nondeterministic = True, diff --git a/gym_soccer/envs/__init__.py b/gym_soccer/envs/__init__.py index 9bd1a76..61907a4 100644 --- a/gym_soccer/envs/__init__.py +++ b/gym_soccer/envs/__init__.py @@ -1,3 +1,4 @@ from gym_soccer.envs.soccer_env import SoccerEnv from gym_soccer.envs.soccer_empty_goal import SoccerEmptyGoalEnv from gym_soccer.envs.soccer_against_keeper import SoccerAgainstKeeperEnv +from gym_soccer.envs.soccer_score_goal import SoccerScoreGoalEnv \ No newline at end of file diff --git a/gym_soccer/envs/soccer_against_keeper.py b/gym_soccer/envs/soccer_against_keeper.py index 3912b01..6d80cf1 100644 --- a/gym_soccer/envs/soccer_against_keeper.py +++ b/gym_soccer/envs/soccer_against_keeper.py @@ -1,9 +1,10 @@ import logging from gym_soccer.envs.soccer_empty_goal import SoccerEmptyGoalEnv +from gym_soccer.envs.soccer_score_goal import SoccerScoreGoalEnv logger = logging.getLogger(__name__) -class SoccerAgainstKeeperEnv(SoccerEmptyGoalEnv): +class SoccerAgainstKeeperEnv(SoccerScoreGoalEnv): """ SoccerAgainstKeeper initializes the agent most of the way down the field with the ball and tasks it with scoring on a keeper. diff --git a/gym_soccer/envs/soccer_env.py b/gym_soccer/envs/soccer_env.py index 9c58c24..dc84a47 100644 --- a/gym_soccer/envs/soccer_env.py +++ b/gym_soccer/envs/soccer_env.py @@ -1,17 +1,35 @@ import os, subprocess, time, signal +import numpy as np import gym from gym import error, spaces from gym import utils from gym.utils import seeding +import socket +from contextlib import closing + try: import hfo_py except ImportError as e: - raise error.DependencyNotInstalled("{}. (HINT: you can install HFO dependencies with 'pip install gym[soccer].)'".format(e)) + raise error.DependencyNotInstalled("{}. (HINT: you can install HFO dependencies with 'pip install gym[soccer].')".format(e)) import logging logger = logging.getLogger(__name__) +def find_free_port(): + """Find a random free port. Does not guarantee that the port will still be free after return. + Note: HFO takes three consecutive port numbers, this only checks one. + + Source: https://github.com/crowdAI/marLo/blob/master/marlo/utils.py + + :rtype: `int` + """ + + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + class SoccerEnv(gym.Env, utils.EzPickle): metadata = {'render.modes': ['human']} @@ -20,19 +38,29 @@ def __init__(self): self.server_process = None self.server_port = None self.hfo_path = hfo_py.get_hfo_path() + print(self.hfo_path) self._configure_environment() self.env = hfo_py.HFOEnvironment() - self.env.connectToServer(config_dir=hfo_py.get_config_path()) + self.env.connectToServer(config_dir=hfo_py.get_config_path(), server_port=self.server_port) + print("Shape =",self.env.getStateSize()) self.observation_space = spaces.Box(low=-1, high=1, - shape=(self.env.getStateSize())) + shape=((self.env.getStateSize(),)), dtype=np.float32) # Action space omits the Tackle/Catch actions, which are useful on defense + low0 = np.array([0, -180], dtype=np.float32) + high0 = np.array([100, 180], dtype=np.float32) + low1 = np.array([-180], dtype=np.float32) + high1 = np.array([180], dtype=np.float32) + low2 = np.array([0, -180], dtype=np.float32) + high2 = np.array([100, 180], dtype=np.float32) + low3 = np.array([-180], dtype=np.float32) + high3 = np.array([180], dtype=np.float32) self.action_space = spaces.Tuple((spaces.Discrete(3), - spaces.Box(low=0, high=100, shape=1), - spaces.Box(low=-180, high=180, shape=1), - spaces.Box(low=-180, high=180, shape=1), - spaces.Box(low=0, high=100, shape=1), - spaces.Box(low=-180, high=180, shape=1))) + spaces.Box(low=low0, high=high0, dtype=np.float32), + spaces.Box(low=low1, high=high1, dtype=np.float32), + spaces.Box(low=low2, high=high2, dtype=np.float32))) + self.status = hfo_py.IN_GAME + self._seed = -1 def __del__(self): self.env.act(hfo_py.QUIT) @@ -50,9 +78,11 @@ def _configure_environment(self): self._start_hfo_server() def _start_hfo_server(self, frames_per_trial=500, - untouched_time=100, offense_agents=1, + #untouched_time=1000, + untouched_time=100, + offense_agents=1, defense_agents=0, offense_npcs=0, - defense_npcs=0, sync_mode=True, port=6000, + defense_npcs=0, sync_mode=True, port=None, offense_on_ball=0, fullstate=True, seed=-1, ball_x_min=0.0, ball_x_max=0.2, verbose=False, log_game=False, @@ -75,13 +105,26 @@ def _start_hfo_server(self, frames_per_trial=500, log_game: Enable game logging. Logs can be used for replay + visualization. log_dir: Directory to place game logs (*.rcg). """ + if port is None: + port = find_free_port() self.server_port = port - cmd = self.hfo_path + \ + '''cmd = self.hfo_path + \ " --headless --frames-per-trial %i --untouched-time %i --offense-agents %i"\ + " --defense-agents %i --offense-npcs %i --defense-npcs %i"\ + " --port %i --offense-on-ball %i --seed %i --ball-x-min %f"\ + " --ball-x-max %f --log-dir %s"\ + % (frames_per_trial, untouched_time, + offense_agents, + defense_agents, offense_npcs, defense_npcs, port, + offense_on_ball, seed, ball_x_min, ball_x_max, + log_dir)''' + cmd = self.hfo_path + \ + " --headless --frames-per-trial %i --offense-agents %i"\ " --defense-agents %i --offense-npcs %i --defense-npcs %i"\ " --port %i --offense-on-ball %i --seed %i --ball-x-min %f"\ " --ball-x-max %f --log-dir %s"\ - % (frames_per_trial, untouched_time, offense_agents, + % (frames_per_trial, + offense_agents, defense_agents, offense_npcs, defense_npcs, port, offense_on_ball, seed, ball_x_min, ball_x_max, log_dir) @@ -109,7 +152,7 @@ def _step(self, action): reward = self._get_reward() ob = self.env.getState() episode_over = self.status != hfo_py.IN_GAME - return ob, reward, episode_over, {} + return ob, reward, episode_over, {'status': STATUS_LOOKUP[self.status]} def _take_action(self, action): """ Converts the action space into an HFO action. """ @@ -139,6 +182,9 @@ def _reset(self): while self.status != hfo_py.IN_GAME: self.env.act(hfo_py.NOOP) self.status = self.env.step() + # prevent infinite output when server dies + if self.status == hfo_py.SERVER_DOWN: + raise ServerDownException("HFO server down!") return self.env.getState() def _render(self, mode='human', close=False): @@ -149,6 +195,21 @@ def _render(self, mode='human', close=False): else: if self.viewer is None: self._start_viewer() + + def close(self): + if self.server_process is not None: + try: + os.kill(self.server_process.pid, signal.SIGKILL) + except Exception: + pass + + +class ServerDownException(Exception): + """ + Custom error so agents can catch it and exit cleanly if the server dies unexpectedly. + """ + pass + ACTION_LOOKUP = { 0 : hfo_py.DASH, @@ -157,3 +218,12 @@ def _render(self, mode='human', close=False): 3 : hfo_py.TACKLE, # Used on defense to slide tackle the ball 4 : hfo_py.CATCH, # Used only by goalie to catch the ball } + +STATUS_LOOKUP = { + hfo_py.IN_GAME: 'IN_GAME', + hfo_py.SERVER_DOWN: 'SERVER_DOWN', + hfo_py.GOAL: 'GOAL', + hfo_py.OUT_OF_BOUNDS: 'OUT_OF_BOUNDS', + hfo_py.OUT_OF_TIME: 'OUT_OF_TIME', + hfo_py.CAPTURED_BY_DEFENSE: 'CAPTURED_BY_DEFENSE', +} diff --git a/gym_soccer/envs/soccer_score_goal.py b/gym_soccer/envs/soccer_score_goal.py new file mode 100644 index 0000000..0d3c841 --- /dev/null +++ b/gym_soccer/envs/soccer_score_goal.py @@ -0,0 +1,147 @@ +import logging +import math +import numpy as np +from gym import spaces +from gym_soccer.envs.soccer_env import SoccerEnv, ACTION_LOOKUP +from gym_soccer.envs.soccer_empty_goal import SoccerEmptyGoalEnv + +try: + import hfo_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you can install HFO dependencies with 'pip install gym[soccer].)'".format(e)) + +logger = logging.getLogger(__name__) + +class SoccerScoreGoalEnv(SoccerEmptyGoalEnv): + """ + SoccerScoreGoal is the same task as SoccerEmptyGoal, which tasks the + agent with approaching the ball, dribbling, and scoring a goal. Rewards + are given as the agent nears the ball, kicks the ball towards the goal, + and scores a goal. + + The difference is that the reward structure is altered to be consistent + with the Hausknecht paper: "Deep Reinforcement Learning with Parameterised + Action Spaces". + + """ + def __init__(self): + super(SoccerScoreGoalEnv, self).__init__() + # dash, turn, kick, tackle + low0 = np.array([0, -180], dtype=np.float32) # meant to be 0, not -100! (according to original soccer env and dqn-hfo inverting gradients) + high0 = np.array([100, 180], dtype=np.float32) + low1 = np.array([-180], dtype=np.float32) + high1 = np.array([180], dtype=np.float32) + low2 = np.array([0, -180], dtype=np.float32) + high2 = np.array([100, 180], dtype=np.float32) + low3 = np.array([-180], dtype=np.float32) + high3 = np.array([180], dtype=np.float32) + self.action_space = spaces.Tuple((spaces.Discrete(3), + spaces.Box(low=low0, high=high0, dtype=np.float32), + spaces.Box(low=low1, high=high1, dtype=np.float32), + spaces.Box(low=low2, high=high2, dtype=np.float32)))#, + #spaces.Box(low=low3, high=high3))) + + self.unum = self.env.getUnum() # uniform number (identifier) of our lone agent + print("UNUM =",self.unum) + + '''def _take_action(self, action): + """ Converts the action space into an HFO action. """ + action_type = ACTION_LOOKUP[action[0]] + if action_type == hfo_py.DASH: + self.env.act(action_type, action[1], action[2]) + elif action_type == hfo_py.TURN: + self.env.act(action_type, action[3]) + elif action_type == hfo_py.KICK: + self.env.act(action_type, action[4], action[5]) + elif action_type == hfo_py.TACKLE: + self.env.act(action_type, action[6]) + else: + print('Unrecognized action %d' % action_type) + self.env.act(hfo_py.NOOP)''' + + def _get_reward(self): + """ + Agent is rewarded for minimizing the distance between itself and + the ball, minimizing the distance between the ball and the goal, + and scoring a goal. + """ + current_state = self.env.getState() + #print("State =",current_state) + #print("len State =",len(current_state)) + ball_proximity = current_state[53] + goal_proximity = current_state[15] + ball_dist = 1.0 - ball_proximity + goal_dist = 1.0 - goal_proximity + kickable = current_state[12] + ball_ang_sin_rad = current_state[51] + ball_ang_cos_rad = current_state[52] + ball_ang_rad = math.acos(ball_ang_cos_rad) + if ball_ang_sin_rad < 0: + ball_ang_rad *= -1. + goal_ang_sin_rad = current_state[13] + goal_ang_cos_rad = current_state[14] + goal_ang_rad = math.acos(goal_ang_cos_rad) + if goal_ang_sin_rad < 0: + goal_ang_rad *= -1. + alpha = max(ball_ang_rad, goal_ang_rad) - min(ball_ang_rad, goal_ang_rad) + ball_dist_goal = math.sqrt(ball_dist*ball_dist + goal_dist*goal_dist - + 2.*ball_dist*goal_dist*math.cos(alpha)) + # Compute the difference in ball proximity from the last step + if not self.first_step: + ball_prox_delta = ball_proximity - self.old_ball_prox + kickable_delta = kickable - self.old_kickable + ball_dist_goal_delta = ball_dist_goal - self.old_ball_dist_goal + self.old_ball_prox = ball_proximity + self.old_kickable = kickable + self.old_ball_dist_goal = ball_dist_goal + #print(self.env.playerOnBall()) + #print(self.env.playerOnBall().unum) + #print(self.env.getUnum()) + reward = 0 + if not self.first_step: + '''# Reward the agent for moving towards the ball + reward += ball_prox_delta + if kickable_delta > 0 and not self.got_kickable_reward: + reward += 1. + self.got_kickable_reward = True + # Reward the agent for kicking towards the goal + reward += 0.6 * -ball_dist_goal_delta + # Reward the agent for scoring + if self.status == hfo_py.GOAL: + reward += 5.0''' + '''reward = self.__move_to_ball_reward(kickable_delta, ball_prox_delta) + \ + 3. * self.__kick_to_goal_reward(ball_dist_goal_delta) + \ + self.__EOT_reward();''' + mtb = self.__move_to_ball_reward(kickable_delta, ball_prox_delta) + ktg = 3. * self.__kick_to_goal_reward(ball_dist_goal_delta) + eot = self.__EOT_reward() + reward = mtb + ktg + eot + #print("mtb: %.06f ktg: %.06f eot: %.06f"%(mtb,ktg,eot)) + + self.first_step = False + #print("r =",reward) + return reward + + def __move_to_ball_reward(self, kickable_delta, ball_prox_delta): + reward = 0. + if self.env.playerOnBall().unum < 0 or self.env.playerOnBall().unum == self.unum: + reward += ball_prox_delta; + if kickable_delta >= 1 and not self.got_kickable_reward: + reward += 1. + self.got_kickable_reward = True + return reward; + + def __kick_to_goal_reward(self, ball_dist_goal_delta): + if(self.env.playerOnBall().unum == self.unum): + return -ball_dist_goal_delta + elif self.got_kickable_reward == True: + return 0.2 * -ball_dist_goal_delta + return 0. + + def __EOT_reward(self): + if self.status == hfo_py.GOAL: + return 5. + #elif self.status == hfo_py.CAPTURED_BY_DEFENSE: + # return -1. + return 0. +