Spaces:

VarunKumarGupta2003
/

SelfDriving

Configuration error

App Files Files Community

SelfDriving / dqn.py

VarunKumarGupta2003

Upload 7 files

b89a51c verified over 1 year ago

raw

history blame contribute delete

11 kB

	from __future__ import generator_stop
	from exp_replay import ExperienceReplay
	import numpy as np
	import tensorflow.contrib.slim as slim
	import tensorflow as tf
	import re
	from processimage import processimage


	class DQN:

	def __init__(self,
	env,
	batchsize=64,
	pic_size=(96, 96),
	num_frame_stack=3,
	gamma=0.95,
	frame_skip=3,
	train_freq=3,
	initial_epsilon=1,
	min_epsilon=0.05,
	render=False,
	epsilon_decay_steps=int(100000),
	min_experience_size=int(1000),
	experience_capacity=int(100000),
	target_network_update_freq=1000,
	regularization = 1e-6,
	optimizer_params = None,
	action_map=None
	):
	self.exp_history = ExperienceReplay(
	num_frame_stack,
	capacity=experience_capacity,
	pic_size=pic_size
	)

	# in playing mode we don't store the experience to agent history
	# but this cache is still needed to get the current frame stack
	self.playing_cache = ExperienceReplay(
	num_frame_stack,
	capacity=num_frame_stack * 5 + 10,
	pic_size=pic_size
	)

	if action_map is not None:
	self.dim_actions = len(action_map)
	else:
	self.dim_actions = env.action_space.n

	self.target_network_update_freq = target_network_update_freq
	self.action_map = action_map
	self.env = env
	self.batchsize = batchsize
	self.num_frame_stack = num_frame_stack
	self.gamma = gamma
	self.frame_skip = frame_skip
	self.train_freq = train_freq
	self.initial_epsilon = initial_epsilon
	self.min_epsilon = min_epsilon
	self.epsilon_decay_steps = epsilon_decay_steps
	self.render = render
	self.min_experience_size = min_experience_size
	self.pic_size = pic_size
	self.regularization = regularization
	# These default magic values always work with Adam
	self.global_step = tf.Variable(0, trainable=False)
	self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1)
	self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False)
	lr = self.decayed_lr
	# lr = 0.001
	self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7)

	self.do_training = True
	self.playing_epsilon = 0.0
	self.session = None

	self.state_size = (self.num_frame_stack,) + self.pic_size
	self.global_counter = 0
	self.episode_counter = 0

	def build_graph(self):
	input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4)
	input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4)

	self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state")
	self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state")
	self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward")
	self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions")
	self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask")

	# The target Q-values come from the fixed network
	with tf.compat.v1.variable_scope("fixed"): #64 96 96 3
	# Create target network which is gonna be fixed and updated every C parameters
	qsa_targets = self.create_network(self.input_next_state, trainable=False)

	with tf.compat.v1.variable_scope("train"): # ? 96 96 3
	# Create Prediction/Estimate network which will be trained/updated every 3 frames
	# Create Prediction/Estimate network which will be trained/updated every 3 frames
	qsa_estimates = self.create_network(self.input_prev_state, trainable=True)

	self.best_action = tf.argmax(qsa_estimates, axis=1)

	not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
	# select the chosen action from each row
	# in numpy this is qsa_estimates[range(batchsize), self.input_actions]
	action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
	#
	q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)

	#Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2]
	q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
	training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize

	# reg_loss = tf.add_n(tf.losses.get_regularization_losses())
	reg_loss = [0]

	#Adam optimizer
	optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
	#Adadelta optimizer:
	# optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params))

	self.train_op = optimizer.minimize(reg_loss + training_loss)

	train_params = self.get_variables("train")
	fixed_params = self.get_variables("fixed")


	assert (len(train_params) == len(fixed_params))
	self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)]

	def get_variables(self, scope):
	vars = [t for t in tf.compat.v1.global_variables()
	if "%s/" % scope in t.name and "Adam" not in t.name]
	return sorted(vars, key=lambda v: v.name)

	def create_network(self, input, trainable):
	if trainable:
	# wr = None
	wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization)
	else:
	wr = None

	net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr)
	net = tf.nn.relu(net)
	net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
	net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2',
	kernel_regularizer=wr)
	net = tf.nn.relu(net)
	net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
	net = tf.layers.flatten(net)
	net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr)
	# net = tf.layers.dropout(net, 0.5)
	q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr)

	return q_state_action_values

	# def check_early_stop(self, reward, totalreward):
	# return False, 0.0

	def get_random_action(self):
	return np.random.choice(self.dim_actions)

	def get_epsilon(self):
	if not self.do_training:
	return self.playing_epsilon
	elif self.global_counter >= self.epsilon_decay_steps:
	return self.min_epsilon
	else:
	# linear decay
	r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
	return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r

	def train(self):
	batch = self.exp_history.sample_mini_batch(self.batchsize)
	# Feed dict
	fd = {
	self.input_reward: "reward",
	self.input_prev_state: "prev_state",
	self.input_next_state: "next_state",
	self.input_actions: "actions",
	self.input_done_mask: "done_mask"
	}
	fd1 = {ph: batch[k] for ph, k in fd.items()}
	self.session.run([self.train_op], fd1)

	def play_episode(self, render, load_checkpoint):
	eh = (
	self.exp_history if self.do_training
	else self.playing_cache
	)
	total_reward = 0
	total_score = 0
	frames_in_episode = 0

	first_frame = self.env.reset()
	first_frame_pp = processimage.process_image(first_frame)

	eh.start_new_episode(first_frame_pp)

	epsilon = self.get_epsilon()
	while True:
	if np.random.rand() > epsilon and not load_checkpoint:
	action_idx = self.session.run(
	self.best_action,
	{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
	)[0]
	elif not load_checkpoint:
	action_idx = self.get_random_action()
	elif load_checkpoint:
	action_idx = self.session.run(
	self.best_action,
	{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
	)[0]

	if self.action_map is not None:
	action = self.action_map[action_idx]
	else:
	action = action_idx

	reward = 0
	score = 0
	for _ in range(self.frame_skip):
	observation, r, done, info = self.env.step(action)
	if render:
	self.env.render()


	score += r
	#Increase rewards on the last frames if reward is positive
	if r > 0:
	r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles
	reward += r

	if done:
	break

	early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode)
	if early_done:
	reward += punishment

	done = done or early_done

	total_reward += reward
	total_score += score
	frames_in_episode += 1
	observation = processimage.process_image(observation)
	eh.add_experience(observation, action_idx, done, reward)

	if self.do_training:
	self.global_counter += 1
	step = self.session.run(self.increment_global_step_op)
	if self.global_counter % self.target_network_update_freq:
	self.update_target_network()
	train_cond = (
	self.exp_history.counter >= self.min_experience_size and
	self.global_counter % self.train_freq == 0
	)
	if train_cond:
	self.train()

	if done:
	if self.do_training:
	self.episode_counter += 1

	return total_score, total_reward, frames_in_episode, epsilon

	def update_target_network(self):
	self.session.run(self.copy_network_ops)