파이썬 코드 idle python 에서 오류?
안녕하세요 제가 코딩을 공부하던중
http://solarisailab.com/archives/486 의 코드를 사용하였습니다
코드는
"""
TensorFlow translation of the torch example found here (written by SeanNaren).
https://github.com/SeanNaren/TorchQLearningExample
Original keras example found here (written by Eder Santana).
https://gist.github.com/EderSantana/c7222daa328f0e885093#file-qlearn-py-L164
The agent plays a game of catch. Fruits drop from the sky and the agent can choose the actions
left/stay/right to catch the fruit before it reaches the ground.
"""
import tensorflow.compat.v1 as tf
tf.disablev2behavior()
import numpy as np
import random
import math
import os
# Parameters
epsilon = 1 # The probability of choosing a random action (in training). This decays as iterations increase. (0 to 1)
epsilonMinimumValue = 0.001 # The minimum value we want epsilon to reach in training. (0 to 1)
nbActions = 3 # The number of actions. Since we only have left/stay/right that means 3 actions.
epoch = 1001 # The number of games we want the system to run for.
hiddenSize = 100 # Number of neurons in the hidden layers.
maxMemory = 500 # How large should the memory be (where it stores its past experiences).
batchSize = 50 # The mini-batch size for training. Samples are randomly taken from memory till mini-batch size.
gridSize = 10 # The size of the grid that the agent is going to play the game on.
nbStates = gridSize * gridSize # We eventually flatten to a 1d tensor to feed the network.
discount = 0.9 # The discount is used to force the network to choose states that lead to the reward quicker (0 to 1)
learningRate = 0.2 # Learning Rate for Stochastic Gradient Descent (our optimizer).
# Create the base model.
X = tf.placeholder(tf.float32, [None, nbStates])
W1 = tf.Variable(tf.truncated_normal([nbStates, hiddenSize], stddev=1.0 / math.sqrt(float(nbStates))))
b1 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)
W2 = tf.Variable(tf.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
b2 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
hiddenlayer = tf.nn.relu(tf.matmul(inputlayer, W2) + b2)
W3 = tf.Variable(tf.truncated_normal([hiddenSize, nbActions],stddev=1.0 / math.sqrt(float(hiddenSize))))
b3 = tf.Variable(tf.truncated_normal([nbActions], stddev=0.01))
outputlayer = tf.matmul(hiddenlayer, W3) + b3
# True labels
Y = tf.placeholder(tf.float32, [None, nbActions])
# Mean squared error cost function
cost = tf.reducesum(tf.square(Y-outputlayer)) / (2*batchSize)
# Stochastic Gradient Decent Optimizer
optimizer = tf.train.GradientDescentOptimizer(learningRate).minimize(cost)
# Helper function: Chooses a random value between the two boundaries.
def randf(s, e):
return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;
# The environment: Handles interactions and contains the state of the environment
class CatchEnvironment():
def init(self, gridSize):
self.gridSize = gridSize
self.nbStates = self.gridSize * self.gridSize
self.state = np.empty(3, dtype = np.uint8)
# Returns the state of the environment.
def observe(self):
canvas = self.drawState()
canvas = np.reshape(canvas, (-1,self.nbStates))
return canvas
def drawState(self):
canvas = np.zeros((self.gridSize, self.gridSize))
canvas[self.state[0]-1, self.state[1]-1] = 1 # Draw the fruit.
# Draw the basket. The basket takes the adjacent two places to the position of basket.
canvas[self.gridSize-1, self.state[2] -1 - 1] = 1
canvas[self.gridSize-1, self.state[2] -1] = 1
canvas[self.gridSize-1, self.state[2] -1 + 1] = 1
return canvas
# Resets the environment. Randomly initialise the fruit position (always at the top to begin with) and bucket.
def reset(self):
initialFruitColumn = random.randrange(1, self.gridSize + 1)
initialBucketPosition = random.randrange(2, self.gridSize + 1 - 1)
self.state = np.array([1, initialFruitColumn, initialBucketPosition])
return self.getState()
def getState(self):
stateInfo = self.state
fruit_row = stateInfo[0]
fruit_col = stateInfo[1]
basket = stateInfo[2]
return fruitrow, fruitcol, basket
# Returns the award that the agent has gained for being in the current environment state.
def getReward(self):
fruitRow, fruitColumn, basket = self.getState()
if (fruitRow == self.gridSize - 1): # If the fruit has reached the bottom.
if (abs(fruitColumn - basket) <= 1): # Check if the basket caught the fruit.
return 1
else:
return -1
else:
return 0
def isGameOver(self):
if (self.state[0] == self.gridSize - 1):
return True
else:
return False
def updateState(self, action):
if (action == 1):
action = -1
elif (action == 2):
action = 0
else:
action = 1
fruitRow, fruitColumn, basket = self.getState()
newBasket = min(max(2, basket + action), self.gridSize - 1) # The min/max prevents the basket from moving out of the grid.
fruitRow = fruitRow + 1 # The fruit is falling by 1 every action.
self.state = np.array([fruitRow, fruitColumn, newBasket])
#Action can be 1 (move left) or 2 (move right)
def act(self, action):
self.updateState(action)
reward = self.getReward()
gameOver = self.isGameOver()
return self.observe(), reward, gameOver, self.getState() # For purpose of the visual, I also return the state.
# The memory: Handles the internal memory that we add experiences that occur based on agent's actions,
# and creates batches of experiences based on the mini-batch size for training.
class ReplayMemory:
def init(self, gridSize, maxMemory, discount):
self.maxMemory = maxMemory
self.gridSize = gridSize
self.nbStates = self.gridSize * self.gridSize
self.discount = discount
canvas = np.zeros((self.gridSize, self.gridSize))
canvas = np.reshape(canvas, (-1,self.nbStates))
self.inputState = np.empty((self.maxMemory, 100), dtype = np.float32)
self.actions = np.zeros(self.maxMemory, dtype = np.uint8)
self.nextState = np.empty((self.maxMemory, 100), dtype = np.float32)
self.gameOver = np.empty(self.maxMemory, dtype = np.bool)
self.rewards = np.empty(self.maxMemory, dtype = np.int8)
self.count = 0
self.current = 0
# Appends the experience to the memory.
def remember(self, currentState, action, reward, nextState, gameOver):
self.actions[self.current] = action
self.rewards[self.current] = reward
self.inputState[self.current, ...] = currentState
self.nextState[self.current, ...] = nextState
self.gameOver[self.current] = gameOver
self.count = max(self.count, self.current + 1)
self.current = (self.current + 1) % self.maxMemory
def getBatch(self, model, batchSize, nbActions, nbStates, sess, X):
# We check to see if we have enough memory inputs to make an entire batch, if not we create the biggest
# batch we can (at the beginning of training we will not have enough experience to fill a batch).
memoryLength = self.count
chosenBatchSize = min(batchSize, memoryLength)
inputs = np.zeros((chosenBatchSize, nbStates))
targets = np.zeros((chosenBatchSize, nbActions))
# Fill the inputs and targets up.
for i in xrange(chosenBatchSize):
if memoryLength == 1:
memoryLength = 2
# Choose a random memory experience to add to the batch.
randomIndex = random.randrange(1, memoryLength)
current_inputState = np.reshape(self.inputState[randomIndex], (1, 100))
target = sess.run(model, feeddict={X: currentinputState})
current_nextState = np.reshape(self.nextState[randomIndex], (1, 100))
currentoutputs = sess.run(model, feeddict={X: current_nextState})
# Gives us Q_sa, the max q for the next state.
nextStateMaxQ = np.amax(current_outputs)
if (self.gameOver[randomIndex] == True):
target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex]
else:
# reward + discount(gamma) * max_a' Q(s',a')
# We are setting the Q-value for the action to r + gamma*max a' Q(s', a'). The rest stay the same
# to give an error of 0 for those outputs.
target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex] + self.discount * nextStateMaxQ
# Update the inputs and targets.
inputs[i] = current_inputState
targets[i] = target
return inputs, targets
def main(_):
print("Training new model")
# Define Environment
env = CatchEnvironment(gridSize)
# Define Replay Memory
memory = ReplayMemory(gridSize, maxMemory, discount)
# Add ops to save and restore all the variables.
saver = tf.train.Saver()
winCount = 0
with tf.Session() as sess:
tf.initializeallvariables().run()
for i in xrange(epoch):
# Initialize the environment.
err = 0
env.reset()
isGameOver = False
# The initial state of the environment.
currentState = env.observe()
while (isGameOver != True):
action = -9999 # action initilization
# Decides if we should choose a random action, or an action from the policy network.
global epsilon
if (randf(0, 1) <= epsilon):
action = random.randrange(1, nbActions+1)
else:
# Forward the current state through the network.
q = sess.run(outputlayer, feeddict={X: currentState})
# Find the max index (the chosen action).
index = q.argmax()
action = index + 1
# Decay the epsilon by multiplying by 0.999, not allowing it to go below a certain threshold.
if (epsilon > epsilonMinimumValue):
epsilon = epsilon * 0.999
nextState, reward, gameOver, stateInfo = env.act(action)
if (reward == 1):
winCount = winCount + 1
memory.remember(currentState, action, reward, nextState, gameOver)
# Update the current state and if the game is over.
currentState = nextState
isGameOver = gameOver
# We get a batch of training data to train the model.
inputs, targets = memory.getBatch(output_layer, batchSize, nbActions, nbStates, sess, X)
# Train the network which returns the error.
, loss = sess.run([optimizer, cost], feeddict={X: inputs, Y: targets})
err = err + loss
print("Epoch " + str(i) + ": err = " + str(err) + ": Win count = " + str(winCount) + " Win ratio = " + str(float(winCount)/float(i+1)*100))
# Save the variables to disk.
save_path = saver.save(sess, os.getcwd()+"/model.ckpt")
print("Model saved in file: %s" % save_path)
if name == 'main':
tf.app.run()
"""
TensorFlow translation of the torch example found here (written by SeanNaren).
https://github.com/SeanNaren/TorchQLearningExample
Original keras example found here (written by Eder Santana).
https://gist.github.com/EderSantana/c7222daa328f0e885093#file-qlearn-py-L164
The agent plays a game of catch. Fruits drop from the sky and the agent can choose the actions
left/stay/right to catch the fruit before it reaches the ground.
"""
import tensorflow.compat.v1 as tf
tf.disablev2behavior()
import numpy as np
import random
import math
import os
# Parameters
epsilon = 1 # The probability of choosing a random action (in training). This decays as iterations increase. (0 to 1)
epsilonMinimumValue = 0.001 # The minimum value we want epsilon to reach in training. (0 to 1)
nbActions = 3 # The number of actions. Since we only have left/stay/right that means 3 actions.
epoch = 1001 # The number of games we want the system to run for.
hiddenSize = 100 # Number of neurons in the hidden layers.
maxMemory = 500 # How large should the memory be (where it stores its past experiences).
batchSize = 50 # The mini-batch size for training. Samples are randomly taken from memory till mini-batch size.
gridSize = 10 # The size of the grid that the agent is going to play the game on.
nbStates = gridSize * gridSize # We eventually flatten to a 1d tensor to feed the network.
discount = 0.9 # The discount is used to force the network to choose states that lead to the reward quicker (0 to 1)
learningRate = 0.2 # Learning Rate for Stochastic Gradient Descent (our optimizer).
# Create the base model.
X = tf.placeholder(tf.float32, [None, nbStates])
W1 = tf.Variable(tf.truncated_normal([nbStates, hiddenSize], stddev=1.0 / math.sqrt(float(nbStates))))
b1 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)
W2 = tf.Variable(tf.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
b2 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
hiddenlayer = tf.nn.relu(tf.matmul(inputlayer, W2) + b2)
W3 = tf.Variable(tf.truncated_normal([hiddenSize, nbActions],stddev=1.0 / math.sqrt(float(hiddenSize))))
b3 = tf.Variable(tf.truncated_normal([nbActions], stddev=0.01))
outputlayer = tf.matmul(hiddenlayer, W3) + b3
# True labels
Y = tf.placeholder(tf.float32, [None, nbActions])
# Mean squared error cost function
cost = tf.reducesum(tf.square(Y-outputlayer)) / (2*batchSize)
# Stochastic Gradient Decent Optimizer
optimizer = tf.train.GradientDescentOptimizer(learningRate).minimize(cost)
# Helper function: Chooses a random value between the two boundaries.
def randf(s, e):
return (float(random.randrange(0, (e - s) * 9999)) / 10000) + s;
# The environment: Handles interactions and contains the state of the environment
class CatchEnvironment():
def init(self, gridSize):
self.gridSize = gridSize
self.nbStates = self.gridSize * self.gridSize
self.state = np.empty(3, dtype = np.uint8)
# Returns the state of the environment.
def observe(self):
canvas = self.drawState()
canvas = np.reshape(canvas, (-1,self.nbStates))
return canvas
def drawState(self):
canvas = np.zeros((self.gridSize, self.gridSize))
canvas[self.state[0]-1, self.state[1]-1] = 1 # Draw the fruit.
# Draw the basket. The basket takes the adjacent two places to the position of basket.
canvas[self.gridSize-1, self.state[2] -1 - 1] = 1
canvas[self.gridSize-1, self.state[2] -1] = 1
canvas[self.gridSize-1, self.state[2] -1 + 1] = 1
return canvas
# Resets the environment. Randomly initialise the fruit position (always at the top to begin with) and bucket.
def reset(self):
initialFruitColumn = random.randrange(1, self.gridSize + 1)
initialBucketPosition = random.randrange(2, self.gridSize + 1 - 1)
self.state = np.array([1, initialFruitColumn, initialBucketPosition])
return self.getState()
def getState(self):
stateInfo = self.state
fruit_row = stateInfo[0]
fruit_col = stateInfo[1]
basket = stateInfo[2]
return fruitrow, fruitcol, basket
# Returns the award that the agent has gained for being in the current environment state.
def getReward(self):
fruitRow, fruitColumn, basket = self.getState()
if (fruitRow == self.gridSize - 1): # If the fruit has reached the bottom.
if (abs(fruitColumn - basket) <= 1): # Check if the basket caught the fruit.
return 1
else:
return -1
else:
return 0
def isGameOver(self):
if (self.state[0] == self.gridSize - 1):
return True
else:
return False
def updateState(self, action):
if (action == 1):
action = -1
elif (action == 2):
action = 0
else:
action = 1
fruitRow, fruitColumn, basket = self.getState()
newBasket = min(max(2, basket + action), self.gridSize - 1) # The min/max prevents the basket from moving out of the grid.
fruitRow = fruitRow + 1 # The fruit is falling by 1 every action.
self.state = np.array([fruitRow, fruitColumn, newBasket])
#Action can be 1 (move left) or 2 (move right)
def act(self, action):
self.updateState(action)
reward = self.getReward()
gameOver = self.isGameOver()
return self.observe(), reward, gameOver, self.getState() # For purpose of the visual, I also return the state.
# The memory: Handles the internal memory that we add experiences that occur based on agent's actions,
# and creates batches of experiences based on the mini-batch size for training.
class ReplayMemory:
def init(self, gridSize, maxMemory, discount):
self.maxMemory = maxMemory
self.gridSize = gridSize
self.nbStates = self.gridSize * self.gridSize
self.discount = discount
canvas = np.zeros((self.gridSize, self.gridSize))
canvas = np.reshape(canvas, (-1,self.nbStates))
self.inputState = np.empty((self.maxMemory, 100), dtype = np.float32)
self.actions = np.zeros(self.maxMemory, dtype = np.uint8)
self.nextState = np.empty((self.maxMemory, 100), dtype = np.float32)
self.gameOver = np.empty(self.maxMemory, dtype = np.bool)
self.rewards = np.empty(self.maxMemory, dtype = np.int8)
self.count = 0
self.current = 0
# Appends the experience to the memory.
def remember(self, currentState, action, reward, nextState, gameOver):
self.actions[self.current] = action
self.rewards[self.current] = reward
self.inputState[self.current, ...] = currentState
self.nextState[self.current, ...] = nextState
self.gameOver[self.current] = gameOver
self.count = max(self.count, self.current + 1)
self.current = (self.current + 1) % self.maxMemory
def getBatch(self, model, batchSize, nbActions, nbStates, sess, X):
# We check to see if we have enough memory inputs to make an entire batch, if not we create the biggest
# batch we can (at the beginning of training we will not have enough experience to fill a batch).
memoryLength = self.count
chosenBatchSize = min(batchSize, memoryLength)
inputs = np.zeros((chosenBatchSize, nbStates))
targets = np.zeros((chosenBatchSize, nbActions))
# Fill the inputs and targets up.
for i in xrange(chosenBatchSize):
if memoryLength == 1:
memoryLength = 2
# Choose a random memory experience to add to the batch.
randomIndex = random.randrange(1, memoryLength)
current_inputState = np.reshape(self.inputState[randomIndex], (1, 100))
target = sess.run(model, feeddict={X: currentinputState})
current_nextState = np.reshape(self.nextState[randomIndex], (1, 100))
currentoutputs = sess.run(model, feeddict={X: current_nextState})
# Gives us Q_sa, the max q for the next state.
nextStateMaxQ = np.amax(current_outputs)
if (self.gameOver[randomIndex] == True):
target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex]
else:
# reward + discount(gamma) * max_a' Q(s',a')
# We are setting the Q-value for the action to r + gamma*max a' Q(s', a'). The rest stay the same
# to give an error of 0 for those outputs.
target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex] + self.discount * nextStateMaxQ
# Update the inputs and targets.
inputs[i] = current_inputState
targets[i] = target
return inputs, targets
def main(_):
print("Training new model")
# Define Environment
env = CatchEnvironment(gridSize)
# Define Replay Memory
memory = ReplayMemory(gridSize, maxMemory, discount)
# Add ops to save and restore all the variables.
saver = tf.train.Saver()
winCount = 0
with tf.Session() as sess:
tf.initializeallvariables().run()
for i in xrange(epoch):
# Initialize the environment.
err = 0
env.reset()
isGameOver = False
# The initial state of the environment.
currentState = env.observe()
while (isGameOver != True):
action = -9999 # action initilization
# Decides if we should choose a random action, or an action from the policy network.
global epsilon
if (randf(0, 1) <= epsilon):
action = random.randrange(1, nbActions+1)
else:
# Forward the current state through the network.
q = sess.run(outputlayer, feeddict={X: currentState})
# Find the max index (the chosen action).
index = q.argmax()
action = index + 1
# Decay the epsilon by multiplying by 0.999, not allowing it to go below a certain threshold.
if (epsilon > epsilonMinimumValue):
epsilon = epsilon * 0.999
nextState, reward, gameOver, stateInfo = env.act(action)
if (reward == 1):
winCount = winCount + 1
memory.remember(currentState, action, reward, nextState, gameOver)
# Update the current state and if the game is over.
currentState = nextState
isGameOver = gameOver
# We get a batch of training data to train the model.
inputs, targets = memory.getBatch(output_layer, batchSize, nbActions, nbStates, sess, X)
# Train the network which returns the error이.
, loss = sess.run([optimizer, cost], feeddict={X: inputs, Y: targets})
err = err + loss
print("Epoch " + str(i) + ": err = " + str(err) + ": Win count = " + str(winCount) + " Win ratio = " + str(float(winCount)/float(i+1)*100))
# Save the variables to disk.
save_path = saver.save(sess, os.getcwd()+"/model.ckpt")
print("Model saved in file: %s" % save_path)
if name == 'main':
tf.app.run() 입니다
그런데 이런 오류가 생겼습니다
WARNING:tensorflow:From C:\ProgramData\Anaconda3\envs\tens2\lib\site-packages\tensorflowcore\python\compat\v2compat.py:65: disableresourcevariables (from tensorflow.python.ops.variablescope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
Training new model
WARNING:tensorflow:From C:\ProgramData\Anaconda3\envs\tens2\lib\site-packages\tensorflowcore\python\util\tfshoulduse.py:198: initializeallvariables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use tf.globalvariablesinitializer instead.
W0820 22:17:13.656675 9068 deprecation.py:323] From C:\ProgramData\Anaconda3\envs\tens2\lib\site-packages\tensorflowcore\python\util\tfshoulduse.py:198: initializeallvariables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use tf.globalvariablesinitializer instead.
Traceback (most recent call last):
File "C:\Windows\system32\python", line 267, in <module>
tf.app.run()
File "C:\ProgramData\Anaconda3\envs\tens2\lib\site-packages\tensorflowcore\python\platform\app.py", line 40, in run
run(main=main, argv=argv, flagsparser=parseflagstolerateundef)
File "C:\ProgramData\Anaconda3\envs\tens_2\lib\site-packages\absl\app.py", line 299, in run
runmain(main, args)
File "C:\ProgramData\Anaconda3\envs\tens2\lib\site-packages\absl\app.py", line 250, in run_main
sys.exit(main(argv))
File "C:\Windows\system32\python", line 216, in main
for i in xrange(epoch):
NameError: name 'xrange' is not defined
어떻게 해결해야 할까요?
매우 길지만 해결해 주시면 감사하겠습니다 ㅠㅠ