import gym
env = gym.make(‘KellyCoinflip-v0‘)
from scipy.stats import binom
import numpy as np
from repoze.lru import lru_cache
def V(w, b, m=250):
if w>=250:
return 250
if w<=0:
return 0
if b==0:
return w
else:
try:
j = binom.ppf(float(w)/float(m), b, 0.5)
return 1.2**b * 1.5**-j * (w + m/2 *
sum(np.multiply(binom.cdf(list(map(lambda x2 :x2-1, range(0,int(j+1)))),b,0.5),
list(map(lambda x : 1.5**x, list(reversed(range(0, int(j+1)))))) )))
except ValueError:
print ("Error:", (w,b,m))
@lru_cache(None)
def VPplan(w, b):
# optimization: short-circuit
if w<=0 or w>=250:
return 0
else:
if b==0:
return w
else:
possibleBets = list(map(lambda pb : float(pb)/100.0, range(0*100,int((w*100)+1),1)))
returns = list(map(lambda pb : 0.6*V(w+pb, b-1) + 0.4*V(w-pb,b-1), possibleBets))
return float(returns.index(max(returns)))/100.0
# play 500 games and calculate mean reward:
rewards = []
for n in range(0,500):
done = False
reward = 0
while not done:
w = env._get_obs()[0][0]
b = env._get_obs()[1]
bet = VPplan(w, b)
results = env.step(bet*100)
print (n, w, b, bet, "results:", results)
reward = reward+results[1]
done = results[2]
rewards.append(reward)
env.reset()
print (sum(rewards)/len(rewards))
原文:https://www.cnblogs.com/dzqdzq/p/15008589.html