# Copyright 2018 Tensorforce Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from collections import OrderedDict
import tensorflow as tf
from tensorforce import TensorforceError, util
from tensorforce.core import distribution_modules, layer_modules, Module, network_modules
from tensorforce.core.networks import Network
from tensorforce.core.policies import Stochastic, ActionValue
[docs]class ParametrizedDistributions(Stochastic, ActionValue):
"""
Policy which parametrizes independent distributions per action conditioned on the output of a
central states-processing neural network (supports both stochastic and action-value-based
policy interface) (specification key: `parametrized_distributions`).
Args:
name (string): Module name
(<span style="color:#0000C0"><b>internal use</b></span>).
network ('auto' | specification): Policy network configuration, see
[networks](../modules/networks.html)
(<span style="color:#00C000"><b>default</b></span>: 'auto', automatically configured
network).
distributions (dict[specification]): Distributions configuration, see
[distributions](../modules/distributions.html), specified per
action-type or -name
(<span style="color:#00C000"><b>default</b></span>: per action-type, Bernoulli
distribution for binary boolean actions, categorical distribution for discrete integer
actions, Gaussian distribution for unbounded continuous actions, Beta distribution for
bounded continuous actions).
temperature (parameter | dict[parameter], float >= 0.0): Sampling temperature, global or
per action (<span style="color:#00C000"><b>default</b></span>: 0.0).
use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous
actions by default.
(<span style="color:#00C000"><b>default</b></span>: true).
infer_state_value (False | "action-values" | "distribution"): Whether to infer the state
value from either the action values or (experimental) the distribution parameters
(<span style="color:#00C000"><b>default</b></span>: false).
device (string): Device name
(<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
summary_labels ('all' | iter[string]): Labels of summaries to record
(<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
l2_regularization (float >= 0.0): Scalar controlling L2 regularization
(<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
states_spec (specification): States specification
(<span style="color:#0000C0"><b>internal use</b></span>).
actions_spec (specification): Actions specification
(<span style="color:#0000C0"><b>internal use</b></span>).
"""
# Network first
def __init__(
self, name, network='auto', distributions=None, temperature=0.0, use_beta_distribution=True,
infer_state_value=False, device=None, summary_labels=None, l2_regularization=None,
states_spec=None, actions_spec=None
):
if isinstance(network, Network):
assert device is None
device = network.device
network.device = None
super().__init__(
name=name, states_spec=states_spec, actions_spec=actions_spec, temperature=temperature,
device=device, summary_labels=summary_labels, l2_regularization=l2_regularization
)
# Network
self.network = self.add_module(
name=(self.name + '-network'), module=network, modules=network_modules,
inputs_spec=self.states_spec
)
output_spec = self.network.get_output_spec()
if output_spec['type'] != 'float':
raise TensorforceError(
"Invalid output type for network: {}.".format(output_spec['type'])
)
Module.register_tensor(name=self.name, spec=output_spec, batched=True)
embedding_shape = output_spec['shape']
# Distributions
self.distributions = OrderedDict()
for name, spec in self.actions_spec.items():
if spec['type'] == 'bool':
default_module = 'bernoulli'
elif spec['type'] == 'int':
default_module = 'categorical'
elif spec['type'] == 'float':
if use_beta_distribution and 'min_value' in spec:
default_module = 'beta'
else:
default_module = 'gaussian'
if distributions is None:
module = None
else:
module = dict()
if spec['type'] in distributions:
if isinstance(distributions[spec['type']], str):
module = distributions[spec['type']]
else:
module.update(distributions[spec['type']])
if name in distributions:
if isinstance(distributions[name], str):
module = distributions[name]
else:
module.update(distributions[name])
self.distributions[name] = self.add_module(
name=(name + '-distribution'), module=module, modules=distribution_modules,
default_module=default_module, action_spec=spec, embedding_shape=embedding_shape
)
# State value
assert infer_state_value in (False, 'action-values', 'distribution')
self.infer_state_value = infer_state_value
if self.infer_state_value is False:
self.value = self.add_module(
name='states-value', module='linear', modules=layer_modules, size=0,
input_spec=output_spec
)
@classmethod
def internals_spec(cls, network=None, policy=None, name=None, states_spec=None, **kwargs):
if policy is None:
if network is None:
network = 'auto'
assert name is not None and states_spec is not None
network_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
name=(name + '-network'), module=network, modules=network_modules,
inputs_spec=states_spec
)
if first_arg is None:
return network_cls.internals_spec(name=(name + '-network'), **kwargs)
else:
return network_cls.internals_spec(first_arg, name=(name + '-network'), **kwargs)
else:
assert network is None and name is None and states_spec is None
return policy.network.__class__.internals_spec(network=policy.network)
def internals_init(self):
return self.network.internals_init()
def max_past_horizon(self, is_optimization):
return self.network.max_past_horizon(is_optimization=is_optimization)
def tf_past_horizon(self, is_optimization):
return self.network.past_horizon(is_optimization=is_optimization)
def tf_act(self, states, internals, auxiliaries, return_internals):
return Stochastic.tf_act(
self=self, states=states, internals=internals, auxiliaries=auxiliaries,
return_internals=return_internals
)
def tf_sample_actions(self, states, internals, auxiliaries, temperature, return_internals):
if return_internals:
embedding, internals = self.network.apply(
x=states, internals=internals, return_internals=return_internals
)
else:
embedding = self.network.apply(
x=states, internals=internals, return_internals=return_internals
)
Module.update_tensor(name=self.name, tensor=embedding)
actions = OrderedDict()
for name, spec, distribution, temp in util.zip_items(
self.actions_spec, self.distributions, temperature
):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
parameters = distribution.parametrize(x=embedding, mask=mask)
else:
parameters = distribution.parametrize(x=embedding)
actions[name] = distribution.sample(parameters=parameters, temperature=temp)
if return_internals:
return actions, internals
else:
return actions
def tf_log_probabilities(self, states, internals, auxiliaries, actions):
embedding = self.network.apply(x=states, internals=internals)
Module.update_tensor(name=self.name, tensor=embedding)
log_probabilities = OrderedDict()
for name, spec, distribution, action in util.zip_items(
self.actions_spec, self.distributions, actions
):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
parameters = distribution.parametrize(x=embedding, mask=mask)
else:
parameters = distribution.parametrize(x=embedding)
log_probabilities[name] = distribution.log_probability(
parameters=parameters, action=action
)
return log_probabilities
def tf_entropies(self, states, internals, auxiliaries):
embedding = self.network.apply(x=states, internals=internals)
Module.update_tensor(name=self.name, tensor=embedding)
entropies = OrderedDict()
for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
parameters = distribution.parametrize(x=embedding, mask=mask)
else:
parameters = distribution.parametrize(x=embedding)
entropies[name] = distribution.entropy(parameters=parameters)
return entropies
def tf_kl_divergences(self, states, internals, auxiliaries, other=None):
parameters = self.kldiv_reference(
states=states, internals=internals, auxiliaries=auxiliaries
)
if other is None:
other = util.fmap(function=tf.stop_gradient, xs=parameters)
elif isinstance(other, ParametrizedDistributions):
other = other.kldiv_reference(
states=states, internals=internals, auxiliaries=auxiliaries
)
other = util.fmap(function=tf.stop_gradient, xs=other)
elif isinstance(other, dict):
if any(name not in other for name in self.actions_spec):
raise TensorforceError.unexpected()
else:
raise TensorforceError.unexpected()
kl_divergences = OrderedDict()
for name, distribution in self.distributions.items():
kl_divergences[name] = distribution.kl_divergence(
parameters1=parameters[name], parameters2=other[name]
)
return kl_divergences
def tf_kldiv_reference(self, states, internals, auxiliaries):
embedding = self.network.apply(x=states, internals=internals)
kldiv_reference = OrderedDict()
for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
kldiv_reference[name] = distribution.parametrize(x=embedding, mask=mask)
else:
kldiv_reference[name] = distribution.parametrize(x=embedding)
return kldiv_reference
def tf_states_values(self, states, internals, auxiliaries):
if self.infer_state_value == 'action-values':
return ActionValue.tf_states_values(
self=self, states=states, internals=internals, auxiliaries=auxiliaries
)
else:
embedding = self.network.apply(x=states, internals=internals)
Module.update_tensor(name=self.name, tensor=embedding)
states_values = OrderedDict()
for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
parameters = distribution.parametrize(x=embedding, mask=mask)
else:
parameters = distribution.parametrize(x=embedding)
states_values[name] = distribution.states_value(parameters=parameters)
return states_values
def tf_actions_values(self, states, internals, auxiliaries, actions=None):
embedding = self.network.apply(x=states, internals=internals)
Module.update_tensor(name=self.name, tensor=embedding)
actions_values = OrderedDict()
for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
if spec['type'] == 'int':
mask = auxiliaries[name + '_mask']
parameters = distribution.parametrize(x=embedding, mask=mask)
else:
parameters = distribution.parametrize(x=embedding)
if actions is None:
action = None
else:
action = actions[name]
actions_values[name] = distribution.action_value(parameters=parameters, action=action)
return actions_values
def tf_states_value(
self, states, internals, auxiliaries, reduced=True, include_per_action=False
):
if self.infer_state_value is False:
if not reduced or include_per_action:
raise TensorforceError.invalid(name='policy.states_value', argument='reduced')
embedding = self.network.apply(x=states, internals=internals)
Module.update_tensor(name=self.name, tensor=embedding)
states_value = self.value.apply(x=embedding)
return states_value
else:
return ActionValue.tf_states_value(
self=self, states=states, internals=internals, auxiliaries=auxiliaries,
reduced=reduced, include_per_action=include_per_action
)