Source code for tensorforce.core.policies.parametrized_distributions

# Copyright 2018 Tensorforce Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from collections import OrderedDict

import tensorflow as tf

from tensorforce import TensorforceError, util
from tensorforce.core import distribution_modules, Module, network_modules
from tensorforce.core.networks import Network
from tensorforce.core.policies import Stochastic, ActionValue


[docs]class ParametrizedDistributions(Stochastic, ActionValue): """ Policy which parametrizes independent distributions per action conditioned on the output of a central states-processing neural network (supports both stochastic and action-value-based policy interface) (specification key: `parametrized_distributions`). Args: name (string): Module name (<span style="color:#0000C0"><b>internal use</b></span>). states_spec (specification): States specification (<span style="color:#0000C0"><b>internal use</b></span>). actions_spec (specification): Actions specification (<span style="color:#0000C0"><b>internal use</b></span>). network ('auto' | specification): Policy network configuration, see [networks](../modules/networks.html) (<span style="color:#00C000"><b>default</b></span>: 'auto', automatically configured network). distributions (dict[specification]): Distributions configuration, see [distributions](../modules/distributions.html), specified per action-type or -name (<span style="color:#00C000"><b>default</b></span>: per action-type, Bernoulli distribution for binary boolean actions, categorical distribution for discrete integer actions, Gaussian distribution for unbounded continuous actions, Beta distribution for bounded continuous actions). device (string): Device name (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). summary_labels ('all' | iter[string]): Labels of summaries to record (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). l2_regularization (float >= 0.0): Scalar controlling L2 regularization (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). """ def __init__( self, name, states_spec, actions_spec, network='auto', distributions=None, device=None, summary_labels=None, l2_regularization=None ): if isinstance(network, Network): assert device is None device = network.device network.device = None super().__init__( name=name, states_spec=states_spec, actions_spec=actions_spec, device=device, summary_labels=summary_labels, l2_regularization=l2_regularization ) # Network if isinstance(network, Network): self.network = network else: self.network = self.add_module( name=(self.name + '-network'), module=network, modules=network_modules, inputs_spec=self.states_spec ) output_spec = self.network.get_output_spec() if output_spec['type'] != 'float': raise TensorforceError( "Invalid output type for network: {}.".format(output_spec['type']) ) elif len(output_spec['shape']) != 1: raise TensorforceError( "Invalid output rank for network: {}.".format(len(output_spec['shape'])) ) Module.register_tensor(name=self.name, spec=output_spec, batched=True) embedding_size = output_spec['shape'][0] # Distributions self.distributions = OrderedDict() for name, spec in self.actions_spec.items(): if spec['type'] == 'bool': default_module = 'bernoulli' elif spec['type'] == 'int': default_module = 'categorical' elif spec['type'] == 'float': default_module = 'beta' if 'min_value' in spec else 'gaussian' if distributions is None: module = None else: module = dict() if spec['type'] in distributions: if isinstance(distributions[spec['type']], str): module = distributions[spec['type']] else: module.update(distributions[spec['type']]) if name in distributions: if isinstance(distributions[name], str): module = distributions[name] else: module.update(distributions[name]) self.distributions[name] = self.add_module( name=(name + '-distribution'), module=module, modules=distribution_modules, default_module=default_module, action_spec=spec, embedding_size=embedding_size ) @classmethod def internals_spec(cls, network=None, policy=None, name=None, states_spec=None, **kwargs): if policy is None: if network is None: network = 'auto' assert name is not None and states_spec is not None network_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name=(name + '-network'), module=network, modules=network_modules, inputs_spec=states_spec ) if first_arg is None: return network_cls.internals_spec(name=(name + '-network'), **kwargs) else: return network_cls.internals_spec(first_arg, name=(name + '-network'), **kwargs) else: assert network is None and name is None and states_spec is None return policy.network.__class__.internals_spec(network=policy.network) def internals_init(self): return self.network.internals_init() def tf_dependency_horizon(self, is_optimization=False): return self.network.dependency_horizon(is_optimization=is_optimization) def tf_act(self, states, internals, auxiliaries): return Stochastic.tf_act( self=self, states=states, internals=internals, auxiliaries=auxiliaries ) def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals ) else: embedding = self.network.apply( x=states, internals=internals, return_internals=return_internals ) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) action = distribution.sample(parameters=parameters, deterministic=deterministic) entropy = distribution.entropy(parameters=parameters) entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1) actions[name] = self.add_summary( label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action ) if return_internals: return actions, internals else: return actions def tf_log_probabilities(self, states, internals, auxiliaries, actions): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) log_probabilities = OrderedDict() for name, spec, distribution, action in util.zip_items( self.actions_spec, self.distributions, actions ): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) log_probabilities[name] = distribution.log_probability( parameters=parameters, action=action ) return log_probabilities def tf_entropies(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) entropies = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) entropies[name] = distribution.entropy(parameters=parameters) return entropies def tf_kl_divergences(self, states, internals, auxiliaries, other=None): assert other is None or isinstance(other, ParametrizedDistributions) embedding = self.network.apply(x=states, internals=internals) if other is not None: other_embedding = other.network.apply(x=states, internals=internals) kl_divergences = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) if other is None: other_parameters = tuple(tf.stop_gradient(input=value) for value in parameters) elif spec['type'] == 'int': other_parameters = other.distributions[name].parametrize( x=other_embedding, mask=mask ) else: other_parameters = other.distributions[name].parametrize(x=other_embedding) kl_divergences[name] = distribution.kl_divergence( parameters1=other_parameters, parameters2=parameters # order???? ) return kl_divergences def tf_states_values(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) states_values = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) states_values[name] = distribution.states_value(parameters=parameters) return states_values def tf_actions_values(self, states, internals, auxiliaries, actions=None): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) actions_values = OrderedDict() for name, spec, distribution, action in util.zip_items( self.actions_spec, self.distributions, actions ): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) actions_values[name] = distribution.action_value(parameters=parameters, action=action) return actions_values