Source code for tensorforce.core.policies.parametrized_distributions

# Copyright 2018 Tensorforce Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from collections import OrderedDict

import tensorflow as tf

from tensorforce import TensorforceError, util
from tensorforce.core import distribution_modules, layer_modules, Module, network_modules
from tensorforce.core.networks import Network
from tensorforce.core.policies import Stochastic, ActionValue


[docs]class ParametrizedDistributions(Stochastic, ActionValue): """ Policy which parametrizes independent distributions per action conditioned on the output of a central states-processing neural network (supports both stochastic and action-value-based policy interface) (specification key: `parametrized_distributions`). Args: name (string): Module name (<span style="color:#0000C0"><b>internal use</b></span>). network ('auto' | specification): Policy network configuration, see [networks](../modules/networks.html) (<span style="color:#00C000"><b>default</b></span>: 'auto', automatically configured network). distributions (dict[specification]): Distributions configuration, see [distributions](../modules/distributions.html), specified per action-type or -name (<span style="color:#00C000"><b>default</b></span>: per action-type, Bernoulli distribution for binary boolean actions, categorical distribution for discrete integer actions, Gaussian distribution for unbounded continuous actions, Beta distribution for bounded continuous actions). temperature (parameter | dict[parameter], float >= 0.0): Sampling temperature, global or per action (<span style="color:#00C000"><b>default</b></span>: 0.0). use_beta_distribution (bool): Whether to use the Beta distribution for bounded continuous actions by default. (<span style="color:#00C000"><b>default</b></span>: true). infer_state_value (False | "action-values" | "distribution"): Whether to infer the state value from either the action values or (experimental) the distribution parameters (<span style="color:#00C000"><b>default</b></span>: false). device (string): Device name (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). summary_labels ('all' | iter[string]): Labels of summaries to record (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). l2_regularization (float >= 0.0): Scalar controlling L2 regularization (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module). states_spec (specification): States specification (<span style="color:#0000C0"><b>internal use</b></span>). actions_spec (specification): Actions specification (<span style="color:#0000C0"><b>internal use</b></span>). """ # Network first def __init__( self, name, network='auto', distributions=None, temperature=0.0, use_beta_distribution=True, infer_state_value=False, device=None, summary_labels=None, l2_regularization=None, states_spec=None, actions_spec=None ): if isinstance(network, Network): assert device is None device = network.device network.device = None super().__init__( name=name, states_spec=states_spec, actions_spec=actions_spec, temperature=temperature, device=device, summary_labels=summary_labels, l2_regularization=l2_regularization ) # Network self.network = self.add_module( name=(self.name + '-network'), module=network, modules=network_modules, inputs_spec=self.states_spec ) output_spec = self.network.get_output_spec() if output_spec['type'] != 'float': raise TensorforceError( "Invalid output type for network: {}.".format(output_spec['type']) ) Module.register_tensor(name=self.name, spec=output_spec, batched=True) embedding_shape = output_spec['shape'] # Distributions self.distributions = OrderedDict() for name, spec in self.actions_spec.items(): if spec['type'] == 'bool': default_module = 'bernoulli' elif spec['type'] == 'int': default_module = 'categorical' elif spec['type'] == 'float': if use_beta_distribution and 'min_value' in spec: default_module = 'beta' else: default_module = 'gaussian' if distributions is None: module = None else: module = dict() if spec['type'] in distributions: if isinstance(distributions[spec['type']], str): module = distributions[spec['type']] else: module.update(distributions[spec['type']]) if name in distributions: if isinstance(distributions[name], str): module = distributions[name] else: module.update(distributions[name]) self.distributions[name] = self.add_module( name=(name + '-distribution'), module=module, modules=distribution_modules, default_module=default_module, action_spec=spec, embedding_shape=embedding_shape ) # State value assert infer_state_value in (False, 'action-values', 'distribution') self.infer_state_value = infer_state_value if self.infer_state_value is False: self.value = self.add_module( name='states-value', module='linear', modules=layer_modules, size=0, input_spec=output_spec ) @classmethod def internals_spec(cls, network=None, policy=None, name=None, states_spec=None, **kwargs): if policy is None: if network is None: network = 'auto' assert name is not None and states_spec is not None network_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name=(name + '-network'), module=network, modules=network_modules, inputs_spec=states_spec ) if first_arg is None: return network_cls.internals_spec(name=(name + '-network'), **kwargs) else: return network_cls.internals_spec(first_arg, name=(name + '-network'), **kwargs) else: assert network is None and name is None and states_spec is None return policy.network.__class__.internals_spec(network=policy.network) def internals_init(self): return self.network.internals_init() def max_past_horizon(self, is_optimization): return self.network.max_past_horizon(is_optimization=is_optimization) def tf_past_horizon(self, is_optimization): return self.network.past_horizon(is_optimization=is_optimization) def tf_act(self, states, internals, auxiliaries, return_internals): return Stochastic.tf_act( self=self, states=states, internals=internals, auxiliaries=auxiliaries, return_internals=return_internals ) def tf_sample_actions(self, states, internals, auxiliaries, temperature, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals ) else: embedding = self.network.apply( x=states, internals=internals, return_internals=return_internals ) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution, temp in util.zip_items( self.actions_spec, self.distributions, temperature ): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) actions[name] = distribution.sample(parameters=parameters, temperature=temp) if return_internals: return actions, internals else: return actions def tf_log_probabilities(self, states, internals, auxiliaries, actions): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) log_probabilities = OrderedDict() for name, spec, distribution, action in util.zip_items( self.actions_spec, self.distributions, actions ): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) log_probabilities[name] = distribution.log_probability( parameters=parameters, action=action ) return log_probabilities def tf_entropies(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) entropies = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) entropies[name] = distribution.entropy(parameters=parameters) return entropies def tf_kl_divergences(self, states, internals, auxiliaries, other=None): parameters = self.kldiv_reference( states=states, internals=internals, auxiliaries=auxiliaries ) if other is None: other = util.fmap(function=tf.stop_gradient, xs=parameters) elif isinstance(other, ParametrizedDistributions): other = other.kldiv_reference( states=states, internals=internals, auxiliaries=auxiliaries ) other = util.fmap(function=tf.stop_gradient, xs=other) elif isinstance(other, dict): if any(name not in other for name in self.actions_spec): raise TensorforceError.unexpected() else: raise TensorforceError.unexpected() kl_divergences = OrderedDict() for name, distribution in self.distributions.items(): kl_divergences[name] = distribution.kl_divergence( parameters1=parameters[name], parameters2=other[name] ) return kl_divergences def tf_kldiv_reference(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) kldiv_reference = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] kldiv_reference[name] = distribution.parametrize(x=embedding, mask=mask) else: kldiv_reference[name] = distribution.parametrize(x=embedding) return kldiv_reference def tf_states_values(self, states, internals, auxiliaries): if self.infer_state_value == 'action-values': return ActionValue.tf_states_values( self=self, states=states, internals=internals, auxiliaries=auxiliaries ) else: embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) states_values = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) states_values[name] = distribution.states_value(parameters=parameters) return states_values def tf_actions_values(self, states, internals, auxiliaries, actions=None): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) actions_values = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) if actions is None: action = None else: action = actions[name] actions_values[name] = distribution.action_value(parameters=parameters, action=action) return actions_values def tf_states_value( self, states, internals, auxiliaries, reduced=True, include_per_action=False ): if self.infer_state_value is False: if not reduced or include_per_action: raise TensorforceError.invalid(name='policy.states_value', argument='reduced') embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) states_value = self.value.apply(x=embedding) return states_value else: return ActionValue.tf_states_value( self=self, states=states, internals=internals, auxiliaries=auxiliaries, reduced=reduced, include_per_action=include_per_action )