Source code for tensorforce.core.policies.parametrized_distributions

# Copyright 2018 Tensorforce Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from collections import OrderedDict

import tensorflow as tf

from tensorforce import TensorforceError, util
from tensorforce.core import distribution_modules, Module, network_modules
from tensorforce.core.networks import Network
from tensorforce.core.policies import Stochastic, ActionValue


[docs]class ParametrizedDistributions(Stochastic, ActionValue):
    """
    Policy which parametrizes independent distributions per action conditioned on the output of a
    central states-processing neural network (supports both stochastic and action-value-based
    policy interface) (specification key: `parametrized_distributions`).

    Args:
        name (string): Module name
            (<span style="color:#0000C0"><b>internal use</b></span>).
        states_spec (specification): States specification
            (<span style="color:#0000C0"><b>internal use</b></span>).
        actions_spec (specification): Actions specification
            (<span style="color:#0000C0"><b>internal use</b></span>).
        network ('auto' | specification): Policy network configuration, see
            [networks](../modules/networks.html)
            (<span style="color:#00C000"><b>default</b></span>: 'auto', automatically configured
            network).
        distributions (dict[specification]): Distributions configuration, see
            [distributions](../modules/distributions.html), specified per
            action-type or -name
            (<span style="color:#00C000"><b>default</b></span>: per action-type, Bernoulli
            distribution for binary boolean actions, categorical distribution for discrete integer
            actions, Gaussian distribution for unbounded continuous actions, Beta distribution for
            bounded continuous actions).
        device (string): Device name
            (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
        summary_labels ('all' | iter[string]): Labels of summaries to record
            (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
        l2_regularization (float >= 0.0): Scalar controlling L2 regularization
            (<span style="color:#00C000"><b>default</b></span>: inherit value of parent module).
    """

    def __init__(
        self, name, states_spec, actions_spec, network='auto', distributions=None, device=None,
        summary_labels=None, l2_regularization=None
    ):
        if isinstance(network, Network):
            assert device is None
            device = network.device
            network.device = None

        super().__init__(
            name=name, states_spec=states_spec, actions_spec=actions_spec, device=device,
            summary_labels=summary_labels, l2_regularization=l2_regularization
        )

        # Network
        if isinstance(network, Network):
            self.network = network
        else:
            self.network = self.add_module(
                name=(self.name + '-network'), module=network, modules=network_modules,
                inputs_spec=self.states_spec
            )
        output_spec = self.network.get_output_spec()
        if output_spec['type'] != 'float':
            raise TensorforceError(
                "Invalid output type for network: {}.".format(output_spec['type'])
            )
        elif len(output_spec['shape']) != 1:
            raise TensorforceError(
                "Invalid output rank for network: {}.".format(len(output_spec['shape']))
            )
        Module.register_tensor(name=self.name, spec=output_spec, batched=True)
        embedding_size = output_spec['shape'][0]

        # Distributions
        self.distributions = OrderedDict()
        for name, spec in self.actions_spec.items():
            if spec['type'] == 'bool':
                default_module = 'bernoulli'
            elif spec['type'] == 'int':
                default_module = 'categorical'
            elif spec['type'] == 'float':
                default_module = 'beta' if 'min_value' in spec else 'gaussian'

            if distributions is None:
                module = None
            else:
                module = dict()
                if spec['type'] in distributions:
                    if isinstance(distributions[spec['type']], str):
                        module = distributions[spec['type']]
                    else:
                        module.update(distributions[spec['type']])
                if name in distributions:
                    if isinstance(distributions[name], str):
                        module = distributions[name]
                    else:
                        module.update(distributions[name])

            self.distributions[name] = self.add_module(
                name=(name + '-distribution'), module=module, modules=distribution_modules,
                default_module=default_module, action_spec=spec, embedding_size=embedding_size
            )

    @classmethod
    def internals_spec(cls, network=None, policy=None, name=None, states_spec=None, **kwargs):
        if policy is None:
            if network is None:
                network = 'auto'
            assert name is not None and states_spec is not None

            network_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
                name=(name + '-network'), module=network, modules=network_modules,
                inputs_spec=states_spec
            )

            if first_arg is None:
                return network_cls.internals_spec(name=(name + '-network'), **kwargs)
            else:
                return network_cls.internals_spec(first_arg, name=(name + '-network'), **kwargs)

        else:
            assert network is None and name is None and states_spec is None
            return policy.network.__class__.internals_spec(network=policy.network)

    def internals_init(self):
        return self.network.internals_init()

    def tf_dependency_horizon(self, is_optimization=False):
        return self.network.dependency_horizon(is_optimization=is_optimization)

    def tf_act(self, states, internals, auxiliaries):
        return Stochastic.tf_act(
            self=self, states=states, internals=internals, auxiliaries=auxiliaries
        )

    def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals):
        if return_internals:
            embedding, internals = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )
        else:
            embedding = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )

        Module.update_tensor(name=self.name, tensor=embedding)

        actions = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            action = distribution.sample(parameters=parameters, deterministic=deterministic)

            entropy = distribution.entropy(parameters=parameters)
            entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape'])))
            mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1)
            actions[name] = self.add_summary(
                label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action
            )

        if return_internals:
            return actions, internals
        else:
            return actions

    def tf_log_probabilities(self, states, internals, auxiliaries, actions):
        embedding = self.network.apply(x=states, internals=internals)
        Module.update_tensor(name=self.name, tensor=embedding)

        log_probabilities = OrderedDict()
        for name, spec, distribution, action in util.zip_items(
            self.actions_spec, self.distributions, actions
        ):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            log_probabilities[name] = distribution.log_probability(
                parameters=parameters, action=action
            )

        return log_probabilities

    def tf_entropies(self, states, internals, auxiliaries):
        embedding = self.network.apply(x=states, internals=internals)
        Module.update_tensor(name=self.name, tensor=embedding)

        entropies = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            entropies[name] = distribution.entropy(parameters=parameters)

        return entropies

    def tf_kl_divergences(self, states, internals, auxiliaries, other=None):
        assert other is None or isinstance(other, ParametrizedDistributions)

        embedding = self.network.apply(x=states, internals=internals)
        if other is not None:
            other_embedding = other.network.apply(x=states, internals=internals)

        kl_divergences = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            if other is None:
                other_parameters = tuple(tf.stop_gradient(input=value) for value in parameters)
            elif spec['type'] == 'int':
                other_parameters = other.distributions[name].parametrize(
                    x=other_embedding, mask=mask
                )
            else:
                other_parameters = other.distributions[name].parametrize(x=other_embedding)
            kl_divergences[name] = distribution.kl_divergence(
                parameters1=other_parameters, parameters2=parameters  # order????
            )

        return kl_divergences

    def tf_states_values(self, states, internals, auxiliaries):
        embedding = self.network.apply(x=states, internals=internals)
        Module.update_tensor(name=self.name, tensor=embedding)

        states_values = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            states_values[name] = distribution.states_value(parameters=parameters)

        return states_values

    def tf_actions_values(self, states, internals, auxiliaries, actions=None):
        embedding = self.network.apply(x=states, internals=internals)
        Module.update_tensor(name=self.name, tensor=embedding)

        actions_values = OrderedDict()
        for name, spec, distribution, action in util.zip_items(
            self.actions_spec, self.distributions, actions
        ):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            actions_values[name] = distribution.action_value(parameters=parameters, action=action)

        return actions_values