base_gru.py

import theano
import theano.tensor as T
import numpy as np

from util import *

class BaseGRULayer( object ):
    """
    Implements a GRU layer
    """

    def __init__(self, input_width, output_width, activation_shift=0.0, name=None, dropout_keep=1, dropout_input=False, dropout_output=True):
        """
        Params:
            input_width: Width of input
            output_width: Width of the GRU output
            activation_shift: How to shift the biases of the activation
        """
        self._input_width = input_width
        self._output_width = output_width

        prefix = "" if name is None else name + "_"

        self._reset_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"reset_W")
        self._reset_b = theano.shared(init_params([output_width], shift=1.0), prefix+"reset_b")

        self._update_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"update_W")
        self._update_b = theano.shared(init_params([output_width], shift=1.0), prefix+"update_b")

        self._activation_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"activation_W")
        self._activation_b = theano.shared(init_params([output_width], shift=activation_shift), prefix+"activation_b")

        self._dropout_keep = dropout_keep
        self._dropout_input = dropout_input
        self._dropout_output = dropout_output

    @property
    def input_width(self):
        return self._input_width

    @property
    def output_width(self):
        return self._output_width

    @property
    def params(self):
        return [self._reset_W, self._reset_b, self._update_W, self._update_b, self._activation_W, self._activation_b]

    def initial_state(self, batch_size):
        """
        The initial state of the network
        Params:
            batch_size: The batch size to construct the initial state for
        """
        return T.zeros([batch_size, self.output_width])

    def dropout_masks(self, srng, use_output=None):
        if self._dropout_keep == 1:
            return []
        else:
            masks = []
            if self._dropout_input:
                masks.append(make_dropout_mask((self._input_width,), self._dropout_keep, srng))
            if self._dropout_output:
                if use_output is not None:
                    masks.append(use_output)
                else:
                    masks.append(make_dropout_mask((self._output_width,), self._dropout_keep, srng))
            return masks

    def split_dropout_masks(self, dropout_masks):
        if dropout_masks is None:
            return [], None
        idx = (self._dropout_keep != 1) * (self._dropout_input + self._dropout_output)
        return dropout_masks[:idx], dropout_masks[idx:]

    def step(self, ipt, state, dropout_masks=Ellipsis):
        """
        Perform a single step of the network

        Params:
            ipt: The current input. Should be an int tensor of shape (n_batch, self.input_width)
            state: The previous state. Should be a float tensor of shape (n_batch, self.output_width)
            dropout_masks: Masks from get_dropout_masks

        Returns: The next output state
        """
        if dropout_masks is Ellipsis:
            dropout_masks = None
            append_masks = False
        else:
            append_masks = True

        if self._dropout_keep != 1 and self._dropout_input and dropout_masks is not None:
                ipt_masks = dropout_masks[0]
                ipt = apply_dropout(ipt, ipt_masks)
                dropout_masks = dropout_masks[1:]

        cat_ipt_state = T.concatenate([ipt, state], 1)
        reset = do_layer( T.nnet.sigmoid, cat_ipt_state,
                            self._reset_W, self._reset_b )
        update = do_layer( T.nnet.sigmoid, cat_ipt_state,
                            self._update_W, self._update_b )
        candidate_act = do_layer( T.tanh, T.concatenate([ipt, (reset * state)], 1),
                            self._activation_W, self._activation_b )

        newstate = update * state + (1-update) * candidate_act

        if self._dropout_keep != 1 and self._dropout_output and dropout_masks is not None:
                newstate_masks = dropout_masks[0]
                newstate = apply_dropout(newstate, newstate_masks)
                dropout_masks = dropout_masks[1:]

        if append_masks:
            return newstate, dropout_masks
        else:
            return newstate