専門ユニット2/山内研セミナー(2020/11/18)

関連サイトと資料

使用する線形代数ライブラリ

linear_algebra.py
import math
   
def add(v, w):
    """Adds corresponding elements"""
    assert len(v) == len(w), "vectors must be the same length"
    return [v_i + w_i for v_i, w_i in zip(v, w)]
   
def subtract(v, w):
    """Subtracts corresponding elements"""
    assert len(v) == len(w), "vectors must be the same length"
    return [v_i - w_i for v_i, w_i in zip(v, w)]
  
def vector_sum(vectors):
    """Sums all corresponding elements"""
    # Check that vectors is not empty
    assert vectors, "no vectors provided!"
  
    # Check the vectors are all the same size
    num_elements = len(vectors[0])
    assert all(len(v) == num_elements for v in vectors), "different sizes!"
  
    # the i-th element of the result is the sum of every vector[i]
    return [sum(vector[i] for vector in vectors) for i in range(num_elements)]
  
def scalar_multiply(c, v):
    """Multiplies every element by c"""
    return [c * v_i for v_i in v]
  
def vector_mean(vectors):
    """Computes the element-wise average"""
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))
  
def dot(v, w):
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"
  
    return sum(v_i * w_i for v_i, w_i in zip(v, w))
  
def sum_of_squares(v):
    """Returns v_1 * v_1 + ... + v_n * v_n"""
    return dot(v, v)
  
def magnitude(v):
    """Returns the magnitude (or length) of v"""
    return math.sqrt(sum_of_squares(v))
   
def squared_distance(v, w):
    """Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
    return sum_of_squares(subtract(v, w))
   
def distance(v, w):
    """Computes the distance between v and w"""
    return math.sqrt(squared_distance(v, w))
   
def shape(A):
    """Returns (# of rows of A, # of columns of A)"""
    num_rows = len(A)
    num_cols = len(A[0]) if A else 0   # number of elements in first row
    return num_rows, num_cols
  
def get_row(A, i):
    """Returns the i-th row of A (as a Vector)"""
    return A[i]
  
def get_column(A, j):
    """Returns the j-th column of A (as a Vector)"""
    return [A_i[j] for A_i in A]
   
def make_matrix(num_rows, num_cols, entry_fn):
    """
    Returns a num_rows x num_cols matrix
    whose (i,j)-th entry is entry_fn(i, j)
    """
    return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]
   
def identity_matrix(n):
    """Returns the n x n identity matrix"""
    return make_matrix(n, n, lambda i, j: 1 if i == j else 0)
    

使用する確率ライブラリ

probability.py
import math
   
def normal_cdf(x, mu = 0, sigma = 1):
    return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2
   
def inverse_normal_cdf(p, mu = 0, sigma = 1, tolerance = 0.00001):
    """Find approximate inverse using binary search"""
  
    # if not standard, compute standard and rescale
    if mu != 0 or sigma != 1:
        return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
  
    low_z = -10.0                      # normal_cdf(-10) is (very close to) 0
    hi_z  =  10.0                      # normal_cdf(10)  is (very close to) 1
    while hi_z - low_z > tolerance:
        mid_z = (low_z + hi_z) / 2     # Consider the midpoint
        mid_p = normal_cdf(mid_z)      # and the cdf's value there
        if mid_p < p:
            low_z = mid_z              # Midpoint too low, search above it
        else:
            hi_z = mid_z               # Midpoint too high, search below it

    return mid_z
    

使用するニューラルネットワークライブラリ

neural_networks.py
import math
  
def sigmoid(t):
    return 1 / (1 + math.exp(-t))
  
def binary_encode(x):
    binary = []

    for i in range(10):
        binary.append(x % 2)
        x = x // 2
  
    return binary
  
def fizz_buzz_encode(x):
    if x % 15 == 0:
        return [0, 0, 0, 1]
    elif x % 5 == 0:
        return [0, 0, 1, 0]
    elif x % 3 == 0:
        return [0, 1, 0, 0]
    else:
        return [1, 0, 0, 0]
   
def argmax(xs):
    """Returns the index of the largest value"""
    return max(range(len(xs)), key=lambda i: xs[i])
    

19.1 テンソル

テンソルの形状
def shape(tensor):
    sizes = []
    while isinstance(tensor, list):
        sizes.append(len(tensor))
        tensor = tensor[0]
    return sizes
   
print(shape([1, 2, 3]))
print(shape([[1, 2], [3, 4], [5, 6]]))
    

1次元のテンソルか判定
def is_1d(tensor):
    """
    If tensor[0] is a list, it's a higher-order tensor.
    Otherwise, tensor is 1-dimensonal (that is, a vector).
    """
    return not isinstance(tensor[0], list)
   
print(is_1d([1, 2, 3]))
print(is_1d([[1, 2], [3, 4]]))
    

テンソル中の値を合計
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
   
print(tensor_sum([1, 2, 3]))
print(tensor_sum([[1, 2], [3, 4]]))
    

テンソル中の値に関数を適用
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
   
print(tensor_apply(lambda x: x + 1, [1, 2, 3]))
print(tensor_apply(lambda x: 2 * x, [[1, 2], [3, 4]]))
    

与えたテンソルと同じ形状で、全要素がゼロのテンソルを生成
def zeros_like(tensor):
    return tensor_apply(lambda _: 0.0, tensor)
   
print(zeros_like([1, 2, 3]))
print(zeros_like([[1, 2], [3, 4]]))
    

2つのテンソルの対応する要素に、関数を適用する
import operator
   
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
print(tensor_combine(operator.add, [1, 2, 3], [4, 5, 6]))
print(tensor_combine(operator.mul, [1, 2, 3], [4, 5, 6]))
    

19.2 層の抽象化

実装
import operator
from neural_networks import sigmoid
  
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
   
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
   
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sigmoid(Layer):
    def forward(self, input):
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation.
        """
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
   
    def backward(self, gradient):
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)
    

19.3 線形層

実装(1)
def shape(tensor):
    sizes = []
    while isinstance(tensor, list):
        sizes.append(len(tensor))
        tensor = tensor[0]
    return sizes
   
import random
from probability import inverse_normal_cdf
   
def random_uniform(*dims):
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random_uniform(*dims[1:]) for _ in range(dims[0])]
  
def random_normal(*dims, mean = 0.0, variance = 1.0):
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims[0])]
   
def random_tensor(*dims, init = 'normal'):
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance=variance)
    else:
        raise ValueError(f"unknown init: {init}")
   
print(shape(random_uniform(2, 3, 4)))
print(shape(random_normal(5, 6, mean=10)))
    

実装(2)
from linear_algebra import dot
  
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
  
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Linear(Layer):
    def __init__(self, input_dim, output_dim, init = 'xavier'):
        """
        A layer of output_dim neurons, each with input_dim weights
        (and a bias).
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
   
        # self.w[o] is the weights for the o-th neuron
        self.w = random_tensor(output_dim, input_dim, init=init)
   
        # self.b[o] is the bias term for the o-th neuron
        self.b = random_tensor(output_dim, init=init)
   
    def forward(self, input):
        # Save the input to use in the backward pass.
        self.input = input
   
        # Return the vector of neuron outputs.
        return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]
   
    def backward(self, gradient):
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
  
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]
  
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]
   
    def params(self):
        return [self.w, self.b]
   
    def grads(self):
        return [self.w_grad, self.b_grad]
    

19.4 層のシーケンスで表現するニューラルネットワーク

実装
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sequential(Layer):
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """
    def __init__(self, layers):
        self.layers = layers
  
    def forward(self, input):
        """Just forward the input through the layers in order."""
        for layer in self.layers:
            input = layer.forward(input)
        return input
  
    def backward(self, gradient):
        """Just backpropagate the gradient through the layers in reverse."""
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        return gradient
  
    def params(self):
        """Just return the params from each layer."""
        return (param for layer in self.layers for param in layer.params())
   
    def grads(self):
        """Just return the grads from each layer."""
        return (grad for layer in self.layers for grad in layer.grads())
    

19.5 損失の最適化

実装(1)
import operator
   
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
   
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
class Loss:
    def loss(self, predicted, actual):
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError

    def gradient(self, predicted, actual):
        """How does the loss change as the predictions change?"""
        raise NotImplementedError
   
class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted, actual):
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)

        # And just add them up
        return tensor_sum(squared_errors)
   
    def gradient(self, predicted, actual):
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)
    

実装(2)
class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer):
        raise NotImplementedError
   
class GradientDescent(Optimizer):
    def __init__(self, learning_rate = 0.1):
        self.lr = learning_rate
  
    def step(self, layer):
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)
   
class Momentum(Optimizer):
    def __init__(self, learning_rate, momentum = 0.9):
        self.lr = learning_rate
        self.mo = momentum
        self.updates = []  # running average
   
    def step(self, layer):
        # If we have no previous updates, start with all zeros.
        if not self.updates:
            self.updates = [zeros_like(grad) for grad in layer.grads()]
   
        for update, param, grad in zip(self.updates, layer.params(), layer.grads()):
            # Apply momentum
            update[:] = tensor_combine(
                lambda u, g: self.mo * u + (1 - self.mo) * g,
                update,
                grad)
   
            # Then take a gradient step
            param[:] = tensor_combine(
                lambda p, u: p - self.lr * u,
                param,
                update)
    

19.6 事例:修正版XOR

実装
import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math
  
def is_1d(tensor):
    """
    If tensor[0] is a list, it's a higher-order tensor.
    Otherwise, tensor is 1-dimensonal (that is, a vector).
    """
    return not isinstance(tensor[0], list)
  
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
   
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
   
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
def random_uniform(*dims):
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random_uniform(*dims[1:]) for _ in range(dims[0])]
   
def random_normal(*dims, mean = 0.0, variance = 1.0):
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims[0])]
   
def random_tensor(*dims, init = 'normal'):
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance=variance)
    else:
        raise ValueError(f"unknown init: {init}")
   
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sigmoid(Layer):
    def forward(self, input):
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation.
        """
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
   
    def backward(self, gradient):
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)
   
class Linear(Layer):
    def __init__(self, input_dim, output_dim, init = 'xavier'):
        """
        A layer of output_dim neurons, each with input_dim weights
        (and a bias).
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
   
        # self.w[o] is the weights for the o-th neuron
        self.w = random_tensor(output_dim, input_dim, init=init)
   
        # self.b[o] is the bias term for the o-th neuron
        self.b = random_tensor(output_dim, init=init)
   
    def forward(self, input):
        # Save the input to use in the backward pass.
        self.input = input
   
        # Return the vector of neuron outputs.
        return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]
   
    def backward(self, gradient):
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
  
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]
  
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]
   
    def params(self):
        return [self.w, self.b]
   
    def grads(self):
        return [self.w_grad, self.b_grad]
   
class Sequential(Layer):
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """
    def __init__(self, layers):
        self.layers = layers
  
    def forward(self, input):
        """Just forward the input through the layers in order."""
        for layer in self.layers:
            input = layer.forward(input)
        return input
  
    def backward(self, gradient):
        """Just backpropagate the gradient through the layers in reverse."""
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        return gradient
  
    def params(self):
        """Just return the params from each layer."""
        return (param for layer in self.layers for param in layer.params())
   
    def grads(self):
        """Just return the grads from each layer."""
        return (grad for layer in self.layers for grad in layer.grads())
   
class Loss:
    def loss(self, predicted, actual):
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError
    def gradient(self, predicted, actual):
        """How does the loss change as the predictions change?"""
        raise NotImplementedError
   
class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted, actual):
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)
        # And just add them up
        return tensor_sum(squared_errors)
   
    def gradient(self, predicted, actual):
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)
    
class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer):
        raise NotImplementedError
   
class GradientDescent(Optimizer):
    def __init__(self, learning_rate = 0.1):
        self.lr = learning_rate
   
    def step(self, layer):
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)
    

学習
random.seed(0)
   
xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
ys = [[0.], [1.], [1.], [0.]]
  
net = Sequential([
        Linear(input_dim=2, output_dim=2),
        Sigmoid(),
        Linear(input_dim=2, output_dim=1)
    ])
    
import tqdm
    
optimizer = GradientDescent(learning_rate=0.1)
loss = SSE()
    
with tqdm.trange(3000) as t:
    for epoch in t:
        epoch_loss = 0.0
    
        for x, y in zip(xs, ys):
            predicted = net.forward(x)
            epoch_loss += loss.loss(predicted, y)
            gradient = loss.gradient(predicted, y)
            net.backward(gradient)
    
            optimizer.step(net)
    
        t.set_description(f"xor loss {epoch_loss:.3f}")
   
print()
   
for param in net.params():
    print(param)
    

検証
for x in xs:
    predicted = net.forward(x)
    print('input={0} output={1}'.format(x, predicted))
    

19.7 その他の活性化関数

その他の活性化関数
import math
   
def tanh(x):
    # If x is very large or very small, tanh is (essentially) 1 or -1.
    # We check for this because e.g. math.exp(1000) raises an error.
    if x < -100:  return -1
    elif x > 100: return 1
  
    em2x = math.exp(-2 * x)
    return (1 - em2x) / (1 + em2x)
  
class Tanh(Layer):
    def forward(self, input):
        # Save tanh output to use in backward pass.
        self.tanh = tensor_apply(tanh, input)
        return self.tanh
  
    def backward(self, gradient):
        return tensor_combine(
            lambda tanh, grad: (1 - tanh ** 2) * grad,
            self.tanh,
            gradient)
  
class Relu(Layer):
    def forward(self, input):
        self.input = input
        return tensor_apply(lambda x: max(x, 0), input)
  
    def backward(self, gradient):
        return tensor_combine(lambda x, grad: grad if x > 0 else 0,
                              self.input,
                              gradient)
    

19.8 事例:修正版 Fizz Buzz

実装
import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math
  
def is_1d(tensor):
    """
    If tensor[0] is a list, it's a higher-order tensor.
    Otherwise, tensor is 1-dimensonal (that is, a vector).
    """
    return not isinstance(tensor[0], list)
  
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
  
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
   
def zeros_like(tensor):
    return tensor_apply(lambda _: 0.0, tensor)
   
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
def random_uniform(*dims):
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random_uniform(*dims[1:]) for _ in range(dims[0])]
   
def random_normal(*dims, mean = 0.0, variance = 1.0):
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims[0])]
   
def random_tensor(*dims, init = 'normal'):
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance=variance)
    else:
        raise ValueError(f"unknown init: {init}")
   
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sigmoid(Layer):
    def forward(self, input):
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation.
        """
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
   
    def backward(self, gradient):
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)
   
class Linear(Layer):
    def __init__(self, input_dim, output_dim, init = 'xavier'):
        """
        A layer of output_dim neurons, each with input_dim weights
        (and a bias).
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
   
        # self.w[o] is the weights for the o-th neuron
        self.w = random_tensor(output_dim, input_dim, init=init)
   
        # self.b[o] is the bias term for the o-th neuron
        self.b = random_tensor(output_dim, init=init)
   
    def forward(self, input):
        # Save the input to use in the backward pass.
        self.input = input
   
        # Return the vector of neuron outputs.
        return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]
   
    def backward(self, gradient):
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
  
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]
  
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]
   
    def params(self):
        return [self.w, self.b]
   
    def grads(self):
        return [self.w_grad, self.b_grad]
   
class Sequential(Layer):
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """
    def __init__(self, layers):
        self.layers = layers
  
    def forward(self, input):
        """Just forward the input through the layers in order."""
        for layer in self.layers:
            input = layer.forward(input)
        return input
  
    def backward(self, gradient):
        """Just backpropagate the gradient through the layers in reverse."""
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        return gradient
  
    def params(self):
        """Just return the params from each layer."""
        return (param for layer in self.layers for param in layer.params())
   
    def grads(self):
        """Just return the grads from each layer."""
        return (grad for layer in self.layers for grad in layer.grads())
  
class Loss:
    def loss(self, predicted, actual):
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError
    def gradient(self, predicted, actual):
        """How does the loss change as the predictions change?"""
        raise NotImplementedError
   
class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted, actual):
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)
        # And just add them up
        return tensor_sum(squared_errors)
   
    def gradient(self, predicted, actual):
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)
   
class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer):
        raise NotImplementedError
   
class GradientDescent(Optimizer):
    def __init__(self, learning_rate = 0.1):
        self.lr = learning_rate
  
    def step(self, layer):
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)
   
class Momentum(Optimizer):
    def __init__(self, learning_rate, momentum = 0.9):
        self.lr = learning_rate
        self.mo = momentum
        self.updates = []  # running average
   
    def step(self, layer):
        # If we have no previous updates, start with all zeros.
        if not self.updates:
            self.updates = [zeros_like(grad) for grad in layer.grads()]
   
        for update, param, grad in zip(self.updates, layer.params(), layer.grads()):
            # Apply momentum
            update[:] = tensor_combine(
                lambda u, g: self.mo * u + (1 - self.mo) * g,
                update,
                grad)
   
            # Then take a gradient step
            param[:] = tensor_combine(
                lambda p, u: p - self.lr * u,
                param,
                update)
   
def tanh(x):
    # If x is very large or very small, tanh is (essentially) 1 or -1.
    # We check for this because e.g. math.exp(1000) raises an error.
    if x < -100:  return -1
    elif x > 100: return 1
  
    em2x = math.exp(-2 * x)
    return (1 - em2x) / (1 + em2x)
  
class Tanh(Layer):
    def forward(self, input):
        # Save tanh output to use in backward pass.
        self.tanh = tensor_apply(tanh, input)
        return self.tanh
  
    def backward(self, gradient):
        return tensor_combine(
            lambda tanh, grad: (1 - tanh ** 2) * grad,
            self.tanh,
            gradient)
  
class Relu(Layer):
    def forward(self, input):
        self.input = input
        return tensor_apply(lambda x: max(x, 0), input)
  
    def backward(self, gradient):
        return tensor_combine(lambda x, grad: grad if x > 0 else 0,
                              self.input,
                              gradient)
    

学習
from neural_networks import binary_encode, fizz_buzz_encode, argmax
import tqdm
   
def fizzbuzz_accuracy(low, hi, net):
        num_correct = 0
        for n in range(low, hi):
            x = binary_encode(n)
            predicted = argmax(net.forward(x))
            actual = argmax(fizz_buzz_encode(n))
            if predicted == actual:
                num_correct += 1
    
        return num_correct / (hi - low)
   
xs = [binary_encode(n) for n in range(101, 1024)]
ys = [fizz_buzz_encode(n) for n in range(101, 1024)]
   
NUM_HIDDEN = 25
   
random.seed(0)
   
net = Sequential([
        Linear(input_dim=10, output_dim=NUM_HIDDEN, init='uniform'),
        Tanh(),
        Linear(input_dim=NUM_HIDDEN, output_dim=4, init='uniform'),
        Sigmoid()
    ])
   
optimizer = Momentum(learning_rate=0.1, momentum=0.9)
loss = SSE()
   
with tqdm.trange(1000) as t:
    for epoch in t:
        epoch_loss = 0.0
        
        for x, y in zip(xs, ys):
            predicted = net.forward(x)
            epoch_loss += loss.loss(predicted, y)
            gradient = loss.gradient(predicted, y)
            net.backward(gradient)
            
            optimizer.step(net)
        
        accuracy = fizzbuzz_accuracy(101, 1024, net)
        t.set_description(f"fb loss: {epoch_loss:.2f} acc: {accuracy:.2f}")
    
# Now check results on the test set
print()
print("test results", fizzbuzz_accuracy(1, 101, net))
    

19.9 ソフトマックスと交差エントロピー

ソフトマックス
def softmax(tensor):
    """Softmax along the last dimension"""
    if is_1d(tensor):
        # Subtract largest value for numerical stabilitity.
        largest = max(tensor)
        exps = [math.exp(x - largest) for x in tensor]
   
        sum_of_exps = sum(exps)
        return [exp_i / sum_of_exps for exp_i in exps]
    else:
        return [softmax(tensor_i) for tensor_i in tensor]
    

交差エントロピー
class SoftmaxCrossEntropy(Loss):
    """
    This is the negative-log-likelihood of the observed values, given the
    neural net model. So if we choose weights to minimize it, our model will
    be maximizing the likelihood of the observed data.
    """
    def loss(self, predicted, actual):
        # Apply softmax to get probabilities
        probabilities = softmax(predicted)

        # This will be log p_i for the actual class i and 0 for the other
        # classes. We add a tiny amount to p to avoid taking log(0).
        likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)

        # And then we just sum up the negatives.
        return -tensor_sum(likelihoods)
   
    def gradient(self, predicted, actual):
        probabilities = softmax(predicted)

        # Isn't this a pleasant equation?
        return tensor_combine(lambda p, actual: p - actual, probabilities, actual)
    

交差エントロピーの効用(実装)
import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math
   
def is_1d(tensor):
    """
    If tensor[0] is a list, it's a higher-order tensor.
    Otherwise, tensor is 1-dimensonal (that is, a vector).
    """
    return not isinstance(tensor[0], list)
   
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
  
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
  
def zeros_like(tensor):
    return tensor_apply(lambda _: 0.0, tensor)
  
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
   
def random_uniform(*dims):
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random_uniform(*dims[1:]) for _ in range(dims[0])]
  
def random_normal(*dims, mean = 0.0, variance = 1.0):
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims[0])]
  
def random_tensor(*dims, init = 'normal'):
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance=variance)
    else:
        raise ValueError(f"unknown init: {init}")
  
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sigmoid(Layer):
    def forward(self, input):
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation.
        """
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
   
    def backward(self, gradient):
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)
  
class Linear(Layer):
    def __init__(self, input_dim, output_dim, init = 'xavier'):
        """
        A layer of output_dim neurons, each with input_dim weights
        (and a bias).
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
   
        # self.w[o] is the weights for the o-th neuron
        self.w = random_tensor(output_dim, input_dim, init=init)
   
        # self.b[o] is the bias term for the o-th neuron
        self.b = random_tensor(output_dim, init=init)
   
    def forward(self, input):
        # Save the input to use in the backward pass.
        self.input = input
   
        # Return the vector of neuron outputs.
        return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]
   
    def backward(self, gradient):
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
  
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]
  
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]
   
    def params(self):
        return [self.w, self.b]
   
    def grads(self):
        return [self.w_grad, self.b_grad]
  
class Sequential(Layer):
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """
    def __init__(self, layers):
        self.layers = layers
  
    def forward(self, input):
        """Just forward the input through the layers in order."""
        for layer in self.layers:
            input = layer.forward(input)
        return input
  
    def backward(self, gradient):
        """Just backpropagate the gradient through the layers in reverse."""
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        return gradient
  
    def params(self):
        """Just return the params from each layer."""
        return (param for layer in self.layers for param in layer.params())
   
    def grads(self):
        """Just return the grads from each layer."""
        return (grad for layer in self.layers for grad in layer.grads())
  
class Loss:
    def loss(self, predicted, actual):
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError
    def gradient(self, predicted, actual):
        """How does the loss change as the predictions change?"""
        raise NotImplementedError
   
class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted, actual):
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)
        # And just add them up
        return tensor_sum(squared_errors)
   
    def gradient(self, predicted, actual):
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)
  
class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer):
        raise NotImplementedError
   
class GradientDescent(Optimizer):
    def __init__(self, learning_rate = 0.1):
        self.lr = learning_rate
  
    def step(self, layer):
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)
   
class Momentum(Optimizer):
    def __init__(self, learning_rate, momentum = 0.9):
        self.lr = learning_rate
        self.mo = momentum
        self.updates = []  # running average
   
    def step(self, layer):
        # If we have no previous updates, start with all zeros.
        if not self.updates:
            self.updates = [zeros_like(grad) for grad in layer.grads()]
   
        for update, param, grad in zip(self.updates, layer.params(), layer.grads()):
            # Apply momentum
            update[:] = tensor_combine(
                lambda u, g: self.mo * u + (1 - self.mo) * g,
                update,
                grad)
   
            # Then take a gradient step
            param[:] = tensor_combine(
                lambda p, u: p - self.lr * u,
                param,
                update)
  
def tanh(x):
    # If x is very large or very small, tanh is (essentially) 1 or -1.
    # We check for this because e.g. math.exp(1000) raises an error.
    if x < -100:  return -1
    elif x > 100: return 1
  
    em2x = math.exp(-2 * x)
    return (1 - em2x) / (1 + em2x)
  
class Tanh(Layer):
    def forward(self, input):
        # Save tanh output to use in backward pass.
        self.tanh = tensor_apply(tanh, input)
        return self.tanh
  
    def backward(self, gradient):
        return tensor_combine(
            lambda tanh, grad: (1 - tanh ** 2) * grad,
            self.tanh,
            gradient)
  
class Relu(Layer):
    def forward(self, input):
        self.input = input
        return tensor_apply(lambda x: max(x, 0), input)
  
    def backward(self, gradient):
        return tensor_combine(lambda x, grad: grad if x > 0 else 0,
                              self.input,
                              gradient)
   
def softmax(tensor):
    """Softmax along the last dimension"""
    if is_1d(tensor):
        # Subtract largest value for numerical stabilitity.
        largest = max(tensor)
        exps = [math.exp(x - largest) for x in tensor]
   
        sum_of_exps = sum(exps)
        return [exp_i / sum_of_exps for exp_i in exps]
    else:
        return [softmax(tensor_i) for tensor_i in tensor]
   
class SoftmaxCrossEntropy(Loss):
    """
    This is the negative-log-likelihood of the observed values, given the
    neural net model. So if we choose weights to minimize it, our model will
    be maximizing the likelihood of the observed data.
    """
    def loss(self, predicted, actual):
        # Apply softmax to get probabilities
        probabilities = softmax(predicted)

        # This will be log p_i for the actual class i and 0 for the other
        # classes. We add a tiny amount to p to avoid taking log(0).
        likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)
   
        # And then we just sum up the negatives.
        return -tensor_sum(likelihoods)
   
    def gradient(self, predicted, actual):
        probabilities = softmax(predicted)
   
        # Isn't this a pleasant equation?
        return tensor_combine(lambda p, actual: p - actual, probabilities, actual)
    

交差エントロピーの効用(学習)
from neural_networks import binary_encode, fizz_buzz_encode, argmax
import tqdm
  
def fizzbuzz_accuracy(low, hi, net):
        num_correct = 0
        for n in range(low, hi):
            x = binary_encode(n)
            predicted = argmax(net.forward(x))
            actual = argmax(fizz_buzz_encode(n))
            if predicted == actual:
                num_correct += 1
    
        return num_correct / (hi - low)
   
xs = [binary_encode(n) for n in range(101, 1024)]
ys = [fizz_buzz_encode(n) for n in range(101, 1024)]
  
NUM_HIDDEN = 25
  
random.seed(0)
    
net = Sequential([
        Linear(input_dim=10, output_dim=NUM_HIDDEN, init='uniform'),
        Tanh(),
        Linear(input_dim=NUM_HIDDEN, output_dim=4, init='uniform')
        # No final sigmoid layer now
    ])
  
optimizer = Momentum(learning_rate=0.1, momentum=0.9)
loss = SoftmaxCrossEntropy()
   
with tqdm.trange(100) as t:
    for epoch in t:
        epoch_loss = 0.0
        
        for x, y in zip(xs, ys):
            predicted = net.forward(x)
            epoch_loss += loss.loss(predicted, y)
            gradient = loss.gradient(predicted, y)
            net.backward(gradient)
            
            optimizer.step(net)
        
        accuracy = fizzbuzz_accuracy(101, 1024, net)
        t.set_description(f"fb loss: {epoch_loss:.3f} acc: {accuracy:.2f}")
       
# Again check results on the test set
print()
print("test results", fizzbuzz_accuracy(1, 101, net))
    

19.10 ドロップアウト

ドロップアウト
class Dropout(Layer):
    def __init__(self, p):
        self.p = p
        self.train = True
   
    def forward(self, input):
        if self.train:
            # Create a mask of 0s and 1s shaped like the input
            # using the specified probability.
            self.mask = tensor_apply(lambda _: 0 if random.random() < self.p else 1, input)
            # Multiply by the mask to dropout inputs.
            return tensor_combine(operator.mul, input, self.mask)
        else:
            # During evaluation just scale down the outputs uniformly.
            return tensor_apply(lambda x: x * (1 - self.p), input)
   
    def backward(self, gradient):
        if self.train:
            # Only propagate the gradients where mask == 1
            return tensor_combine(operator.mul, gradient, self.mask)
        else:
            raise RuntimeError("don't call backward when not in train mode")
    

19.11 事例:MNIST

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、mnistをインストールしてください。 コマンドは「pip install mnist」となります。

実装
import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math
   
def is_1d(tensor):
    """
    If tensor[0] is a list, it's a higher-order tensor.
    Otherwise, tensor is 1-dimensonal (that is, a vector).
    """
    return not isinstance(tensor[0], list)
  
def tensor_sum(tensor):
    """Sums up all the values in the tensor"""
    if is_1d(tensor):
        return sum(tensor)
    else:
        return sum(tensor_sum(tensor_i) for tensor_i in tensor)
  
def tensor_apply(f, tensor):
    """Applies f elementwise"""
    if is_1d(tensor):
        return [f(x) for x in tensor]
    else:
        return [tensor_apply(f, tensor_i) for tensor_i in tensor]
  
def zeros_like(tensor):
    return tensor_apply(lambda _: 0.0, tensor)
  
def tensor_combine(f, t1, t2):
    """Applies f to corresponding elements of t1 and t2"""
    if is_1d(t1):
        return [f(x, y) for x, y in zip(t1, t2)]
    else:
        return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]
  
def random_uniform(*dims):
    if len(dims) == 1:
        return [random.random() for _ in range(dims[0])]
    else:
        return [random_uniform(*dims[1:]) for _ in range(dims[0])]
  
def random_normal(*dims, mean = 0.0, variance = 1.0):
    if len(dims) == 1:
        return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
    else:
        return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims[0])]
  
def random_tensor(*dims, init = 'normal'):
    if init == 'normal':
        return random_normal(*dims)
    elif init == 'uniform':
        return random_uniform(*dims)
    elif init == 'xavier':
        variance = len(dims) / sum(dims)
        return random_normal(*dims, variance=variance)
    else:
        raise ValueError(f"unknown init: {init}")
  
class Layer:
    """
    Our neural networks will be composed of Layers, each of which
    knows how to do some computation on its inputs in the "forward"
    direction and propagate gradients in the "backward" direction.
    """
    def forward(self, input):
        """
        Note the lack of types. We're not going to be prescriptive
        about what kinds of inputs layers can take and what kinds
        of outputs they can return.
        """
        raise NotImplementedError
  
    def backward(self, gradient):
        """
        Similarly, we're not going to be prescriptive about what the
        gradient looks like. It's up to you the user to make sure
        that you're doing things sensibly.
        """
        raise NotImplementedError
   
    def params(self):
        """
        Returns the parameters of this layer. The default implementation
        returns nothing, so that if you have a layer with no parameters
        you don't have to implement this.
        """
        return ()
  
    def grads(self):
        """
        Returns the gradients, in the same order as params()
        """
        return ()
   
class Sigmoid(Layer):
    def forward(self, input):
        """
        Apply sigmoid to each element of the input tensor,
        and save the results to use in backpropagation.
        """
        self.sigmoids = tensor_apply(sigmoid, input)
        return self.sigmoids
   
    def backward(self, gradient):
        return tensor_combine(lambda sig, grad: sig * (1 - sig) * grad, self.sigmoids, gradient)
  
class Linear(Layer):
    def __init__(self, input_dim, output_dim, init = 'xavier'):
        """
        A layer of output_dim neurons, each with input_dim weights
        (and a bias).
        """
        self.input_dim = input_dim
        self.output_dim = output_dim
   
        # self.w[o] is the weights for the o-th neuron
        self.w = random_tensor(output_dim, input_dim, init=init)
   
        # self.b[o] is the bias term for the o-th neuron
        self.b = random_tensor(output_dim, init=init)
   
    def forward(self, input):
        # Save the input to use in the backward pass.
        self.input = input
   
        # Return the vector of neuron outputs.
        return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]
   
    def backward(self, gradient):
        # Each b[o] gets added to output[o], which means
        # the gradient of b is the same as the output gradient.
        self.b_grad = gradient
  
        # Each w[o][i] multiplies input[i] and gets added to output[o].
        # So its gradient is input[i] * gradient[o].
        self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]
  
        # Each input[i] multiplies every w[o][i] and gets added to every
        # output[o]. So its gradient is the sum of w[o][i] * gradient[o]
        # across all the outputs.
        return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]
   
    def params(self):
        return [self.w, self.b]
   
    def grads(self):
        return [self.w_grad, self.b_grad]
  
class Sequential(Layer):
    """
    A layer consisting of a sequence of other layers.
    It's up to you to make sure that the output of each layer
    makes sense as the input to the next layer.
    """
    def __init__(self, layers):
        self.layers = layers
  
    def forward(self, input):
        """Just forward the input through the layers in order."""
        for layer in self.layers:
            input = layer.forward(input)
        return input
  
    def backward(self, gradient):
        """Just backpropagate the gradient through the layers in reverse."""
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)
        return gradient
  

    def params(self):
        """Just return the params from each layer."""
        return (param for layer in self.layers for param in layer.params())
   
    def grads(self):
        """Just return the grads from each layer."""
        return (grad for layer in self.layers for grad in layer.grads())
  
class Loss:
    def loss(self, predicted, actual):
        """How good are our predictions? (Larger numbers are worse.)"""
        raise NotImplementedError
    def gradient(self, predicted, actual):
        """How does the loss change as the predictions change?"""
        raise NotImplementedError
   
class SSE(Loss):
    """Loss function that computes the sum of the squared errors."""
    def loss(self, predicted, actual):
        # Compute the tensor of squared differences
        squared_errors = tensor_combine(
            lambda predicted, actual: (predicted - actual) ** 2,
            predicted,
            actual)
        # And just add them up
        return tensor_sum(squared_errors)
   
    def gradient(self, predicted, actual):
        return tensor_combine(
            lambda predicted, actual: 2 * (predicted - actual),
            predicted,
            actual)
  
class Optimizer:
    """
    An optimizer updates the weights of a layer (in place) using information
    known by either the layer or the optimizer (or by both).
    """
    def step(self, layer):
        raise NotImplementedError
   
class GradientDescent(Optimizer):
    def __init__(self, learning_rate = 0.1):
        self.lr = learning_rate
  
    def step(self, layer):
        for param, grad in zip(layer.params(), layer.grads()):
            # Update param using a gradient step
            param[:] = tensor_combine(
                lambda param, grad: param - grad * self.lr,
                param,
                grad)
  
class Momentum(Optimizer):
    def __init__(self, learning_rate, momentum = 0.9):
        self.lr = learning_rate
        self.mo = momentum
        self.updates = []  # running average
   
    def step(self, layer):
        # If we have no previous updates, start with all zeros.
        if not self.updates:
            self.updates = [zeros_like(grad) for grad in layer.grads()]
   
        for update, param, grad in zip(self.updates, layer.params(), layer.grads()):
            # Apply momentum
            update[:] = tensor_combine(
                lambda u, g: self.mo * u + (1 - self.mo) * g,
                update,
                grad)
   
            # Then take a gradient step
            param[:] = tensor_combine(
                lambda p, u: p - self.lr * u,
                param,
                update)
  
def tanh(x):
    # If x is very large or very small, tanh is (essentially) 1 or -1.
    # We check for this because e.g. math.exp(1000) raises an error.
    if x < -100:  return -1
    elif x > 100: return 1
  
    em2x = math.exp(-2 * x)
    return (1 - em2x) / (1 + em2x)
  
class Tanh(Layer):
    def forward(self, input):
        # Save tanh output to use in backward pass.
        self.tanh = tensor_apply(tanh, input)
        return self.tanh
  
    def backward(self, gradient):
        return tensor_combine(
            lambda tanh, grad: (1 - tanh ** 2) * grad,
            self.tanh,
            gradient)
  
class Relu(Layer):
    def forward(self, input):
        self.input = input
        return tensor_apply(lambda x: max(x, 0), input)
  
    def backward(self, gradient):
        return tensor_combine(lambda x, grad: grad if x > 0 else 0,
                              self.input,
                              gradient)
  
def softmax(tensor):
    """Softmax along the last dimension"""
    if is_1d(tensor):
        # Subtract largest value for numerical stabilitity.
        largest = max(tensor)
        exps = [math.exp(x - largest) for x in tensor]
  
        sum_of_exps = sum(exps)
        return [exp_i / sum_of_exps for exp_i in exps]
    else:
        return [softmax(tensor_i) for tensor_i in tensor]
  
class SoftmaxCrossEntropy(Loss):
    """
    This is the negative-log-likelihood of the observed values, given the
    neural net model. So if we choose weights to minimize it, our model will
    be maximizing the likelihood of the observed data.
    """
    def loss(self, predicted, actual):
        # Apply softmax to get probabilities
        probabilities = softmax(predicted)

        # This will be log p_i for the actual class i and 0 for the other
        # classes. We add a tiny amount to p to avoid taking log(0).
        likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)

        # And then we just sum up the negatives.
        return -tensor_sum(likelihoods)
  
    def gradient(self, predicted, actual):
        probabilities = softmax(predicted)
   
        # Isn't this a pleasant equation?
        return tensor_combine(lambda p, actual: p - actual, probabilities, actual)
   
class Dropout(Layer):
    def __init__(self, p):
        self.p = p
        self.train = True
   
    def forward(self, input):
        if self.train:
            # Create a mask of 0s and 1s shaped like the input
            # using the specified probability.
            self.mask = tensor_apply(lambda _: 0 if random.random() < self.p else 1, input)
            # Multiply by the mask to dropout inputs.
            return tensor_combine(operator.mul, input, self.mask)
        else:
            # During evaluation just scale down the outputs uniformly.
            return tensor_apply(lambda x: x * (1 - self.p), input)
  
    def backward(self, gradient):
        if self.train:
            # Only propagate the gradients where mask == 1
            return tensor_combine(operator.mul, gradient, self.mask)
        else:
            raise RuntimeError("don't call backward when not in train mode")
    

データのロード
import mnist
  
def shape(tensor):
    sizes = []
    while isinstance(tensor, list):
        sizes.append(len(tensor))
        tensor = tensor[0]
    return sizes
   
train_images = mnist.train_images().tolist()
train_labels = mnist.train_labels().tolist()
   
print(shape(train_images))
print(shape(train_labels))
    

最初の100個の学習画像をプロット
%matplotlib inline
import matplotlib.pyplot as plt
    
fig, ax = plt.subplots(10, 10)
    
for i in range(10):
    for j in range(10):
        # Plot each image in black and white and hide the axes.
        ax[i][j].imshow(train_images[10 * i + j], cmap='Greys')
        ax[i][j].xaxis.set_visible(False)
        ax[i][j].yaxis.set_visible(False)
    

前処理
def one_hot_encode(i, num_labels = 10):
    return [1.0 if j == i else 0.0 for j in range(num_labels)]
  
test_images = mnist.test_images().tolist()
test_labels = mnist.test_labels().tolist()
  
# Compute the average pixel value
avg = tensor_sum(train_images) / 60000 / 28 / 28
    
# Recenter, rescale, and flatten
train_images = [[(pixel - avg) / 256 for row in image for pixel in row] for image in train_images]
test_images = [[(pixel - avg) / 256 for row in image for pixel in row] for image in test_images]
   
train_labels = [one_hot_encode(label) for label in train_labels]
test_labels = [one_hot_encode(label) for label in test_labels]
    

学習
from neural_networks import argmax
import tqdm
    
def loop(model, images, labels, loss, optimizer = None):
    correct = 0         # Track number of correct predictions.
    total_loss = 0.0    # Track total loss.
    
    with tqdm.trange(len(images)) as t:
        for i in t:
            predicted = model.forward(images[i])             # Predict.
            if argmax(predicted) == argmax(labels[i]):       # Check for
                correct += 1                                 # correctness.
            total_loss += loss.loss(predicted, labels[i])    # Compute loss.
    
            # If we're training, backpropagate gradient and update weights.
            if optimizer is not None:
                gradient = loss.gradient(predicted, labels[i])
                model.backward(gradient)
                optimizer.step(model)
    
            # And update our metrics in the progress bar.
            avg_loss = total_loss / (i + 1)
            acc = correct / (i + 1)
            t.set_description(f"mnist loss: {avg_loss:.3f} acc: {acc:.3f}")
  
random.seed(0)
  
# Name them so we can turn train on and off
dropout1 = Dropout(0.1)
dropout2 = Dropout(0.1)
  
model = Sequential([
        Linear(784, 30),  # Hidden layer 1: size 30
        dropout1,
        Tanh(),
        Linear(30, 10),   # Hidden layer 2: size 10
        dropout2,
        Tanh(),
        Linear(10, 10)    # Output layer: size 10
    ])
  
optimizer = Momentum(learning_rate=0.01, momentum=0.99)
loss = SoftmaxCrossEntropy()
    
# Enable dropout and train (takes > 20 minutes on my laptop!)
dropout1.train = dropout2.train = True
loop(model, train_images, train_labels, loss, optimizer)
    
# Disable dropout and evaluate
dropout1.train = dropout2.train = False
loop(model, test_images, test_labels, loss)