# 専門ユニット2/山内研セミナー(2020/11/18)

## 使用する線形代数ライブラリ

linear_algebra.py
import math

assert len(v) == len(w), "vectors must be the same length"
return [v_i + w_i for v_i, w_i in zip(v, w)]

def subtract(v, w):
"""Subtracts corresponding elements"""
assert len(v) == len(w), "vectors must be the same length"
return [v_i - w_i for v_i, w_i in zip(v, w)]

def vector_sum(vectors):
"""Sums all corresponding elements"""
# Check that vectors is not empty
assert vectors, "no vectors provided!"

# Check the vectors are all the same size
num_elements = len(vectors)
assert all(len(v) == num_elements for v in vectors), "different sizes!"

# the i-th element of the result is the sum of every vector[i]
return [sum(vector[i] for vector in vectors) for i in range(num_elements)]

def scalar_multiply(c, v):
"""Multiplies every element by c"""
return [c * v_i for v_i in v]

def vector_mean(vectors):
"""Computes the element-wise average"""
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))

def dot(v, w):
"""Computes v_1 * w_1 + ... + v_n * w_n"""
assert len(v) == len(w), "vectors must be same length"

return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
"""Returns v_1 * v_1 + ... + v_n * v_n"""
return dot(v, v)

def magnitude(v):
"""Returns the magnitude (or length) of v"""
return math.sqrt(sum_of_squares(v))

def squared_distance(v, w):
"""Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
return sum_of_squares(subtract(v, w))

def distance(v, w):
"""Computes the distance between v and w"""
return math.sqrt(squared_distance(v, w))

def shape(A):
"""Returns (# of rows of A, # of columns of A)"""
num_rows = len(A)
num_cols = len(A) if A else 0   # number of elements in first row
return num_rows, num_cols

def get_row(A, i):
"""Returns the i-th row of A (as a Vector)"""
return A[i]

def get_column(A, j):
"""Returns the j-th column of A (as a Vector)"""
return [A_i[j] for A_i in A]

def make_matrix(num_rows, num_cols, entry_fn):
"""
Returns a num_rows x num_cols matrix
whose (i,j)-th entry is entry_fn(i, j)
"""
return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]

def identity_matrix(n):
"""Returns the n x n identity matrix"""
return make_matrix(n, n, lambda i, j: 1 if i == j else 0)


## 使用する確率ライブラリ

probability.py
import math

def normal_cdf(x, mu = 0, sigma = 1):
return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2

def inverse_normal_cdf(p, mu = 0, sigma = 1, tolerance = 0.00001):
"""Find approximate inverse using binary search"""

# if not standard, compute standard and rescale
if mu != 0 or sigma != 1:
return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)

low_z = -10.0                      # normal_cdf(-10) is (very close to) 0
hi_z  =  10.0                      # normal_cdf(10)  is (very close to) 1
while hi_z - low_z > tolerance:
mid_z = (low_z + hi_z) / 2     # Consider the midpoint
mid_p = normal_cdf(mid_z)      # and the cdf's value there
if mid_p < p:
low_z = mid_z              # Midpoint too low, search above it
else:
hi_z = mid_z               # Midpoint too high, search below it

return mid_z


## 使用するニューラルネットワークライブラリ

neural_networks.py
import math

def sigmoid(t):
return 1 / (1 + math.exp(-t))

def binary_encode(x):
binary = []

for i in range(10):
binary.append(x % 2)
x = x // 2

return binary

def fizz_buzz_encode(x):
if x % 15 == 0:
return [0, 0, 0, 1]
elif x % 5 == 0:
return [0, 0, 1, 0]
elif x % 3 == 0:
return [0, 1, 0, 0]
else:
return [1, 0, 0, 0]

def argmax(xs):
"""Returns the index of the largest value"""
return max(range(len(xs)), key=lambda i: xs[i])


## 19.1 テンソル

テンソルの形状
def shape(tensor):
sizes = []
while isinstance(tensor, list):
sizes.append(len(tensor))
tensor = tensor
return sizes

print(shape([1, 2, 3]))
print(shape([[1, 2], [3, 4], [5, 6]]))


1次元のテンソルか判定
def is_1d(tensor):
"""
If tensor is a list, it's a higher-order tensor.
Otherwise, tensor is 1-dimensonal (that is, a vector).
"""
return not isinstance(tensor, list)

print(is_1d([1, 2, 3]))
print(is_1d([[1, 2], [3, 4]]))


テンソル中の値を合計
def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

print(tensor_sum([1, 2, 3]))
print(tensor_sum([[1, 2], [3, 4]]))


テンソル中の値に関数を適用
def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

print(tensor_apply(lambda x: x + 1, [1, 2, 3]))
print(tensor_apply(lambda x: 2 * x, [[1, 2], [3, 4]]))


def zeros_like(tensor):
return tensor_apply(lambda _: 0.0, tensor)

print(zeros_like([1, 2, 3]))
print(zeros_like([[1, 2], [3, 4]]))


2つのテンソルの対応する要素に、関数を適用する
import operator

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

print(tensor_combine(operator.add, [1, 2, 3], [4, 5, 6]))
print(tensor_combine(operator.mul, [1, 2, 3], [4, 5, 6]))


## 19.2 層の抽象化

import operator
from neural_networks import sigmoid

def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sigmoid(Layer):
def forward(self, input):
"""
Apply sigmoid to each element of the input tensor,
and save the results to use in backpropagation.
"""
self.sigmoids = tensor_apply(sigmoid, input)
return self.sigmoids



## 19.3 線形層

def shape(tensor):
sizes = []
while isinstance(tensor, list):
sizes.append(len(tensor))
tensor = tensor
return sizes

import random
from probability import inverse_normal_cdf

def random_uniform(*dims):
if len(dims) == 1:
return [random.random() for _ in range(dims)]
else:
return [random_uniform(*dims[1:]) for _ in range(dims)]

def random_normal(*dims, mean = 0.0, variance = 1.0):
if len(dims) == 1:
return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims)]
else:
return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims)]

def random_tensor(*dims, init = 'normal'):
if init == 'normal':
return random_normal(*dims)
elif init == 'uniform':
return random_uniform(*dims)
elif init == 'xavier':
variance = len(dims) / sum(dims)
return random_normal(*dims, variance=variance)
else:
raise ValueError(f"unknown init: {init}")

print(shape(random_uniform(2, 3, 4)))
print(shape(random_normal(5, 6, mean=10)))


from linear_algebra import dot

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Linear(Layer):
def __init__(self, input_dim, output_dim, init = 'xavier'):
"""
A layer of output_dim neurons, each with input_dim weights
(and a bias).
"""
self.input_dim = input_dim
self.output_dim = output_dim

# self.w[o] is the weights for the o-th neuron
self.w = random_tensor(output_dim, input_dim, init=init)

# self.b[o] is the bias term for the o-th neuron
self.b = random_tensor(output_dim, init=init)

def forward(self, input):
# Save the input to use in the backward pass.
self.input = input

# Return the vector of neuron outputs.
return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]

# Each b[o] gets added to output[o], which means
# the gradient of b is the same as the output gradient.

# Each w[o][i] multiplies input[i] and gets added to output[o].
self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]

# Each input[i] multiplies every w[o][i] and gets added to every
# output[o]. So its gradient is the sum of w[o][i] * gradient[o]
# across all the outputs.
return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]

def params(self):
return [self.w, self.b]



## 19.4 層のシーケンスで表現するニューラルネットワーク

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sequential(Layer):
"""
A layer consisting of a sequence of other layers.
It's up to you to make sure that the output of each layer
makes sense as the input to the next layer.
"""
def __init__(self, layers):
self.layers = layers

def forward(self, input):
"""Just forward the input through the layers in order."""
for layer in self.layers:
input = layer.forward(input)
return input

"""Just backpropagate the gradient through the layers in reverse."""
for layer in reversed(self.layers):

def params(self):
"""Just return the params from each layer."""
return (param for layer in self.layers for param in layer.params())

"""Just return the grads from each layer."""


## 19.5 損失の最適化

import operator

def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

class Loss:
def loss(self, predicted, actual):
"""How good are our predictions? (Larger numbers are worse.)"""
raise NotImplementedError

"""How does the loss change as the predictions change?"""
raise NotImplementedError

class SSE(Loss):
"""Loss function that computes the sum of the squared errors."""
def loss(self, predicted, actual):
# Compute the tensor of squared differences
squared_errors = tensor_combine(
lambda predicted, actual: (predicted - actual) ** 2,
predicted,
actual)

# And just add them up
return tensor_sum(squared_errors)

return tensor_combine(
lambda predicted, actual: 2 * (predicted - actual),
predicted,
actual)


class Optimizer:
"""
An optimizer updates the weights of a layer (in place) using information
known by either the layer or the optimizer (or by both).
"""
def step(self, layer):
raise NotImplementedError

def __init__(self, learning_rate = 0.1):
self.lr = learning_rate

def step(self, layer):
# Update param using a gradient step
param[:] = tensor_combine(
param,

class Momentum(Optimizer):
def __init__(self, learning_rate, momentum = 0.9):
self.lr = learning_rate
self.mo = momentum
self.updates = []  # running average

def step(self, layer):

# Apply momentum
update[:] = tensor_combine(
lambda u, g: self.mo * u + (1 - self.mo) * g,
update,

# Then take a gradient step
param[:] = tensor_combine(
lambda p, u: p - self.lr * u,
param,
update)


## 19.6 事例：修正版XOR

import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math

def is_1d(tensor):
"""
If tensor is a list, it's a higher-order tensor.
Otherwise, tensor is 1-dimensonal (that is, a vector).
"""
return not isinstance(tensor, list)

def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

def random_uniform(*dims):
if len(dims) == 1:
return [random.random() for _ in range(dims)]
else:
return [random_uniform(*dims[1:]) for _ in range(dims)]

def random_normal(*dims, mean = 0.0, variance = 1.0):
if len(dims) == 1:
return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims)]
else:
return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims)]

def random_tensor(*dims, init = 'normal'):
if init == 'normal':
return random_normal(*dims)
elif init == 'uniform':
return random_uniform(*dims)
elif init == 'xavier':
variance = len(dims) / sum(dims)
return random_normal(*dims, variance=variance)
else:
raise ValueError(f"unknown init: {init}")

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sigmoid(Layer):
def forward(self, input):
"""
Apply sigmoid to each element of the input tensor,
and save the results to use in backpropagation.
"""
self.sigmoids = tensor_apply(sigmoid, input)
return self.sigmoids

class Linear(Layer):
def __init__(self, input_dim, output_dim, init = 'xavier'):
"""
A layer of output_dim neurons, each with input_dim weights
(and a bias).
"""
self.input_dim = input_dim
self.output_dim = output_dim

# self.w[o] is the weights for the o-th neuron
self.w = random_tensor(output_dim, input_dim, init=init)

# self.b[o] is the bias term for the o-th neuron
self.b = random_tensor(output_dim, init=init)

def forward(self, input):
# Save the input to use in the backward pass.
self.input = input

# Return the vector of neuron outputs.
return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]

# Each b[o] gets added to output[o], which means
# the gradient of b is the same as the output gradient.

# Each w[o][i] multiplies input[i] and gets added to output[o].
self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]

# Each input[i] multiplies every w[o][i] and gets added to every
# output[o]. So its gradient is the sum of w[o][i] * gradient[o]
# across all the outputs.
return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]

def params(self):
return [self.w, self.b]

class Sequential(Layer):
"""
A layer consisting of a sequence of other layers.
It's up to you to make sure that the output of each layer
makes sense as the input to the next layer.
"""
def __init__(self, layers):
self.layers = layers

def forward(self, input):
"""Just forward the input through the layers in order."""
for layer in self.layers:
input = layer.forward(input)
return input

"""Just backpropagate the gradient through the layers in reverse."""
for layer in reversed(self.layers):

def params(self):
"""Just return the params from each layer."""
return (param for layer in self.layers for param in layer.params())

"""Just return the grads from each layer."""

class Loss:
def loss(self, predicted, actual):
"""How good are our predictions? (Larger numbers are worse.)"""
raise NotImplementedError
"""How does the loss change as the predictions change?"""
raise NotImplementedError

class SSE(Loss):
"""Loss function that computes the sum of the squared errors."""
def loss(self, predicted, actual):
# Compute the tensor of squared differences
squared_errors = tensor_combine(
lambda predicted, actual: (predicted - actual) ** 2,
predicted,
actual)
# And just add them up
return tensor_sum(squared_errors)

return tensor_combine(
lambda predicted, actual: 2 * (predicted - actual),
predicted,
actual)

class Optimizer:
"""
An optimizer updates the weights of a layer (in place) using information
known by either the layer or the optimizer (or by both).
"""
def step(self, layer):
raise NotImplementedError

def __init__(self, learning_rate = 0.1):
self.lr = learning_rate

def step(self, layer):
# Update param using a gradient step
param[:] = tensor_combine(
param,


random.seed(0)

xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
ys = [[0.], [1.], [1.], [0.]]

net = Sequential([
Linear(input_dim=2, output_dim=2),
Sigmoid(),
Linear(input_dim=2, output_dim=1)
])

import tqdm

loss = SSE()

with tqdm.trange(3000) as t:
for epoch in t:
epoch_loss = 0.0

for x, y in zip(xs, ys):
predicted = net.forward(x)
epoch_loss += loss.loss(predicted, y)

optimizer.step(net)

t.set_description(f"xor loss {epoch_loss:.3f}")

print()

for param in net.params():
print(param)


for x in xs:
predicted = net.forward(x)
print('input={0} output={1}'.format(x, predicted))


## 19.7 その他の活性化関数

その他の活性化関数
import math

def tanh(x):
# If x is very large or very small, tanh is (essentially) 1 or -1.
# We check for this because e.g. math.exp(1000) raises an error.
if x < -100:  return -1
elif x > 100: return 1

em2x = math.exp(-2 * x)
return (1 - em2x) / (1 + em2x)

class Tanh(Layer):
def forward(self, input):
# Save tanh output to use in backward pass.
self.tanh = tensor_apply(tanh, input)
return self.tanh

return tensor_combine(
self.tanh,

class Relu(Layer):
def forward(self, input):
self.input = input
return tensor_apply(lambda x: max(x, 0), input)

self.input,


## 19.8 事例：修正版 Fizz Buzz

import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math

def is_1d(tensor):
"""
If tensor is a list, it's a higher-order tensor.
Otherwise, tensor is 1-dimensonal (that is, a vector).
"""
return not isinstance(tensor, list)

def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

def zeros_like(tensor):
return tensor_apply(lambda _: 0.0, tensor)

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

def random_uniform(*dims):
if len(dims) == 1:
return [random.random() for _ in range(dims)]
else:
return [random_uniform(*dims[1:]) for _ in range(dims)]

def random_normal(*dims, mean = 0.0, variance = 1.0):
if len(dims) == 1:
return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims)]
else:
return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims)]

def random_tensor(*dims, init = 'normal'):
if init == 'normal':
return random_normal(*dims)
elif init == 'uniform':
return random_uniform(*dims)
elif init == 'xavier':
variance = len(dims) / sum(dims)
return random_normal(*dims, variance=variance)
else:
raise ValueError(f"unknown init: {init}")

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sigmoid(Layer):
def forward(self, input):
"""
Apply sigmoid to each element of the input tensor,
and save the results to use in backpropagation.
"""
self.sigmoids = tensor_apply(sigmoid, input)
return self.sigmoids

class Linear(Layer):
def __init__(self, input_dim, output_dim, init = 'xavier'):
"""
A layer of output_dim neurons, each with input_dim weights
(and a bias).
"""
self.input_dim = input_dim
self.output_dim = output_dim

# self.w[o] is the weights for the o-th neuron
self.w = random_tensor(output_dim, input_dim, init=init)

# self.b[o] is the bias term for the o-th neuron
self.b = random_tensor(output_dim, init=init)

def forward(self, input):
# Save the input to use in the backward pass.
self.input = input

# Return the vector of neuron outputs.
return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]

# Each b[o] gets added to output[o], which means
# the gradient of b is the same as the output gradient.

# Each w[o][i] multiplies input[i] and gets added to output[o].
self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]

# Each input[i] multiplies every w[o][i] and gets added to every
# output[o]. So its gradient is the sum of w[o][i] * gradient[o]
# across all the outputs.
return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]

def params(self):
return [self.w, self.b]

class Sequential(Layer):
"""
A layer consisting of a sequence of other layers.
It's up to you to make sure that the output of each layer
makes sense as the input to the next layer.
"""
def __init__(self, layers):
self.layers = layers

def forward(self, input):
"""Just forward the input through the layers in order."""
for layer in self.layers:
input = layer.forward(input)
return input

"""Just backpropagate the gradient through the layers in reverse."""
for layer in reversed(self.layers):

def params(self):
"""Just return the params from each layer."""
return (param for layer in self.layers for param in layer.params())

"""Just return the grads from each layer."""

class Loss:
def loss(self, predicted, actual):
"""How good are our predictions? (Larger numbers are worse.)"""
raise NotImplementedError
"""How does the loss change as the predictions change?"""
raise NotImplementedError

class SSE(Loss):
"""Loss function that computes the sum of the squared errors."""
def loss(self, predicted, actual):
# Compute the tensor of squared differences
squared_errors = tensor_combine(
lambda predicted, actual: (predicted - actual) ** 2,
predicted,
actual)
# And just add them up
return tensor_sum(squared_errors)

return tensor_combine(
lambda predicted, actual: 2 * (predicted - actual),
predicted,
actual)

class Optimizer:
"""
An optimizer updates the weights of a layer (in place) using information
known by either the layer or the optimizer (or by both).
"""
def step(self, layer):
raise NotImplementedError

def __init__(self, learning_rate = 0.1):
self.lr = learning_rate

def step(self, layer):
# Update param using a gradient step
param[:] = tensor_combine(
param,

class Momentum(Optimizer):
def __init__(self, learning_rate, momentum = 0.9):
self.lr = learning_rate
self.mo = momentum
self.updates = []  # running average

def step(self, layer):

# Apply momentum
update[:] = tensor_combine(
lambda u, g: self.mo * u + (1 - self.mo) * g,
update,

# Then take a gradient step
param[:] = tensor_combine(
lambda p, u: p - self.lr * u,
param,
update)

def tanh(x):
# If x is very large or very small, tanh is (essentially) 1 or -1.
# We check for this because e.g. math.exp(1000) raises an error.
if x < -100:  return -1
elif x > 100: return 1

em2x = math.exp(-2 * x)
return (1 - em2x) / (1 + em2x)

class Tanh(Layer):
def forward(self, input):
# Save tanh output to use in backward pass.
self.tanh = tensor_apply(tanh, input)
return self.tanh

return tensor_combine(
self.tanh,

class Relu(Layer):
def forward(self, input):
self.input = input
return tensor_apply(lambda x: max(x, 0), input)

self.input,


from neural_networks import binary_encode, fizz_buzz_encode, argmax
import tqdm

def fizzbuzz_accuracy(low, hi, net):
num_correct = 0
for n in range(low, hi):
x = binary_encode(n)
predicted = argmax(net.forward(x))
actual = argmax(fizz_buzz_encode(n))
if predicted == actual:
num_correct += 1

return num_correct / (hi - low)

xs = [binary_encode(n) for n in range(101, 1024)]
ys = [fizz_buzz_encode(n) for n in range(101, 1024)]

NUM_HIDDEN = 25

random.seed(0)

net = Sequential([
Linear(input_dim=10, output_dim=NUM_HIDDEN, init='uniform'),
Tanh(),
Linear(input_dim=NUM_HIDDEN, output_dim=4, init='uniform'),
Sigmoid()
])

optimizer = Momentum(learning_rate=0.1, momentum=0.9)
loss = SSE()

with tqdm.trange(1000) as t:
for epoch in t:
epoch_loss = 0.0

for x, y in zip(xs, ys):
predicted = net.forward(x)
epoch_loss += loss.loss(predicted, y)

optimizer.step(net)

accuracy = fizzbuzz_accuracy(101, 1024, net)
t.set_description(f"fb loss: {epoch_loss:.2f} acc: {accuracy:.2f}")

# Now check results on the test set
print()
print("test results", fizzbuzz_accuracy(1, 101, net))


## 19.9 ソフトマックスと交差エントロピー

ソフトマックス
def softmax(tensor):
"""Softmax along the last dimension"""
if is_1d(tensor):
# Subtract largest value for numerical stabilitity.
largest = max(tensor)
exps = [math.exp(x - largest) for x in tensor]

sum_of_exps = sum(exps)
return [exp_i / sum_of_exps for exp_i in exps]
else:
return [softmax(tensor_i) for tensor_i in tensor]


class SoftmaxCrossEntropy(Loss):
"""
This is the negative-log-likelihood of the observed values, given the
neural net model. So if we choose weights to minimize it, our model will
be maximizing the likelihood of the observed data.
"""
def loss(self, predicted, actual):
# Apply softmax to get probabilities
probabilities = softmax(predicted)

# This will be log p_i for the actual class i and 0 for the other
# classes. We add a tiny amount to p to avoid taking log(0).
likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)

# And then we just sum up the negatives.
return -tensor_sum(likelihoods)

probabilities = softmax(predicted)

# Isn't this a pleasant equation?
return tensor_combine(lambda p, actual: p - actual, probabilities, actual)


import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math

def is_1d(tensor):
"""
If tensor is a list, it's a higher-order tensor.
Otherwise, tensor is 1-dimensonal (that is, a vector).
"""
return not isinstance(tensor, list)

def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

def zeros_like(tensor):
return tensor_apply(lambda _: 0.0, tensor)

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

def random_uniform(*dims):
if len(dims) == 1:
return [random.random() for _ in range(dims)]
else:
return [random_uniform(*dims[1:]) for _ in range(dims)]

def random_normal(*dims, mean = 0.0, variance = 1.0):
if len(dims) == 1:
return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims)]
else:
return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims)]

def random_tensor(*dims, init = 'normal'):
if init == 'normal':
return random_normal(*dims)
elif init == 'uniform':
return random_uniform(*dims)
elif init == 'xavier':
variance = len(dims) / sum(dims)
return random_normal(*dims, variance=variance)
else:
raise ValueError(f"unknown init: {init}")

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sigmoid(Layer):
def forward(self, input):
"""
Apply sigmoid to each element of the input tensor,
and save the results to use in backpropagation.
"""
self.sigmoids = tensor_apply(sigmoid, input)
return self.sigmoids

class Linear(Layer):
def __init__(self, input_dim, output_dim, init = 'xavier'):
"""
A layer of output_dim neurons, each with input_dim weights
(and a bias).
"""
self.input_dim = input_dim
self.output_dim = output_dim

# self.w[o] is the weights for the o-th neuron
self.w = random_tensor(output_dim, input_dim, init=init)

# self.b[o] is the bias term for the o-th neuron
self.b = random_tensor(output_dim, init=init)

def forward(self, input):
# Save the input to use in the backward pass.
self.input = input

# Return the vector of neuron outputs.
return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]

# Each b[o] gets added to output[o], which means
# the gradient of b is the same as the output gradient.

# Each w[o][i] multiplies input[i] and gets added to output[o].
self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]

# Each input[i] multiplies every w[o][i] and gets added to every
# output[o]. So its gradient is the sum of w[o][i] * gradient[o]
# across all the outputs.
return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]

def params(self):
return [self.w, self.b]

class Sequential(Layer):
"""
A layer consisting of a sequence of other layers.
It's up to you to make sure that the output of each layer
makes sense as the input to the next layer.
"""
def __init__(self, layers):
self.layers = layers

def forward(self, input):
"""Just forward the input through the layers in order."""
for layer in self.layers:
input = layer.forward(input)
return input

"""Just backpropagate the gradient through the layers in reverse."""
for layer in reversed(self.layers):

def params(self):
"""Just return the params from each layer."""
return (param for layer in self.layers for param in layer.params())

"""Just return the grads from each layer."""

class Loss:
def loss(self, predicted, actual):
"""How good are our predictions? (Larger numbers are worse.)"""
raise NotImplementedError
"""How does the loss change as the predictions change?"""
raise NotImplementedError

class SSE(Loss):
"""Loss function that computes the sum of the squared errors."""
def loss(self, predicted, actual):
# Compute the tensor of squared differences
squared_errors = tensor_combine(
lambda predicted, actual: (predicted - actual) ** 2,
predicted,
actual)
# And just add them up
return tensor_sum(squared_errors)

return tensor_combine(
lambda predicted, actual: 2 * (predicted - actual),
predicted,
actual)

class Optimizer:
"""
An optimizer updates the weights of a layer (in place) using information
known by either the layer or the optimizer (or by both).
"""
def step(self, layer):
raise NotImplementedError

def __init__(self, learning_rate = 0.1):
self.lr = learning_rate

def step(self, layer):
# Update param using a gradient step
param[:] = tensor_combine(
param,

class Momentum(Optimizer):
def __init__(self, learning_rate, momentum = 0.9):
self.lr = learning_rate
self.mo = momentum
self.updates = []  # running average

def step(self, layer):

# Apply momentum
update[:] = tensor_combine(
lambda u, g: self.mo * u + (1 - self.mo) * g,
update,

# Then take a gradient step
param[:] = tensor_combine(
lambda p, u: p - self.lr * u,
param,
update)

def tanh(x):
# If x is very large or very small, tanh is (essentially) 1 or -1.
# We check for this because e.g. math.exp(1000) raises an error.
if x < -100:  return -1
elif x > 100: return 1

em2x = math.exp(-2 * x)
return (1 - em2x) / (1 + em2x)

class Tanh(Layer):
def forward(self, input):
# Save tanh output to use in backward pass.
self.tanh = tensor_apply(tanh, input)
return self.tanh

return tensor_combine(
self.tanh,

class Relu(Layer):
def forward(self, input):
self.input = input
return tensor_apply(lambda x: max(x, 0), input)

self.input,

def softmax(tensor):
"""Softmax along the last dimension"""
if is_1d(tensor):
# Subtract largest value for numerical stabilitity.
largest = max(tensor)
exps = [math.exp(x - largest) for x in tensor]

sum_of_exps = sum(exps)
return [exp_i / sum_of_exps for exp_i in exps]
else:
return [softmax(tensor_i) for tensor_i in tensor]

class SoftmaxCrossEntropy(Loss):
"""
This is the negative-log-likelihood of the observed values, given the
neural net model. So if we choose weights to minimize it, our model will
be maximizing the likelihood of the observed data.
"""
def loss(self, predicted, actual):
# Apply softmax to get probabilities
probabilities = softmax(predicted)

# This will be log p_i for the actual class i and 0 for the other
# classes. We add a tiny amount to p to avoid taking log(0).
likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)

# And then we just sum up the negatives.
return -tensor_sum(likelihoods)

probabilities = softmax(predicted)

# Isn't this a pleasant equation?
return tensor_combine(lambda p, actual: p - actual, probabilities, actual)


from neural_networks import binary_encode, fizz_buzz_encode, argmax
import tqdm

def fizzbuzz_accuracy(low, hi, net):
num_correct = 0
for n in range(low, hi):
x = binary_encode(n)
predicted = argmax(net.forward(x))
actual = argmax(fizz_buzz_encode(n))
if predicted == actual:
num_correct += 1

return num_correct / (hi - low)

xs = [binary_encode(n) for n in range(101, 1024)]
ys = [fizz_buzz_encode(n) for n in range(101, 1024)]

NUM_HIDDEN = 25

random.seed(0)

net = Sequential([
Linear(input_dim=10, output_dim=NUM_HIDDEN, init='uniform'),
Tanh(),
Linear(input_dim=NUM_HIDDEN, output_dim=4, init='uniform')
# No final sigmoid layer now
])

optimizer = Momentum(learning_rate=0.1, momentum=0.9)
loss = SoftmaxCrossEntropy()

with tqdm.trange(100) as t:
for epoch in t:
epoch_loss = 0.0

for x, y in zip(xs, ys):
predicted = net.forward(x)
epoch_loss += loss.loss(predicted, y)

optimizer.step(net)

accuracy = fizzbuzz_accuracy(101, 1024, net)
t.set_description(f"fb loss: {epoch_loss:.3f} acc: {accuracy:.2f}")

# Again check results on the test set
print()
print("test results", fizzbuzz_accuracy(1, 101, net))


## 19.10 ドロップアウト

ドロップアウト
class Dropout(Layer):
def __init__(self, p):
self.p = p
self.train = True

def forward(self, input):
if self.train:
# Create a mask of 0s and 1s shaped like the input
# using the specified probability.
self.mask = tensor_apply(lambda _: 0 if random.random() < self.p else 1, input)
# Multiply by the mask to dropout inputs.
else:
# During evaluation just scale down the outputs uniformly.
return tensor_apply(lambda x: x * (1 - self.p), input)

if self.train:
else:
raise RuntimeError("don't call backward when not in train mode")


## 19.11 事例：MNIST

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、mnistをインストールしてください。 コマンドは「pip install mnist」となります。

import operator
from linear_algebra import dot
import random
from probability import inverse_normal_cdf
from neural_networks import sigmoid
import math

def is_1d(tensor):
"""
If tensor is a list, it's a higher-order tensor.
Otherwise, tensor is 1-dimensonal (that is, a vector).
"""
return not isinstance(tensor, list)

def tensor_sum(tensor):
"""Sums up all the values in the tensor"""
if is_1d(tensor):
return sum(tensor)
else:
return sum(tensor_sum(tensor_i) for tensor_i in tensor)

def tensor_apply(f, tensor):
"""Applies f elementwise"""
if is_1d(tensor):
return [f(x) for x in tensor]
else:
return [tensor_apply(f, tensor_i) for tensor_i in tensor]

def zeros_like(tensor):
return tensor_apply(lambda _: 0.0, tensor)

def tensor_combine(f, t1, t2):
"""Applies f to corresponding elements of t1 and t2"""
if is_1d(t1):
return [f(x, y) for x, y in zip(t1, t2)]
else:
return [tensor_combine(f, t1_i, t2_i) for t1_i, t2_i in zip(t1, t2)]

def random_uniform(*dims):
if len(dims) == 1:
return [random.random() for _ in range(dims)]
else:
return [random_uniform(*dims[1:]) for _ in range(dims)]

def random_normal(*dims, mean = 0.0, variance = 1.0):
if len(dims) == 1:
return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims)]
else:
return [random_normal(*dims[1:], mean=mean, variance=variance) for _ in range(dims)]

def random_tensor(*dims, init = 'normal'):
if init == 'normal':
return random_normal(*dims)
elif init == 'uniform':
return random_uniform(*dims)
elif init == 'xavier':
variance = len(dims) / sum(dims)
return random_normal(*dims, variance=variance)
else:
raise ValueError(f"unknown init: {init}")

class Layer:
"""
Our neural networks will be composed of Layers, each of which
knows how to do some computation on its inputs in the "forward"
direction and propagate gradients in the "backward" direction.
"""
def forward(self, input):
"""
Note the lack of types. We're not going to be prescriptive
about what kinds of inputs layers can take and what kinds
of outputs they can return.
"""
raise NotImplementedError

"""
Similarly, we're not going to be prescriptive about what the
gradient looks like. It's up to you the user to make sure
that you're doing things sensibly.
"""
raise NotImplementedError

def params(self):
"""
Returns the parameters of this layer. The default implementation
returns nothing, so that if you have a layer with no parameters
you don't have to implement this.
"""
return ()

"""
Returns the gradients, in the same order as params()
"""
return ()

class Sigmoid(Layer):
def forward(self, input):
"""
Apply sigmoid to each element of the input tensor,
and save the results to use in backpropagation.
"""
self.sigmoids = tensor_apply(sigmoid, input)
return self.sigmoids

class Linear(Layer):
def __init__(self, input_dim, output_dim, init = 'xavier'):
"""
A layer of output_dim neurons, each with input_dim weights
(and a bias).
"""
self.input_dim = input_dim
self.output_dim = output_dim

# self.w[o] is the weights for the o-th neuron
self.w = random_tensor(output_dim, input_dim, init=init)

# self.b[o] is the bias term for the o-th neuron
self.b = random_tensor(output_dim, init=init)

def forward(self, input):
# Save the input to use in the backward pass.
self.input = input

# Return the vector of neuron outputs.
return [dot(input, self.w[o]) + self.b[o] for o in range(self.output_dim)]

# Each b[o] gets added to output[o], which means
# the gradient of b is the same as the output gradient.

# Each w[o][i] multiplies input[i] and gets added to output[o].
self.w_grad = [[self.input[i] * gradient[o] for i in range(self.input_dim)] for o in range(self.output_dim)]

# Each input[i] multiplies every w[o][i] and gets added to every
# output[o]. So its gradient is the sum of w[o][i] * gradient[o]
# across all the outputs.
return [sum(self.w[o][i] * gradient[o] for o in range(self.output_dim)) for i in range(self.input_dim)]

def params(self):
return [self.w, self.b]

class Sequential(Layer):
"""
A layer consisting of a sequence of other layers.
It's up to you to make sure that the output of each layer
makes sense as the input to the next layer.
"""
def __init__(self, layers):
self.layers = layers

def forward(self, input):
"""Just forward the input through the layers in order."""
for layer in self.layers:
input = layer.forward(input)
return input

"""Just backpropagate the gradient through the layers in reverse."""
for layer in reversed(self.layers):

def params(self):
"""Just return the params from each layer."""
return (param for layer in self.layers for param in layer.params())

"""Just return the grads from each layer."""

class Loss:
def loss(self, predicted, actual):
"""How good are our predictions? (Larger numbers are worse.)"""
raise NotImplementedError
"""How does the loss change as the predictions change?"""
raise NotImplementedError

class SSE(Loss):
"""Loss function that computes the sum of the squared errors."""
def loss(self, predicted, actual):
# Compute the tensor of squared differences
squared_errors = tensor_combine(
lambda predicted, actual: (predicted - actual) ** 2,
predicted,
actual)
# And just add them up
return tensor_sum(squared_errors)

return tensor_combine(
lambda predicted, actual: 2 * (predicted - actual),
predicted,
actual)

class Optimizer:
"""
An optimizer updates the weights of a layer (in place) using information
known by either the layer or the optimizer (or by both).
"""
def step(self, layer):
raise NotImplementedError

def __init__(self, learning_rate = 0.1):
self.lr = learning_rate

def step(self, layer):
# Update param using a gradient step
param[:] = tensor_combine(
param,

class Momentum(Optimizer):
def __init__(self, learning_rate, momentum = 0.9):
self.lr = learning_rate
self.mo = momentum
self.updates = []  # running average

def step(self, layer):

# Apply momentum
update[:] = tensor_combine(
lambda u, g: self.mo * u + (1 - self.mo) * g,
update,

# Then take a gradient step
param[:] = tensor_combine(
lambda p, u: p - self.lr * u,
param,
update)

def tanh(x):
# If x is very large or very small, tanh is (essentially) 1 or -1.
# We check for this because e.g. math.exp(1000) raises an error.
if x < -100:  return -1
elif x > 100: return 1

em2x = math.exp(-2 * x)
return (1 - em2x) / (1 + em2x)

class Tanh(Layer):
def forward(self, input):
# Save tanh output to use in backward pass.
self.tanh = tensor_apply(tanh, input)
return self.tanh

return tensor_combine(
self.tanh,

class Relu(Layer):
def forward(self, input):
self.input = input
return tensor_apply(lambda x: max(x, 0), input)

self.input,

def softmax(tensor):
"""Softmax along the last dimension"""
if is_1d(tensor):
# Subtract largest value for numerical stabilitity.
largest = max(tensor)
exps = [math.exp(x - largest) for x in tensor]

sum_of_exps = sum(exps)
return [exp_i / sum_of_exps for exp_i in exps]
else:
return [softmax(tensor_i) for tensor_i in tensor]

class SoftmaxCrossEntropy(Loss):
"""
This is the negative-log-likelihood of the observed values, given the
neural net model. So if we choose weights to minimize it, our model will
be maximizing the likelihood of the observed data.
"""
def loss(self, predicted, actual):
# Apply softmax to get probabilities
probabilities = softmax(predicted)

# This will be log p_i for the actual class i and 0 for the other
# classes. We add a tiny amount to p to avoid taking log(0).
likelihoods = tensor_combine(lambda p, act: math.log(p + 1e-30) * act, probabilities, actual)

# And then we just sum up the negatives.
return -tensor_sum(likelihoods)

probabilities = softmax(predicted)

# Isn't this a pleasant equation?
return tensor_combine(lambda p, actual: p - actual, probabilities, actual)

class Dropout(Layer):
def __init__(self, p):
self.p = p
self.train = True

def forward(self, input):
if self.train:
# Create a mask of 0s and 1s shaped like the input
# using the specified probability.
self.mask = tensor_apply(lambda _: 0 if random.random() < self.p else 1, input)
# Multiply by the mask to dropout inputs.
else:
# During evaluation just scale down the outputs uniformly.
return tensor_apply(lambda x: x * (1 - self.p), input)

if self.train:
else:
raise RuntimeError("don't call backward when not in train mode")


データのロード
import mnist

def shape(tensor):
sizes = []
while isinstance(tensor, list):
sizes.append(len(tensor))
tensor = tensor
return sizes

train_images = mnist.train_images().tolist()
train_labels = mnist.train_labels().tolist()

print(shape(train_images))
print(shape(train_labels))


%matplotlib inline
import matplotlib.pyplot as plt

fig, ax = plt.subplots(10, 10)

for i in range(10):
for j in range(10):
# Plot each image in black and white and hide the axes.
ax[i][j].imshow(train_images[10 * i + j], cmap='Greys')
ax[i][j].xaxis.set_visible(False)
ax[i][j].yaxis.set_visible(False)


def one_hot_encode(i, num_labels = 10):
return [1.0 if j == i else 0.0 for j in range(num_labels)]

test_images = mnist.test_images().tolist()
test_labels = mnist.test_labels().tolist()

# Compute the average pixel value
avg = tensor_sum(train_images) / 60000 / 28 / 28

# Recenter, rescale, and flatten
train_images = [[(pixel - avg) / 256 for row in image for pixel in row] for image in train_images]
test_images = [[(pixel - avg) / 256 for row in image for pixel in row] for image in test_images]

train_labels = [one_hot_encode(label) for label in train_labels]
test_labels = [one_hot_encode(label) for label in test_labels]


from neural_networks import argmax
import tqdm

def loop(model, images, labels, loss, optimizer = None):
correct = 0         # Track number of correct predictions.
total_loss = 0.0    # Track total loss.

with tqdm.trange(len(images)) as t:
for i in t:
predicted = model.forward(images[i])             # Predict.
if argmax(predicted) == argmax(labels[i]):       # Check for
correct += 1                                 # correctness.
total_loss += loss.loss(predicted, labels[i])    # Compute loss.

# If we're training, backpropagate gradient and update weights.
if optimizer is not None:
optimizer.step(model)

# And update our metrics in the progress bar.
avg_loss = total_loss / (i + 1)
acc = correct / (i + 1)
t.set_description(f"mnist loss: {avg_loss:.3f} acc: {acc:.3f}")

random.seed(0)

# Name them so we can turn train on and off
dropout1 = Dropout(0.1)
dropout2 = Dropout(0.1)

model = Sequential([
Linear(784, 30),  # Hidden layer 1: size 30
dropout1,
Tanh(),
Linear(30, 10),   # Hidden layer 2: size 10
dropout2,
Tanh(),
Linear(10, 10)    # Output layer: size 10
])

optimizer = Momentum(learning_rate=0.01, momentum=0.99)
loss = SoftmaxCrossEntropy()

# Enable dropout and train (takes > 20 minutes on my laptop!)
dropout1.train = dropout2.train = True
loop(model, train_images, train_labels, loss, optimizer)

# Disable dropout and evaluate
dropout1.train = dropout2.train = False
loop(model, test_images, test_labels, loss)