# 専門ユニット2/山内研セミナー(2020/11/11)

## 使用する線形代数ライブラリ

linear_algebra.py
import math

assert len(v) == len(w), "vectors must be the same length"
return [v_i + w_i for v_i, w_i in zip(v, w)]

def subtract(v, w):
"""Subtracts corresponding elements"""
assert len(v) == len(w), "vectors must be the same length"
return [v_i - w_i for v_i, w_i in zip(v, w)]

def vector_sum(vectors):
"""Sums all corresponding elements"""
# Check that vectors is not empty
assert vectors, "no vectors provided!"

# Check the vectors are all the same size
num_elements = len(vectors)
assert all(len(v) == num_elements for v in vectors), "different sizes!"

# the i-th element of the result is the sum of every vector[i]
return [sum(vector[i] for vector in vectors) for i in range(num_elements)]

def scalar_multiply(c, v):
"""Multiplies every element by c"""
return [c * v_i for v_i in v]

def vector_mean(vectors):
"""Computes the element-wise average"""
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))

def dot(v, w):
"""Computes v_1 * w_1 + ... + v_n * w_n"""
assert len(v) == len(w), "vectors must be same length"

return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
"""Returns v_1 * v_1 + ... + v_n * v_n"""
return dot(v, v)

def magnitude(v):
"""Returns the magnitude (or length) of v"""
return math.sqrt(sum_of_squares(v))

def squared_distance(v, w):
"""Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
return sum_of_squares(subtract(v, w))

def distance(v, w):
"""Computes the distance between v and w"""
return math.sqrt(squared_distance(v, w))

def shape(A):
"""Returns (# of rows of A, # of columns of A)"""
num_rows = len(A)
num_cols = len(A) if A else 0   # number of elements in first row
return num_rows, num_cols

def get_row(A, i):
"""Returns the i-th row of A (as a Vector)"""
return A[i]

def get_column(A, j):
"""Returns the j-th column of A (as a Vector)"""
return [A_i[j] for A_i in A]

def make_matrix(num_rows, num_cols, entry_fn):
"""
Returns a num_rows x num_cols matrix
whose (i,j)-th entry is entry_fn(i, j)
"""
return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]

def identity_matrix(n):
"""Returns the n x n identity matrix"""
return make_matrix(n, n, lambda i, j: 1 if i == j else 0)


## 使用する機械学習ライブラリ

machine_learning.py
import random

def split_data(data, prob):
"""Split data into fractions [prob, 1 - prob]"""
data = data[:]
random.shuffle(data)
cut = int(len(data) * prob)
return data[:cut], data[cut:]

def train_test_split(xs, ys, test_pct):
# Generate the indices and split them.
idxs = [i for i in range(len(xs))]
train_idxs, test_idxs = split_data(idxs, 1 - test_pct)

return ([xs[i] for i in train_idxs],  # x_train
[xs[i] for i in test_idxs],   # x_test
[ys[i] for i in train_idxs],  # y_train
[ys[i] for i in test_idxs])   # y_test


## 12.1 モデル

k個の近傍データから多数決で結果を求める
from collections import Counter

def raw_majority_vote(labels):
return winner

print(raw_majority_vote(['a', 'b', 'c', 'b']))


k個の近傍データから多数決で結果を求める(同数一位を勘案)
from collections import Counter

def majority_vote(labels):
"""Assumes that labels are ordered from nearest to farthest."""
vote_counts = Counter(labels)
winner, winner_count = vote_counts.most_common(1)
num_winners = len([count for count in vote_counts.values() if count == winner_count])

if num_winners == 1:
return winner                     # unique winner, so return it
else:
return majority_vote(labels[:-1]) # try again without the farthest

print(majority_vote(['a', 'b', 'c', 'b', 'a']))


k近傍による分類器
from linear_algebra import distance

class LabeledPoint:
def __init__(self, point, label):
self.point = point
self.label = label

def knn_classify(k, labeled_points, new_point):

# Order the labeled points from nearest to farthest.
by_distance = sorted(labeled_points, key=lambda lp: distance(lp.point, new_point))

# Find the labels for the k closest
k_nearest_labels = [lp.label for lp in by_distance[:k]]

# and let them vote.
return majority_vote(k_nearest_labels)


## 12.2 事例：Irisデータセット

データセットのダウンロード
import requests

def get_iris_data(proxy=False):

if proxy:
proxies = { "http":"http://ccproxyz.kanagawa-it.ac.jp:10080", "https":"http://ccproxyz.kanagawa-it.ac.jp:10080"}
data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", proxies=proxies)
else:
data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")

with open('iris.data', 'w') as f:
f.write(data.text)

get_iris_data(proxy=True)


データの変換
import csv
from collections import defaultdict

def parse_iris_row(row):
"""
sepal_length, sepal_width, petal_length, petal_width, class
"""
measurements = [float(value) for value in row[:-1]]
# class is e.g. "Iris-virginica"; we just want "virginica"
label = row[-1].split("-")[-1]

return LabeledPoint(measurements, label)

with open('iris.data') as f:
iris_data = [parse_iris_row(row) for row in reader]

points_by_species = defaultdict(list)
for iris in iris_data:
points_by_species[iris.label].append(iris.point)


グラフの描画
%matplotlib inline
from matplotlib import pyplot as plt

metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
marks = ['+', '.', 'x']  # we have 3 classes, so 3 markers

fig, ax = plt.subplots(2, 3)

for row in range(2):
for col in range(3):
i, j = pairs[3 * row + col]
ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
ax[row][col].set_xticks([])
ax[row][col].set_yticks([])

for mark, (species, points) in zip(marks, points_by_species.items()):
xs = [point[i] for point in points]
ys = [point[j] for point in points]
ax[row][col].scatter(xs, ys, marker=mark, label=species)

ax[-1][-1].legend(loc='lower right', prop={'size': 6})


テスト用データと学習用データへの分割
import random
from machine_learning import split_data

random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)


confusion_matrix = defaultdict(int)
num_correct = 0

for iris in iris_test:
predicted = knn_classify(5, iris_train, iris.point)
actual = iris.label

if predicted == actual:
num_correct += 1

confusion_matrix[(predicted, actual)] += 1

pct_correct = num_correct / len(iris_test)
print(pct_correct, confusion_matrix)


## 12.3 次元の呪い

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、tqdmをインストールしてください。 コマンドは「pip install tqdm」となります。

import random
from linear_algebra import distance

def random_point(dim):
return [random.random() for _ in range(dim)]

def random_distances(dim, num_pairs):
return [distance(random_point(dim), random_point(dim)) for _ in range(num_pairs)]


1次元から100次元までランダムな2点間の平均距離と最短距離
import tqdm

dimensions = range(1, 101)

avg_distances = []
min_distances = []

random.seed(0)
for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
distances = random_distances(dim, 10000)      # 10,000 random pairs
avg_distances.append(sum(distances) / 10000)  # track the average
min_distances.append(min(distances))          # track the minimum


グラフの描画(1)
%matplotlib inline
from matplotlib import pyplot as plt

plt.plot(dimensions, avg_distances, label='average distance')
plt.plot(dimensions, min_distances, label='minimum distance')
plt.legend()
plt.xlabel("# of dimensions")
plt.title("10,000 Random Distances")


グラフの描画(2)
%matplotlib inline
from matplotlib import pyplot as plt

min_avg_ratio = [min_dist / avg_dist for min_dist, avg_dist in zip(min_distances, avg_distances)]

plt.plot(dimensions, min_avg_ratio)
plt.xlabel("# of dimensions")
plt.title("Minimum Distance / Average Distance")