専門ユニット2/山内研セミナー(2020/11/11)

関連サイトと資料

使用する線形代数ライブラリ

linear_algebra.py
import math
   
def add(v, w):
    """Adds corresponding elements"""
    assert len(v) == len(w), "vectors must be the same length"
    return [v_i + w_i for v_i, w_i in zip(v, w)]
   
def subtract(v, w):
    """Subtracts corresponding elements"""
    assert len(v) == len(w), "vectors must be the same length"
    return [v_i - w_i for v_i, w_i in zip(v, w)]
  
def vector_sum(vectors):
    """Sums all corresponding elements"""
    # Check that vectors is not empty
    assert vectors, "no vectors provided!"
  
    # Check the vectors are all the same size
    num_elements = len(vectors[0])
    assert all(len(v) == num_elements for v in vectors), "different sizes!"
  
    # the i-th element of the result is the sum of every vector[i]
    return [sum(vector[i] for vector in vectors) for i in range(num_elements)]
  
def scalar_multiply(c, v):
    """Multiplies every element by c"""
    return [c * v_i for v_i in v]
  
def vector_mean(vectors):
    """Computes the element-wise average"""
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))
  
def dot(v, w):
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"
  
    return sum(v_i * w_i for v_i, w_i in zip(v, w))
  
def sum_of_squares(v):
    """Returns v_1 * v_1 + ... + v_n * v_n"""
    return dot(v, v)
  
def magnitude(v):
    """Returns the magnitude (or length) of v"""
    return math.sqrt(sum_of_squares(v))
   
def squared_distance(v, w):
    """Computes (v_1 - w_1) ** 2 + ... + (v_n - w_n) ** 2"""
    return sum_of_squares(subtract(v, w))
   
def distance(v, w):
    """Computes the distance between v and w"""
    return math.sqrt(squared_distance(v, w))
   
def shape(A):
    """Returns (# of rows of A, # of columns of A)"""
    num_rows = len(A)
    num_cols = len(A[0]) if A else 0   # number of elements in first row
    return num_rows, num_cols
  
def get_row(A, i):
    """Returns the i-th row of A (as a Vector)"""
    return A[i]
  
def get_column(A, j):
    """Returns the j-th column of A (as a Vector)"""
    return [A_i[j] for A_i in A]
   
def make_matrix(num_rows, num_cols, entry_fn):
    """
    Returns a num_rows x num_cols matrix
    whose (i,j)-th entry is entry_fn(i, j)
    """
    return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]
   
def identity_matrix(n):
    """Returns the n x n identity matrix"""
    return make_matrix(n, n, lambda i, j: 1 if i == j else 0)
    

使用する機械学習ライブラリ

machine_learning.py
import random
   
def split_data(data, prob):
    """Split data into fractions [prob, 1 - prob]"""
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]
   
def train_test_split(xs, ys, test_pct):
    # Generate the indices and split them.
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
   
    return ([xs[i] for i in train_idxs],  # x_train
            [xs[i] for i in test_idxs],   # x_test
            [ys[i] for i in train_idxs],  # y_train
            [ys[i] for i in test_idxs])   # y_test
    

12.1 モデル

k個の近傍データから多数決で結果を求める
from collections import Counter
  
def raw_majority_vote(labels):
    votes = Counter(labels)
    winner, _ = votes.most_common(1)[0]
    return winner
   
print(raw_majority_vote(['a', 'b', 'c', 'b']))
    

k個の近傍データから多数決で結果を求める(同数一位を勘案)
from collections import Counter
   
def majority_vote(labels):
    """Assumes that labels are ordered from nearest to farthest."""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count for count in vote_counts.values() if count == winner_count])
   
    if num_winners == 1:
        return winner                     # unique winner, so return it
    else:
        return majority_vote(labels[:-1]) # try again without the farthest
   
print(majority_vote(['a', 'b', 'c', 'b', 'a']))
    

k近傍による分類器
from linear_algebra import distance
  
class LabeledPoint:
    def __init__(self, point, label):
        self.point = point
        self.label = label
  
def knn_classify(k, labeled_points, new_point):
  
    # Order the labeled points from nearest to farthest.
    by_distance = sorted(labeled_points, key=lambda lp: distance(lp.point, new_point))
   
    # Find the labels for the k closest
    k_nearest_labels = [lp.label for lp in by_distance[:k]]
   
    # and let them vote.
    return majority_vote(k_nearest_labels)
    

12.2 事例:Irisデータセット

データセットのダウンロード
import requests
   
def get_iris_data(proxy=False):
   
    if proxy:
        proxies = { "http":"http://ccproxyz.kanagawa-it.ac.jp:10080", "https":"http://ccproxyz.kanagawa-it.ac.jp:10080"}
        data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", proxies=proxies)
    else:
        data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
    
    with open('iris.data', 'w') as f:
        f.write(data.text)
  
get_iris_data(proxy=True)
    

データの変換
import csv
from collections import defaultdict
   
def parse_iris_row(row):
    """
    sepal_length, sepal_width, petal_length, petal_width, class
    """
    measurements = [float(value) for value in row[:-1]]
    # class is e.g. "Iris-virginica"; we just want "virginica"
    label = row[-1].split("-")[-1]
    
    return LabeledPoint(measurements, label)
  
with open('iris.data') as f:
    reader = csv.reader(f)
    iris_data = [parse_iris_row(row) for row in reader]
  
points_by_species = defaultdict(list)
for iris in iris_data:
    points_by_species[iris.label].append(iris.point)
    

グラフの描画
%matplotlib inline
from matplotlib import pyplot as plt
   
metrics = ['sepal length', 'sepal width', 'petal length', 'petal width']
pairs = [(i, j) for i in range(4) for j in range(4) if i < j]
marks = ['+', '.', 'x']  # we have 3 classes, so 3 markers
    
fig, ax = plt.subplots(2, 3)
    
for row in range(2):
    for col in range(3):
        i, j = pairs[3 * row + col]
        ax[row][col].set_title(f"{metrics[i]} vs {metrics[j]}", fontsize=8)
        ax[row][col].set_xticks([])
        ax[row][col].set_yticks([])
    
        for mark, (species, points) in zip(marks, points_by_species.items()):
            xs = [point[i] for point in points]
            ys = [point[j] for point in points]
            ax[row][col].scatter(xs, ys, marker=mark, label=species)
    
ax[-1][-1].legend(loc='lower right', prop={'size': 6})
    

テスト用データと学習用データへの分割
import random
from machine_learning import split_data
   
random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)
    

分類
confusion_matrix = defaultdict(int)
num_correct = 0
    
for iris in iris_test:
    predicted = knn_classify(5, iris_train, iris.point)
    actual = iris.label
    
    if predicted == actual:
        num_correct += 1
    
    confusion_matrix[(predicted, actual)] += 1
    
pct_correct = num_correct / len(iris_test)
print(pct_correct, confusion_matrix)
    

12.3 次元の呪い

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、tqdmをインストールしてください。 コマンドは「pip install tqdm」となります。

次元を増やした時の距離
import random
from linear_algebra import distance
  
def random_point(dim):
    return [random.random() for _ in range(dim)]
  
def random_distances(dim, num_pairs):
    return [distance(random_point(dim), random_point(dim)) for _ in range(num_pairs)]
    

1次元から100次元までランダムな2点間の平均距離と最短距離
import tqdm
  
dimensions = range(1, 101)
    
avg_distances = []
min_distances = []
    
random.seed(0)
for dim in tqdm.tqdm(dimensions, desc="Curse of Dimensionality"):
    distances = random_distances(dim, 10000)      # 10,000 random pairs
    avg_distances.append(sum(distances) / 10000)  # track the average
    min_distances.append(min(distances))          # track the minimum
    

グラフの描画(1)
%matplotlib inline
from matplotlib import pyplot as plt
   
plt.plot(dimensions, avg_distances, label='average distance')
plt.plot(dimensions, min_distances, label='minimum distance')
plt.legend()
plt.xlabel("# of dimensions")
plt.title("10,000 Random Distances")
    

グラフの描画(2)
%matplotlib inline
from matplotlib import pyplot as plt
   
min_avg_ratio = [min_dist / avg_dist for min_dist, avg_dist in zip(min_distances, avg_distances)]
    
plt.plot(dimensions, min_avg_ratio)
plt.xlabel("# of dimensions")
plt.title("Minimum Distance / Average Distance")