신경망 학습

deep learning
공개

2025년 7월 30일

손실함수

오차 제곱합

  • \(E = \frac{1}{2} \sum_{i=1}^{n} (y_i - t_i)^2\)
  • \(y_i\): 예측값
  • \(t_i\): 정답값
  • \(\frac{1}{2}\)는 미분을 쉽게 하기 위해서 곱해주는 상수
import numpy as np

def sum_squared_error(y, t):
    return 0.5 * np.sum((y - t) ** 2)

교차 엔트로피

  • \(E = -\sum_{i=1}^{n} t_i \log(y_i)\)
  • 일반적으로 정답값인 \(t_i\)는 0 또는 1이기 때문에(one hot encoding), \(t_i = 1\)인 경우에만 계산된다.
  • \(y_i\)가 1에 가까울수록 손실이 작아진다.
def cross_entropy_error(y, t):
    delta = 1e-7  # log(0) 방지
    return -np.sum(t * np.log(y + delta))

미니배치 학습

  • 모든 데이터를 한 번에 학습하는 것이 아니라, 일부 데이터만을 사용하여 학습하는 방법
import numpy as np
from dl_dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

경사법

  • 기울기가 가리키는 쪽은 각 장소에서 함수의 출력 값을 가장 크게 줄이는 방향
  • 기울어진 방향이 반드시 최솟값은 아니지만, 그 방향으로 가야 함수의 값을 줄일 수 있다.
import numpy as np
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)

    for idx in range(x.size):
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x)

        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2 * h)
        x[idx] = tmp_val

    return grad
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x.copy()

    for _ in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad
    return x
from dl_common.functions import softmax, cross_entropy_error
from dl_common.gradient import numerical_gradient

class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2, 3)

    def predict(self, x):
        return np.dot(x, self.W)

    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        return cross_entropy_error(y, t)
net = simpleNet()
x = np.array([0.6, 0.9])
p = net.predict(x)
np.argmax(p)
0
t = np.array([0, 0, 1])
net.loss(x, t)
1.5687375483483121
f = lambda w: net.loss(x, t)

dW = numerical_gradient(f, net.W)
print(dW)
[[ 0.39309067  0.08192436 -0.47501503]
 [ 0.58963601  0.12288654 -0.71252255]]

학습 알고리즘 구현

from dl_common.functions import *
from dl_common.gradient import numerical_gradient

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        return y

    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        if t.ndim != 1:  # one-hot encoding
            t = np.argmax(t, axis=1)
        return np.sum(np.argmax(y, axis=1) == t) / float(x.shape[0])

    def numerical_gradient(self, x, t):
        loss_w = lambda w: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_w, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_w, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_w, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_w, self.params['b2'])
        return grads
from dl_dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

train_loss_list = []
train_acc_list = []
test_acc_list = []

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    grads = network.numerical_gradient(x_batch, t_batch)
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grads[key]
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
맨 위로