๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ
Deep Learning/2023 DL ๊ธฐ์ดˆ ์ด๋ก  ๊ณต๋ถ€

[๋ฐ‘๋ฐ”๋‹ฅ๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ๋”ฅ๋Ÿฌ๋‹ 1] chap5(์˜ค์ฐจ์—ญ์ „ํŒŒ๋ฒ•)

by ์ œ๋ฃฝ 2023. 7. 8.
728x90
๋ฐ˜์‘ํ˜•

 

 

5-1) ๊ณ„์‚ฐ ๊ทธ๋ž˜ํ”„
  • ๊ณ„์‚ฐ ๊ณผ์ •์„ ๊ทธ๋ž˜ํ”„๋กœ ๋‚˜ํƒ€๋‚ธ ๊ฒƒ
  • ๋…ธ๋“œ(node)/ ์—์ง€(edge): ๋…ธ๋“œ ์‚ฌ์ด์˜ ์ง์„ ์„ ์—์ง€๋ผ๊ณ  ๋ถ€๋ฆ„

⇒ ์ˆœ์ „ํŒŒ๋ผ๊ณ  ํ•จ(์™ผ์ชฝ์—์„œ ์˜ค๋ฅธ์ชฝ์œผ๋กœ ์ง„ํ–‰)

๊ตญ์†Œ์  ๊ณ„์‚ฐ
  • ์ „์ฒด์ ์œผ๋กœ ๋ณด๋ฉด ๋ณต์žกํ•œ ๊ณ„์‚ฐ์‹์ด์ง€๋งŒ ํŒŒ๊ณ ๋“ค๋ฉด ๊ฐ„๋‹จํ•œ ์ˆ˜์‹์œผ๋กœ ์ด๋ฃจ์–ด์ ธ ์žˆ๋Š” ⇒ ๊ตญ์†Œ์  ๊ณ„์‚ฐ์ด๋ผ๊ณ  ํ•จ.( ์ „์ฒด ์‹์€ ๊ฐœ๋ณต์žก but, ์•ˆ์œผ๋กœ ๋“ค์–ด๊ฐ€๋ฉด ๋ง์…ˆ ๋บ„์…ˆ์œผ๋กœ ์ด๋ฃจ์–ด์ง)
๊ณ„์‚ฐ ๊ทธ๋ž˜ํ”„์˜ ์ด์ 
  • ๊ตญ์†Œ์  ๊ณ„์‚ฐ
  • ์ „์ฒด๊ฐ€ ๋ณต์žกํ•ด๋„ ์•ˆ์—์„œ๋Š” ๊ฐ„๋‹จํ•˜๊ฒŒ ๋ฌธ์ œ ๋‹จ์ˆœํ™”ํ•ด์„œ ํ’€ ์ˆ˜ ์žˆ์Œ
  • ์ค‘๊ฐ„ ์ค‘๊ฐ„์˜ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ ์ €์žฅ ๊ฐ€๋Šฅ
  • ์—ญ์ „ํŒŒ๋ฅผ ํ†ตํ•ด ‘๋ฏธ๋ถ„’์„ ํšจ์œจ์ ์œผ๋กœ ๊ณ„์‚ฐ ๊ฐ€๋Šฅ
5-2) ์—ฐ์‡„ ๋ฒ•์น™
  • ํ•ฉ์„ฑ ํ•จ์ˆ˜์˜ ๋ฏธ๋ถ„์— ๋Œ€ํ•œ ์„ฑ์งˆ
  • ํ•ฉ์„ฑ ํ•จ์ˆ˜์˜ ๋ฏธ๋ถ„: ํ•ฉ์„ฑ ํ•จ์ˆ˜๋ฅผ ๊ตฌ์„ฑํ•˜๋Š” ๊ฐ ํ•จ์ˆ˜์˜ ๋ฏธ๋ถ„์˜ ๊ณฑ์œผ๋กœ ๋‚˜ํƒ€๋‚ผ ์ˆ˜ ์žˆ๋‹ค
5-3) ์—ญ์ „ํŒŒ
5-3-1) ๋ง์…ˆ ๋…ธ๋“œ์˜ ์—ญ์ „ํŒŒ
  • ๊ทธ๋ƒฅ ๋‹ค์Œ ๋…ธ๋“œ๋กœ ์ถœ๋ ฅํ•จ (๋ง์…ˆ ํ•จ์ˆ˜์˜ ๊ฒฝ์šฐ, ๋ฏธ๋ถ„ํ•˜๋ฉด 1์ด ๋˜๋ฏ€๋กœ) ex) z= x+y
5-3-2) ๊ณฑ์…ˆ ๋…ธ๋“œ์˜ ์—ญ์ „ํŒŒ
  • ์•ž์˜ ์ถœ๋ ฅ๊ฐ’ x ๋ฏธ๋ถ„๊ฐ’
5-4) ๋‹จ์ˆœ ๊ณ„์ธต ๊ตฌํ˜„
  • ๊ณฑ์…ˆ ๊ณ„์ธต
class MulLayer:     def __init__(self):         self.x = None         self.y = None      def forward(self, x, y):         self.x = x         self.y = y                         out = x * y          return out      def backward(self, dout):         dx = dout * self.y  # x์™€ y๋ฅผ ๋ฐ”๊พผ๋‹ค.         dy = dout * self.x          return dx, dy
from layer_naive import *  apple = 100 apple_num = 2 tax = 1.1  # ๊ณ„์ธต๋“ค mul_apple_layer = MulLayer() mul_tax_layer = MulLayer()  # ์ˆœ์ „ํŒŒ apple_price = mul_apple_layer.forward(apple,apple_num) price = mul_tax_layer.forward(apple_price,tax)  print(price) # 220  # ์—ญ์ „ํŒŒ dprice = 1 dapple_price, dtax = mul_tax_layer.backward(dprice) dapple, dapple_num = mul_apple_layer.backward(dapple_price)  print(dapple, dapple_num, dtax) # 2.2 110 200
  • ๋ง์…ˆ ๊ณ„์ธต
class AddLayer:     def __init__(self):         pass      def forward(self, x, y):         out = x + y          return out      def backward(self, dout):         dx = dout * 1         dy = dout * 1          return dx, dy

 

5-5) ํ™œ์„ฑํ™” ํ•จ์ˆ˜ ๊ณ„์ธต ๊ตฌํ˜„(ReLU, Sigmoid)
  • ReLU ๊ณ„์ธต
    class Relu:     def __init__(self):         self.mask = None      def forward(self, x):         self.mask = (x <= 0)         out = x.copy()         out[self.mask] = 0          return out      def backward(self, dout):         dout[self.mask] = 0         dx = dout          return dx
    • x๊ฐ€ 0๋ณด๋‹ค ํฌ๋ฉด ์—ญ์ „ํŒŒ์˜ ๊ฒฝ์šฐ, ์ƒ๋ฅ˜ ๊ฐ’์„ ํ•˜๋ฅ˜๋กœ ๊ทธ๋Œ€๋กœ ์ „๋‹ฌ
    • x๊ฐ€ 0๋ณด๋‹ค ์ž‘์œผ๋ฉด 0์œผ๋กœ ์ „๋‹ฌ
  • Sigmoid ๊ณ„์ธต
    • ๊ตฌ์ฒดํ™” ๋ฒ„์ „(์‹)

     

     

    • ๊ฐ„์†Œํ™” ๋ฒ„์ „
    • ์—ญ์ „ํŒŒ์˜ ๊ฒฝ์šฐ, ์ˆœ์ „ํŒŒ ์ถœ๋ ฅ(y)๋งŒ์œผ๋กœ ๊ตฌํ˜„ ๊ฐ€๋Šฅ
      class Sigmoid:     def __init__(self):         self.out = None      def forward(self, x):         out = sigmoid(x) # ์ˆœ์ „ํŒŒ์˜ ์ถœ๋ ฅ์„ out์— ๋ณด๊ด€ํ•œ ํ›„ ์—ญ์ „ํŒŒ ๊ณ„์‚ฐ ๋•Œ ๊ทธ ๊ฐ’์„ ์‚ฌ์šฉ         self.out = out          return out      def backward(self, dout):         dx = dout * (1.0 - self.out) * self.out          return dx
5-6) Affine/Softmax ๊ณ„์ธต ๊ตฌํ˜„
Affine ๊ณ„์ธต
  • ์ˆœ์ „ํŒŒ ๋•Œ ์ˆ˜ํ–‰ํ•˜๋Š” ํ–‰๋ ฌ์˜ ๊ณฑ: affine ๋ณ€ํ™˜
  • ์–ดํŒŒ์ธ ๋ณ€ํ™˜์„ ์ˆ˜ํ–‰ํ•˜๋Š” ์ฒ˜๋ฆฌ: affine ๊ณ„์ธต์ด๋ผ๊ณ  ์นญํ•จ
  • affine ๊ณ„์ธต์˜ ์—ญ์ „ํŒŒ

 

๋ฐฐ์น˜์šฉ Affine ๊ณ„์ธต
  • ๋ฐ์ดํ„ฐ N๊ฐœ๋ฅผ ๋ฌถ์€ ๊ฒฝ์šฐ
class Affine:     def __init__(self, W, b):         self.W = W         self.b = b                  self.x = None         self.original_x_shape = None         # ๊ฐ€์ค‘์น˜์™€ ํŽธํ–ฅ ๋งค๊ฐœ๋ณ€์ˆ˜์˜ ๋ฏธ๋ถ„         self.dW = None         self.db = None      def forward(self, x):         # ํ…์„œ ๋Œ€์‘         self.original_x_shape = x.shape         x = x.reshape(x.shape[0], -1)         self.x = x          out = np.dot(self.x, self.W) + self.b          return out      def backward(self, dout):         dx = np.dot(dout, self.W.T)         self.dW = np.dot(self.x.T, dout)         self.db = np.sum(dout, axis=0)                  dx = dx.reshape(*self.original_x_shape)  # ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๋ชจ์–‘ ๋ณ€๊ฒฝ(ํ…์„œ ๋Œ€์‘)         return dx
  • ์ˆœ์ „ํŒŒ์˜ ํŽธํ–ฅ ๋ง์…ˆ: ๊ฐ ๋ฐ์ดํ„ฐ์— ํŽธํ–ฅ์ด ๋”ํ•ด์ง
  • ์—ญ์ „ํŒŒ์˜ ํŽธํ–ฅ: ๋ฐ์ดํ„ฐ์˜ ํ•ฉ์„ ๊ตฌํ•œ ๋’ค ๋ฏธ๋ถ„์œผ๋กœ ํŽธํ–ฅ ๊ตฌํ•จ
Softmax-with-Loss ๊ณ„์ธต
  • ์†Œํ”„ํŠธ ๋งฅ์ˆ˜: ์ •๊ทœํ™”, ์ถœ๋ ฅ์ธต์—์„œ ์‚ฌ์šฉ, ์ถœ๋ ฅํ•ฉ: 1
  • ์ ์ˆ˜๋ฅผ ํ™•๋ฅ ์˜ ๋ฒ”์œ„๋กœ (0~1 ์‚ฌ์ด๋กœ ์ •๊ทœํ™”)
  • but ์ถ”๋ก ํ•  ๋•Œ๋Š” ๊ตณ์ด ์ ์ˆ˜ ์ค‘ ๋†’์€ ๊ฐ’์„ ๊ณ ๋ฅด๋ฉด ๋˜๊ธฐ์— ํ™•๋ฅ ๋กœ ๋ณ€ํ™˜ํ•ด์ค„ ํ•„์š”๊ฐ€ ์—†์Œ ⇒ ์†Œํ”„ํŠธ ๋งฅ์Šค ์‚ฌ์šฉ ์•ˆ์”€ (ํ•™์Šต ์‹œ์—๋Š” ์‚ฌ์šฉ)
  • ์ถœ๋ ฅ๊ฐ’-์ •๋‹ต๊ฐ’( softmax ๊ณ„์ธต์˜ ์ถœ๋ ฅ๊ณผ ์ •๋‹ต ๋ ˆ์ด๋ธ”์˜ ์ฐจ๋ถ„์„ ์˜๋ฏธ)
  • ์‹ ๊ฒฝ๋ง์˜ ๋ชฉํ‘œ๋Š” ์ถœ๋ ฅ๊ฐ’์ด ์ •๋‹ต๊ณผ ๊ฐ€๊นŒ์›Œ์ง€๋„๋ก ๊ฐ€์ค‘์น˜ ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ์กฐ์ •ํ•˜๋Š” ๊ฒƒ์ž„.
  • (์ถœ๋ ฅ-์ •๋‹ต๊ฐ’) ์ด ์˜ค์ฐจ๊ฐ€ ์•ž ๊ณ„์ธต์— ์ „๋‹ฌ๋จ ⇒ ์˜ค์ฐจ๋ฅผ ํšจ์œจ์ ์œผ๋กœ ์•ž ๊ณ„์ธต์— ์ „๋‹ฌํ•ด์•ผ.
class SoftmaxWithLoss:     def __init__(self):         self.loss = None # ์†์‹คํ•จ์ˆ˜         self.y = None    # softmax์˜ ์ถœ๋ ฅ         self.t = None    # ์ •๋‹ต ๋ ˆ์ด๋ธ”(์›-ํ•ซ ์ธ์ฝ”๋”ฉ ํ˜•ํƒœ)              def forward(self, x, t):         self.t = t         self.y = softmax(x)         self.loss = cross_entropy_error(self.y, self.t)                  return self.loss      def backward(self, dout=1):         batch_size = self.t.shape[0]         if self.t.size == self.y.size: # ์ •๋‹ต ๋ ˆ์ด๋ธ”์ด ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ํ˜•ํƒœ์ผ ๋•Œ             dx = (self.y - self.t) / batch_size         else:             dx = self.y.copy()             dx[np.arange(batch_size), self.t] -= 1             dx = dx / batch_size                  return dx

 

5-7) ์˜ค์ฐจ์—ญ์ „ํŒŒ๋ฒ• ๊ตฌํ˜„
  • ๊ตฌํ˜„ ๋‹จ๊ณ„
    • ์ „์ œ: ๊ฐ€์ค‘์น˜์™€ ํŽธํ–ฅ์„ ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ์— ์ ์‘ํ•˜๋„๋ก ์กฐ์ •ํ•˜๋Š” ๊ณผ์ •(=ํ•™์Šต)
    1. 1๋‹จ๊ณ„- ๋ฏธ๋‹ˆ๋ฐฐ์น˜
      • ๋ฌด์ž‘์œ„ ๋ฐ์ดํ„ฐ ์‚ฐ์ถœ(๋ฏธ๋‹ˆ๋ฐฐ์น˜), ๋ฏธ๋‹ˆ๋ฐฐ์น˜์˜ ์†์‹ค ํ•จ์ˆ˜ ๊ฐ’์„ ์ค„์ด๋Š” ๊ฒƒ์ด ๋ชฉํ‘œ
    1. 2๋‹จ๊ณ„- ๊ธฐ์šธ๊ธฐ ์‚ฐ์ถœ
      • ์†์‹คํ•จ์ˆ˜ ๊ฐ’์„ ์ค„์ด๊ธฐ ์œ„ํ•ด ๊ฐ ๊ฐ€์ค‘์น˜ ๋งค๊ฐœ๋ณ€์ˆ˜์˜ ๊ธฐ์šธ๊ธฐ๋ฅผ ๊ตฌํ•จ ⇒ ์†์‹คํ•จ์ˆ˜ ๊ฐ’ ์ž‘๋„๋ก ๋ฐฉํ–ฅ ์ œ์‹œ
    1. 3๋‹จ๊ณ„- ๋งค๊ฐœ๋ณ€์ˆ˜ ๊ฐฑ์‹ 
      • ๊ธฐ์šธ๊ธฐ ๋ฐฉํ–ฅ์œผ๋กœ ๊ฐฑ์‹ 
    1. 4๋‹จ๊ณ„- 1~3๋‹จ๊ณ„ ๋ฐ˜๋ณต
  • ์ฝ”๋“œ
    import sys, os sys.path.append(os.pardir)  # ๋ถ€๋ชจ ๋””๋ ‰ํ„ฐ๋ฆฌ์˜ ํŒŒ์ผ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ๋„๋ก ์„ค์ • import numpy as np from common.layers import * from common.gradient import numerical_gradient from collections import OrderedDict   class TwoLayerNet:      def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):         # ๊ฐ€์ค‘์น˜ ์ดˆ๊ธฐํ™”         self.params = {}         self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)         self.params['b1'] = np.zeros(hidden_size)         self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)          self.params['b2'] = np.zeros(output_size)          # ๊ณ„์ธต ์ƒ์„ฑ         self.layers = OrderedDict() #๊ฐ’ ์ €์žฅ(์—ญ์ „ํŒŒ ๋•Œ input ๋ฐ”๊ฟ”์„œ ๊ณ„์‚ฐํ•˜๊ธฐ ๋•Œ๋ฌธ!)         self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])         self.layers['Relu1'] = Relu()         self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])          self.lastLayer = SoftmaxWithLoss()              def predict(self, x):         for layer in self.layers.values():             x = layer.forward(x)                  return x              # x : ์ž…๋ ฅ ๋ฐ์ดํ„ฐ, t : ์ •๋‹ต ๋ ˆ์ด๋ธ”     def loss(self, x, t):         y = self.predict(x)         return self.lastLayer.forward(y, t)          def accuracy(self, x, t):         y = self.predict(x)         y = np.argmax(y, axis=1)         if t.ndim != 1 : t = np.argmax(t, axis=1)                  accuracy = np.sum(y == t) / float(x.shape[0])         return accuracy              # x : ์ž…๋ ฅ ๋ฐ์ดํ„ฐ, t : ์ •๋‹ต ๋ ˆ์ด๋ธ”     def numerical_gradient(self, x, t):         loss_W = lambda W: self.loss(x, t)                  grads = {}         grads['W1'] = numerical_gradient(loss_W, self.params['W1'])         grads['b1'] = numerical_gradient(loss_W, self.params['b1'])         grads['W2'] = numerical_gradient(loss_W, self.params['W2'])         grads['b2'] = numerical_gradient(loss_W, self.params['b2'])                  return grads              def gradient(self, x, t):         # forward         self.loss(x, t)          # backward         dout = 1         dout = self.lastLayer.backward(dout)                  layers = list(self.layers.values())         layers.reverse()         for layer in layers:             dout = layer.backward(dout)          # ๊ฒฐ๊ณผ ์ €์žฅ         grads = {}         grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db         grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db          return grads
  • ๊ธฐ์šธ๊ธฐ ๊ฒ€์ฆํ•˜๊ธฐ
    • ์ˆ˜์น˜ ๋ฏธ๋ถ„ vs (ํ•ด์„์  ์ˆ˜์‹)์˜ค์ฐจ์—ญ์ „ํŒŒ๋ฒ•
    • ์ˆ˜์น˜๋ฏธ๋ถ„๊ณผ ์˜ค์ฐจ์—ญ์ „ํŒŒ๋ฒ•์˜ ๊ฒฐ๊ณผ๋ฅผ ๋น„๊ต ⇒ ์˜ค์ฐจ์—ญ์ „ํŒŒ๋ฒ•์„ ์ œ๋Œ€๋กœ ๊ตฌํ˜„ํ–ˆ๋Š”๊ฐ€์— ๋Œ€ํ•œ ๊ฒ€์ฆ ⇒ ๊ธฐ์šธ๊ธฐ ํ™•์ธ
    import sys, os sys.path.append(os.pardir)  # ๋ถ€๋ชจ ๋””๋ ‰ํ„ฐ๋ฆฌ์˜ ํŒŒ์ผ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ๋„๋ก ์„ค์ • import numpy as np from dataset.mnist import load_mnist from two_layer_net import TwoLayerNet  # ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)  network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)  x_batch = x_train[:3] t_batch = t_train[:3]  grad_numerical = network.numerical_gradient(x_batch, t_batch) grad_backprop = network.gradient(x_batch, t_batch)  # ๊ฐ ๊ฐ€์ค‘์น˜์˜ ์ ˆ๋Œ€ ์˜ค์ฐจ์˜ ํ‰๊ท ์„ ๊ตฌํ•œ๋‹ค. for key in grad_numerical.keys():     diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )     print(key + ":" + str(diff))

     

728x90
๋ฐ˜์‘ํ˜•