一步一步用numpy实现神经网络各种层

作者 : admin 本文共6053个字,预计阅读时间需要16分钟 发布时间: 2024-06-17 共1人阅读

1. 首先准备一下数据

if __name__ == "__main__":
	data = np.array([[2, 1, 0],
                 [2, 2, 0],
                 [5, 4, 1],
                 [4, 5, 1],
                 [2, 3, 0],
                 [3, 2, 0],
                 [6, 5, 1],
                 [4, 1, 0],
                 [6, 3, 1],
                 [7, 4, 1]])
	
	x = data[:, :-1]
	y = data[:, -1]
	
	for epoch in range(1000):
		...

2. 实现Softmax+CrossEntropy层

单独求softmax层有点麻烦, 将softmax+entropy一起求导更方便。

假设对于输入向量

(

x

1

,

x

2

,

x

3

)

(x_1, x_2, x_3)

(x1,x2,x3), 则对应的Loss为:

L

=

i

=

1

C

y

i

ln

p

i

=

(

y

1

ln

p

1

+

y

2

ln

p

2

+

y

3

ln

p

3

)

\begin{align*} L&=-\sum_{i=1}^Cy_i \ln p^i \ &=-(y_1\ln p_1+y_2\ln p_2+y_3\ln p_3) \end{align*}

L=i=1Cyilnpi=(y1lnp1+y2lnp2+y3lnp3)

其中

y

i

y_i

yi为ground truth, 为one-hot vector.

p

i

p_i

pi为输出概率。

p

1

=

e

x

1

e

x

1

+

e

x

2

+

e

x

3

p

2

=

e

x

2

e

x

1

+

e

x

2

+

e

x

3

p

3

=

e

x

3

e

x

1

+

e

x

2

+

e

x

3

p_1=\frac{e^{x_1}}{e^{x_1}+e^{x_2}+e^{x_3}}\ p_2=\frac{e^{x_2}}{e^{x_1}+e^{x_2}+e^{x_3}}\ p_3=\frac{e^{x_3}}{e^{x_1}+e^{x_2}+e^{x_3}}\

p1=ex1+ex2+ex3ex1p2=ex1+ex2+ex3ex2p3=ex1+ex2+ex3ex3
则偏导为

L

x

1

=

y

1

1

p

1

p

1

x

1

y

2

1

p

2

p

2

x

1

y

3

1

p

3

p

3

x

1

=

y

1

1

p

1

e

x

1

(

e

x

1

+

e

x

2

+

e

x

3

)

e

x

1

e

x

1

(

e

x

1

+

e

x

2

+

e

x

3

)

2

y

2

1

p

2

e

x

2

e

x

1

(

e

x

1

+

e

x

2

+

e

x

3

)

2

y

3

1

p

3

e

x

3

e

x

1

(

e

x

1

+

e

x

2

+

e

x

3

)

2

=

y

1

1

p

1

(

p

1

p

2

+

p

1

p

3

)

y

2

1

p

2

(

p

1

p

2

)

y

3

1

p

3

(

p

1

p

3

)

=

y

1

(

p

2

+

p

3

)

+

y

2

p

2

+

y

3

p

3

=

y

1

(

1

p

1

)

+

y

2

p

1

+

y

3

p

1

=

y

1

(

p

1

1

)

+

y

2

p

1

+

y

3

p

1

\begin{align*} \frac{\partial L}{\partial x_1} &= -y_1\frac{1}{p_1}*\frac{\partial p_1}{\partial x_1} – y_2\frac{1}{p_2}*\frac{\partial p_2}{\partial x_1} – y_3\frac{1}{p_3}*\frac{\partial p_3}{\partial x_1} \ &= -y_1\frac{1}{p_1} * \frac{e^{x_1} * (e^{x_1}+e^{x_2}+e^{x_3})-e^{x_1}*e^{x_1}}{(e^{x_1}+e^{x_2}+e^{x_3})^2} \ &\quad\quad-y_2\frac{1}{p_2}*\frac{-e^{x_2}*e^{x_1}}{{(e^{x_1}+e^{x_2}+e^{x_3})^2}}\ &\quad\quad-y_3\frac{1}{p_3}*\frac{-e^{x_3}*e^{x_1}}{{(e^{x_1}+e^{x_2}+e^{x_3})^2}}\ &=-y_1\frac{1}{p_1}(p_1*p_2+p_1*p_3)\ &\quad\quad -y_2\frac{1}{p_2}(-p_1*p_2)\ &\quad\quad -y_3\frac{1}{p_3}(-p_1*p_3)\ &=-y1(p_2+p_3)+y_2*p_2+y_3*p_3\ &=-y_1(1-p_1)+y_2*p_1+y_3*p_1\ &=y_1(p_1-1)+y_2*p_1+y_3*p_1 \end{align*}

x1L=y1p11x1p1y2p21x1p2y3p31x1p3=y1p11(ex1+ex2+ex3)2ex1(ex1+ex2+ex3)ex1ex1y2p21(ex1+ex2+ex3)2ex2ex1y3p31(ex1+ex2+ex3)2ex3ex1=y1p11(p1p2+p1p3)y2p21(p1p2)y3p31(p1p3)=y1(p2+p3)+y2p2+y3p3=y1(1p1)+y2p1+y3p1=y1(p11)+y2p1+y3p1

同理:

L

x

2

=

y

1

p

2

+

y

2

(

p

2

1

)

+

y

3

p

2

L

x

3

=

y

1

p

3

+

y

2

p

3

+

y

3

(

p

3

1

)

\frac{\partial L}{\partial x_2}=y_1*p_2+y_2(p_2-1)+y_3*p_2\ \frac{\partial L}{\partial x_3}=y_1*p_3+y_2p_3+y_3*(p_3-1)

x2L=y1p2+y2(p21)+y3p2x3L=y1p3+y2p3+y3(p31)

y

1

=

1

y_1=1

y1=1时, 对应的导数为

(

p

1

1

,

p

2

,

p

3

)

(p1-1, p_2, p_3)

(p11,p2,p3). 当

y

2

=

1

y_2=1

y2=1时,对应的导数为:

(

p

1

,

p

2

1

,

p

3

)

(p_1, p2-1, p3)

(p1,p21,p3).

例如求得概率为

(

0.2

,

0.3

,

0.5

)

(0.2, 0.3, 0.5)

(0.2,0.3,0.5), label为

(

0

,

0

,

1

)

(0, 0, 1)

(0,0,1), 则导数为

(

0.2

,

0.3

,

0.5

)

(0.2, 0.3, -0.5)

(0.2,0.3,0.5)

python代码为:

注意求softmax时需要np.exp(x-np.max(x, axis=1, keepdims=True))防止指数运算溢出。

class Softmax:
	def __init__(self, n_classes):
		self.n_classes = n_classes

	def forward(self, x, y):
		prob = np.exp(x-np.max(x, axis=1, keepdims=True))
		prob /= np.sum(prob, axis=1, keepdims=True)
		
		# 选出y==1位置的概率
		loss = -np.sum(np.log(prob[np.arange(len(y), y])) / len(y)

		self.grad = prob.copy()
		self.grad[np.arange(len(y), y] -= 1
		"""
		因为后面求导数都是直接np.sum而不是np.mean, 因此这里mean一次就可以了
		"""
		self.grad /= len(y)  

		return prob, loss

	def backward(self):
		return self.grad

3. 单独的CrossEntropy

python代码为:

class Entropy:
	def __init__(self, n_classes):
		self.n_classes = n_classes
		self.grad = None
	
	def forward(self, x, y):
		# x: (b, c), y: (b)
		b = y.shape[0]
		one_hot_y = np.zeros((b, self.n_classes))
		one_hot_y[range(len(y)), y] = 1
		self.grad = one_hot_y * -1 / x
		return np.mean(-one_hot_y * np.log(x), axis=0)

	def backward(self):
		return self.grad

2. 单独的Softmax层

from einops import repeat, rearrange, einsum
class Softmax:
	def __init__(self):
		
	def forward(self, x):
		# x: (b, c)
		x_exp = np.exp(x)
		self.output = x_xep / np.sum(x_exp, axis=1, keep_dims=True)
		return self.output
		
	def backward(self, prev_grad):
		b, c = self.output.shape
        o = repeat(self.output, 'b c -> b c r', r=c)
        I = repeat(np.eye(x.shape[1]), 'c1 c2 -> b c1 c2', b=b)
        self.grad = o * (I - rearrange(o, 'b c1 c2 -> b c2 c1'))
        return einsum(self.grad, grad[..., None], 'b c c, b c m -> b c m')[..., 0]		

3. Linear层

注意更新

w

w

w时用的

d

w

d_w

dw, 但是往上一层传递的是

d

x

d_x

dx。因为上一层需要

d

L

/

d

o

u

t

dL/d_{out}

dL/dout, 而本层的输入

x

x

x即是上一次层的输出

d

L

/

d

o

u

t

=

d

L

/

d

x

dL/d_{out} = dL/dx

dL/dout=dL/dx

class Linear:
    def __init__(self, in_channels, out_channels, lr):
        self.lr = lr
        self.w = np.random.rand(in_channels, out_channels)
        self.b = np.random.rand(out_channels)

    def forward(self, x):
        self.x = x
        return x@self.w + self.b

    def backward(self, grad):
        dx = einsum(prev_grad, rearrange(self.w, 'w1 w2 -> w2 w1'), 'c1 b, b c2 -> c1 c2')
        dw = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
        db = np.sum(prev_grad, axis=0)

        self.w -= self.lr * dw
        self.b -= self.lr * db

        """
            注意这里往上一层传递的是dx, 因为上一层需要dL/d_out, 而本层的输入x即是上一次层的输出
            dL/d_out = dL/dx
        """
        return dx

5. 完整训练代码

from einops import *
import numpy as np
class Softmax:
def __init__(self, train=True):
self.grad = None
self.train = train
def forward(self, x, y):
prob = np.exp(x-np.max(x, axis=1, keepdims=True))
prob /= np.sum(prob, axis=1, keepdims=True)
if self.train:
loss = -np.sum(np.log(prob[range(len(y)), y]))/len(y)
self.grad = prob.copy()
self.grad[range(len(y)), y] -= 1
self.grad /= len(y)
return prob, loss
else:
return prob
def backward(self):
return self.grad
class Linear:
def __init__(self, in_channels, out_channels, lr):
self.w = np.random.rand(in_channels, out_channels)
self.b = np.random.rand(out_channels)
self.lr = lr
def forward(self, x):
self.x = x
output = einsum(x, self.w, 'b c1, c1 c2 -> b c2') + self.b
return output
def backward(self, prev_grad):
cur_grad = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
self.w -= self.lr * cur_grad
self.b -= self.lr * np.sum(prev_grad, axis=0)
return cur_grad
class Network:
def __init__(self, in_channels, out_channels, n_classes, lr):
self.lr = lr
self.linear = Linear(in_channels, out_channels, lr)
self.softmax = Softmax()
def forward(self, x, y=None):
out = self.linear.forward(x)
out = self.softmax.forward(out, y)
return out
def backward(self):
grad = self.softmax.backward()
grad = self.linear.backward(grad)
return grad
if __name__ == "__main__":
data = np.array([[2, 1, 0],
[2, 2, 0],
[5, 4, 1],
[4, 5, 1],
[2, 3, 0],
[3, 2, 0],
[6, 5, 1],
[4, 1, 0],
[6, 3, 1],
[7, 4, 1]])
# x = np.concatenate([np.array([[1]] * data.shape[0]), data[:, :2]], axis=1)
x = data[:, :-1]
y = data[:, -1:].flatten()
net = Network(2, 2, 2, 0.1)
# loss_fn = CrossEntropy(n_classes=2)
for epoch in range(500):
prob, loss = net.forward(x, y)
# loss = loss_fn.forward(out, y)
# grad_ = loss_fn.backward()
grad = net.backward()
print(loss)
net.softmax.train = False
print(net.forward(np.array([[0, 0], [0, 4], [8, 6], [10, 10]])), y)
本站无任何商业行为
个人在线分享 » 一步一步用numpy实现神经网络各种层
E-->