Chapter 7: machine learning

7.2 deep learning

7.2.1 我妻幸長

Esc = Einstein summation convention

Wx=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=(nν=0w0νxνnν=0w1νxνnν=0wmνxν)Esc=(w0νxνw1νxνwmνxν)=wμνxνy=(y0y1yμ)=yμ=wμνxν=(w0νxνw1νxνwmνxν)=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=Wxy=(y0y1ym)=yμ=(wμνxν)=(w0νxνw1νxνwmνxν)=[(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)]=[Wx]=xνwμν=(w0νxνw1νxνwmνxν)=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=xWxW=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=(w0νxνw1νxνwmνxν)=xνwμν=yμ=(y0y1ym)=y


Wx=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=(nν=0w0νxνnν=0w1νxνnν=0wmνxν)Esc=(w0νxνw1νxνwmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=(b0+nj=1w0jxjb1+nj=1w1jxjbm+nj=1wmjxj)Esc=(b0+w0jxjb1+w1jxjbm+wmjxj)=bμ+wμjxjy=(y0y1yμ)=yμ=wμνxν=(w0νxνw1νxνwmνxν)=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=Wx,{x0=1wμ0=bμ=y=(y0y1yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxjbm+wmjxj)=(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=Wx,{1=x0bμ=wμ0y=(y0y1ym)=yμ=(wμνxν)=(w0νxνw1νxνwmνxν)=[(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)]=[Wx]=xνwμν=(w0νxνw1νxνwmνxν)=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=xW,{x0=1wμ0=bμ=bμ+xjwμj=(b0+w0jxjb1+w1jxjbm+wmjxj)=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)=xWxW=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=(w0νxνw1νxνwmνxν)=xνwμν=yμ=(y0y1ym)=yxW=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)=(b0+w0jxjb1+w1jxjbm+wmjxj)=bμ+xjwμj=yμ=(y0y1ym)=y


Wx=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=(nν=0w0νxνnν=0w1νxνnν=0wmνxν)Esc=(w0νxνw1νxνwmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=(b0+nj=1w0jxjb1+nj=1w1jxjbm+nj=1wmjxj)Esc=(b0+w0jxjb1+w1jxjbm+wmjxj)=bμ+wμjxjy=(y0y1yμ)=yμ=wμνxν=(w0νxνw1νxνwmνxν)=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=Wx,{x0=1wμ0=bμ=y=(y0y1yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxjbm+wmjxj)=(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=Wx,{1=x0bμ=wμ0y=(y0y1ym)=yμ=(wμνxν)=(w0νxνw1νxνwmνxν)=[(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)]=[Wx]=xνwμν=xνwνμ=(w0νxνw1νxνwmνxν)=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=xW=bμ+xjwμj=bμ+xjwjμ=(b0+w0jxjb1+w1jxjbm+wmjxj)=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)xW=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=(w0νxνw1νxνwmνxν)=xνwνμ=yμ=(y0y1ym)=yxW=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)=(b0+w0jxjb1+w1jxjbm+wmjxj)=bμ+xjwjμ=yμ=(y0y1ym)


y=(y0y1yμ)=yμ=wμνxν=(w0νxνw1νxνwmνxν)=(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=Wx,{x0=1wμ0=bμ=y=(y0y1yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxjbm+wmjxj)=(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=Wx,{1=x0bμ=wμ0xW=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)=(w0νxνw1νxνwmνxν)=xνwμν=xνwνμ=yμ=(y0y1ym)=y=xW=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)=(b0+w0jxjb1+w1jxjbm+wmjxj)=bμ+xjwjμ=yμ=(y0y1ym)=y


σ(y)=σ(y0y1yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxνwmνxν)=σ((w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn))=σ(Wx),{x0=1wμ0=bμ=σ(y)=σ(y0y1yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxjbm+wmjxj)=σ((b0w01w0nb1w11w1nbmwm1wmn)(1x1xn))=σ(Wx),{1=x0bμ=wμ0(xW)ς=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)ς=(w0νxνw1νxνwmνxν)ς=(xνwμν)ς=(xνwνμ)ς=(yμ)ς=(y0y1ym)ς=(y)ς=(xW)ς=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)ς=(b0+w0jxjb1+w1jxjbm+wmjxj)ς=(bμ+xjwjμ)ς=(yμ)ς=(y0y1ym)ς=(y)ς


σ(y)=σ(y0y1yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxνwmνxν)=σ((w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn))=σ(Wx),{x0=1wμ0=bμ=σy=σμ(y0y1yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxνwmνxν)=σ(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=σWx,{x0=1wμ0=bμ=σ(y)=σ(y0y1yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxjbm+wmjxj)=σ((b0w01w0nb1w11w1nbmwm1wmn)(1x1xn))=σ(Wx),{1=x0bμ=wμ0=σy=σμ(y0y1yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxjbm+wmjxj)=σ(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=σWx,{1=x0bμ=wμ0(xW)ς=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)ς=(w0νxνw1νxνwmνxν)ς=(xνwμν)ς=(xνwνμ)ς=(yμ)ς=(y0y1ym)ς=(y)ς=xWς=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)ς=(w0νxνw1νxνwmνxν)ς=xνwμνςμ=xνwνμςμ=yμςμ=(y0y1ym)ςμ=yς=(xW)ς=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)ς=(b0+w0jxjb1+w1jxjbm+wmjxj)ς=(bμ+xjwjμ)ς=(yμ)ς=(y0y1ym)ς=(y)ς=xWς=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)ς=(b0+w0jxjb1+w1jxjbm+wmjxj)ς=(bμ+xjwjμ)ςμ=yμςμ=(y0y1ym)ςμ=yς


σy=σμ(y0y1yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxνwmνxν)=σ(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=σWx,{x0=1wμ0=bμ=σy=σμ(y0y1yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxjbm+wmjxj)=σ(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=σWx,{1=x0bμ=wμ0xWς=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)ς=(w0νxνw1νxνwmνxν)ς=xνwμνςμ=xνwνμςμ=yμςμ=(y0y1ym)ςμ=yς=xWς=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)ς=(b0+w0jxjb1+w1jxjbm+wmjxj)ς=(bμ+xjwjμ)ςμ=yμςμ=(y0y1ym)ςμ=yς


z=σy=σμ(y0y1yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxνwmνxν)=σ(w00w01w0nw10w11w1nwm0wm1wmn)(x0x1xn)=σWx,{x0=1wμ0=bμ=z=zμ=σμ(y0y1yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxjbm+wmjxj)=σ(b0w01w0nb1w11w1nbmwm1wmn)(1x1xn)=σWx,{1=x0bμ=wμ0xWς=(x0x1xn)(w00w01w0nw10w11w1nwm0wm1wmn)ς=(w0νxνw1νxνwmνxν)ς=xνwμνςμ=xνwνμςμ=yμςμ=(y0y1ym)ςμ=yς=z=xWς=(1x1xn)(b0w01w0nb1w11w1nbmwm1wmn)ς=(b0+w0jxjb1+w1jxjbm+wmjxj)ς=(bμ+xjwjμ)ςμ=yμςμ=(y0y1ym)ςμ=zμ=z


\begin{aligned} \boldsymbol{z}= & \sigma\boldsymbol{y}=\sigma_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}x_{{\scriptscriptstyle \nu}}=\sigma W\boldsymbol{x},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle \mu0}}=b_{{\scriptscriptstyle \mu}} \end{cases}\\ =z_{{\scriptscriptstyle \mu}}= & \sigma_{{\scriptscriptstyle \mu}}y_{{\scriptscriptstyle \mu}}=\sigma_{{\scriptscriptstyle \mu}}\left(w_{{\scriptscriptstyle \mu j}}x_{{\scriptscriptstyle j}}+b_{{\scriptscriptstyle \mu}}\right),\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \mu}}=w_{{\scriptscriptstyle \mu0}} \end{cases}\\ \boldsymbol{x}^{\intercal}W^{\intercal}\varsigma= & x_{{\scriptscriptstyle \nu}}^{\intercal}w_{{\scriptscriptstyle \mu\nu}}^{\intercal}\varsigma_{{\scriptscriptstyle \mu}}=x_{{\scriptscriptstyle \nu}}^{\intercal}w_{{\scriptscriptstyle \nu\mu}}\varsigma_{{\scriptscriptstyle \mu}}=\boldsymbol{y}^{\intercal}\varsigma=\boldsymbol{z}^{\intercal}\\ = & \left(b_{{\scriptscriptstyle \mu}}^{\intercal}+x_{{\scriptscriptstyle j}}^{\intercal}w_{{\scriptscriptstyle j\mu}}\right)\varsigma_{{\scriptscriptstyle \mu}}=y_{{\scriptscriptstyle \mu}}^{\intercal}\varsigma_{{\scriptscriptstyle \mu}}=z_{{\scriptscriptstyle \mu}}^{\intercal} \end{aligned}


matrix calculus[57]

4-15

wrong or incompatible transpose

\begin{aligned} \boldsymbol{x}^{\intercal}W= & \begin{pmatrix}x_{{\scriptscriptstyle 0}} & x_{{\scriptscriptstyle 1}} & \cdots & x_{{\scriptscriptstyle m}}\end{pmatrix}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\\ = & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}\sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ \sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ \sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\\ \overset{\text{Einstein summation convention}}{=} & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\\ = & x_{{\scriptscriptstyle \mu}}^{\intercal}w_{{\scriptscriptstyle \mu\nu}}=\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}\right)^{\intercal}? \end{aligned}

4-18

wrong or incompatible transpose

\begin{aligned} \boldsymbol{x}^{\intercal}W= & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle 0\nu}}=b_{{\scriptscriptstyle \nu}} \end{cases}\\ = & \begin{pmatrix}1\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}b_{{\scriptscriptstyle 0}} & b_{{\scriptscriptstyle 1}} & \cdots & b_{{\scriptscriptstyle n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i0}}+b_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i1}}+b_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{i}w_{{\scriptscriptstyle in}}+b_{{\scriptscriptstyle n}} \end{pmatrix}^{\intercal},\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \nu}}=w_{{\scriptscriptstyle 0\nu}} \end{cases} \end{aligned}

wrong or incompatible transpose

\begin{aligned} \sigma\left(\boldsymbol{x}^{\intercal}W\right)= & \sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\right)=\sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\right)=\begin{pmatrix}\sigma_{{\scriptscriptstyle 0}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\right)\\ \sigma_{{\scriptscriptstyle 1}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\right)\\ \vdots\\ \sigma_{{\scriptscriptstyle n}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}}\right) \end{pmatrix}^{\intercal},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle 0\nu}}=b_{{\scriptscriptstyle \nu}} \end{cases}\\ = & \sigma\left(\begin{pmatrix}1\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}b_{{\scriptscriptstyle 0}} & b_{{\scriptscriptstyle 1}} & \cdots & b_{{\scriptscriptstyle n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\right)=\sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i0}}+b_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i1}}+b_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{i}w_{{\scriptscriptstyle in}}+b_{{\scriptscriptstyle n}} \end{pmatrix}^{\intercal}\right)=\sigma_{{\scriptscriptstyle \nu}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}\right),\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \nu}}=w_{{\scriptscriptstyle 0\nu}} \end{cases} \end{aligned}