Chapter 7: machine learning
7.1 Shai Ben-David
https://www.youtube.com/playlist?list=PLPW2keNyw-usgvmR7FTQ3ZRjfLs5jT4BO
7.2 deep learning
Esc = Einstein summation convention
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxνy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wxy⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=y⊺μ=(y0y1⋮ym)⊺=y⊺
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=(b0+n∑j=1w0jxjb1+n∑j=1w1jxj⋮bm+n∑j=1wmjxj)Esc=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=bμ+wμjxjy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0y⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺,{x0=1wμ0=bμ=b⊺μ+x⊺jw⊺μj=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=x⊺W⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=y⊺μ=(y0y1⋮ym)⊺=y⊺x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jw⊺μj=y⊺μ=(y0y1⋮ym)⊺=y⊺
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=(b0+n∑j=1w0jxjb1+n∑j=1w1jxj⋮bm+n∑j=1wmjxj)Esc=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=bμ+wμjxjy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0y⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=x⊺νwνμ=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺=b⊺μ+x⊺jw⊺μj=b⊺μ+x⊺jwjμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νwνμ=y⊺μ=(y0y1⋮ym)⊺=y⊺x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jwjμ=y⊺μ=(y0y1⋮ym)⊺
y=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=x⊺νwνμ=y⊺μ=(y0y1⋮ym)⊺=y⊺=x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jwjμ=y⊺μ=(y0y1⋮ym)⊺=y⊺
σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxν⋮wmνxν)=σ((w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn))=σ(Wx),{x0=1wμ0=bμ=σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ((b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn))=σ(Wx),{1=x0bμ=wμ0(x⊺W⊺)ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=(x⊺νw⊺μν)ς=(x⊺νwνμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=(x⊺W⊺)ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς
σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxν⋮wmνxν)=σ((w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn))=σ(Wx),{x0=1wμ0=bμ=σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ((b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn))=σ(Wx),{1=x0bμ=wμ0=σy=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0(x⊺W⊺)ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=(x⊺νw⊺μν)ς=(x⊺νwνμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=(x⊺W⊺)ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς
σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=σy=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς
z=σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=z=zμ=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=z⊺=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=z⊺μ=z⊺
\begin{aligned} \boldsymbol{z}= & \sigma\boldsymbol{y}=\sigma_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}x_{{\scriptscriptstyle \nu}}=\sigma W\boldsymbol{x},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle \mu0}}=b_{{\scriptscriptstyle \mu}} \end{cases}\\ =z_{{\scriptscriptstyle \mu}}= & \sigma_{{\scriptscriptstyle \mu}}y_{{\scriptscriptstyle \mu}}=\sigma_{{\scriptscriptstyle \mu}}\left(w_{{\scriptscriptstyle \mu j}}x_{{\scriptscriptstyle j}}+b_{{\scriptscriptstyle \mu}}\right),\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \mu}}=w_{{\scriptscriptstyle \mu0}} \end{cases}\\ \boldsymbol{x}^{\intercal}W^{\intercal}\varsigma= & x_{{\scriptscriptstyle \nu}}^{\intercal}w_{{\scriptscriptstyle \mu\nu}}^{\intercal}\varsigma_{{\scriptscriptstyle \mu}}=x_{{\scriptscriptstyle \nu}}^{\intercal}w_{{\scriptscriptstyle \nu\mu}}\varsigma_{{\scriptscriptstyle \mu}}=\boldsymbol{y}^{\intercal}\varsigma=\boldsymbol{z}^{\intercal}\\ = & \left(b_{{\scriptscriptstyle \mu}}^{\intercal}+x_{{\scriptscriptstyle j}}^{\intercal}w_{{\scriptscriptstyle j\mu}}\right)\varsigma_{{\scriptscriptstyle \mu}}=y_{{\scriptscriptstyle \mu}}^{\intercal}\varsigma_{{\scriptscriptstyle \mu}}=z_{{\scriptscriptstyle \mu}}^{\intercal} \end{aligned}
4-15
wrong or incompatible transpose
\begin{aligned} \boldsymbol{x}^{\intercal}W= & \begin{pmatrix}x_{{\scriptscriptstyle 0}} & x_{{\scriptscriptstyle 1}} & \cdots & x_{{\scriptscriptstyle m}}\end{pmatrix}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\\ = & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}\sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ \sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ \sum\limits _{\mu=1}^{m}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\\ \overset{\text{Einstein summation convention}}{=} & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\\ = & x_{{\scriptscriptstyle \mu}}^{\intercal}w_{{\scriptscriptstyle \mu\nu}}=\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}\right)^{\intercal}? \end{aligned}
4-18
wrong or incompatible transpose
\begin{aligned} \boldsymbol{x}^{\intercal}W= & \begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle 0\nu}}=b_{{\scriptscriptstyle \nu}} \end{cases}\\ = & \begin{pmatrix}1\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}b_{{\scriptscriptstyle 0}} & b_{{\scriptscriptstyle 1}} & \cdots & b_{{\scriptscriptstyle n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}=\begin{pmatrix}x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i0}}+b_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i1}}+b_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{i}w_{{\scriptscriptstyle in}}+b_{{\scriptscriptstyle n}} \end{pmatrix}^{\intercal},\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \nu}}=w_{{\scriptscriptstyle 0\nu}} \end{cases} \end{aligned}
wrong or incompatible transpose
\begin{aligned} \sigma\left(\boldsymbol{x}^{\intercal}W\right)= & \sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}w_{{\scriptscriptstyle 00}} & w_{{\scriptscriptstyle 01}} & \cdots & w_{{\scriptscriptstyle 0n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\right)=\sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\\ \vdots\\ x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}} \end{pmatrix}^{\intercal}\right)=\begin{pmatrix}\sigma_{{\scriptscriptstyle 0}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu0}}\right)\\ \sigma_{{\scriptscriptstyle 1}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu1}}\right)\\ \vdots\\ \sigma_{{\scriptscriptstyle n}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu n}}\right) \end{pmatrix}^{\intercal},\begin{cases} x_{{\scriptscriptstyle 0}}=1\\ w_{{\scriptscriptstyle 0\nu}}=b_{{\scriptscriptstyle \nu}} \end{cases}\\ = & \sigma\left(\begin{pmatrix}1\\ x_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{{\scriptscriptstyle m}} \end{pmatrix}^{\intercal}\begin{pmatrix}b_{{\scriptscriptstyle 0}} & b_{{\scriptscriptstyle 1}} & \cdots & b_{{\scriptscriptstyle n}}\\ w_{{\scriptscriptstyle 10}} & w_{{\scriptscriptstyle 11}} & \cdots & w_{{\scriptscriptstyle 1n}}\\ \vdots & \vdots & \ddots & \vdots\\ w_{{\scriptscriptstyle m0}} & w_{{\scriptscriptstyle m1}} & \cdots & w_{{\scriptscriptstyle mn}} \end{pmatrix}\right)=\sigma\left(\begin{pmatrix}x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i0}}+b_{{\scriptscriptstyle 0}}\\ x_{{\scriptscriptstyle i}}w_{{\scriptscriptstyle i1}}+b_{{\scriptscriptstyle 1}}\\ \vdots\\ x_{i}w_{{\scriptscriptstyle in}}+b_{{\scriptscriptstyle n}} \end{pmatrix}^{\intercal}\right)=\sigma_{{\scriptscriptstyle \nu}}\left(x_{{\scriptscriptstyle \mu}}w_{{\scriptscriptstyle \mu\nu}}\right),\begin{cases} 1=x_{{\scriptscriptstyle 0}}\\ b_{{\scriptscriptstyle \nu}}=w_{{\scriptscriptstyle 0\nu}} \end{cases} \end{aligned}