Chapter 7: machine learning
7.1 Shai Ben-David
https://www.youtube.com/playlist?list=PLPW2keNyw-usgvmR7FTQ3ZRjfLs5jT4BO
7.2 deep learning
Esc = Einstein summation convention
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxνy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wxy⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=y⊺μ=(y0y1⋮ym)⊺=y⊺
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=(b0+n∑j=1w0jxjb1+n∑j=1w1jxj⋮bm+n∑j=1wmjxj)Esc=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=bμ+wμjxjy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0y⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺,{x0=1wμ0=bμ=b⊺μ+x⊺jw⊺μj=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=x⊺W⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=y⊺μ=(y0y1⋮ym)⊺=y⊺x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jw⊺μj=y⊺μ=(y0y1⋮ym)⊺=y⊺
Wx=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=(n∑ν=0w0νxνn∑ν=0w1νxν⋮n∑ν=0wmνxν)Esc=(w0νxνw1νxν⋮wmνxν)=wμνxν,{x0=1wμ0=bμ=Wx=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=(b0+n∑j=1w0jxjb1+n∑j=1w1jxj⋮bm+n∑j=1wmjxj)Esc=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=bμ+wμjxjy=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0y⊺=(y0y1⋮ym)⊺=y⊺μ=(wμνxν)⊺=(w0νxνw1νxν⋮wmνxν)⊺=[(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)]⊺=[Wx]⊺=x⊺νw⊺μν=x⊺νwνμ=(w0νxνw1νxν⋮wmνxν)⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=x⊺W⊺=b⊺μ+x⊺jw⊺μj=b⊺μ+x⊺jwjμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νwνμ=y⊺μ=(y0y1⋮ym)⊺=y⊺x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jwjμ=y⊺μ=(y0y1⋮ym)⊺
y=(y0y1⋮yμ)=yμ=wμνxν=(w0νxνw1νxν⋮wmνxν)=(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=Wx,{x0=1wμ0=bμ=y=(y0y1⋮yμ)=yμ=wμjxj+bμ=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=Wx,{1=x0bμ=wμ0x⊺W⊺=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺=(w0νxνw1νxν⋮wmνxν)⊺=x⊺νw⊺μν=x⊺νwνμ=y⊺μ=(y0y1⋮ym)⊺=y⊺=x⊺W⊺=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺=b⊺μ+x⊺jwjμ=y⊺μ=(y0y1⋮ym)⊺=y⊺
σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxν⋮wmνxν)=σ((w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn))=σ(Wx),{x0=1wμ0=bμ=σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ((b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn))=σ(Wx),{1=x0bμ=wμ0(x⊺W⊺)ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=(x⊺νw⊺μν)ς=(x⊺νwνμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=(x⊺W⊺)ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς
σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμνxν)=σ(w0νxνw1νxν⋮wmνxν)=σ((w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn))=σ(Wx),{x0=1wμ0=bμ=σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=σ(y)=σ(y0y1⋮yμ)=σ(yμ)=σ(wμjxj+bμ)=σ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ((b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn))=σ(Wx),{1=x0bμ=wμ0=σy=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0(x⊺W⊺)ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=(x⊺νw⊺μν)ς=(x⊺νwνμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=(x⊺W⊺)ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ς=(y⊺μ)ς=(y0y1⋮ym)⊺ς=(y⊺)ς=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς
σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=σy=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς
z=σy=σμ(y0y1⋮yμ)=σμyμ=σμwμνxν=σμ(w0νxνw1νxν⋮wmνxν)=σ(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)(x0x1⋮xn)=σWx,{x0=1wμ0=bμ=z=zμ=σμ(y0y1⋮yμ)=σμyμ=σμ(wμjxj+bμ)=σμ(b0+w0jxjb1+w1jxj⋮bm+wmjxj)=σ(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)(1x1⋮xn)=σWx,{1=x0bμ=wμ0x⊺W⊺ς=(x0x1⋮xn)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)⊺ς=(w0νxνw1νxν⋮wmνxν)⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=y⊺ς=z⊺=x⊺W⊺ς=(1x1⋮xn)⊺(b0w01⋯w0nb1w11⋯w1n⋮⋮⋱⋮bmwm1⋯wmn)⊺ς=(b0+w0jxjb1+w1jxj⋮bm+wmjxj)⊺ς=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=(y0y1⋮ym)⊺ςμ=z⊺μ=z⊺
z=σy=σμwμνxν=σWx,{x0=1wμ0=bμ=zμ=σμyμ=σμ(wμjxj+bμ),{1=x0bμ=wμ0x⊺W⊺ς=x⊺νw⊺μνςμ=x⊺νwνμςμ=y⊺ς=z⊺=(b⊺μ+x⊺jwjμ)ςμ=y⊺μςμ=z⊺μ
4-15
wrong or incompatible transpose
x⊺W=(x0x1⋯xm)(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)=(x0x1⋮xm)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)=(m∑μ=1xμwμ0m∑μ=1xμwμ1⋮m∑μ=1xμwμn)⊺Einstein summation convention=(x0x1⋮xm)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)=(xμwμ0xμwμ1⋮xμwμn)⊺=x⊺μwμν=(xμwμν)⊺?
4-18
wrong or incompatible transpose
x⊺W=(x0x1⋮xm)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)=(xμwμ0xμwμ1⋮xμwμn)⊺,{x0=1w0ν=bν=(1x1⋮xm)⊺(b0b1⋯bnw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn)=(xiwi0+b0xiwi1+b1⋮xiwin+bn)⊺,{1=x0bν=w0ν
wrong or incompatible transpose
σ(x⊺W)=σ((x0x1⋮xm)⊺(w00w01⋯w0nw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn))=σ((xμwμ0xμwμ1⋮xμwμn)⊺)=(σ0(xμwμ0)σ1(xμwμ1)⋮σn(xμwμn))⊺,{x0=1w0ν=bν=σ((1x1⋮xm)⊺(b0b1⋯bnw10w11⋯w1n⋮⋮⋱⋮wm0wm1⋯wmn))=σ((xiwi0+b0xiwi1+b1⋮xiwin+bn)⊺)=σν(xμwμν),{1=x0bν=w0ν