using LinearAlgebra, Statistics, Roots, Optim, Plots, Random; Random.seed!(1234) n = 1000 # number of samples to draw # the true regression function r(x) = 2 + 1/50*x*(30-x) # the true density function σy = 3/2 f(x,y) = 3/4000 * 1/sqrt(2π*σy^2) * x*(20-x)*exp(-1/(2σy^2)*(y-r(x))^2) # x values and y values for a grid xs = 0:1/2^3:20 ys = 0:1/2^3:10 # F(t) is the CDF of X F(t) = -t^3/4000 + 3t^2/400 # Inverse CDF sampling function sampleX() U = rand() find_zero(t->F(t)-U,(0,20),Bisection()) end function sampleXY(r,σ) X = sampleX() Y = r(X) + σ*randn() (X,Y) end samples = [sampleXY(r,σy) for i=1:n] D(u) = abs(u) < 1 ? 70/81*(1-abs(u)^3)^3 : 0 # tri-cube function D(λ,u) = 1/λ*D(u/λ) # scaled tri-cube K(λ,x,y) = D(λ,x) * D(λ,y) # kernel kde(λ,x,y,samples) = sum(K(λ,x-Xi,y-Yi) for (Xi,Yi) in samples)/length(samples) # `optimize` takes functions which accept vector arguments, so we treat # λ as a one-entry vector L(λ) = sum((f(x,y) - kde(λ,x,y,samples))^2 for x=xs,y=ys)*step(xs)*step(ys) # minimize L using the BFGS method λ_best = optimize(λ->L(first(λ)),[1.0],BFGS())
"Evaluate the summation Σᵢ f⁽⁻ⁱ⁾(Xᵢ,Yᵢ) in J(λ)'s second term" function kdeCV(λ,i,samples) x,y = samples[i] newsamples = copy(samples) deleteat!(newsamples,i) kde(λ,x,y,newsamples) end # first line approximates ∫f̂², the second line approximates -(2/n)∫f̂f J(λ) = sum([kde(λ,x,y,samples)^2 for x=xs,y=ys])*step(xs)*step(ys) - 2/length(samples)*sum(kdeCV(λ,i,samples) for i=1:length(samples)) λ_best_cv = optimize(λ->J(first(λ)),[1.0],BFGS())
λ = first(λ_best_cv.minimizer) r̂(x) = sum(D(λ,x-Xi)*Yi for (Xi,Yi) in samples)/sum(D(λ,x-Xi) for (Xi,Yi) in samples) plot(r̂,0,20)
y = [Y for (X,Y) in samples] X = [ones(n) [X for (X,Y) in samples]] β = (X'*X) \ X'*y # (computes (X'X)⁻¹X'y) scatter(samples) plot!(xs,[β⋅[1,x] for x in xs])
y = [Y for (X,Y) in samples] X = [ones(n) [X for (X,Y) in samples] [X^2 for (X,Y) in samples]] βq = (X'*X) \ X'*y scatter(samples) plot!(xs,[βq⋅[1,x,x^2] for x in xs]) quaderr = sum((r(x)-βq⋅[1,x,x^2])^2 for x in xs)*step(xs) linerr = sum((r(x)-β⋅[1,x])^2 for x in xs)*step(xs)
using Plots, StatsBase, Random; Random.seed!(1234) struct Normal μ::Vector Σ::Array end struct Flower X::Vector color::String end # density function for the normal distribution N f(x,N::Normal) = 1/(2π*sqrt(det(N.Σ))) * exp(-1/2*((x-N.μ)'*inv(N.Σ)*(x-N.μ))) xs = 0:1/2^4:15 ys = 0:1/2^4:15 As = [[1.5 -1; 0 1],[1/2 1/4; 0 1/2], [2 0; 0 2]] μs = [[9,5],[4,10],[7,9]] Ns = [Normal(μ,A*A') for (μ,A) in zip(μs,As)] p = Weights([1/3,1/6,1/2]) colors = ["red","green","blue"] function randflower(μs,As) i = sample(p) Flower(As[i]*randn(2)+μs[i],colors[i]) end flowers = [randflower(μs,As) for i=1:300]
classify(x,p,Ns) = argmax([p[i]*f(x,Ns[i]) for i=1:3]) function classificationplot(flowers,p,Ns) rgb = [:red,:green,:blue] P = heatmap(xs,ys,[classify([x,y],p,Ns) for x=xs,y=ys], fillcolor=cgrad(rgb),opacity=0.4,transpose=true) for c in ["red","green","blue"] scatter!(P,[(F.X[1],F.X[2]) for F in flowers if F.color ==c],color=c) end plot!(aspect_ratio=:equal,legend=false) P end scatter([(F.X[1],F.X[2]) for F in flowers], group=[F.color for F in flowers], color_palette=rgb,aspect_ratio=:equal,legend=false)
ERROR: UndefVarError: rgb not defined
correct(flowers,p,Ns) = count(colors[classify(F.X,p,Ns)] == F.color for F in flowers) classificationplot(flowers,p,Ns)
function mvn_estimate(flowers,color) flowers_subset = [F.X for F in flowers if F.color == color] m̂ = mean(flowers_subset) Ŝ = mean([(X - m̂)*(X - m̂)' for X in flowers_subset]) Normal(m̂,Ŝ) end colorcounts = countmap([F.color for F in flowers]) p̂ = [colorcounts[c]/length(flowers) for c in colors] N̂s = [mvn_estimate(flowers,c) for c in colors] classificationplot(flowers,p̂,N̂s)
using Plots, Distributions, Optim mycgrad = cgrad([:MidnightBlue,:SeaGreen,:Gold,:Tomato]) gr(aspect_ratio=1,fillcolor=mycgrad) # Plots.jl default settings A = MvNormal([0,0],[1.0 0; 0 1]) B = MvNormal([1,1],[1.0 0; 0 1]) xs = -5:1/2^5:5 ys = -5:1/2^5:5 r(x,y) = 0.5pdf(B,[x,y])/(0.5pdf(A,[x,y])+0.5pdf(B,[x,y])) rs = [r(x,y) for x=xs,y=ys] heatmap(xs,ys,rs)
samples = [rand(Bool) ? (rand(A),0) : (rand(B),1) for i=1:1000] cs = [c for ((x,y),c) in samples] scatter([(x,y) for ((x,y),c) in samples],group=cs)
σ(u) = 1/(1 + exp(-u)) r(β,x) = σ(β⋅[1;x]) C(β,xᵢ,yᵢ) = yᵢ*log(1/r(β,xᵢ))+(1-yᵢ)*log(1/(1-r(β,xᵢ))) L(β) = sum(C(β,xᵢ,yᵢ) for (xᵢ,yᵢ) in samples) β̂ = optimize(L,ones(3),BFGS()).minimizer r̂s = [r(β̂,[x,y]) for x=xs,y=ys]
using Plots, JuMP, Ipopt, Random; Random.seed!(1234); gr(aspect_ratio=1)
Plots.GRBackend()
function samplepoint(μ=[3,3]) class = rand([-1,1]) if class == -1 X = randn(2) else X = μ + [1 -1/2; -1/2 1] * randn(2) end (X,class) end n = 100 samples = [samplepoint() for i=1:n] ys = [y for (x,y) in samples] scatter([(x₁,x₂) for ((x₁,x₂),y) in samples],group=ys)
m = Model(solver=IpoptSolver(print_level=0))
@variable(m,β[1:2]) # adds β[1] and β[2] at the same time @variable(m,α) for (x,y) in samples @constraint(m,y*(β⋅x - α) ≥ 1) end @objective(m,Min,β[1]^2+β[2]^2)
β[1]² + β[2]²
solve(m) β,α = getvalue(β), getvalue(α)
l(x₁) = (α - β[1]*x₁)/β[2] xs = [-2,5] plot!(xs,[l(x₁) for x₁ in xs],label="")
samples = [samplepoint([1,1]) for i=1:n] ys = [y for (x,y) in samples] scatter([(x₁,x₂) for ((x₁,x₂),y) in samples],group=ys)
L(λ,β,α,samples) = λ*norm(β)^2 + 1/n*sum(max(0,1-y*(β⋅x - α)) for (x,y) in samples) L(λ,params,samples) = L(λ,params[1:end-1],params[end],samples)
using Optim function SVM(λ,samples) params = optimize(params->L(λ,params,samples),ones(3),BFGS()).minimizer params[1:end-1], params[end] end
SVM (generic function with 1 method)
function errorrate(β,α,samples) count(y*(β⋅x-α) < 0 for (x,y) in samples) end function CV(λ,samples,i) β,α = SVM(λ,[samples[1:i-1];samples[i+1:end]]) x,y = samples[i] y*(β⋅x - α) < 0 end function CV(λ,samples) mean(CV(λ,samples,i) for i=1:length(samples)) end
CV (generic function with 2 methods)
λ₁, λ₂ = 0.001, 0.25 λ = optimize(λ->CV(first(λ),samples), λ₁,λ₂).minimizer β,α = SVM(λ,samples) l(x₁) = (α - β[1]x₁)/β[2] xs = [-1.5,2] plot!(xs,[l(x₁) for x₁ in xs],label="")
struct AffineMap W::Matrix b::Vector end struct NeuralNet maps::Vector{AffineMap} K::Function # activation function K̇::Function # derivative of K end (A::AffineMap)(x) = A.W * x + A.b (N::NeuralNet)(x) = forwardprop(N,x)[end] architecture(N::NeuralNet) = [[size(A.W,2) for A in N.maps]; size(last(N.maps).W,1)]
function forwardprop(N::NeuralNet,x) vectors = [x] for (j,A) in enumerate(N.maps) push!(vectors,A(vectors[end])) push!(vectors,(j < length(N.maps) ? N.K : identity).(vectors[end])) end vectors end
forwardprop (generic function with 1 method)
greennode(j) = 2j-1 purplenode(j) = 2j function backprop(N::NeuralNet,vectors,yᵢ) grads = [2(vectors[end]-yᵢ)'] push!(grads,grads[end]) for j = length(N.maps) : -1 : 1 push!(grads,grads[end] * N.maps[j].W) j > 1 && push!(grads,grads[end] * Diagonal(N.K̇.(vectors[purplenode(j-1)]))) end reverse(grads) end
backprop (generic function with 1 method)
function weight_gradients(N::NeuralNet,vectors,grads) [grads[purplenode(j)]' * vectors[greennode(j)]' for j=1:length(N.maps)] end function bias_gradients(N::NeuralNet,vectors,grads) [grads[purplenode(j)]' for j=1:length(N.maps)] end
bias_gradients (generic function with 1 method)
function averageweightΔ(N::NeuralNet,samples,batch,ϵ) arch = architecture(N) layers = length(arch)-1 sum_Δweight = [zeros(arch[i+1],arch[i]) for i=1:layers] sum_Δbias = [zeros(arch[i+1]) for i=1:layers] for k in batch x, y = samples[k] vectors = forwardprop(N,x) grads = backprop(N,vectors,y) ΔWs = weight_gradients(N,vectors,grads) Δbs = bias_gradients(N,vectors,grads) for i=1:layers sum_Δweight[i] -= ϵ*ΔWs[i] sum_Δbias[i] -= ϵ*Δbs[i] end end (sum_Δweight, sum_Δbias) ./ length(batch) end
averageweightΔ (generic function with 1 method)
function train(arch,K,K̇,samples,batchsize,ϵ = 0.1,iterations=1000) random_maps = [AffineMap(randn(arch[i+1],arch[i]), fill(0.1,arch[i+1])) for i=1:length(arch)-1] N = NeuralNet(random_maps,K,K̇) for i=1:iterations batch = sample(1:length(samples),batchsize) meanΔweight, meanΔbias = averageweightΔ(N,samples,batch,ϵ) N = NeuralNet([AffineMap(A.W .+ ΔW, A.b .+ Δb) for (A,ΔW,Δb) in zip(N.maps,meanΔweight,meanΔbias)],K,K̇) end N end
train (generic function with 3 methods)
K(x) = x > 0 ? x : 0 K̇(x) = x > 0 ? 1 : 0 cost(N::NeuralNet,samples) = mean(norm(N(x)-y)^2 for (x,y) in samples)
using Random; Random.seed!(123) xs = [rand(2) for i=1:1000] ys = [[1-norm(x)^2] for x in xs] samples = collect(zip(xs,ys));
arch = [2,5,1] batchsize = 100 ϵ = 0.005 N = train(arch,K,K̇,samples,batchsize,ϵ,10_000)
cost(N,samples) # returns 0.00343 xgrid = 0:1/2^8:1 ygrid = 0:1/2^8:1 zs = [first(N([x,y])) for x=xgrid,y=ygrid] plotly(); surface(xgrid,ygrid,zs)