using LinearAlgebra, Statistics, Roots, Optim, Plots, Random; Random.seed!(1234)
n = 1000 # number of samples to draw

# the true regression function
r(x) = 2 + 1/50*x*(30-x)
# the true density function
σy = 3/2
f(x,y) = 3/4000 * 1/sqrt(2π*σy^2) * x*(20-x)*exp(-1/(2σy^2)*(y-r(x))^2)

# x values and y values for a grid
xs = 0:1/2^3:20
ys = 0:1/2^3:10

# F(t) is the CDF of X
F(t) = -t^3/4000 + 3t^2/400
# Inverse CDF sampling
function sampleX()
    U = rand()
    find_zero(t->F(t)-U,(0,20),Bisection())
end

function sampleXY(r,σ)
    X = sampleX()
    Y = r(X) + σ*randn()
    (X,Y)
end

samples = [sampleXY(r,σy) for i=1:n]

D(u) = abs(u) < 1 ? 70/81*(1-abs(u)^3)^3 : 0 # tri-cube function
D(λ,u) = 1/λ*D(u/λ) # scaled tri-cube
K(λ,x,y) = D(λ,x) * D(λ,y) # kernel
kde(λ,x,y,samples) = sum(K(λ,x-Xi,y-Yi) for (Xi,Yi) in samples)/length(samples)

# `optimize` takes functions which accept vector arguments, so we treat
# λ as a one-entry vector
L(λ) = sum((f(x,y) - kde(λ,x,y,samples))^2 for x=xs,y=ys)*step(xs)*step(ys)

# minimize L using the BFGS method
λ_best = optimize(λ->L(first(λ)),[1.0],BFGS())

"Evaluate the summation Σᵢ f⁽⁻ⁱ⁾(Xᵢ,Yᵢ) in J(λ)'s second term"
function kdeCV(λ,i,samples)
    x,y = samples[i]
    newsamples = copy(samples)
    deleteat!(newsamples,i)
    kde(λ,x,y,newsamples)
end

# first line approximates ∫f̂², the second line approximates -(2/n)∫f̂f
J(λ) = sum([kde(λ,x,y,samples)^2 for x=xs,y=ys])*step(xs)*step(ys) -
                2/length(samples)*sum(kdeCV(λ,i,samples) for i=1:length(samples))
λ_best_cv = optimize(λ->J(first(λ)),[1.0],BFGS())

λ = first(λ_best_cv.minimizer)
r̂(x) = sum(D(λ,x-Xi)*Yi for (Xi,Yi) in samples)/sum(D(λ,x-Xi) for (Xi,Yi) in samples)
plot(r̂,0,20)

y = [Y for (X,Y) in samples]
X = [ones(n) [X for (X,Y) in samples]]
β = (X'*X) \ X'*y # (computes (X'X)⁻¹X'y)
scatter(samples)
plot!(xs,[β⋅[1,x] for x in xs])

y = [Y for (X,Y) in samples]
X = [ones(n) [X for (X,Y) in samples] [X^2 for (X,Y) in samples]]
βq = (X'*X) \ X'*y
scatter(samples)
plot!(xs,[βq⋅[1,x,x^2] for x in xs])
quaderr = sum((r(x)-βq⋅[1,x,x^2])^2 for x in xs)*step(xs)
linerr = sum((r(x)-β⋅[1,x])^2 for x in xs)*step(xs)

using Plots, StatsBase, Random; Random.seed!(1234)
struct Normal
    μ::Vector
    Σ::Array
end
struct Flower
    X::Vector
    color::String
end
# density function for the normal distribution N
f(x,N::Normal) = 1/(2π*sqrt(det(N.Σ))) * exp(-1/2*((x-N.μ)'*inv(N.Σ)*(x-N.μ)))
xs = 0:1/2^4:15
ys = 0:1/2^4:15
As = [[1.5 -1; 0 1],[1/2 1/4; 0 1/2], [2 0; 0 2]]
μs = [[9,5],[4,10],[7,9]]
Ns = [Normal(μ,A*A') for (μ,A) in zip(μs,As)]
p = Weights([1/3,1/6,1/2])
colors = ["red","green","blue"]
function randflower(μs,As)
    i = sample(p)
    Flower(As[i]*randn(2)+μs[i],colors[i])
end
flowers = [randflower(μs,As) for i=1:300]

classify(x,p,Ns) = argmax([p[i]*f(x,Ns[i]) for i=1:3])
function classificationplot(flowers,p,Ns)
    rgb = [:red,:green,:blue]
    P = heatmap(xs,ys,[classify([x,y],p,Ns) for x=xs,y=ys],
                fillcolor=cgrad(rgb),opacity=0.4,transpose=true)
    for c in ["red","green","blue"]
        scatter!(P,[(F.X[1],F.X[2]) for F in flowers if F.color ==c],color=c)
    end
    plot!(aspect_ratio=:equal,legend=false)
    P
end
scatter([(F.X[1],F.X[2]) for F in flowers],
        group=[F.color for F in flowers],
        color_palette=rgb,aspect_ratio=:equal,legend=false)

ERROR: UndefVarError: rgb not defined

correct(flowers,p,Ns) = count(colors[classify(F.X,p,Ns)] == F.color for F in flowers)
classificationplot(flowers,p,Ns)

function mvn_estimate(flowers,color)
    flowers_subset = [F.X for F in flowers if F.color == color]
    m̂ = mean(flowers_subset)
    Ŝ = mean([(X - m̂)*(X - m̂)' for X in flowers_subset])
    Normal(m̂,Ŝ)
end
colorcounts = countmap([F.color for F in flowers])
p̂ = [colorcounts[c]/length(flowers) for c in colors]
N̂s = [mvn_estimate(flowers,c) for c in colors]
classificationplot(flowers,p̂,N̂s)

using Plots, Distributions, Optim
mycgrad = cgrad([:MidnightBlue,:SeaGreen,:Gold,:Tomato])
gr(aspect_ratio=1,fillcolor=mycgrad) # Plots.jl default settings
A = MvNormal([0,0],[1.0 0; 0 1])
B = MvNormal([1,1],[1.0 0; 0 1])
xs = -5:1/2^5:5
ys = -5:1/2^5:5
r(x,y) = 0.5pdf(B,[x,y])/(0.5pdf(A,[x,y])+0.5pdf(B,[x,y]))
rs = [r(x,y) for x=xs,y=ys]
heatmap(xs,ys,rs)

samples = [rand(Bool) ? (rand(A),0) : (rand(B),1) for i=1:1000]
cs =  [c for ((x,y),c) in samples]
scatter([(x,y) for ((x,y),c) in samples],group=cs)

σ(u) = 1/(1 + exp(-u))
r(β,x) = σ(β⋅[1;x])
C(β,xᵢ,yᵢ) = yᵢ*log(1/r(β,xᵢ))+(1-yᵢ)*log(1/(1-r(β,xᵢ)))
L(β) = sum(C(β,xᵢ,yᵢ) for (xᵢ,yᵢ) in samples)
β̂ = optimize(L,ones(3),BFGS()).minimizer
r̂s = [r(β̂,[x,y]) for x=xs,y=ys]

using Plots, JuMP, Ipopt, Random; Random.seed!(1234);
gr(aspect_ratio=1)

Plots.GRBackend()

function samplepoint(μ=[3,3])
    class = rand([-1,1])
    if class == -1
        X = randn(2)
    else
        X = μ + [1 -1/2; -1/2 1] * randn(2)
    end
    (X,class)
end
n = 100
samples = [samplepoint() for i=1:n]
ys = [y for (x,y) in samples]
scatter([(x₁,x₂) for ((x₁,x₂),y) in samples],group=ys)

m = Model(solver=IpoptSolver(print_level=0))

@variable(m,β[1:2]) # adds β[1] and β[2] at the same time
@variable(m,α)
for (x,y) in samples
    @constraint(m,y*(β⋅x - α) ≥ 1)
end
@objective(m,Min,β[1]^2+β[2]^2)

β[1]² + β[2]²

solve(m)
β,α = getvalue(β), getvalue(α)

l(x₁) = (α - β[1]*x₁)/β[2]
xs = [-2,5]
plot!(xs,[l(x₁) for x₁ in xs],label="")

samples = [samplepoint([1,1]) for i=1:n]
ys = [y for (x,y) in samples]
scatter([(x₁,x₂) for ((x₁,x₂),y) in samples],group=ys)

L(λ,β,α,samples) = λ*norm(β)^2 + 1/n*sum(max(0,1-y*(β⋅x - α)) for (x,y) in samples)
L(λ,params,samples) = L(λ,params[1:end-1],params[end],samples)

using Optim
function SVM(λ,samples)
    params = optimize(params->L(λ,params,samples),ones(3),BFGS()).minimizer
    params[1:end-1], params[end]
end

SVM (generic function with 1 method)

function errorrate(β,α,samples)
    count(y*(β⋅x-α) < 0 for (x,y) in samples)
end
function CV(λ,samples,i)
    β,α = SVM(λ,[samples[1:i-1];samples[i+1:end]])
    x,y = samples[i]
    y*(β⋅x - α) < 0
end
function CV(λ,samples)
    mean(CV(λ,samples,i) for i=1:length(samples))
end

CV (generic function with 2 methods)

λ₁, λ₂ = 0.001, 0.25
λ = optimize(λ->CV(first(λ),samples),
             λ₁,λ₂).minimizer
β,α = SVM(λ,samples)
l(x₁) = (α - β[1]x₁)/β[2]
xs = [-1.5,2]
plot!(xs,[l(x₁) for x₁ in xs],label="")

struct AffineMap
    W::Matrix
    b::Vector
end
struct NeuralNet
    maps::Vector{AffineMap}
    K::Function # activation function
    K̇::Function # derivative of K
end
(A::AffineMap)(x) = A.W * x + A.b
(N::NeuralNet)(x) = forwardprop(N,x)[end]
architecture(N::NeuralNet) = [[size(A.W,2) for A in N.maps]; size(last(N.maps).W,1)]

function forwardprop(N::NeuralNet,x)
    vectors = [x]
    for (j,A) in enumerate(N.maps)
        push!(vectors,A(vectors[end]))
        push!(vectors,(j < length(N.maps) ? N.K : identity).(vectors[end]))
    end
    vectors
end

forwardprop (generic function with 1 method)

greennode(j) = 2j-1
purplenode(j) = 2j
function backprop(N::NeuralNet,vectors,yᵢ)
    grads = [2(vectors[end]-yᵢ)']
    push!(grads,grads[end])
    for j = length(N.maps) : -1 : 1
        push!(grads,grads[end] * N.maps[j].W)
        j > 1 && push!(grads,grads[end] * Diagonal(N.K̇.(vectors[purplenode(j-1)])))
    end
    reverse(grads)
end

backprop (generic function with 1 method)

function weight_gradients(N::NeuralNet,vectors,grads)
    [grads[purplenode(j)]' * vectors[greennode(j)]' for j=1:length(N.maps)]
end

function bias_gradients(N::NeuralNet,vectors,grads)
    [grads[purplenode(j)]' for j=1:length(N.maps)]
end

bias_gradients (generic function with 1 method)

function averageweightΔ(N::NeuralNet,samples,batch,ϵ)
    arch = architecture(N)
    layers = length(arch)-1
    sum_Δweight = [zeros(arch[i+1],arch[i]) for i=1:layers]
    sum_Δbias = [zeros(arch[i+1]) for i=1:layers]
    for k in batch
        x, y = samples[k]
        vectors = forwardprop(N,x)
        grads = backprop(N,vectors,y)
        ΔWs = weight_gradients(N,vectors,grads)
        Δbs = bias_gradients(N,vectors,grads)
        for i=1:layers
            sum_Δweight[i] -= ϵ*ΔWs[i]
            sum_Δbias[i] -= ϵ*Δbs[i]
        end
    end
    (sum_Δweight, sum_Δbias) ./ length(batch)
end

averageweightΔ (generic function with 1 method)

function train(arch,K,K̇,samples,batchsize,ϵ = 0.1,iterations=1000)
    random_maps = [AffineMap(randn(arch[i+1],arch[i]),
                             fill(0.1,arch[i+1])) for i=1:length(arch)-1]
    N = NeuralNet(random_maps,K,K̇)
    for i=1:iterations
        batch = sample(1:length(samples),batchsize)
        meanΔweight, meanΔbias = averageweightΔ(N,samples,batch,ϵ)
        N = NeuralNet([AffineMap(A.W .+ ΔW, A.b .+ Δb)
                       for (A,ΔW,Δb) in zip(N.maps,meanΔweight,meanΔbias)],K,K̇)
    end
    N
end

train (generic function with 3 methods)

K(x) = x > 0 ? x : 0
K̇(x) = x > 0 ? 1 : 0
cost(N::NeuralNet,samples) = mean(norm(N(x)-y)^2 for (x,y) in samples)

using Random; Random.seed!(123)
xs = [rand(2) for i=1:1000]
ys = [[1-norm(x)^2] for x in xs]
samples = collect(zip(xs,ys));

arch = [2,5,1]
batchsize = 100
ϵ = 0.005
N = train(arch,K,K̇,samples,batchsize,ϵ,10_000)

cost(N,samples) # returns 0.00343
xgrid = 0:1/2^8:1
ygrid = 0:1/2^8:1
zs = [first(N([x,y])) for x=xgrid,y=ygrid]
plotly(); surface(xgrid,ygrid,zs)