From abb90ab9f15fe6b4522e35021660d97e9aa770e1 Mon Sep 17 00:00:00 2001 From: tndoan Date: Thu, 10 Dec 2020 09:52:57 -0500 Subject: [PATCH] Chapter 5 + 6 --- Chapter_05.ipynb | 649 +++++++++++++++++++++++++++++++++++++++++++++++ Chapter_06.ipynb | 359 ++++++++++++++++++++++++++ 2 files changed, 1008 insertions(+) create mode 100644 Chapter_05.ipynb create mode 100644 Chapter_06.ipynb diff --git a/Chapter_05.ipynb b/Chapter_05.ipynb new file mode 100644 index 0000000..cf07b6f --- /dev/null +++ b/Chapter_05.ipynb @@ -0,0 +1,649 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "using CSV\n", + "using DataFrames\n", + "using Plots\n", + "using StatsPlots\n", + "using GLM\n", + "using Statistics\n", + "using Distributions\n", + "using Random\n", + "using MultivariateStats" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "autoData = CSV.File(\"dataset/Auto.csv\"; missingstring=\"?\") |> DataFrame;" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "getMSEOfValidation (generic function with 1 method)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "function getMSEOfValidation(train, validation)\n", + " ols_1 = lm(@formula(mpg ~ horsepower), train);\n", + " ols_2 = lm(@formula(mpg ~ horsepower + horsepower^2), train);\n", + " ols_3 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3), train);\n", + " ols_4 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4), train);\n", + " ols_5 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5), train);\n", + " ols_6 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5 + horsepower^6), train);\n", + " ols_7 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5 + horsepower^6 + horsepower^7), train);\n", + " ols_8 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5 + horsepower^6 + horsepower^7 + horsepower^8), train);\n", + " ols_9 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5 + horsepower^6 + horsepower^7 + horsepower^8 + horsepower^9), train);\n", + " ols_10 = lm(@formula(mpg ~ horsepower + horsepower^2 + horsepower^3 + horsepower^4 + horsepower^5 + horsepower^6 + horsepower^7 + horsepower^8 + horsepower^9 + horsepower^10), train);\n", + "\n", + " all_models = [ols_1, ols_2, ols_3, ols_4, ols_5, ols_6, ols_7, ols_8, ols_9, ols_10];\n", + " result = []\n", + " for model in all_models\n", + " p = predict(model, validation)\n", + " mse = sum((validation[:mpg] - p).^2) / length(p)\n", + " push!(result, mse)\n", + " end\n", + " return result\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Figure 5.2\n", + "clean_autoData = dropmissing(autoData, :horsepower)\n", + "\n", + "indices = collect(1: nrow(clean_autoData))\n", + "shuffle!(indices)\n", + "\n", + "train = clean_autoData[indices[1:196], :]\n", + "validation = clean_autoData[indices[197:end], :]\n", + "\n", + "mse = getMSEOfValidation(train, validation)\n", + "p1 = plot(mse, xlabel=\"Degree of Polynomial\", ylabel=\"Mean Squared Error\", legend=false, markershape = :hexagon, lc=\"red\", markercolor = :red)\n", + "\n", + "p2 = plot(mse)\n", + "for i=1:10\n", + " shuffle!(indices)\n", + " train = clean_autoData[indices[1:196], :]\n", + " validation = clean_autoData[indices[197:end], :]\n", + " mse = getMSEOfValidation(train, validation)\n", + "\n", + " p2 = plot!(mse, xlabel=\"Degree of Polynomial\", ylabel=\"Mean Squared Error\", legend=false)\n", + "end\n", + "\n", + "plot(p1, p2)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Figure 5.4\n", + "mse = zeros(10)\n", + "for i in 1:nrow(clean_autoData)\n", + " indices = collect(1:nrow(clean_autoData))\n", + " popat!(indices, i)\n", + " train = clean_autoData[indices, :]\n", + " validation = clean_autoData[[i], :]\n", + " mse = mse + getMSEOfValidation(train, validation)\n", + "end\n", + "\n", + "mse = mse / nrow(clean_autoData)\n", + "p1 = plot(mse, xlabel=\"Degree of Polynomial\", ylabel=\"Mean Squared Error\", legend=false, markershape = :hexagon, lc=\"blue\", markercolor = :blue, title=\"LOOCV\")\n", + "\n", + "############################\n", + "indices = collect(1:nrow(clean_autoData))\n", + "batch = nrow(clean_autoData) ÷ 10\n", + "n = nrow(clean_autoData)\n", + "\n", + "mse_k = zeros(10, 9)\n", + "for i in 1:9\n", + " shuffle!(indices)\n", + " for k in 1:10\n", + " sInd = (k - 1) * batch + 1\n", + " eInd = k * batch\n", + " if k == 10\n", + " eInd = n\n", + " end\n", + " validation = clean_autoData[indices[sInd:eInd], :]\n", + " rest = collect(1:n)\n", + " splice!(rest, collect(sInd:eInd))\n", + " train = clean_autoData[indices[rest], :]\n", + " mse_k[:, i] += getMSEOfValidation(train, validation)\n", + " end\n", + "end\n", + "mse_k = mse_k ./ 10\n", + "p2 = plot(mse_k, xlabel=\"Degree of Polynomial\", ylabel=\"Mean Squared Error\", legend=false, title=\"10-fold CV\")\n", + "\n", + "plot(p1, p2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.5.1", + "language": "julia", + "name": "julia-1.5" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.5.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Chapter_06.ipynb b/Chapter_06.ipynb new file mode 100644 index 0000000..52c953a --- /dev/null +++ b/Chapter_06.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "using CSV\n", + "using DataFrames\n", + "using Plots\n", + "using StatsPlots\n", + "using GLM\n", + "using Statistics\n", + "using Distributions\n", + "using Random\n", + "using MultivariateStats" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FullNormal(\n", + "dim: 2\n", + "μ: [0.0, 0.0]\n", + "Σ: [1.0 1.0; 1.0 2.0]\n", + ")\n" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "μ = [0, 0]\n", + "σ = [1.0 1.0; 1.0 2.0]\n", + "dist = Distributions.MvNormal(μ, σ)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2×100 Array{Float64,2}:\n", + " 1.90728 0.417492 0.514817 0.453653 … 0.991181 1.40246 0.892847\n", + " 0.710246 0.31368 -0.209396 -0.669405 1.54026 1.47478 1.03204" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample = rand(dist, 100) # the random samples are close to Figure 6.14" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PCA(indim = 2, outdim = 2, principalratio = 1.0)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = fit(PCA, sample)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proj = projection(p) # get principle component\n", + "x = sample[1, :]\n", + "y = sample[2, :]\n", + "\n", + "scatter(x, y)\n", + "first_pca = proj[:, 2] # principle components are sorted in descending order\n", + "second_pca = proj[:, 1]\n", + "plot!([-2, 2], [(2 * first_pca[1] / first_pca[2]), (-2 * first_pca[1] / first_pca[2])])\n", + "plot!([-2, 2], [(2 * second_pca[1] / second_pca[2]), (-2 * second_pca[1] / second_pca[2])])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.5.1", + "language": "julia", + "name": "julia-1.5" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.5.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}