Codes for chapters 1, 2, and 3

2021-12-29 14:18:24 +01:00 · 2021-12-29 14:18:24 +01:00 · b82be9e882
commit b82be9e882
parent f4c3f0f754
6 changed files with 1899 additions and 2 deletions
--- a/Manifest.toml
+++ b/Manifest.toml
--- a/Project.toml
+++ b/Project.toml
@ -0,0 +1,8 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
--- a/README.md
+++ b/README.md
@ -1,2 +1,35 @@
-# JuliaForDataAnalysis
-Codes for the book "Julia for Data Analysis"
+This repository contains source codes for the "Julia for Data Analysis" book
+that is written by Bogumił Kamiński and is planned to be published in 2022 by
+[Manning Publications Co.](https://www.manning.com/).
+
+In order to prepare the Julia environment before working with the materials
+presented in the book please perform the following setup steps:
+* [download](https://julialang.org/downloads/) and
+  [install](https://julialang.org/downloads/platform/)
+  [Julia](https://julialang.org/);
+  all the codes were tested under Julia 1.7;
+* make sure you can start Julia by running `julia` command in your system shell
+  (alternative ways to use Julia are described in Appendix A to the book)
+* download [this repository](https://github.com/bkamins/JuliaForDataAnalysis)
+  to a local folder on your computer;
+* start Julia in a folder containing the downloaded material using the command
+  `julia --project`; the folder must
+  contain the Project.toml and Manifest.toml files prepared for this book
+  (an explanation what these files do and why they are required is given in
+   Appendix A to the book);
+* press *]*, write `instantiate` and press *Enter* (this process will ensure
+  that Julia properly configures the working environment for working with
+  the codes from the book);
+* press *Backspace*, write `exit()` and press *Enter*; now you should exit Julia
+  and everything is set up to work with the materials presented in the book.
+
+The codes for each chapter are stored in files named *chXX.jl*, where *XX* is
+chapter number.
+
+To work with codes from some given chapter:
+* start a fresh Julia session using the `julia --project` command in a folder
+  containing the downloaded material;
+* execute the commands sequentially as they appear in the file;
+  the codes were prepared in a way that you do not need to restart Julia
+  when working with material from a single chapter, unless it is explicitly
+  written in the instructions to restart Julia (some of the codes require this).
--- a/ch01.jl
+++ b/ch01.jl
@ -0,0 +1,25 @@
+# Bogumił Kamiński, 2021
+
+# Codes for chapter 1
+
+# Code from section 1.2.1
+
+function f(n)
+    s = 0
+    for i in 1:n
+        s += i
+    end
+    return s
+end
+
+@time f(1_000_000_000)
+
+# Code allowing to reproduce the data frame presented in section 1.3
+
+using DataFrames
+
+DataFrame(a=1:3, name=["Alice", "Bob", "Clyde"],
+          age=[19, 24, 21], friends=[[2], [1, 3], [2]],
+          location=[(city="Atlanta", state="GA"),
+                    (city="Boston", state="MA"),
+                    (city="Austin", state="TX")])
--- a/ch02.jl
+++ b/ch02.jl
@ -0,0 +1,442 @@
+# Bogumił Kamiński, 2021
+
+# Codes for chapter 2
+
+# Code for listing 2.1
+
+1
+true
+"Hello world!"
+0.1
+[1, 2, 3]
+
+# Code for listing 2.2
+
+typeof(1)
+typeof(true)
+typeof("Hello world!")
+typeof(0.1)
+typeof([1, 2, 3])
+
+# Code for showing bit representation of numbers
+
+bitstring(1)
+bitstring(1.0)
+bitstring(Int8(1))
+
+# Code showing to what Int alias expands
+
+Int
+
+# Code for checking if value is of some type
+
+[1, 2, 3] isa Vector{Int}
+[1, 2, 3] isa Array{Int64, 1}
+
+# Code for section 2.2
+
+x = 1
+y = [1, 2, 3]
+
+x = 1
+x
+typeof(x)
+x = 0.1
+x
+typeof(x)
+
+Kamiński = 1
+x₁ = 0.5
+ε = 0.0001
+
+?₁
+?ε
+
+# Code for listing 2.3
+
+x = -7
+if x > 0
+    println("positive")
+elseif x < 0
+    println("negative")
+elseif x == 0
+    println("zero")
+else
+    println("unexpected condition")
+end
+
+# Code showing that logical condition must be Bool
+
+x = -7
+if x
+    println("condition was true")
+end
+
+# Code showing comparisons against NaN
+
+NaN > 0
+NaN >= 0
+NaN < 0
+NaN <= 0
+NaN == 0
+
+NaN != 0
+NaN != NaN
+
+# Code showing that floating point arithmetic is only approximate
+
+0.1 + 0.2 == 0.3
+
+0.1 + 0.2
+
+isapprox(0.1 + 0.2, 0.3)
+
+0.1 + 0.2 ≈ 0.3
+
+# Code showing combining conditions
+
+x = -7
+x > 0 && x < 10
+x < 0 || log(x) > 10
+
+x = -7
+log(x)
+
+# Code showing typical one-line conditional execution expressions
+
+x = -7
+x < 0 && println(x^2)
+iseven(x) || println("x is odd")
+
+x = -7
+if x < 0
+    println(x^2)
+end
+if !iseven(x)
+    println("x is odd")
+end
+
+x = -7
+if x < 0 && x^2
+    println("inside if")
+end
+
+# Code showing ternary operator
+
+x = -7
+x > 0 ? println("x is positive") : println("x is not positive")
+
+# Code from listing 2.4
+
+for i in [1, 2, 3]
+    println(i, " is ", isodd(i) ? "odd" : "even")
+end
+
+# Code from listing 2.5
+
+i = 1
+while i < 4
+    println(i, " is ", isodd(i) ? "odd" : "even")
+    global i += 1
+end
+
+# Code showing break and continue keywords
+
+i = 0
+while true
+    global i += 1
+    i > 6 && break
+    isodd(i) && continue
+    println(i, " is even")
+end
+
+# Code from listing 2.6
+
+x = -7
+x < 0 && begin
+    println(x)
+    x += 1
+    println(x)
+    2 * x
+end
+x > 0 ? (println(x); x) : (x += 1; println(x); x)
+
+# Code from section 2.3.4
+
+x = [8, 3, 1, 5, 7]
+k = 1
+
+y = sort(x)
+
+for i in 1:k
+    y[i] = y[k + 1]
+    y[end - i + 1] = y[end - k]
+end
+y
+
+s = 0
+for v in y
+    s += v
+end
+s
+s / length(y)
+
+# Code from listing 2.7
+
+function times_two(x)
+    return 2 * x
+end
+times_two(10)
+
+# Code from listing 2.8
+
+function compose(x, y=10; a, b=10)
+    return x, y, a, b
+end
+compose(1, 2; a=3, b=4)
+compose(1, 2; a=3)
+compose(1; a=3)
+compose(1)
+compose(; a=3)
+
+# Code from listing 2.9
+
+times_two(x) = 2 * x
+compose(x, y=10; a, b=10) = x, y, a, b
+
+# Code showing the use of map function
+
+map(times_two, [1, 2, 3])
+
+# Code from listing 2.10
+
+map(x -> 2 * x, [1, 2, 3])
+
+# Code showing sum taking a function as a first argument
+
+sum(x -> x ^ 2, [1, 2, 3])
+
+# Code showing do-end syntax
+
+sum([1, 2, 3]) do x
+    println("processing ", x)
+    return x ^ 2
+end
+
+# Code showing the difference between sort and sort!
+
+x = [5, 1, 3, 2]
+sort(x)
+x
+sort!(x)
+x
+
+# Code showing a simple implementation of winsorized_mean function
+
+function winsorized_mean(x, k)
+    y = sort(x)
+    for i in 1:k
+        y[i] = y[k + 1]
+        y[end - i + 1] = y[end - k]
+    end
+    s = 0
+    for v in y
+        s += v
+    end
+    return s / length(y)
+end
+winsorized_mean([8, 3, 1, 5, 7], 1)
+
+# Code from section 2.5
+
+function fun1()
+    x = 1
+    return x + 1
+end
+fun1()
+x
+
+function fun2()
+    if true
+        x = 10
+    end
+    return x
+end
+fun2()
+
+function fun3()
+    x = 0
+    for i in [1, 2, 3]
+        if i == 2
+            x = 2
+        end
+    end
+    return x
+end
+fun3()
+
+function fun4()
+    for i in [1, 2, 3]
+        if i == 2
+            x = 2
+        end
+    end
+    return x
+end
+fun4()
+
+function fun5()
+    for i in [1, 2, 3]
+        if i == 1
+            x = 1
+        else
+            x += 1
+        end
+        println(x)
+    end
+end
+fun5()
+
+function fun6()
+    x = 0
+    for i in [1, 2, 3]
+        if i == 1
+            x = 1
+        else
+            x += 1
+        end
+        println(x)
+    end
+end
+fun6()
+
+# Code from section 2.6
+
+methods(cd)
+
+sum isa Function
+
+typeof(sum)
+typeof(sum) == Function
+
+supertype(typeof(sum))
+
+function traverse(T)
+    println(T)
+    T == Any || traverse(supertype(T))
+    return nothing
+end
+traverse(Int64)
+
+function print_subtypes(T, indent_level=0)
+    println(" " ^ indent_level, T)
+    for S in subtypes(T)
+        print_subtypes(S, indent_level + 2)
+    end
+    return nothing
+end
+print_subtypes(Integer)
+
+traverse(typeof([1.0, 2.0, 3.0]))
+traverse(typeof(1:3))
+
+AbstractVector
+
+typejoin(typeof([1.0, 2.0, 3.0]), typeof(1:3))
+
+# Code from section 2.7
+
+fun(x) = println("unsupported type")
+fun(x::Number) = println("a number was passed")
+fun(x::Float64) = println("a Float64 value")
+methods(fun)
+
+fun("hello!")
+fun(1)
+fun(1.0)
+
+bar(x, y) = "no numbers passed"
+bar(x::Number, y) = "first argument is a number"
+bar(x, y::Number) = "second argument is a number"
+bar("hello", "world")
+bar(1, "world")
+bar("hello", 2)
+bar(1, 2)
+
+bar(x::Number, y::Number) = "both arguments are numbers"
+bar(1, 2)
+methods(bar)
+
+function winsorized_mean(x::AbstractVector, k::Integer)
+    k >= 0 || throw(ArgumentError("k must be non-negative"))
+    length(x) > 2 * k || throw(ArgumentError("k is too large"))
+    y = sort!(collect(x))
+    for i in 1:k
+        y[i] = y[k + 1]
+        y[end - i + 1] = y[end - k]
+    end
+    return sum(y) / length(y)
+end
+
+winsorized_mean([8, 3, 1, 5, 7], 1)
+winsorized_mean(1:10, 2)
+winsorized_mean(1:10, "a")
+winsorized_mean(10, 1)
+
+winsorized_mean(1:10, -1)
+winsorized_mean(1:10, 5)
+
+# Code from section 2.8
+
+import Statistics
+x = [1, 2, 3]
+mean(x)
+Statistics.mean(x)
+
+using Statistics
+mean(x)
+
+# start a fresh Julia session before running this code
+mean = 1
+using Statistics
+mean
+
+# start a fresh Julia session before running this code
+using Statistics
+mean([1, 2, 3])
+mean = 1
+
+# start a fresh Julia session before running this code
+using Statistics
+mean = 1
+mean([1, 2, 3])
+
+# start a fresh Julia session before running this code
+using Statistics
+using StatsBase
+?winsor
+mean(winsor([8, 3, 1, 5, 7], count=1))
+
+# Code from section 2.9
+
+@time 1 + 2
+
+@time(1 + 2)
+
+@assert 1 == 2 "1 is not equal 2"
+@assert(1 == 2, "1 is not equal 2")
+
+@macroexpand @assert(1 == 2, "1 is not equal 2")
+
+@macroexpand @time 1 + 2
+
+# before running these codes
+# define the winsorized_mean function using the code from section 2.7
+
+using BenchmarkTools
+x = rand(10^6);
+@benchmark winsorized_mean($x, 10^5)
+using Statistics, StatsBase
+@benchmark mean(winsor($x; count=10^5))
+
+@edit winsor(x, count=10^5)
--- a/ch03.jl
+++ b/ch03.jl
@ -0,0 +1,359 @@
+# Bogumił Kamiński, 2021
+
+# Codes for chapter 3
+
+# Code for listing 3.1
+
+aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
+      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
+       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
+      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
+      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
+       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
+       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
+      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
+       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
+       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
+
+# Code for checking size of a matrix
+
+size(aq)
+size(aq, 1)
+size(aq, 2)
+
+# Code comparing tuple to a vector
+
+v = [1, 2, 3]
+t = (1, 2, 3)
+v[1]
+t[1]
+v[1] = 10
+v
+t[1] = 10
+
+# Code for figure 3.2
+
+using BenchmarkTools
+@benchmark (1, 2, 3)
+@benchmark [1, 2, 3]
+
+# Code for section 3.1.2
+
+using Statistics
+mean(aq; dims=1)
+std(aq; dims=1)
+
+map(mean, eachcol(aq))
+map(std, eachcol(aq))
+
+map(eachcol(aq)) do col
+    mean(col)
+end
+
+[mean(col) for col in eachcol(aq)]
+[std(col) for col in eachcol(aq)]
+
+# Code for section 3.1.3
+
+[mean(aq[:, j]) for j in axes(aq, 2)]
+[std(aq[:, j]) for j in axes(aq, 2)]
+
+axes(aq, 2)
+?Base.OneTo
+
+[mean(view(aq, :, j)) for j in axes(aq, 2)]
+[std(@view aq[:, j]) for j in axes(aq, 2)]
+
+# Code for section 3.1.4
+
+using BenchmarkTools
+x = ones(10^7, 10)
+@benchmark [mean(@view $x[:, j]) for j in axes($x, 2)]
+@benchmark [mean($x[:, j]) for j in axes($x, 2)]
+@benchmark mean($x, dims=1)
+
+# Code for section 3.1.5
+
+[cor(aq[:, i], aq[:, i+1]) for i in 1:2:7]
+collect(1:2:7)
+
+# Code for section 3.1.6
+
+y = aq[:, 2]
+X = [ones(11) aq[:, 1]]
+X \ y
+[[ones(11) aq[:, i]] \ aq[:, i+1] for i in 1:2:7]
+
+function R²(x, y)
+    X = [ones(11) x]
+    model = X \ y
+    prediction = X * model
+    error = y - prediction
+    SS_res = sum(v -> v ^ 2, error)
+    mean_y = mean(y)
+    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
+    return 1 - SS_res / SS_tot
+end
+[R²(aq[:, i], aq[:, i+1]) for i in 1:2:7]
+
+?²
+
+# Code for section 3.1.7
+
+using Plots
+scatter(aq[:, 1], aq[:, 2]; legend=false)
+
+plot(scatter(aq[:, 1], aq[:, 2]; legend=false),
+     scatter(aq[:, 3], aq[:, 4]; legend=false),
+     scatter(aq[:, 5], aq[:, 6]; legend=false),
+     scatter(aq[:, 7], aq[:, 8]; legend=false))
+
+plot([scatter(aq[:, i], aq[:, i+1]; legend=false)
+      for i in 1:2:7]...)
+
+# Code for section 3.2
+
+two_standard = Dict{Int, Int}()
+for i in [1, 2, 3, 4, 5, 6]
+    for j in [1, 2, 3, 4, 5, 6]
+        s = i + j
+        if haskey(two_standard, s)
+            two_standard[s] += 1
+        else
+            two_standard[s] = 1
+        end
+    end
+end
+two_standard
+
+keys(two_standard)
+values(two_standard)
+
+using Plots
+scatter(collect(keys(two_standard)), collect(values(two_standard));
+        legend=false, xaxis=2:12)
+
+all_dice = [[1, x2, x3, x4, x5, x6]
+            for x2 in 2:11
+            for x3 in x2:11
+            for x4 in x3:11
+            for x5 in x4:11
+            for x6 in x5:11]
+
+for d1 in all_dice, d2 in all_dice
+    test = Dict{Int, Int}()
+    for i in d1, j in d2
+        s = i + j
+        if haskey(test, s)
+            test[s] += 1
+        else
+            test[s] = 1
+        end
+    end
+    if test == two_standard
+        println(d1, " ", d2)
+    end
+end
+
+# Code for section 3.3
+
+aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
+      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
+       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
+      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
+      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
+       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
+       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
+      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
+       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
+       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
+
+dataset1 = (x=aq[:, 1], y=aq[:, 2])
+
+dataset1[1]
+dataset1.x
+
+# Code for listing 3.2
+
+data = (set1=(x=aq[:, 1], y=aq[:, 2]),
+        set2=(x=aq[:, 3], y=aq[:, 4]),
+        set3=(x=aq[:, 5], y=aq[:, 6]),
+        set4=(x=aq[:, 7], y=aq[:, 8]))
+
+# Code for section 3.3.2
+
+using Statistics
+map(s -> mean(s.x), data)
+
+map(s -> cor(s.x, s.y), data)
+
+using GLM
+model = lm(@formula(y ~ x), data.set1)
+
+r2(model)
+
+# Code for section 3.3.3
+
+model.mm
+
+x = [3, 1, 2]
+sort(x)
+x
+sort!(x)
+x
+
+empty_field!(nt, i) = empty!(nt[i])
+nt = (dict = Dict("a" => 1, "b" => 2), int=10)
+empty_field!(nt, 1)
+nt
+
+# Code for section 3.4.1
+
+x = [1 2 3]
+y = [1, 2, 3]
+x * y
+
+a = [1, 2, 3]
+b = [4, 5, 6]
+a * b
+
+a .* b
+
+map(*, a, b)
+[a[i] * b[i] for i in eachindex(a, b)]
+
+eachindex(a, b)
+
+eachindex([1, 2, 3], [4, 5])
+
+map(*, [1, 2, 3], [4, 5])
+
+[1, 2, 3] .* [4, 5]
+
+# Code for section 3.4.2
+
+[1, 2, 3] .* [4]
+
+[1, 2, 3] .^ 2
+
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] .* [1 2 3 4 5 6 7 8 9 10]
+
+["x", "y", "z"] .=> [sum minimum maximum]
+
+abs.([1, -2, 3, -4])
+
+abs([1, 2, 3])
+
+string(1, 2, 3)
+
+string.("x", 1:10)
+
+f(i::Int) = string("got integer ", i)
+f(s::String) = string("got string ", s)
+f.([1, "1"])
+
+# Code for section 3.4.3
+
+in(1, [1, 2, 3])
+in(4, [1, 2, 3])
+
+in([1, 3, 5, 7, 9], [1, 2, 3, 4])
+
+in.([1, 3, 5, 7, 9], [1, 2, 3, 4])
+
+in.([1, 3, 5, 7, 9], Ref([1, 2, 3, 4]))
+
+# Code for section 3.4.4
+
+aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
+      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
+       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
+      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
+      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
+       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
+       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
+      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
+       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
+       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
+using Statistics
+
+mean.(eachcol(aq))
+
+mean(eachcol(aq))
+
+function R²(x, y)
+    X = [ones(11) x]
+    model = X \ y
+    prediction = X * model
+    error = y - prediction
+    SS_res = sum(v -> v ^ 2, error)
+    mean_y = mean(y)
+    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
+    return 1 - SS_res / SS_tot
+end
+
+function R²(x, y)
+    X = [ones(11) x]
+    model = X \ y
+    prediction = X * model
+    SS_res = sum((y .- prediction) .^ 2)
+    SS_tot = sum((y .- mean(y)) .^ 2)
+    return 1 - SS_res / SS_tot
+end
+
+# Code for section 3.5
+
+[]
+Dict()
+
+Float64[1, 2, 3]
+
+Dict{UInt8, Float64}(0 => 0, 1 => 1)
+
+UInt32(200)
+
+Real[1, 1.0, 0x3]
+
+v1 = Any[1, 2, 3]
+eltype(v1)
+v2 = Float64[1, 2, 3]
+eltype(v2)
+v3 = [1, 2, 3]
+eltype(v2)
+d1 = Dict()
+eltype(d1)
+d2 = Dict(1 => 2, 3 => 4)
+eltype(d2)
+
+p = 1 => 2
+typeof(p)
+
+# Code for section 3.5.1
+
+[1, 2, 3] isa AbstractVector{Int}
+[1, 2, 3] isa AbstractVector{Real}
+
+AbstractVector{<:Real}
+
+# Code for section 3.5.2
+
+using Statistics
+function ourcov(x::AbstractVector{<:Real},
+                y::AbstractVector{<:Real})
+    len = length(x)
+    @assert len == length(y) > 0
+    return sum((x .- mean(x)) .* (y .- mean(y))) / (len - 1)
+end
+
+ourcov(1:4, [1.0, 3.0, 2.0, 4.0])
+cov(1:4, [1.0, 3.0, 2.0, 4.0])
+
+ourcov(1:4, Any[1.0, 3.0, 2.0, 4.0])
+
+x = Any[1, 2, 3]
+identity.(x)
+y = Any[1, 2.0]
+identity.(y)