reorganize chapters

2022-02-08 20:58:33 +01:00
parent c44c4f1609
commit 55045fc5dd
12 changed files with 1662 additions and 1596 deletions
--- a/ch02.jl
+++ b/ch02.jl
@@ -329,142 +329,3 @@ function fun6()
    end
 end
 fun6()
 # Code from section 2.6
 methods(cd)
 sum isa Function
 typeof(sum)
 typeof(sum) == Function
 supertype(typeof(sum))
 function traverse(T)
    println(T)
    T == Any || traverse(supertype(T))
    return nothing
 end
 traverse(Int64)
 function print_subtypes(T, indent_level=0)
    println(" " ^ indent_level, T)
    for S in subtypes(T)
        print_subtypes(S, indent_level + 2)
    end
    return nothing
 end
 print_subtypes(Integer)
 traverse(typeof([1.0, 2.0, 3.0]))
 traverse(typeof(1:3))
 AbstractVector
 typejoin(typeof([1.0, 2.0, 3.0]), typeof(1:3))
 # Code from section 2.7
 fun(x) = println("unsupported type")
 fun(x::Number) = println("a number was passed")
 fun(x::Float64) = println("a Float64 value")
 methods(fun)
 fun("hello!")
 fun(1)
 fun(1.0)
 bar(x, y) = "no numbers passed"
 bar(x::Number, y) = "first argument is a number"
 bar(x, y::Number) = "second argument is a number"
 bar("hello", "world")
 bar(1, "world")
 bar("hello", 2)
 bar(1, 2)
 bar(x::Number, y::Number) = "both arguments are numbers"
 bar(1, 2)
 methods(bar)
 function winsorized_mean(x::AbstractVector, k::Integer)
    k >= 0 || throw(ArgumentError("k must be non-negative"))
    length(x) > 2 * k || throw(ArgumentError("k is too large"))
    y = sort!(collect(x))
    for i in 1:k
        y[i] = y[k + 1]
        y[end - i + 1] = y[end - k]
    end
    return sum(y) / length(y)
 end
 winsorized_mean([8, 3, 1, 5, 7], 1)
 winsorized_mean(1:10, 2)
 winsorized_mean(1:10, "a")
 winsorized_mean(10, 1)
 winsorized_mean(1:10, -1)
 winsorized_mean(1:10, 5)
 # Code from section 2.8
 module ExampleModule
 function example()
    println("Hello")
 end
 end # ExampleModule
 import Statistics
 x = [1, 2, 3]
 mean(x)
 Statistics.mean(x)
 using Statistics
 mean(x)
 # start a fresh Julia session before running this code
 mean = 1
 using Statistics
 mean
 # start a fresh Julia session before running this code
 using Statistics
 mean([1, 2, 3])
 mean = 1
 # start a fresh Julia session before running this code
 using Statistics
 mean = 1
 mean([1, 2, 3])
 # start a fresh Julia session before running this code
 using Statistics
 using StatsBase
 ?winsor
 mean(winsor([8, 3, 1, 5, 7], count=1))
 # Code from section 2.9
@time 1 + 2
@time(1 + 2)
@assert 1 == 2 "1 is not equal 2"
@assert(1 == 2, "1 is not equal 2")
@macroexpand @assert(1 == 2, "1 is not equal 2")
@macroexpand @time 1 + 2
 # before running these codes
 # define the winsorized_mean function using the code from section 2.7
 using BenchmarkTools
 x = rand(10^6);
@benchmark winsorized_mean($x, 10^5)
 using Statistics, StatsBase
@benchmark mean(winsor($x; count=10^5))
@edit winsor(x, count=10^5)
--- a/ch03.jl
+++ b/ch03.jl
@@ -2,358 +2,141 @@
 # Codes for chapter 3
-# Code for listing 3.1
+# Code from section 3.1
-aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+methods(cd)
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
-# Code for checking size of a matrix
+sum isa Function
-size(aq)
+typeof(sum)
-size(aq, 1)
+typeof(sum) == Function
 size(aq, 2)
-# Code comparing tuple to a vector
+supertype(typeof(sum))
-v = [1, 2, 3]
+function traverse(T)
-t = (1, 2, 3)
+    println(T)
-v[1]
+    T == Any || traverse(supertype(T))
-t[1]
+    return nothing
-v[1] = 10
+end
-v
+traverse(Int64)
 t[1] = 10
-# Code for figure 3.2
+function print_subtypes(T, indent_level=0)
    println(" " ^ indent_level, T)
    for S in subtypes(T)
        print_subtypes(S, indent_level + 2)
    end
    return nothing
 end
 print_subtypes(Integer)
 traverse(typeof([1.0, 2.0, 3.0]))
 traverse(typeof(1:3))
 AbstractVector
 typejoin(typeof([1.0, 2.0, 3.0]), typeof(1:3))
 # Code from section 3.2
 fun(x) = println("unsupported type")
 fun(x::Number) = println("a number was passed")
 fun(x::Float64) = println("a Float64 value")
 methods(fun)
 fun("hello!")
 fun(1)
 fun(1.0)
 bar(x, y) = "no numbers passed"
 bar(x::Number, y) = "first argument is a number"
 bar(x, y::Number) = "second argument is a number"
 bar("hello", "world")
 bar(1, "world")
 bar("hello", 2)
 bar(1, 2)
 bar(x::Number, y::Number) = "both arguments are numbers"
 bar(1, 2)
 methods(bar)
 function winsorized_mean(x::AbstractVector, k::Integer)
    k >= 0 || throw(ArgumentError("k must be non-negative"))
    length(x) > 2 * k || throw(ArgumentError("k is too large"))
    y = sort!(collect(x))
    for i in 1:k
        y[i] = y[k + 1]
        y[end - i + 1] = y[end - k]
    end
    return sum(y) / length(y)
 end
 winsorized_mean([8, 3, 1, 5, 7], 1)
 winsorized_mean(1:10, 2)
 winsorized_mean(1:10, "a")
 winsorized_mean(10, 1)
 winsorized_mean(1:10, -1)
 winsorized_mean(1:10, 5)
 # Code from section 3.3
 module ExampleModule
 function example()
    println("Hello")
 end
 end # ExampleModule
 import Statistics
 x = [1, 2, 3]
 mean(x)
 Statistics.mean(x)
 using Statistics
 mean(x)
 # start a fresh Julia session before running this code
 mean = 1
 using Statistics
 mean
 # start a fresh Julia session before running this code
 using Statistics
 mean([1, 2, 3])
 mean = 1
 # start a fresh Julia session before running this code
 using Statistics
 mean = 1
 mean([1, 2, 3])
 # start a fresh Julia session before running this code
 using Statistics
 using StatsBase
 ?winsor
 mean(winsor([8, 3, 1, 5, 7], count=1))
 # Code from section 3.4
@time 1 + 2
@time(1 + 2)
@assert 1 == 2 "1 is not equal 2"
@assert(1 == 2, "1 is not equal 2")
@macroexpand @assert(1 == 2, "1 is not equal 2")
@macroexpand @time 1 + 2
 # before running these codes
 # define the winsorized_mean function using the code from section 3.1
 using BenchmarkTools
-@benchmark (1, 2, 3)
+x = rand(10^6);
-@benchmark [1, 2, 3]
+@benchmark winsorized_mean($x, 10^5)
 using Statistics, StatsBase
@benchmark mean(winsor($x; count=10^5))
-# Code for section 3.1.2
+@edit winsor(x, count=10^5)
 using Statistics
 mean(aq; dims=1)
 std(aq; dims=1)
 map(mean, eachcol(aq))
 map(std, eachcol(aq))
 map(eachcol(aq)) do col
    mean(col)
 end
 [mean(col) for col in eachcol(aq)]
 [std(col) for col in eachcol(aq)]
 # Code for section 3.1.3
 [mean(aq[:, j]) for j in axes(aq, 2)]
 [std(aq[:, j]) for j in axes(aq, 2)]
 axes(aq, 2)
 ?Base.OneTo
 [mean(view(aq, :, j)) for j in axes(aq, 2)]
 [std(@view aq[:, j]) for j in axes(aq, 2)]
 # Code for section 3.1.4
 using BenchmarkTools
 x = ones(10^7, 10)
@benchmark [mean(@view $x[:, j]) for j in axes($x, 2)]
@benchmark [mean($x[:, j]) for j in axes($x, 2)]
@benchmark mean($x, dims=1)
 # Code for section 3.1.5
 [cor(aq[:, i], aq[:, i+1]) for i in 1:2:7]
 collect(1:2:7)
 # Code for section 3.1.6
 y = aq[:, 2]
 X = [ones(11) aq[:, 1]]
 X \ y
 [[ones(11) aq[:, i]] \ aq[:, i+1] for i in 1:2:7]
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    error = y - prediction
    SS_res = sum(v -> v ^ 2, error)
    mean_y = mean(y)
    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
    return 1 - SS_res / SS_tot
 end
 [R²(aq[:, i], aq[:, i+1]) for i in 1:2:7]
 ?²
 # Code for section 3.1.7
 using Plots
 scatter(aq[:, 1], aq[:, 2]; legend=false)
 plot(scatter(aq[:, 1], aq[:, 2]; legend=false),
     scatter(aq[:, 3], aq[:, 4]; legend=false),
     scatter(aq[:, 5], aq[:, 6]; legend=false),
     scatter(aq[:, 7], aq[:, 8]; legend=false))
 plot([scatter(aq[:, i], aq[:, i+1]; legend=false)
      for i in 1:2:7]...)
 # Code for section 3.2
 two_standard = Dict{Int, Int}()
 for i in [1, 2, 3, 4, 5, 6]
    for j in [1, 2, 3, 4, 5, 6]
        s = i + j
        if haskey(two_standard, s)
            two_standard[s] += 1
        else
            two_standard[s] = 1
        end
    end
 end
 two_standard
 keys(two_standard)
 values(two_standard)
 using Plots
 scatter(collect(keys(two_standard)), collect(values(two_standard));
        legend=false, xaxis=2:12)
 all_dice = [[1, x2, x3, x4, x5, x6]
            for x2 in 2:11
            for x3 in x2:11
            for x4 in x3:11
            for x5 in x4:11
            for x6 in x5:11]
 for d1 in all_dice, d2 in all_dice
    test = Dict{Int, Int}()
    for i in d1, j in d2
        s = i + j
        if haskey(test, s)
            test[s] += 1
        else
            test[s] = 1
        end
    end
    if test == two_standard
        println(d1, " ", d2)
    end
 end
 # Code for section 3.3
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
 dataset1 = (x=aq[:, 1], y=aq[:, 2])
 dataset1[1]
 dataset1.x
 # Code for listing 3.2
 data = (set1=(x=aq[:, 1], y=aq[:, 2]),
        set2=(x=aq[:, 3], y=aq[:, 4]),
        set3=(x=aq[:, 5], y=aq[:, 6]),
        set4=(x=aq[:, 7], y=aq[:, 8]))
 # Code for section 3.3.2
 using Statistics
 map(s -> mean(s.x), data)
 map(s -> cor(s.x, s.y), data)
 using GLM
 model = lm(@formula(y ~ x), data.set1)
 r2(model)
 # Code for section 3.3.3
 model.mm
 x = [3, 1, 2]
 sort(x)
 x
 sort!(x)
 x
 empty_field!(nt, i) = empty!(nt[i])
 nt = (dict = Dict("a" => 1, "b" => 2), int=10)
 empty_field!(nt, 1)
 nt
 # Code for section 3.4.1
 x = [1 2 3]
 y = [1, 2, 3]
 x * y
 a = [1, 2, 3]
 b = [4, 5, 6]
 a * b
 a .* b
 map(*, a, b)
 [a[i] * b[i] for i in eachindex(a, b)]
 eachindex(a, b)
 eachindex([1, 2, 3], [4, 5])
 map(*, [1, 2, 3], [4, 5])
 [1, 2, 3] .* [4, 5]
 # Code for section 3.4.2
 [1, 2, 3] .* [4]
 [1, 2, 3] .^ 2
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] .* [1 2 3 4 5 6 7 8 9 10]
 ["x", "y", "z"] .=> [sum minimum maximum]
 abs.([1, -2, 3, -4])
 abs([1, 2, 3])
 string(1, 2, 3)
 string.("x", 1:10)
 f(i::Int) = string("got integer ", i)
 f(s::String) = string("got string ", s)
 f.([1, "1"])
 # Code for section 3.4.3
 in(1, [1, 2, 3])
 in(4, [1, 2, 3])
 in([1, 3, 5, 7, 9], [1, 2, 3, 4])
 in.([1, 3, 5, 7, 9], [1, 2, 3, 4])
 in.([1, 3, 5, 7, 9], Ref([1, 2, 3, 4]))
 # Code for section 3.4.4
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
 using Statistics
 mean.(eachcol(aq))
 mean(eachcol(aq))
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    error = y - prediction
    SS_res = sum(v -> v ^ 2, error)
    mean_y = mean(y)
    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
    return 1 - SS_res / SS_tot
 end
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    SS_res = sum((y .- prediction) .^ 2)
    SS_tot = sum((y .- mean(y)) .^ 2)
    return 1 - SS_res / SS_tot
 end
 # Code for section 3.5
 []
 Dict()
 Float64[1, 2, 3]
 Dict{UInt8, Float64}(0 => 0, 1 => 1)
 UInt32(200)
 Real[1, 1.0, 0x3]
 v1 = Any[1, 2, 3]
 eltype(v1)
 v2 = Float64[1, 2, 3]
 eltype(v2)
 v3 = [1, 2, 3]
 eltype(v2)
 d1 = Dict()
 eltype(d1)
 d2 = Dict(1 => 2, 3 => 4)
 eltype(d2)
 p = 1 => 2
 typeof(p)
 # Code for section 3.5.1
 [1, 2, 3] isa AbstractVector{Int}
 [1, 2, 3] isa AbstractVector{Real}
 AbstractVector{<:Real}
 # Code for section 3.5.2
 using Statistics
 function ourcov(x::AbstractVector{<:Real},
                y::AbstractVector{<:Real})
    len = length(x)
    @assert len == length(y) > 0
    return sum((x .- mean(x)) .* (y .- mean(y))) / (len - 1)
 end
 ourcov(1:4, [1.0, 3.0, 2.0, 4.0])
 cov(1:4, [1.0, 3.0, 2.0, 4.0])
 ourcov(1:4, Any[1.0, 3.0, 2.0, 4.0])
 x = Any[1, 2, 3]
 identity.(x)
 y = Any[1, 2.0]
 identity.(y)
--- a/ch04.jl
+++ b/ch04.jl
@@ -1,224 +0,0 @@
 # Bogumił Kamiński, 2022
 # Codes for chapter 4
 # Code for listing 4.1
 import Downloads
 Downloads.download("https://raw.githubusercontent.com/" *
                   "sidooms/MovieTweetings/" *
                   "44c525d0c766944910686c60697203cda39305d6/" *
                   "snapshots/10K/movies.dat",
                   "movies.dat")
 # Code for string interpolation examples
 x = 10
 "I have $x apples"
 "I have \$100."
 "I have $100."
 # Code for multiline strings
 Downloads.download("https://raw.githubusercontent.com/\
                    sidooms/MovieTweetings/\
                    44c525d0c766944910686c60697203cda39305d6/\
                    snapshots/10K/movies.dat",
                   "movies.dat")
 "a\
 b\
 c"
 # Code for raw strings
 "C:\my_folder\my_file.txt"
 raw"C:\my_folder\my_file.txt"
 # Code for listing 4.2
 movies = readlines("movies.dat")
 # Code for section 4.2
 movie1 = first(movies)
 movie1_parts = split(movie1, "::")
 supertype(String)
 supertype(SubString{String})
 # Code for section 4.3
 movie1_parts[2]
 rx = r"(.*) \((\d{4})\)$"
 m = match(rx, movie1_parts[2])
 m[1]
 m[2]
 parse(Int, m[2])
 # Code for listing 4.3
 function parseline(line::String)
    parts = split(line, "::")
    m = match(r"(.*) \((\d{4})\)", parts[2])
    return (id=parts[1],
            name=m[1],
            year=parse(Int, m[2]),
            genres=split(parts[3], "|"))
 end
 # Code for parsing one line of movies data
 record1 = parseline(movie1)
 # Code for listing 4.4
 codeunits("a")
 codeunits("ε")
 codeunits("∀")
 # Codes for different patterns of string subsetting
 word = first(record1.name, 8)
 record1.name[1:8]
 for i in eachindex(word)
    println(i, ": ", word[i])
 end
 codeunits("ô")
 codeunits("Fantômas")
 isascii("Hello world!")
 isascii("∀ x: x≥0")
 word[1]
 word[5]
 # Code for section 4.5
 records = parseline.(movies)
 genres = String[]
 for record in records
    append!(genres, record.genres)
 end
 genres
 using FreqTables
 table = freqtable(genres)
 sort!(table)
 years = [record.year for record in records]
 has_drama = ["Drama" in record.genres for record in records]
 drama_prop = proptable(years, has_drama; margins=1)
 # Code for listing 4.5
 using Plots
 plot(names(drama_prop, 1), drama_prop[:, 2]; legend=false,
     xlabel="year", ylabel="Drama probability")
 # Code for section 4.6.1
 s1 = Symbol("x")
 s2 = Symbol("hello world!")
 s3 = Symbol("x", 1)
 typeof(s1)
 typeof(s2)
 typeof(s3)
 Symbol("1")
 :x
 :x1
 :hello world
 :1
 # Code for section 4.6.2
 supertype(Symbol)
 :x == :x
 :x == :y
 # Code for listing 4.6
 using BenchmarkTools
 str = string.("x", 1:10^6)
 symb = Symbol.(str)
@benchmark "x" in $str
@benchmark :x in $symb
 # Code for section 4.7
 using InlineStrings
 s1 = InlineString("x")
 typeof(s1)
 s2 = InlineString("∀")
 typeof(s2)
 sv = inlinestrings(["The", "quick", "brown", "fox", "jumps",
                    "over", "the", "lazy", "dog"])
 # Code for listing 4.7
 using Random
 using BenchmarkTools
 Random.seed!(1234);
 s1 = [randstring(3) for i in 1:10^6]
 s2 = inlinestrings(s1)
 # Code for analyzing properties of InlineStrings.jl
 Base.summarysize(s1)
 Base.summarysize(s2)
@benchmark sort($s1)
@benchmark sort($s2)
 # Code for listing 4.8
 open("iris.txt", "w") do io
    for i in 1:10^6
        println(io, "Iris setosa")
        println(io, "Iris virginica")
        println(io, "Iris versicolor")
    end
 end
 # Code for section 4.8.2
 uncompressed = readlines("iris.txt")
 using PooledArrays
 compressed = PooledArray(uncompressed)
 Base.summarysize(uncompressed)
 Base.summarysize(compressed)
 # Code for section 4.8.3
 compressed.invpool
 compressed.pool
 compressed[10]
 compressed.pool[compressed.refs[10]]
 Base.summarysize.(compressed.pool)
 v1 = string.("x", 1:10^6)
 v2 = PooledArray(v1)
 Base.summarysize(v1)
 Base.summarysize(v2)
--- a/ch045.jl
+++ b/ch045.jl
@@ -0,0 +1,359 @@
 # Bogumił Kamiński, 2021
 # Codes for chapter 3
 # Code for listing 3.1
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
 # Code for checking size of a matrix
 size(aq)
 size(aq, 1)
 size(aq, 2)
 # Code comparing tuple to a vector
 v = [1, 2, 3]
 t = (1, 2, 3)
 v[1]
 t[1]
 v[1] = 10
 v
 t[1] = 10
 # Code for figure 3.2
 using BenchmarkTools
@benchmark (1, 2, 3)
@benchmark [1, 2, 3]
 # Code for section 3.1.2
 using Statistics
 mean(aq; dims=1)
 std(aq; dims=1)
 map(mean, eachcol(aq))
 map(std, eachcol(aq))
 map(eachcol(aq)) do col
    mean(col)
 end
 [mean(col) for col in eachcol(aq)]
 [std(col) for col in eachcol(aq)]
 # Code for section 3.1.3
 [mean(aq[:, j]) for j in axes(aq, 2)]
 [std(aq[:, j]) for j in axes(aq, 2)]
 axes(aq, 2)
 ?Base.OneTo
 [mean(view(aq, :, j)) for j in axes(aq, 2)]
 [std(@view aq[:, j]) for j in axes(aq, 2)]
 # Code for section 3.1.4
 using BenchmarkTools
 x = ones(10^7, 10)
@benchmark [mean(@view $x[:, j]) for j in axes($x, 2)]
@benchmark [mean($x[:, j]) for j in axes($x, 2)]
@benchmark mean($x, dims=1)
 # Code for section 3.1.5
 [cor(aq[:, i], aq[:, i+1]) for i in 1:2:7]
 collect(1:2:7)
 # Code for section 3.1.6
 y = aq[:, 2]
 X = [ones(11) aq[:, 1]]
 X \ y
 [[ones(11) aq[:, i]] \ aq[:, i+1] for i in 1:2:7]
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    error = y - prediction
    SS_res = sum(v -> v ^ 2, error)
    mean_y = mean(y)
    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
    return 1 - SS_res / SS_tot
 end
 [R²(aq[:, i], aq[:, i+1]) for i in 1:2:7]
 ?²
 # Code for section 3.1.7
 using Plots
 scatter(aq[:, 1], aq[:, 2]; legend=false)
 plot(scatter(aq[:, 1], aq[:, 2]; legend=false),
     scatter(aq[:, 3], aq[:, 4]; legend=false),
     scatter(aq[:, 5], aq[:, 6]; legend=false),
     scatter(aq[:, 7], aq[:, 8]; legend=false))
 plot([scatter(aq[:, i], aq[:, i+1]; legend=false)
      for i in 1:2:7]...)
 # Code for section 3.2
 two_standard = Dict{Int, Int}()
 for i in [1, 2, 3, 4, 5, 6]
    for j in [1, 2, 3, 4, 5, 6]
        s = i + j
        if haskey(two_standard, s)
            two_standard[s] += 1
        else
            two_standard[s] = 1
        end
    end
 end
 two_standard
 keys(two_standard)
 values(two_standard)
 using Plots
 scatter(collect(keys(two_standard)), collect(values(two_standard));
        legend=false, xaxis=2:12)
 all_dice = [[1, x2, x3, x4, x5, x6]
            for x2 in 2:11
            for x3 in x2:11
            for x4 in x3:11
            for x5 in x4:11
            for x6 in x5:11]
 for d1 in all_dice, d2 in all_dice
    test = Dict{Int, Int}()
    for i in d1, j in d2
        s = i + j
        if haskey(test, s)
            test[s] += 1
        else
            test[s] = 1
        end
    end
    if test == two_standard
        println(d1, " ", d2)
    end
 end
 # Code for section 3.3
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
 dataset1 = (x=aq[:, 1], y=aq[:, 2])
 dataset1[1]
 dataset1.x
 # Code for listing 3.2
 data = (set1=(x=aq[:, 1], y=aq[:, 2]),
        set2=(x=aq[:, 3], y=aq[:, 4]),
        set3=(x=aq[:, 5], y=aq[:, 6]),
        set4=(x=aq[:, 7], y=aq[:, 8]))
 # Code for section 3.3.2
 using Statistics
 map(s -> mean(s.x), data)
 map(s -> cor(s.x, s.y), data)
 using GLM
 model = lm(@formula(y ~ x), data.set1)
 r2(model)
 # Code for section 3.3.3
 model.mm
 x = [3, 1, 2]
 sort(x)
 x
 sort!(x)
 x
 empty_field!(nt, i) = empty!(nt[i])
 nt = (dict = Dict("a" => 1, "b" => 2), int=10)
 empty_field!(nt, 1)
 nt
 # Code for section 3.4.1
 x = [1 2 3]
 y = [1, 2, 3]
 x * y
 a = [1, 2, 3]
 b = [4, 5, 6]
 a * b
 a .* b
 map(*, a, b)
 [a[i] * b[i] for i in eachindex(a, b)]
 eachindex(a, b)
 eachindex([1, 2, 3], [4, 5])
 map(*, [1, 2, 3], [4, 5])
 [1, 2, 3] .* [4, 5]
 # Code for section 3.4.2
 [1, 2, 3] .* [4]
 [1, 2, 3] .^ 2
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] .* [1 2 3 4 5 6 7 8 9 10]
 ["x", "y", "z"] .=> [sum minimum maximum]
 abs.([1, -2, 3, -4])
 abs([1, 2, 3])
 string(1, 2, 3)
 string.("x", 1:10)
 f(i::Int) = string("got integer ", i)
 f(s::String) = string("got string ", s)
 f.([1, "1"])
 # Code for section 3.4.3
 in(1, [1, 2, 3])
 in(4, [1, 2, 3])
 in([1, 3, 5, 7, 9], [1, 2, 3, 4])
 in.([1, 3, 5, 7, 9], [1, 2, 3, 4])
 in.([1, 3, 5, 7, 9], Ref([1, 2, 3, 4]))
 # Code for section 3.4.4
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]
 using Statistics
 mean.(eachcol(aq))
 mean(eachcol(aq))
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    error = y - prediction
    SS_res = sum(v -> v ^ 2, error)
    mean_y = mean(y)
    SS_tot = sum(v -> (v - mean_y) ^ 2, y)
    return 1 - SS_res / SS_tot
 end
 function R²(x, y)
    X = [ones(11) x]
    model = X \ y
    prediction = X * model
    SS_res = sum((y .- prediction) .^ 2)
    SS_tot = sum((y .- mean(y)) .^ 2)
    return 1 - SS_res / SS_tot
 end
 # Code for section 3.5
 []
 Dict()
 Float64[1, 2, 3]
 Dict{UInt8, Float64}(0 => 0, 1 => 1)
 UInt32(200)
 Real[1, 1.0, 0x3]
 v1 = Any[1, 2, 3]
 eltype(v1)
 v2 = Float64[1, 2, 3]
 eltype(v2)
 v3 = [1, 2, 3]
 eltype(v2)
 d1 = Dict()
 eltype(d1)
 d2 = Dict(1 => 2, 3 => 4)
 eltype(d2)
 p = 1 => 2
 typeof(p)
 # Code for section 3.5.1
 [1, 2, 3] isa AbstractVector{Int}
 [1, 2, 3] isa AbstractVector{Real}
 AbstractVector{<:Real}
 # Code for section 3.5.2
 using Statistics
 function ourcov(x::AbstractVector{<:Real},
                y::AbstractVector{<:Real})
    len = length(x)
    @assert len == length(y) > 0
    return sum((x .- mean(x)) .* (y .- mean(y))) / (len - 1)
 end
 ourcov(1:4, [1.0, 3.0, 2.0, 4.0])
 cov(1:4, [1.0, 3.0, 2.0, 4.0])
 ourcov(1:4, Any[1.0, 3.0, 2.0, 4.0])
 x = Any[1, 2, 3]
 identity.(x)
 y = Any[1, 2.0]
 identity.(y)
--- a/ch05.jl
+++ b/ch05.jl
@@ -1,214 +0,0 @@
 # Bogumił Kamiński, 2022
 # Codes for chapter 5
 # Code for listing 5.1
 using HTTP
 using JSON3
 query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
        "2020-06-01/?format=json"
 response = HTTP.get(query)
 json = JSON3.read(response.body)
 # Code for the remainder of section 5.1.2
 response.body
 String(response.body)
 response.body
 json.table
 json.currency
 json.code
 json.rates
 json.rates[1].mid
 only(json.rates).mid
 only([])
 only([1, 2])
 # Code for listing 5.2
 query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
        "2020-06-06/?format=json"
 response = HTTP.get(query)
 # Code for listing 5.3
 query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
        "2020-06-01/?format=json"
 try
    response = HTTP.get(query)
    json = JSON3.read(response.body)
    only(json.rates).mid
 catch e
    if e isa HTTP.ExceptionRequest.StatusError
        missing
    else
        rethrow(e)
    end
 end
 query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
        "2020-06-06/?format=json"
 try
    response = HTTP.get(query)
    json = JSON3.read(response.body)
    only(json.rates).mid
 catch e
    if e isa HTTP.ExceptionRequest.StatusError
        missing
    else
        rethrow(e)
    end
 end
 # Code for section 5.2
 ismissing(missing)
 ismissing(1)
 1 + missing
 sin(missing)
 1 == missing
 1 > missing
 1 < missing
 if missing
    print("this is not printed")
 end
 missing && true
 coalesce(missing, true)
 coalesce(missing, false)
 isequal(1, missing)
 isequal(missing, missing)
 isless(1, missing)
 isless(missing, missing)
 isless(Inf, missing)
 a = [1]
 b = [1]
 isequal(a, b)
 a === b
 x = [1, missing, 3, 4, missing]
 coalesce.(x, 0)
 sum(x)
 y = skipmissing(x)
 sum(y)
 sum(skipmissing(x))
 fun(x::Int, y::Int) = x + y
 fun(1, 2)
 fun(1, missing)
 using Missings
 fun2 = passmissing(fun)
 fun2(1, 2)
 fun2(1, missing)
 # Code for section 5.3
 using Dates
 d = Date("2020-06-01")
 typeof(d)
 year(d)
 month(d)
 day(d)
 dayofweek(d)
 dayname(d)
 Date(2020, 6, 1)
 dates = Date.(2020, 6, 1:30)
 Day(1)
 d
 d + Day(1)
 Date(2020, 5, 20):Day(1):Date(2020, 7, 5)
 collect(Date(2020, 5, 20):Day(1):Date(2020, 7, 5))
 # Code for listing 5.6
 function get_rate(date::Date)
    query = "https://api.nbp.pl/api/exchangerates/rates/" *
            "a/usd/$date/?format=json"
    try
        response = HTTP.get(query)
        json = JSON3.read(response.body)
        return only(json.rates).mid
    catch e
        if e isa HTTP.ExceptionRequest.StatusError
            return missing
        else
            rethrow(e)
        end
    end
 end
 # Code for showing how string interpolation works
 "https://api.nbp.pl/api/exchangerates/rates/" *
 "a/usd/$(dates[1])/?format=json"
 "https://api.nbp.pl/api/exchangerates/rates/" *
 "a/usd/$dates[1]/?format=json"
 # Code for listing 5.7
 rates = get_rate.(dates)
 # Code for section 5.4
 using Statistics
 mean(rates)
 std(rates)
 mean(skipmissing(rates))
 std(skipmissing(rates))
 # Code for listing 5.8
 using FreqTables
 proptable(dayname.(dates), ismissing.(rates); margins=1)
 # Code showing how to specify a complex condition using broadcasting
 dayname.(dates) .== "Thursday" .&& ismissing.(rates)
 # Code for listing 5.9
 dates[dayname.(dates) .== "Thursday" .&& ismissing.(rates)]
 # Codes for plotting exchange rate data
 using Plots
 plot(dates, rates; xlabel="day", ylabel="PLN/USD", legend=false)
 rates_ok = .!ismissing.(rates)
 plot(dates[rates_ok], rates[rates_ok];
     xlabel="day", ylabel="PLN/USD", legend=false)
 using Impute
 rates_filled = Impute.interp(rates)
 scatter!(dates, rates_filled)
--- a/ch06.jl
+++ b/ch06.jl
@@ -1,248 +1,224 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 6
+# Codes for chapter 4
-# Code for section 6.1
+# Code for listing 4.1
-if isfile("puzzles.csv.bz2")
+import Downloads
-    @info "file already present"
+Downloads.download("https://raw.githubusercontent.com/" *
-else
+                   "sidooms/MovieTweetings/" *
-    @info "fetching file"
+                   "44c525d0c766944910686c60697203cda39305d6/" *
-    download("https://database.lichess.org/" *
+                   "snapshots/10K/movies.dat",
-            "lichess_db_puzzle.csv.bz2",
+                   "movies.dat")
-            "puzzles.csv.bz2")
+
 # Code for string interpolation examples
 x = 10
 "I have $x apples"
 "I have \$100."
 "I have $100."
 # Code for multiline strings
 Downloads.download("https://raw.githubusercontent.com/\
                    sidooms/MovieTweetings/\
                    44c525d0c766944910686c60697203cda39305d6/\
                    snapshots/10K/movies.dat",
                   "movies.dat")
 "a\
 b\
 c"
 # Code for raw strings
 "C:\my_folder\my_file.txt"
 raw"C:\my_folder\my_file.txt"
 # Code for listing 4.2
 movies = readlines("movies.dat")
 # Code for section 4.2
 movie1 = first(movies)
 movie1_parts = split(movie1, "::")
 supertype(String)
 supertype(SubString{String})
 # Code for section 4.3
 movie1_parts[2]
 rx = r"(.*) \((\d{4})\)$"
 m = match(rx, movie1_parts[2])
 m[1]
 m[2]
 parse(Int, m[2])
 # Code for listing 4.3
 function parseline(line::String)
    parts = split(line, "::")
    m = match(r"(.*) \((\d{4})\)", parts[2])
    return (id=parts[1],
            name=m[1],
            year=parse(Int, m[2]),
            genres=split(parts[3], "|"))
 end
-using CodecBzip2
+# Code for parsing one line of movies data
 compressed = read("puzzles.csv.bz2")
 plain = transcode(Bzip2Decompressor, compressed)
-open("puzzles.csv", "w") do io
+record1 = parseline(movie1)
-    println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
+
-                "Popularity,NbPlays,Themes,GameUrl")
+# Code for listing 4.4
-    write(io, plain)
+
 codeunits("a")
 codeunits("ε")
 codeunits("∀")
 # Codes for different patterns of string subsetting
 word = first(record1.name, 8)
 record1.name[1:8]
 for i in eachindex(word)
    println(i, ": ", word[i])
 end
-readlines("puzzles.csv")
+codeunits("ô")
-# Code for section 6.2
+codeunits("Fantômas")
-using CSV
+isascii("Hello world!")
-using DataFrames
+isascii("∀ x: x≥0")
 puzzles = CSV.read("puzzles.csv", DataFrame);
-CSV.read(plain, DataFrame);
+word[1]
 word[5]
-compressed = nothing
+# Code for section 4.5
 plain = nothing
-# Code for listing 6.1
+records = parseline.(movies)
-puzzles
+genres = String[]
 for record in records
    append!(genres, record.genres)
 end
 genres
-# Code for listing 6.2
+using FreqTables
 table = freqtable(genres)
 sort!(table)
-describe(puzzles)
+years = [record.year for record in records]
 has_drama = ["Drama" in record.genres for record in records]
 drama_prop = proptable(years, has_drama; margins=1)
-# Code for getting basic information about a data frame
+# Code for listing 4.5
 ncol(puzzles)
 nrow(puzzles)
 names(puzzles)
 # Code for section 6.3
 puzzles.Rating
 using BenchmarkTools
@benchmark $puzzles.Rating
 puzzles.Rating == copy(puzzles.Rating)
 puzzles.Rating === copy(puzzles.Rating)
 puzzles.Rating === puzzles.Rating
 copy(puzzles.Rating) === copy(puzzles.Rating)
 puzzles."Rating"
 col = "Rating"
 data_frame_name[selected_rows, selected_columns]
 puzzles[:, "Rating"]
 puzzles[:, :Rating]
 puzzles[:, 4]
 puzzles[:, col]
 columnindex(puzzles, "Rating")
 columnindex(puzzles, "Some fancy column name")
 hasproperty(puzzles, "Rating")
 hasproperty(puzzles, "Some fancy column name")
@benchmark $puzzles[:, :Rating]
 puzzles[!, "Rating"]
 puzzles[!, :Rating]
 puzzles[!, 4]
 puzzles[!, col]
 using Plots
 plot(histogram(puzzles.Rating, label="Rating"),
     histogram(puzzles.RatingDeviation, label="RatingDeviation"),
     histogram(puzzles.Popularity, label="Popularity"),
     histogram(puzzles.NbPlays, label="NbPlays"))
-plot([histogram(puzzles[!, col]; label=col) for
+plot(names(drama_prop, 1), drama_prop[:, 2]; legend=false,
-      col in ["Rating", "RatingDeviation",
+     xlabel="year", ylabel="Drama probability")
              "Popularity", "NbPlays"]]...)
-# Code for section 6.4
+# Code for section 4.6.1
-using Statistics
+s1 = Symbol("x")
-plays_lo = median(puzzles.NbPlays)
+s2 = Symbol("hello world!")
-puzzles.NbPlays .> plays_lo
+s3 = Symbol("x", 1)
-puzzles.NbPlays > plays_lo
+typeof(s1)
 typeof(s2)
 typeof(s3)
-rating_lo = 1500
+Symbol("1")
 rating_hi = quantile(puzzles.Rating, 0.99)
 rating_lo .< puzzles.Rating .< rating_hi
-row_selector = (puzzles.NbPlays .> plays_lo) .&&
+:x
-               (rating_lo .< puzzles.Rating .< rating_hi)
+:x1
-sum(row_selector)
+:hello world
-count(row_selector)
+:1
-# Code for listing 6.3
+# Code for section 4.6.2
-good = puzzles[row_selector, ["Rating", "Popularity"]]
+supertype(Symbol)
-# Code for plotting histograms
+:x == :x
 :x == :y
-plot(histogram(good.Rating; label="Rating"),
+# Code for listing 4.6
     histogram(good.Popularity; label="Popularity"))
-# Code for column selectors
+using BenchmarkTools
 str = string.("x", 1:10^6)
 symb = Symbol.(str)
@benchmark "x" in $str
@benchmark :x in $symb
-puzzles[1, "Rating"]
+# Code for section 4.7
-puzzles[:, "Rating"]
+using InlineStrings
 s1 = InlineString("x")
 typeof(s1)
 s2 = InlineString("∀")
 typeof(s2)
 sv = inlinestrings(["The", "quick", "brown", "fox", "jumps",
                    "over", "the", "lazy", "dog"])
-row1 = puzzles[1, ["Rating", "Popularity"]]
+# Code for listing 4.7
-row1["Rating"]
+using Random
-row1[:Rating]
+using BenchmarkTools
-row1[1]
+Random.seed!(1234);
-row1.Rating
+s1 = [randstring(3) for i in 1:10^6]
-row1."Rating"
+s2 = inlinestrings(s1)
-good = puzzles[row_selector, ["Rating", "Popularity"]]
+# Code for analyzing properties of InlineStrings.jl
-good[1, "Rating"]
+Base.summarysize(s1)
-good[1, :]
+Base.summarysize(s2)
 good[:, "Rating"]
 good[:, :]
-names(puzzles, ["Rating", "Popularity"])
+@benchmark sort($s1)
-names(puzzles, [:Rating, :Popularity])
+@benchmark sort($s2)
 names(puzzles, [4, 6])
 names(puzzles, [false, false, false, true, false, true, false, false, false])
 names(puzzles, r"Rating")
 names(puzzles, Not([4, 6]))
 names(puzzles, Not(r"Rating"))
 names(puzzles, Between("Rating", "Popularity"))
 names(puzzles, :)
 names(puzzles, All())
 names(puzzles, Cols(r"Rating", "NbPlays"))
 names(puzzles, Cols(startswith("P")))
-names(puzzles, startswith("P"))
+# Code for listing 4.8
-names(puzzles, Real)
+open("iris.txt", "w") do io
-
+    for i in 1:10^6
-names(puzzles, AbstractString)
+        println(io, "Iris setosa")
-
+        println(io, "Iris virginica")
-puzzles[:, names(puzzles, Real)]
+        println(io, "Iris versicolor")
 # Code for row subsetting
 df1 = puzzles[:, ["Rating", "Popularity"]];
 df2 = puzzles[!, ["Rating", "Popularity"]];
 df1 == df2
 df1 == puzzles
 df2 == puzzles
 df1.Rating === puzzles.Rating
 df1.Popularity === puzzles.Popularity
 df2.Rating === puzzles.Rating
 df2.Popularity === puzzles.Popularity
@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
 puzzles[1, 1]
 puzzles[[1], 1]
 puzzles[1, [1]]
 puzzles[[1], [1]]
 # Code for making views
@view puzzles[1, 1]
@view puzzles[[1], 1]
@view puzzles[1, [1]]
@view puzzles[[1], [1]]
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
 parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
 # Code for section 6.5
 describe(good)
 rating_mapping = Dict{Int, Vector{Int}}()
 for (i, rating) in enumerate(good.Rating)
    if haskey(rating_mapping, rating)
        push!(rating_mapping[rating], i)
    else
        rating_mapping[rating] = [i]
    end
 end
 rating_mapping
-good[rating_mapping[2108], :]
+# Code for section 4.8.2
-unique(good[rating_mapping[2108], :].Rating)
+uncompressed = readlines("iris.txt")
-using Statistics
+using PooledArrays
-mean(good[rating_mapping[2108], "Popularity"])
+compressed = PooledArray(uncompressed)
-ratings = unique(good.Rating)
+Base.summarysize(uncompressed)
 Base.summarysize(compressed)
-mean_popularities = map(ratings) do rating
+# Code for section 4.8.3
    indices = rating_mapping[rating]
    popularities = good[indices, "Popularity"]
    return mean(popularities)
 end
-scatter(ratings, mean_popularities;
+compressed.invpool
-        xlabel="rating", ylabel="mean popularity", legend=false)
+compressed.pool
-import Loess
+compressed[10]
-model = Loess.loess(ratings, mean_popularities);
+compressed.pool[compressed.refs[10]]
 ratings_predict = float.(sort(ratings))
 popularity_predict = Loess.predict(model, ratings_predict)
-plot!(ratings_predict, popularity_predict; width=5, color="black")
+Base.summarysize.(compressed.pool)
 v1 = string.("x", 1:10^6)
 v2 = PooledArray(v1)
 Base.summarysize(v1)
 Base.summarysize(v2)
--- a/ch07.jl
+++ b/ch07.jl
@@ -1,279 +1,214 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 7
+# Codes for chapter 5
-# Code for section 7.1
+# Code for listing 5.1
-aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+using HTTP
-       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
+using JSON3
-      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
+query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
-       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
+        "2020-06-01/?format=json"
-      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
+response = HTTP.get(query)
-      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
+json = JSON3.read(response.body)
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
-data = (set1=(x=aq[:, 1], y=aq[:, 2]),
+# Code for the remainder of section 5.1.2
        set2=(x=aq[:, 3], y=aq[:, 4]),
        set3=(x=aq[:, 5], y=aq[:, 6]),
        set4=(x=aq[:, 7], y=aq[:, 8]));
-using DataFrames
+response.body
-# Code for listing 7.1
+String(response.body)
-aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
+response.body
 DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
-# Code for creating DataFrame with automatic column names
+json.table
 json.currency
 json.code
 json.rates
-DataFrame(aq, :auto)
+json.rates[1].mid
-# Codes for creating DataFrame from vector of vectors
+only(json.rates).mid
-aq_vec = collect(eachcol(aq))
+only([])
-DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
+only([1, 2])
 DataFrame(aq_vec, :auto)
-# Codes for section 7.1.2
+# Code for listing 5.2
-data.set1.x
+query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
        "2020-06-06/?format=json"
 response = HTTP.get(query)
-DataFrame(x1=data.set1.x, y1=data.set1.y,
+# Code for listing 5.3
          x2=data.set2.x, y2=data.set2.y,
          x3=data.set3.x, y3=data.set3.y,
          x4=data.set4.x, y4=data.set4.y)
-DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
+query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
-          :x2 => data.set2.x, :y2 => data.set2.y,
+        "2020-06-01/?format=json"
-          :x3 => data.set3.x, :y3 => data.set3.y,
+try
-          :x4 => data.set4.x, :y4 => data.set4.y)
+    response = HTTP.get(query)
-
+    json = JSON3.read(response.body)
-DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
+    only(json.rates).mid
-           :x2 => data.set2.x, :y2 => data.set2.y,
+catch e
-           :x3 => data.set3.x, :y3 => data.set3.y,
+    if e isa HTTP.ExceptionRequest.StatusError
-           :x4 => data.set4.x, :y4 => data.set4.y]);
+        missing
-
+    else
-[(i, v) for i in 1:4 for v in [:x, :y]]
+        rethrow(e)
-
+    end
 [string(v, i) for i in 1:4 for v in [:x, :y]]
 [string(v, i) => getproperty(data[i], v)
        for i in 1:4 for v in [:x, :y]]
 DataFrame([string(v, i) => getproperty(data[i], v)
           for i in 1:4 for v in [:x, :y]]);
 data_dict = Dict([string(v, i) => getproperty(data[i], v)
                         for i in 1:4 for v in [:x, :y]])
 collect(data_dict)
 DataFrame(data_dict)
 df1 = DataFrame(x1=data.set1.x)
 df1.x1 === data.set1.x
 df2 = DataFrame(x1=data.set1.x; copycols=false)
 df2.x1 === data.set1.x
 df = DataFrame(x=1:3, y=1)
 df.x
 DataFrame(x=[1], y=[1, 2, 3])
 # Codes for section 7.1.3
 data.set1
 DataFrame(data.set1)
 DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
 data
 # Code for listing 7.2
 aq2 = DataFrame(data)
 # Codes for listing 7.3
 data_dfs = map(DataFrame, data)
 # Codes for vertical concatenation examples
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id")
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id"=>string.("set", 1:4))
 reduce(vcat, collect(data_dfs);
       source="source_id"=>string.("set", 1:4))
 # Code for listing 7.4
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, c=24:26)
 vcat(df1, df2)
 vcat(df1, df2; cols=:union)
 # Code for listing 7.5
 df_agg = DataFrame()
 append!(df_agg, data_dfs.set1)
 append!(df_agg, data_dfs.set2)
 # Code for appending tables to a data frame
 df_agg = DataFrame()
 append!(df_agg, data.set1)
 append!(df_agg, data.set2)
 # Code for promote keyword argument
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, b=[14, missing, 16])
 append!(df1, df2)
 append!(df1, df2; promote=true)
 # Code for section 7.2.3
 df = DataFrame()
 push!(df, (a=1, b=2))
 push!(df, (a=3, b=4))
 df = DataFrame(a=Int[], b=Int[])
 push!(df, [1, 2])
 push!(df, [3, 4])
 function sim_step(current)
    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
    return (x=current.x + dx, y=current.y + dy)
 end
-using BenchmarkTools
+query = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
-@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
+        "2020-06-06/?format=json"
 try
    response = HTTP.get(query)
    json = JSON3.read(response.body)
    only(json.rates).mid
 catch e
    if e isa HTTP.ExceptionRequest.StatusError
        missing
    else
        rethrow(e)
    end
 end
-dx, dy = (10, 20)
+# Code for section 5.2
-dx
+
-dy
+ismissing(missing)
 ismissing(1)
 1 + missing
 sin(missing)
 1 == missing
 1 > missing
 1 < missing
 if missing
    print("this is not printed")
 end
 missing && true
 coalesce(missing, true)
 coalesce(missing, false)
 isequal(1, missing)
 isequal(missing, missing)
 isless(1, missing)
 isless(missing, missing)
 isless(Inf, missing)
 a = [1]
 b = [1]
 isequal(a, b)
 a === b
 x = [1, missing, 3, 4, missing]
 coalesce.(x, 0)
 sum(x)
 y = skipmissing(x)
 sum(y)
 sum(skipmissing(x))
 fun(x::Int, y::Int) = x + y
 fun(1, 2)
 fun(1, missing)
 using Missings
 fun2 = passmissing(fun)
 fun2(1, 2)
 fun2(1, missing)
 # Code for section 5.3
 using Dates
 d = Date("2020-06-01")
 typeof(d)
 year(d)
 month(d)
 day(d)
 dayofweek(d)
 dayname(d)
 Date(2020, 6, 1)
 dates = Date.(2020, 6, 1:30)
 Day(1)
 d
 d + Day(1)
 Date(2020, 5, 20):Day(1):Date(2020, 7, 5)
 collect(Date(2020, 5, 20):Day(1):Date(2020, 7, 5))
 # Code for listing 5.6
 function get_rate(date::Date)
    query = "https://api.nbp.pl/api/exchangerates/rates/" *
            "a/usd/$date/?format=json"
    try
        response = HTTP.get(query)
        json = JSON3.read(response.body)
        return only(json.rates).mid
    catch e
        if e isa HTTP.ExceptionRequest.StatusError
            return missing
        else
            rethrow(e)
        end
    end
 end
 # Code for showing how string interpolation works
 "https://api.nbp.pl/api/exchangerates/rates/" *
 "a/usd/$(dates[1])/?format=json"
 "https://api.nbp.pl/api/exchangerates/rates/" *
 "a/usd/$dates[1]/?format=json"
 # Code for listing 5.7
 rates = get_rate.(dates)
 # Code for section 5.4
 using Statistics
 mean(rates)
 std(rates)
 mean(skipmissing(rates))
 std(skipmissing(rates))
 # Code for listing 5.8
 using FreqTables
-using Random
+proptable(dayname.(dates), ismissing.(rates); margins=1)
 Random.seed!(1234);
 proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
-using Random
+# Code showing how to specify a complex condition using broadcasting
 Random.seed!(6);
 walk = DataFrame(x=0, y=0)
 for _ in 1:10
    current = walk[end, :]
    push!(walk, sim_step(current))
 end
 walk
-plot(walk.x, walk.y;
+dayname.(dates) .== "Thursday" .&& ismissing.(rates)
     legend=false,
     series_annotations=1:11,
     xticks=range(extrema(walk.x)...),
     yticks=range(extrema(walk.y)...))
-extrema(walk.y)
+# Code for listing 5.9
-range(1, 5)
+dates[dayname.(dates) .== "Thursday" .&& ismissing.(rates)]
-(3/4)^9
+# Codes for plotting exchange rate data
-# Code for listing 7.6
+using Plots
 plot(dates, rates; xlabel="day", ylabel="PLN/USD", legend=false)
-function walk_unique() #A
+rates_ok = .!ismissing.(rates)
    walk = DataFrame(x=0, y=0)
    for _ in 1:10
        current = walk[end, :]
        push!(walk, sim_step(current))
    end
    return nrow(unique(walk)) == nrow(walk) #B
 end
 Random.seed!(2);
 proptable([walk_unique() for _ in 1:10^5])
-# Code for a note on conversion
+plot(dates[rates_ok], rates[rates_ok];
     xlabel="day", ylabel="PLN/USD", legend=false)
-x = [1.5]
+using Impute
-x[1] = 1
+rates_filled = Impute.interp(rates)
 x
-# Code from section 7.3.1
+scatter!(dates, rates_filled)
 Matrix(walk)
 Matrix{Any}(walk)
 Matrix{String}(walk)
 plot(walk)
 plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
 # Code from section 7.3.2
 Tables.columntable(walk)
 using BenchmarkTools
 function mysum(table)
           s = 0 #A
           for v in table.x #B
               s += v
           end
           return s
       end
 df = DataFrame(x=1:1_000_000);
@btime mysum($df)
 tab = Tables.columntable(df);
@btime mysum($tab)
@code_warntype mysum(df)
@code_warntype mysum(tab)
 typeof(tab)
 function barrier_mysum2(x)
    s = 0
    for v in x
        s += v
    end
    return s
 end
 mysum2(table) = barrier_mysum2(table.x)
@btime mysum2($df)
 df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
 unique(df)
 tab = Tables.columntable(df)
 unique(tab)
 # Code from section 7.3.3
 Tables.rowtable(walk)
 nti = Tables.namedtupleiterator(walk)
 for v in nti
    println(v)
 end
 er = eachrow(walk)
 er[1]
 er[end]
 ec = eachcol(walk)
 ec[1]
 ec[end]
 identity.(eachcol(walk))
 df = DataFrame(x=1:2, b=["a", "b"])
 identity.(eachcol(df))
--- a/ch08.jl
+++ b/ch08.jl
@@ -1,284 +1,248 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 8
+# Codes for chapter 6
-# Codes for section 8.1
+# Code for section 6.1
-# Code for listing 8.1
+if isfile("puzzles.csv.bz2")
-
+    @info "file already present"
-import Downloads
+else
-using SHA
+    @info "fetching file"
-git_zip = "git_web_ml.zip"
+    download("https://database.lichess.org/" *
-if !isfile(git_zip)
+            "lichess_db_puzzle.csv.bz2",
-    Downloads.download("https://snap.stanford.edu/data/" *
+            "puzzles.csv.bz2")
                       "git_web_ml.zip",
                       git_zip)
 end
 isfile(git_zip)
 open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
                          0xc4, 0x60, 0xdc, 0x4c,
                          0x7b, 0xf8, 0x93, 0x57,
                          0xb1, 0xfe, 0xc0, 0x20,
                          0xf4, 0x5e, 0x2e, 0xce,
                          0xba, 0xb8, 0x1d, 0x13,
                          0x1d, 0x07, 0x3b, 0x10,
                          0xe2, 0x8e, 0xc0, 0x31]
 # Code for opeining a zip archive
 import ZipFile
 git_archive = ZipFile.Reader(git_zip)
 # Code for listing 8.2
 function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
    idx = only(findall(x -> x.name == filename, archive.files))
    return CSV.read(read(archive.files[idx]), DataFrame)
 end
-# Code for working with zip archive
+using CodecBzip2
 compressed = read("puzzles.csv.bz2")
 plain = transcode(Bzip2Decompressor, compressed)
-git_archive.files
+open("puzzles.csv", "w") do io
    println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
                "Popularity,NbPlays,Themes,GameUrl")
    write(io, plain)
 end
-git_archive.files[2].name
+readlines("puzzles.csv")
-findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
+# Code for section 6.2
 findall(x -> x.name == "", git_archive.files)
 only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
 only(findall(x -> x.name == "", git_archive.files))
 # Code for listing 8.3
 using CSV
 using DataFrames
-edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
+puzzles = CSV.read("puzzles.csv", DataFrame);
 classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
 close(git_archive)
 summary(edges_df)
 describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
 summary(classes_df)
 describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
-# Code for updating data frame columns using broadcasting
+CSV.read(plain, DataFrame);
-edges_df .+= 1
+compressed = nothing
-classes_df.id .+= 1
+plain = nothing
-# Code for examples of data frame broadcasting
+# Code for listing 6.1
-df = DataFrame(a=1:3, b=[4, missing, 5])
+puzzles
 df .^ 2
 coalesce.(df, 0)
 df .+ [10, 11, 12]
-# Code for checking the order of :id column in a data frame
+# Code for listing 6.2
-classes_df.id == axes(classes_df, 1)
+describe(puzzles)
-# Code for the difference between ! and : in broadcasting assignment
+# Code for getting basic information about a data frame
-df = DataFrame(a=1:3, b=1:3)
+ncol(puzzles)
 df[!, :a] .= "x"
 df[:, :b] .= "x"
 df
-# Code for the difference between ! and : in assignment
+nrow(puzzles)
-df = DataFrame(a=1:3, b=1:3, c=1:3)
+names(puzzles)
 df[!, :a] = ["x", "y", "z"]
 df[:, :b] = ["x", "y", "z"]
 df[:, :c] = [11, 12, 13]
 df
-# Codes for section 8.2
+# Code for section 6.3
-# Code from listing 8.4
+puzzles.Rating
-using Graphs
+using BenchmarkTools
-gh = SimpleGraph(nrow(classes_df))
+@benchmark $puzzles.Rating
 for (from, to) in eachrow(edges_df)
    add_edge!(gh, from, to)
 end
 gh
 ne(gh)
 nv(gh)
-# Code for iterator destruction in iteration specification
+puzzles.Rating == copy(puzzles.Rating)
-mat = [1 2; 3 4; 5 6]
+puzzles.Rating === copy(puzzles.Rating)
 for (x1, x2) in eachrow(mat)
    @show x1, x2
 end
-# Code for getting degrees of nodes in the graph
+puzzles.Rating === puzzles.Rating
-degree(gh)
+copy(puzzles.Rating) === copy(puzzles.Rating)
-# Code for adding a column to a data frame
+puzzles."Rating"
-classes_df.deg = degree(gh)
+col = "Rating"
-# Code for the difference between ! and : when adding a column
+data_frame_name[selected_rows, selected_columns]
-df = DataFrame()
+puzzles[:, "Rating"]
-x = [1, 2, 3]
+puzzles[:, :Rating]
-df[!, :x1] = x
+puzzles[:, 4]
-df[:, :x2] = x
+puzzles[:, col]
 df
 df.x1 === x
 df.x2 === x
 df.x2 == x
-# Code for creating a column using broadcasting
+columnindex(puzzles, "Rating")
-df.x3 .= 1
+columnindex(puzzles, "Some fancy column name")
 df
-# Code for edge iterator of a graph
+hasproperty(puzzles, "Rating")
 hasproperty(puzzles, "Some fancy column name")
-edges(gh)
+@benchmark $puzzles[:, :Rating]
-e1 = first(edges(gh))
+puzzles[!, "Rating"]
-dump(e1)
+puzzles[!, :Rating]
-e1.src
+puzzles[!, 4]
-e1.dst
+puzzles[!, col]
 # Code for listing 8.5
 function deg_class(gh, class)
    deg_ml = zeros(Int, length(class))
    deg_web = zeros(Int, length(class))
    for edge in edges(gh)
        a, b = edge.src, edge.dst
        if class[b] == 1
            deg_ml[a] += 1
        else
            deg_web[a] += 1
        end
        if class[a] == 1
            deg_ml[b] += 1
        else
            deg_web[b] += 1
        end
    end
    return (deg_ml, deg_web)
 end
 # Code for computing machine learning and web neighbors for gh graph
 classes_df.deg_ml, classes_df.deg_web =
 deg_class(gh, classes_df.ml_target)
 # Code for checking type stability of deg_class function
@time deg_class(gh, classes_df.ml_target);
@code_warntype deg_class(gh, classes_df.ml_target)
 # Code for checking the classes_df summary statistics
 describe(classes_df, :min, :max, :mean, :std)
 # Code for average degree of node in the graph
 2 * ne(gh) / nv(gh)
 # Code for checking correctness of computations
 classes_df.deg_ml + classes_df.deg_web == classes_df.deg
 # Code for showing that DataFrames.jl checks consistency of stored objects
 df = DataFrame(a=1, b=11)
 push!(df.a, 2)
 df
 # Codes for section 8.3
 # Code for computing groupwise means of columns
 using Statistics
 for type in [0, 1], col in ["deg_ml", "deg_web"]
    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
 end
 gdf = groupby(classes_df, :ml_target)
 combine(gdf,
        :deg_ml => mean => :mean_deg_ml,
        :deg_web => mean => :mean_deg_web)
 using DataFramesMeta
@combine(gdf,
         :mean_deg_ml = mean(:deg_ml),
         :mean_deg_web = mean(:deg_web))
 # Code for simple plotting of relationship between developer degree and type
 using Plots
-scatter(classes_df.deg_ml, classes_df.deg_web;
+plot(histogram(puzzles.Rating, label="Rating"),
-        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
+     histogram(puzzles.RatingDeviation, label="RatingDeviation"),
-        xlabel="degree ml", ylabel="degree web", labels=false)
+     histogram(puzzles.Popularity, label="Popularity"),
     histogram(puzzles.NbPlays, label="NbPlays"))
-# Code for aggregation of degree data
+plot([histogram(puzzles[!, col]; label=col) for
      col in ["Rating", "RatingDeviation",
              "Popularity", "NbPlays"]]...)
-agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
+# Code for section 6.4
                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
-# Code for comparison how Julia parses expressions
+using Statistics
 plays_lo = median(puzzles.NbPlays)
 puzzles.NbPlays .> plays_lo
-:ml_target => (x -> 1 - mean(x)) => :web_mean
+puzzles.NbPlays > plays_lo
 :ml_target => x -> 1 - mean(x) => :web_mean
-# Code for aggregation using DataFramesMeta.jl
+rating_lo = 1500
 rating_hi = quantile(puzzles.Rating, 0.99)
 rating_lo .< puzzles.Rating .< rating_hi
-@combine(groupby(classes_df, [:deg_ml, :deg_web]),
+row_selector = (puzzles.NbPlays .> plays_lo) .&&
-         :web_mean = 1 - mean(:ml_target))
+               (rating_lo .< puzzles.Rating .< rating_hi)
-# Code for getting summary information about the aggregated data frame
+sum(row_selector)
 count(row_selector)
-describe(agg_df)
+# Code for listing 6.3
-# Code for log1p function
+good = puzzles[row_selector, ["Rating", "Popularity"]]
-log1p(0)
+# Code for plotting histograms
-# Code for listing 8.6
+plot(histogram(good.Rating; label="Rating"),
     histogram(good.Popularity; label="Popularity"))
-function gen_ticks(maxv)
+# Code for column selectors
-    max2 = round(Int, log2(maxv))
+
-    tick = [0; 2 .^ (0:max2)]
+puzzles[1, "Rating"]
-    return (log1p.(tick), tick)
+
 puzzles[:, "Rating"]
 row1 = puzzles[1, ["Rating", "Popularity"]]
 row1["Rating"]
 row1[:Rating]
 row1[1]
 row1.Rating
 row1."Rating"
 good = puzzles[row_selector, ["Rating", "Popularity"]]
 good[1, "Rating"]
 good[1, :]
 good[:, "Rating"]
 good[:, :]
 names(puzzles, ["Rating", "Popularity"])
 names(puzzles, [:Rating, :Popularity])
 names(puzzles, [4, 6])
 names(puzzles, [false, false, false, true, false, true, false, false, false])
 names(puzzles, r"Rating")
 names(puzzles, Not([4, 6]))
 names(puzzles, Not(r"Rating"))
 names(puzzles, Between("Rating", "Popularity"))
 names(puzzles, :)
 names(puzzles, All())
 names(puzzles, Cols(r"Rating", "NbPlays"))
 names(puzzles, Cols(startswith("P")))
 names(puzzles, startswith("P"))
 names(puzzles, Real)
 names(puzzles, AbstractString)
 puzzles[:, names(puzzles, Real)]
 # Code for row subsetting
 df1 = puzzles[:, ["Rating", "Popularity"]];
 df2 = puzzles[!, ["Rating", "Popularity"]];
 df1 == df2
 df1 == puzzles
 df2 == puzzles
 df1.Rating === puzzles.Rating
 df1.Popularity === puzzles.Popularity
 df2.Rating === puzzles.Rating
 df2.Popularity === puzzles.Popularity
@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
 puzzles[1, 1]
 puzzles[[1], 1]
 puzzles[1, [1]]
 puzzles[[1], [1]]
 # Code for making views
@view puzzles[1, 1]
@view puzzles[[1], 1]
@view puzzles[1, [1]]
@view puzzles[[1], [1]]
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
 parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
 # Code for section 6.5
 describe(good)
 rating_mapping = Dict{Int, Vector{Int}}()
 for (i, rating) in enumerate(good.Rating)
    if haskey(rating_mapping, rating)
        push!(rating_mapping[rating], i)
    else
        rating_mapping[rating] = [i]
    end
 end
 rating_mapping
 good[rating_mapping[2108], :]
 unique(good[rating_mapping[2108], :].Rating)
 using Statistics
 mean(good[rating_mapping[2108], "Popularity"])
 ratings = unique(good.Rating)
 mean_popularities = map(ratings) do rating
    indices = rating_mapping[rating]
    popularities = good[indices, "Popularity"]
    return mean(popularities)
 end
-log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
+scatter(ratings, mean_popularities;
        xlabel="rating", ylabel="mean popularity", legend=false)
-using Random
+import Loess
-Random.seed!(1234);
+model = Loess.loess(ratings, mean_popularities);
-scatter(log1pjitter.(agg_df.deg_ml),
+ratings_predict = float.(sort(ratings))
-        log1pjitter.(agg_df.deg_web);
+popularity_predict = Loess.predict(model, ratings_predict)
        zcolor=agg_df.web_mean,
        xlabel="degree ml", ylabel="degree web",
        markersize=2, markerstrokewidth=0, markeralpha=0.8,
        legend=:topleft, labels = "fraction web",
        xticks=gen_ticks(maximum(classes_df.deg_ml)),
        yticks=gen_ticks(maximum(classes_df.deg_web)))
-# Code for fitting logistic regression model
+plot!(ratings_predict, popularity_predict; width=5, color="black")
 using GLM
 glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
 # Code for inspecting @formula result
@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
 # Code for inserting columns to a data frame
 df = DataFrame(x=1:3)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :z => 1)
 insertcols!(df, 1, :a => 0)
 insertcols!(df, :x, :pre_x => 2)
 insertcols!(df, :x, :post_x => 3, after=true)
--- a/ch09.jl
+++ b/ch09.jl
@@ -0,0 +1,279 @@
 # Bogumił Kamiński, 2022
 # Codes for chapter 7
 # Code for section 7.1
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
 data = (set1=(x=aq[:, 1], y=aq[:, 2]),
        set2=(x=aq[:, 3], y=aq[:, 4]),
        set3=(x=aq[:, 5], y=aq[:, 6]),
        set4=(x=aq[:, 7], y=aq[:, 8]));
 using DataFrames
 # Code for listing 7.1
 aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
 DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
 # Code for creating DataFrame with automatic column names
 DataFrame(aq, :auto)
 # Codes for creating DataFrame from vector of vectors
 aq_vec = collect(eachcol(aq))
 DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
 DataFrame(aq_vec, :auto)
 # Codes for section 7.1.2
 data.set1.x
 DataFrame(x1=data.set1.x, y1=data.set1.y,
          x2=data.set2.x, y2=data.set2.y,
          x3=data.set3.x, y3=data.set3.y,
          x4=data.set4.x, y4=data.set4.y)
 DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
          :x2 => data.set2.x, :y2 => data.set2.y,
          :x3 => data.set3.x, :y3 => data.set3.y,
          :x4 => data.set4.x, :y4 => data.set4.y)
 DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
           :x2 => data.set2.x, :y2 => data.set2.y,
           :x3 => data.set3.x, :y3 => data.set3.y,
           :x4 => data.set4.x, :y4 => data.set4.y]);
 [(i, v) for i in 1:4 for v in [:x, :y]]
 [string(v, i) for i in 1:4 for v in [:x, :y]]
 [string(v, i) => getproperty(data[i], v)
        for i in 1:4 for v in [:x, :y]]
 DataFrame([string(v, i) => getproperty(data[i], v)
           for i in 1:4 for v in [:x, :y]]);
 data_dict = Dict([string(v, i) => getproperty(data[i], v)
                         for i in 1:4 for v in [:x, :y]])
 collect(data_dict)
 DataFrame(data_dict)
 df1 = DataFrame(x1=data.set1.x)
 df1.x1 === data.set1.x
 df2 = DataFrame(x1=data.set1.x; copycols=false)
 df2.x1 === data.set1.x
 df = DataFrame(x=1:3, y=1)
 df.x
 DataFrame(x=[1], y=[1, 2, 3])
 # Codes for section 7.1.3
 data.set1
 DataFrame(data.set1)
 DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
 data
 # Code for listing 7.2
 aq2 = DataFrame(data)
 # Codes for listing 7.3
 data_dfs = map(DataFrame, data)
 # Codes for vertical concatenation examples
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id")
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id"=>string.("set", 1:4))
 reduce(vcat, collect(data_dfs);
       source="source_id"=>string.("set", 1:4))
 # Code for listing 7.4
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, c=24:26)
 vcat(df1, df2)
 vcat(df1, df2; cols=:union)
 # Code for listing 7.5
 df_agg = DataFrame()
 append!(df_agg, data_dfs.set1)
 append!(df_agg, data_dfs.set2)
 # Code for appending tables to a data frame
 df_agg = DataFrame()
 append!(df_agg, data.set1)
 append!(df_agg, data.set2)
 # Code for promote keyword argument
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, b=[14, missing, 16])
 append!(df1, df2)
 append!(df1, df2; promote=true)
 # Code for section 7.2.3
 df = DataFrame()
 push!(df, (a=1, b=2))
 push!(df, (a=3, b=4))
 df = DataFrame(a=Int[], b=Int[])
 push!(df, [1, 2])
 push!(df, [3, 4])
 function sim_step(current)
    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
    return (x=current.x + dx, y=current.y + dy)
 end
 using BenchmarkTools
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
 dx, dy = (10, 20)
 dx
 dy
 using FreqTables
 using Random
 Random.seed!(1234);
 proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
 using Random
 Random.seed!(6);
 walk = DataFrame(x=0, y=0)
 for _ in 1:10
    current = walk[end, :]
    push!(walk, sim_step(current))
 end
 walk
 plot(walk.x, walk.y;
     legend=false,
     series_annotations=1:11,
     xticks=range(extrema(walk.x)...),
     yticks=range(extrema(walk.y)...))
 extrema(walk.y)
 range(1, 5)
 (3/4)^9
 # Code for listing 7.6
 function walk_unique() #A
    walk = DataFrame(x=0, y=0)
    for _ in 1:10
        current = walk[end, :]
        push!(walk, sim_step(current))
    end
    return nrow(unique(walk)) == nrow(walk) #B
 end
 Random.seed!(2);
 proptable([walk_unique() for _ in 1:10^5])
 # Code for a note on conversion
 x = [1.5]
 x[1] = 1
 x
 # Code from section 7.3.1
 Matrix(walk)
 Matrix{Any}(walk)
 Matrix{String}(walk)
 plot(walk)
 plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
 # Code from section 7.3.2
 Tables.columntable(walk)
 using BenchmarkTools
 function mysum(table)
           s = 0 #A
           for v in table.x #B
               s += v
           end
           return s
       end
 df = DataFrame(x=1:1_000_000);
@btime mysum($df)
 tab = Tables.columntable(df);
@btime mysum($tab)
@code_warntype mysum(df)
@code_warntype mysum(tab)
 typeof(tab)
 function barrier_mysum2(x)
    s = 0
    for v in x
        s += v
    end
    return s
 end
 mysum2(table) = barrier_mysum2(table.x)
@btime mysum2($df)
 df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
 unique(df)
 tab = Tables.columntable(df)
 unique(tab)
 # Code from section 7.3.3
 Tables.rowtable(walk)
 nti = Tables.namedtupleiterator(walk)
 for v in nti
    println(v)
 end
 er = eachrow(walk)
 er[1]
 er[end]
 ec = eachcol(walk)
 ec[1]
 ec[end]
 identity.(eachcol(walk))
 df = DataFrame(x=1:2, b=["a", "b"])
 identity.(eachcol(df))
--- a/ch10.jl
+++ b/ch10.jl
@@ -0,0 +1,284 @@
 # Bogumił Kamiński, 2022
 # Codes for chapter 8
 # Codes for section 8.1
 # Code for listing 8.1
 import Downloads
 using SHA
 git_zip = "git_web_ml.zip"
 if !isfile(git_zip)
    Downloads.download("https://snap.stanford.edu/data/" *
                       "git_web_ml.zip",
                       git_zip)
 end
 isfile(git_zip)
 open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
                          0xc4, 0x60, 0xdc, 0x4c,
                          0x7b, 0xf8, 0x93, 0x57,
                          0xb1, 0xfe, 0xc0, 0x20,
                          0xf4, 0x5e, 0x2e, 0xce,
                          0xba, 0xb8, 0x1d, 0x13,
                          0x1d, 0x07, 0x3b, 0x10,
                          0xe2, 0x8e, 0xc0, 0x31]
 # Code for opeining a zip archive
 import ZipFile
 git_archive = ZipFile.Reader(git_zip)
 # Code for listing 8.2
 function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
    idx = only(findall(x -> x.name == filename, archive.files))
    return CSV.read(read(archive.files[idx]), DataFrame)
 end
 # Code for working with zip archive
 git_archive.files
 git_archive.files[2].name
 findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
 findall(x -> x.name == "", git_archive.files)
 only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
 only(findall(x -> x.name == "", git_archive.files))
 # Code for listing 8.3
 using CSV
 using DataFrames
 edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
 classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
 close(git_archive)
 summary(edges_df)
 describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
 summary(classes_df)
 describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
 # Code for updating data frame columns using broadcasting
 edges_df .+= 1
 classes_df.id .+= 1
 # Code for examples of data frame broadcasting
 df = DataFrame(a=1:3, b=[4, missing, 5])
 df .^ 2
 coalesce.(df, 0)
 df .+ [10, 11, 12]
 # Code for checking the order of :id column in a data frame
 classes_df.id == axes(classes_df, 1)
 # Code for the difference between ! and : in broadcasting assignment
 df = DataFrame(a=1:3, b=1:3)
 df[!, :a] .= "x"
 df[:, :b] .= "x"
 df
 # Code for the difference between ! and : in assignment
 df = DataFrame(a=1:3, b=1:3, c=1:3)
 df[!, :a] = ["x", "y", "z"]
 df[:, :b] = ["x", "y", "z"]
 df[:, :c] = [11, 12, 13]
 df
 # Codes for section 8.2
 # Code from listing 8.4
 using Graphs
 gh = SimpleGraph(nrow(classes_df))
 for (from, to) in eachrow(edges_df)
    add_edge!(gh, from, to)
 end
 gh
 ne(gh)
 nv(gh)
 # Code for iterator destruction in iteration specification
 mat = [1 2; 3 4; 5 6]
 for (x1, x2) in eachrow(mat)
    @show x1, x2
 end
 # Code for getting degrees of nodes in the graph
 degree(gh)
 # Code for adding a column to a data frame
 classes_df.deg = degree(gh)
 # Code for the difference between ! and : when adding a column
 df = DataFrame()
 x = [1, 2, 3]
 df[!, :x1] = x
 df[:, :x2] = x
 df
 df.x1 === x
 df.x2 === x
 df.x2 == x
 # Code for creating a column using broadcasting
 df.x3 .= 1
 df
 # Code for edge iterator of a graph
 edges(gh)
 e1 = first(edges(gh))
 dump(e1)
 e1.src
 e1.dst
 # Code for listing 8.5
 function deg_class(gh, class)
    deg_ml = zeros(Int, length(class))
    deg_web = zeros(Int, length(class))
    for edge in edges(gh)
        a, b = edge.src, edge.dst
        if class[b] == 1
            deg_ml[a] += 1
        else
            deg_web[a] += 1
        end
        if class[a] == 1
            deg_ml[b] += 1
        else
            deg_web[b] += 1
        end
    end
    return (deg_ml, deg_web)
 end
 # Code for computing machine learning and web neighbors for gh graph
 classes_df.deg_ml, classes_df.deg_web =
 deg_class(gh, classes_df.ml_target)
 # Code for checking type stability of deg_class function
@time deg_class(gh, classes_df.ml_target);
@code_warntype deg_class(gh, classes_df.ml_target)
 # Code for checking the classes_df summary statistics
 describe(classes_df, :min, :max, :mean, :std)
 # Code for average degree of node in the graph
 2 * ne(gh) / nv(gh)
 # Code for checking correctness of computations
 classes_df.deg_ml + classes_df.deg_web == classes_df.deg
 # Code for showing that DataFrames.jl checks consistency of stored objects
 df = DataFrame(a=1, b=11)
 push!(df.a, 2)
 df
 # Codes for section 8.3
 # Code for computing groupwise means of columns
 using Statistics
 for type in [0, 1], col in ["deg_ml", "deg_web"]
    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
 end
 gdf = groupby(classes_df, :ml_target)
 combine(gdf,
        :deg_ml => mean => :mean_deg_ml,
        :deg_web => mean => :mean_deg_web)
 using DataFramesMeta
@combine(gdf,
         :mean_deg_ml = mean(:deg_ml),
         :mean_deg_web = mean(:deg_web))
 # Code for simple plotting of relationship between developer degree and type
 using Plots
 scatter(classes_df.deg_ml, classes_df.deg_web;
        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
        xlabel="degree ml", ylabel="degree web", labels=false)
 # Code for aggregation of degree data
 agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
 # Code for comparison how Julia parses expressions
 :ml_target => (x -> 1 - mean(x)) => :web_mean
 :ml_target => x -> 1 - mean(x) => :web_mean
 # Code for aggregation using DataFramesMeta.jl
@combine(groupby(classes_df, [:deg_ml, :deg_web]),
         :web_mean = 1 - mean(:ml_target))
 # Code for getting summary information about the aggregated data frame
 describe(agg_df)
 # Code for log1p function
 log1p(0)
 # Code for listing 8.6
 function gen_ticks(maxv)
    max2 = round(Int, log2(maxv))
    tick = [0; 2 .^ (0:max2)]
    return (log1p.(tick), tick)
 end
 log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
 using Random
 Random.seed!(1234);
 scatter(log1pjitter.(agg_df.deg_ml),
        log1pjitter.(agg_df.deg_web);
        zcolor=agg_df.web_mean,
        xlabel="degree ml", ylabel="degree web",
        markersize=2, markerstrokewidth=0, markeralpha=0.8,
        legend=:topleft, labels = "fraction web",
        xticks=gen_ticks(maximum(classes_df.deg_ml)),
        yticks=gen_ticks(maximum(classes_df.deg_web)))
 # Code for fitting logistic regression model
 using GLM
 glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
 # Code for inspecting @formula result
@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
 # Code for inserting columns to a data frame
 df = DataFrame(x=1:3)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :z => 1)
 insertcols!(df, 1, :a => 0)
 insertcols!(df, :x, :pre_x => 2)
 insertcols!(df, :x, :post_x => 3, after=true)
--- a/chXXX_client.jl
+++ b/chXXX_client.jl
@@ -0,0 +1,18 @@
 using HTTP
 using JSON3
 using DataFrames
 using Plots
 df = DataFrame(K=30:2:80, max_time=0.25)
 df.data = map(df.K, df.max_time) do K, max_time
    @show K
    @time req = HTTP.request("POST", "http://127.0.0.1:8000",
                            ["Content-Type" => "application/json"],
                            JSON3.write((;K, max_time)))
    return JSON3.read(req.body)
 end
@assert all(==("OK"), getproperty.(df.data, :status))
 df2 = select(df, :K, :data => ByRow(x -> x.value) => AsTable)
 plot(plot(df2.K, df2.mv; legend=false, xlabel="K", ylabel="expected value"),
     plot(df2.K, df2.zero; legend=false, xlabel="K", ylabel="probability of zero"))
--- a/chXXX_server.jl
+++ b/chXXX_server.jl
@@ -0,0 +1,45 @@
 using Genie
 using Statistics
 using ThreadsX
 function v_asian_sample(T, X0, K, r, sd, m)::Float64
    X = X0
    sumX = X
    d = T / m
    for i in 1:m
        X *= exp((r-sd^2/2)*d + sd*sqrt(d)*randn())
        sumX += X
    end
    return exp(-r*T) * max(sumX / (m + 1) - K, 0)
 end
 function v_asian_value(T, X0, K, r, sd, m, max_time)
    result = Float64[]
    start_time = time()
    while time() - start_time < max_time
        append!(result, ThreadsX.map(_ -> v_asian_sample(T, X0, K, r, sd, m), 1:10_000))
    end
    n = length(result)
    mv = mean(result)
    sdv = std(result)
    lo95 = mv - 1.96 * sdv / sqrt(n)
    hi95 = mv + 1.96 * sdv / sqrt(n)
    zero = mean(==(0), result)
    return (; n, mv, sdv, lo95, hi95, zero)
 end
 Genie.config.run_as_server = true
 Genie.Router.route("/", method=POST) do
  message = Genie.Requests.jsonpayload()
  return try
      K = float(message["K"])
      max_time = float(message["max_time"])
      value = v_asian_value(1.0, 50.0, K, 0.05, 0.3, 200, max_time)
      Genie.Renderer.Json.json((status="OK", value=value))
  catch
      Genie.Renderer.Json.json((status="ERROR", value=""))
  end
 end
 Genie.startup()