add chapter 6

2022-01-17 00:22:06 +01:00 · 2022-01-17 00:22:06 +01:00 · 8b39fdb427
commit 8b39fdb427
parent 2ca7eb6737
4 changed files with 267 additions and 1 deletions
--- a/Manifest.toml
+++ b/Manifest.toml
@ -77,6 +77,12 @@ git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 version = "0.1.2"

+[[deps.CodecBzip2]]
+deps = ["Bzip2_jll", "Libdl", "TranscodingStreams"]
+git-tree-sha1 = "2e62a725210ce3c3c2e1a3080190e7ca491f18d7"
+uuid = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
+version = "0.7.2"
+
 [[deps.CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
 git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
@ -508,6 +514,12 @@ version = "2.36.0+0"
 deps = ["Libdl", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

+[[deps.Loess]]
+deps = ["Distances", "LinearAlgebra", "Statistics"]
+git-tree-sha1 = "46efcea75c890e5d820e670516dc156689851722"
+uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
+version = "0.5.4"
+
 [[deps.LogExpFunctions]]
 deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
 git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,7 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
@ -8,6 +9,7 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
 InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+Loess = "4345ca2d-374a-55d4-8d30-97f9976e7612"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
--- a/README.md
+++ b/README.md
@ -38,4 +38,8 @@ To work with codes from some given chapter:
 * execute the commands sequentially as they appear in the file;
  the codes were prepared in a way that you do not need to restart Julia
  when working with material from a single chapter, unless it is explicitly
-  written in the instructions to restart Julia (some of the codes require this).
+  written in the instructions to restart Julia (some of the codes require this);
+* before each code there is a comment allowing you to locate the relevant part
+  of the book where it is used; if in the code there is a blank line between
+  consecutive code sections this means that in the book these codes are
+  separated by the text of the book explaining what the code does
--- a/ch06.jl
+++ b/ch06.jl
@ -0,0 +1,248 @@
+# Bogumił Kamiński, 2022
+
+# Codes for chapter 6
+
+# Code for section 6.1
+
+if isfile("puzzles.csv.bz2")
+    @info "file already present"
+else
+    @info "fetching file"
+    download("https://database.lichess.org/" *
+            "lichess_db_puzzle.csv.bz2",
+            "puzzles.csv.bz2")
+end
+
+using CodecBzip2
+compressed = read("puzzles.csv.bz2")
+plain = transcode(Bzip2Decompressor, compressed)
+
+open("puzzles.csv", "w") do io
+    println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
+                "Popularity,NbPlays,Themes,GameUrl")
+    write(io, plain)
+end
+
+readlines("puzzles.csv")
+
+# Code for section 6.2
+
+using CSV
+using DataFrames
+puzzles = CSV.read("puzzles.csv", DataFrame);
+
+CSV.read(plain, DataFrame);
+
+compressed = nothing
+plain = nothing
+
+# Code for listing 6.1
+
+puzzles
+
+# Code for listing 6.2
+
+describe(puzzles)
+
+# Code for getting basic information about a data frame
+
+ncol(puzzles)
+
+nrow(puzzles)
+
+names(puzzles)
+
+# Code for section 6.3
+
+puzzles.Rating
+
+using BenchmarkTools
+@benchmark $puzzles.Rating
+
+puzzles.Rating == copy(puzzles.Rating)
+
+puzzles.Rating === copy(puzzles.Rating)
+
+puzzles.Rating === puzzles.Rating
+
+copy(puzzles.Rating) === copy(puzzles.Rating)
+
+puzzles."Rating"
+
+col = "Rating"
+
+data_frame_name[selected_rows, selected_columns]
+
+puzzles[:, "Rating"]
+puzzles[:, :Rating]
+puzzles[:, 4]
+puzzles[:, col]
+
+columnindex(puzzles, "Rating")
+
+columnindex(puzzles, "Some fancy column name")
+
+hasproperty(puzzles, "Rating")
+hasproperty(puzzles, "Some fancy column name")
+
+@benchmark $puzzles[:, :Rating]
+
+puzzles[!, "Rating"]
+puzzles[!, :Rating]
+puzzles[!, 4]
+puzzles[!, col]
+
+plot(histogram(puzzles.Rating, label="Rating"),
+     histogram(puzzles.RatingDeviation, label="RatingDeviation"),
+     histogram(puzzles.Popularity, label="Popularity"),
+     histogram(puzzles.NbPlays, label="NbPlays"))
+
+plot([histogram(puzzles[!, col], label=col) for
+      col in ["Rating", "RatingDeviation",
+              "Popularity", "NbPlays"]]...)
+
+# Code for section 6.4
+
+using Statistics
+plays_lo = median(puzzles.NbPlays)
+puzzles.NbPlays .> plays_lo
+
+puzzles.NbPlays > plays_lo
+
+rating_lo = 1500
+rating_hi = quantile(puzzles.Rating, 0.99)
+rating_lo .< puzzles.Rating .< rating_hi
+
+row_selector = (puzzles.NbPlays .> plays_lo) .&&
+               (rating_lo .< puzzles.Rating .< rating_hi)
+
+sum(row_selector)
+count(row_selector)
+
+# Code for listing 6.3
+
+good = puzzles[row_selector, ["Rating", "Popularity"]]
+
+# Code for plotting histograms
+
+plot(histogram(good.Rating, label="Rating"),
+     histogram(good.Popularity, label="Popularity"))
+
+# Code for column selectors
+
+puzzles[1, "Rating"]
+
+puzzles[:, "Rating"]
+
+row1 = puzzles[1, ["Rating", "Popularity"]]
+
+row1["Rating"]
+row1[:Rating]
+row1[1]
+row1.Rating
+row1."Rating"
+
+good = puzzles[row_selector, ["Rating", "Popularity"]]
+
+good[1, "Rating"]
+good[1, :]
+good[:, "Rating"]
+good[:, :]
+
+names(puzzles, ["Rating", "Popularity"])
+names(puzzles, [:Rating, :Popularity])
+names(puzzles, [4, 6])
+names(puzzles, [false, false, false, true, false, true, false, false, false])
+names(puzzles, r"Rating")
+names(puzzles, Not([4, 6]))
+names(puzzles, Not(r"Rating"))
+names(puzzles, Between("Rating", "Popularity"))
+names(puzzles, :)
+names(puzzles, All())
+names(puzzles, Cols(r"Rating", "NbPlays"))
+names(puzzles, Cols(startswith("P")))
+
+names(puzzles, startswith("P"))
+
+names(puzzles, Real)
+
+names(puzzles, AbstractString)
+
+puzzles[:, names(puzzles, Real)]
+
+# Code for row subsetting
+
+df1 = puzzles[:, ["Rating", "Popularity"]];
+df2 = puzzles[!, ["Rating", "Popularity"]];
+
+df1 == df2
+df1 == puzzles
+df2 == puzzles
+
+df1.Rating === puzzles.Rating
+df1.Popularity === puzzles.Popularity
+df2.Rating === puzzles.Rating
+df2.Popularity === puzzles.Popularity
+
+@benchmark $puzzles[:, ["Rating", "Popularity"]]
+@benchmark $puzzles[!, ["Rating", "Popularity"]]
+
+puzzles[1, 1]
+puzzles[[1], 1]
+puzzles[1, [1]]
+puzzles[[1], [1]]
+
+# Code for making views
+
+@view puzzles[1, 1]
+
+@view puzzles[[1], 1]
+
+@view puzzles[1, [1]]
+
+@view puzzles[[1], [1]]
+
+@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
+@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
+
+parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
+
+# Code for section 6.5
+
+describe(good)
+
+rating_mapping = Dict{Int, Vector{Int}}()
+for (i, rating) in enumerate(good.Rating)
+    if haskey(rating_mapping, rating)
+        push!(rating_mapping[rating], i)
+    else
+        rating_mapping[rating] = [i]
+    end
+end
+rating_mapping
+
+good[rating_mapping[2108], :]
+
+unique(good[rating_mapping[2108], :].Rating)
+
+using Statistics
+mean(good[rating_mapping[2108], "Popularity"])
+
+ratings = unique(good.Rating)
+
+mean_popularities = map(ratings) do rating
+    indices = rating_mapping[rating]
+    popularities = good[indices, "Popularity"]
+    return mean(popularities)
+end
+
+using Plots
+scatter(ratings, mean_popularities;
+        xlabel="rating", ylabel="mean popularity", legend=false)
+
+import Loess
+model = Loess.loess(ratings, mean_popularities);
+ratings_predict = float.(sort(ratings))
+popularity_predict = Loess.predict(model, ratings_predict)
+
+plot!(ratings_predict, popularity_predict, width=5, color="black")