add chapter 6

This commit is contained in:
Bogumił Kamiński 2022-01-17 00:22:06 +01:00
parent 2ca7eb6737
commit 8b39fdb427
4 changed files with 267 additions and 1 deletions

View File

@ -77,6 +77,12 @@ git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.2"
[[deps.CodecBzip2]]
deps = ["Bzip2_jll", "Libdl", "TranscodingStreams"]
git-tree-sha1 = "2e62a725210ce3c3c2e1a3080190e7ca491f18d7"
uuid = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
version = "0.7.2"
[[deps.CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
@ -508,6 +514,12 @@ version = "2.36.0+0"
deps = ["Libdl", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[deps.Loess]]
deps = ["Distances", "LinearAlgebra", "Statistics"]
git-tree-sha1 = "46efcea75c890e5d820e670516dc156689851722"
uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
version = "0.5.4"
[[deps.LogExpFunctions]]
deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "e5718a00af0ab9756305a0392832c8952c7426c1"

View File

@ -1,6 +1,7 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
@ -8,6 +9,7 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Loess = "4345ca2d-374a-55d4-8d30-97f9976e7612"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"

View File

@ -38,4 +38,8 @@ To work with codes from some given chapter:
* execute the commands sequentially as they appear in the file;
the codes were prepared in a way that you do not need to restart Julia
when working with material from a single chapter, unless it is explicitly
written in the instructions to restart Julia (some of the codes require this).
written in the instructions to restart Julia (some of the codes require this);
* before each code there is a comment allowing you to locate the relevant part
of the book where it is used; if in the code there is a blank line between
consecutive code sections this means that in the book these codes are
separated by the text of the book explaining what the code does

248
ch06.jl Normal file
View File

@ -0,0 +1,248 @@
# Bogumił Kamiński, 2022
# Codes for chapter 6
# Code for section 6.1
if isfile("puzzles.csv.bz2")
@info "file already present"
else
@info "fetching file"
download("https://database.lichess.org/" *
"lichess_db_puzzle.csv.bz2",
"puzzles.csv.bz2")
end
using CodecBzip2
compressed = read("puzzles.csv.bz2")
plain = transcode(Bzip2Decompressor, compressed)
open("puzzles.csv", "w") do io
println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
"Popularity,NbPlays,Themes,GameUrl")
write(io, plain)
end
readlines("puzzles.csv")
# Code for section 6.2
using CSV
using DataFrames
puzzles = CSV.read("puzzles.csv", DataFrame);
CSV.read(plain, DataFrame);
compressed = nothing
plain = nothing
# Code for listing 6.1
puzzles
# Code for listing 6.2
describe(puzzles)
# Code for getting basic information about a data frame
ncol(puzzles)
nrow(puzzles)
names(puzzles)
# Code for section 6.3
puzzles.Rating
using BenchmarkTools
@benchmark $puzzles.Rating
puzzles.Rating == copy(puzzles.Rating)
puzzles.Rating === copy(puzzles.Rating)
puzzles.Rating === puzzles.Rating
copy(puzzles.Rating) === copy(puzzles.Rating)
puzzles."Rating"
col = "Rating"
data_frame_name[selected_rows, selected_columns]
puzzles[:, "Rating"]
puzzles[:, :Rating]
puzzles[:, 4]
puzzles[:, col]
columnindex(puzzles, "Rating")
columnindex(puzzles, "Some fancy column name")
hasproperty(puzzles, "Rating")
hasproperty(puzzles, "Some fancy column name")
@benchmark $puzzles[:, :Rating]
puzzles[!, "Rating"]
puzzles[!, :Rating]
puzzles[!, 4]
puzzles[!, col]
plot(histogram(puzzles.Rating, label="Rating"),
histogram(puzzles.RatingDeviation, label="RatingDeviation"),
histogram(puzzles.Popularity, label="Popularity"),
histogram(puzzles.NbPlays, label="NbPlays"))
plot([histogram(puzzles[!, col], label=col) for
col in ["Rating", "RatingDeviation",
"Popularity", "NbPlays"]]...)
# Code for section 6.4
using Statistics
plays_lo = median(puzzles.NbPlays)
puzzles.NbPlays .> plays_lo
puzzles.NbPlays > plays_lo
rating_lo = 1500
rating_hi = quantile(puzzles.Rating, 0.99)
rating_lo .< puzzles.Rating .< rating_hi
row_selector = (puzzles.NbPlays .> plays_lo) .&&
(rating_lo .< puzzles.Rating .< rating_hi)
sum(row_selector)
count(row_selector)
# Code for listing 6.3
good = puzzles[row_selector, ["Rating", "Popularity"]]
# Code for plotting histograms
plot(histogram(good.Rating, label="Rating"),
histogram(good.Popularity, label="Popularity"))
# Code for column selectors
puzzles[1, "Rating"]
puzzles[:, "Rating"]
row1 = puzzles[1, ["Rating", "Popularity"]]
row1["Rating"]
row1[:Rating]
row1[1]
row1.Rating
row1."Rating"
good = puzzles[row_selector, ["Rating", "Popularity"]]
good[1, "Rating"]
good[1, :]
good[:, "Rating"]
good[:, :]
names(puzzles, ["Rating", "Popularity"])
names(puzzles, [:Rating, :Popularity])
names(puzzles, [4, 6])
names(puzzles, [false, false, false, true, false, true, false, false, false])
names(puzzles, r"Rating")
names(puzzles, Not([4, 6]))
names(puzzles, Not(r"Rating"))
names(puzzles, Between("Rating", "Popularity"))
names(puzzles, :)
names(puzzles, All())
names(puzzles, Cols(r"Rating", "NbPlays"))
names(puzzles, Cols(startswith("P")))
names(puzzles, startswith("P"))
names(puzzles, Real)
names(puzzles, AbstractString)
puzzles[:, names(puzzles, Real)]
# Code for row subsetting
df1 = puzzles[:, ["Rating", "Popularity"]];
df2 = puzzles[!, ["Rating", "Popularity"]];
df1 == df2
df1 == puzzles
df2 == puzzles
df1.Rating === puzzles.Rating
df1.Popularity === puzzles.Popularity
df2.Rating === puzzles.Rating
df2.Popularity === puzzles.Popularity
@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
puzzles[1, 1]
puzzles[[1], 1]
puzzles[1, [1]]
puzzles[[1], [1]]
# Code for making views
@view puzzles[1, 1]
@view puzzles[[1], 1]
@view puzzles[1, [1]]
@view puzzles[[1], [1]]
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
# Code for section 6.5
describe(good)
rating_mapping = Dict{Int, Vector{Int}}()
for (i, rating) in enumerate(good.Rating)
if haskey(rating_mapping, rating)
push!(rating_mapping[rating], i)
else
rating_mapping[rating] = [i]
end
end
rating_mapping
good[rating_mapping[2108], :]
unique(good[rating_mapping[2108], :].Rating)
using Statistics
mean(good[rating_mapping[2108], "Popularity"])
ratings = unique(good.Rating)
mean_popularities = map(ratings) do rating
indices = rating_mapping[rating]
popularities = good[indices, "Popularity"]
return mean(popularities)
end
using Plots
scatter(ratings, mean_popularities;
xlabel="rating", ylabel="mean popularity", legend=false)
import Loess
model = Loess.loess(ratings, mean_popularities);
ratings_predict = float.(sort(ratings))
popularity_predict = Loess.predict(model, ratings_predict)
plot!(ratings_predict, popularity_predict, width=5, color="black")