2022-01-27 15:28:42 +01:00
|
|
|
# Bogumił Kamiński, 2022
|
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Codes for chapter 6
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for section 6.1
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
if isfile("puzzles.csv.bz2")
|
|
|
|
@info "file already present"
|
|
|
|
else
|
|
|
|
@info "fetching file"
|
|
|
|
download("https://database.lichess.org/" *
|
|
|
|
"lichess_db_puzzle.csv.bz2",
|
|
|
|
"puzzles.csv.bz2")
|
2022-01-27 15:28:42 +01:00
|
|
|
end
|
2022-02-08 20:58:33 +01:00
|
|
|
|
|
|
|
using CodecBzip2
|
|
|
|
compressed = read("puzzles.csv.bz2")
|
|
|
|
plain = transcode(Bzip2Decompressor, compressed)
|
|
|
|
|
|
|
|
open("puzzles.csv", "w") do io
|
|
|
|
println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
|
|
|
|
"Popularity,NbPlays,Themes,GameUrl")
|
|
|
|
write(io, plain)
|
2022-01-27 15:28:42 +01:00
|
|
|
end
|
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
readlines("puzzles.csv")
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for section 6.2
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
using CSV
|
|
|
|
using DataFrames
|
|
|
|
puzzles = CSV.read("puzzles.csv", DataFrame);
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
CSV.read(plain, DataFrame);
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
compressed = nothing
|
|
|
|
plain = nothing
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for listing 6.1
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for listing 6.2
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
describe(puzzles)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for getting basic information about a data frame
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
ncol(puzzles)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
nrow(puzzles)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
names(puzzles)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for section 6.3
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles.Rating
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
using BenchmarkTools
|
|
|
|
@benchmark $puzzles.Rating
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles.Rating == copy(puzzles.Rating)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles.Rating === copy(puzzles.Rating)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles.Rating === puzzles.Rating
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
copy(puzzles.Rating) === copy(puzzles.Rating)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles."Rating"
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
col = "Rating"
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
data_frame_name[selected_rows, selected_columns]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[:, "Rating"]
|
|
|
|
puzzles[:, :Rating]
|
|
|
|
puzzles[:, 4]
|
|
|
|
puzzles[:, col]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
columnindex(puzzles, "Rating")
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
columnindex(puzzles, "Some fancy column name")
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
hasproperty(puzzles, "Rating")
|
|
|
|
hasproperty(puzzles, "Some fancy column name")
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@benchmark $puzzles[:, :Rating]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[!, "Rating"]
|
|
|
|
puzzles[!, :Rating]
|
|
|
|
puzzles[!, 4]
|
|
|
|
puzzles[!, col]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
using Plots
|
|
|
|
plot(histogram(puzzles.Rating, label="Rating"),
|
|
|
|
histogram(puzzles.RatingDeviation, label="RatingDeviation"),
|
|
|
|
histogram(puzzles.Popularity, label="Popularity"),
|
|
|
|
histogram(puzzles.NbPlays, label="NbPlays"))
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
plot([histogram(puzzles[!, col]; label=col) for
|
|
|
|
col in ["Rating", "RatingDeviation",
|
|
|
|
"Popularity", "NbPlays"]]...)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for section 6.4
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
using Statistics
|
|
|
|
plays_lo = median(puzzles.NbPlays)
|
|
|
|
puzzles.NbPlays .> plays_lo
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles.NbPlays > plays_lo
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
rating_lo = 1500
|
|
|
|
rating_hi = quantile(puzzles.Rating, 0.99)
|
|
|
|
rating_lo .< puzzles.Rating .< rating_hi
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
row_selector = (puzzles.NbPlays .> plays_lo) .&&
|
|
|
|
(rating_lo .< puzzles.Rating .< rating_hi)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
sum(row_selector)
|
|
|
|
count(row_selector)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for listing 6.3
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for plotting histograms
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
plot(histogram(good.Rating; label="Rating"),
|
|
|
|
histogram(good.Popularity; label="Popularity"))
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for column selectors
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[1, "Rating"]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[:, "Rating"]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
row1 = puzzles[1, ["Rating", "Popularity"]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
row1["Rating"]
|
|
|
|
row1[:Rating]
|
|
|
|
row1[1]
|
|
|
|
row1.Rating
|
|
|
|
row1."Rating"
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
good[1, "Rating"]
|
|
|
|
good[1, :]
|
|
|
|
good[:, "Rating"]
|
|
|
|
good[:, :]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
names(puzzles, ["Rating", "Popularity"])
|
|
|
|
names(puzzles, [:Rating, :Popularity])
|
|
|
|
names(puzzles, [4, 6])
|
|
|
|
names(puzzles, [false, false, false, true, false, true, false, false, false])
|
|
|
|
names(puzzles, r"Rating")
|
|
|
|
names(puzzles, Not([4, 6]))
|
|
|
|
names(puzzles, Not(r"Rating"))
|
|
|
|
names(puzzles, Between("Rating", "Popularity"))
|
|
|
|
names(puzzles, :)
|
|
|
|
names(puzzles, All())
|
|
|
|
names(puzzles, Cols(r"Rating", "NbPlays"))
|
|
|
|
names(puzzles, Cols(startswith("P")))
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
names(puzzles, startswith("P"))
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
names(puzzles, Real)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
names(puzzles, AbstractString)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[:, names(puzzles, Real)]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for row subsetting
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
df1 = puzzles[:, ["Rating", "Popularity"]];
|
|
|
|
df2 = puzzles[!, ["Rating", "Popularity"]];
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
df1 == df2
|
|
|
|
df1 == puzzles
|
|
|
|
df2 == puzzles
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
df1.Rating === puzzles.Rating
|
|
|
|
df1.Popularity === puzzles.Popularity
|
|
|
|
df2.Rating === puzzles.Rating
|
|
|
|
df2.Popularity === puzzles.Popularity
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@benchmark $puzzles[:, ["Rating", "Popularity"]]
|
|
|
|
@benchmark $puzzles[!, ["Rating", "Popularity"]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
puzzles[1, 1]
|
|
|
|
puzzles[[1], 1]
|
|
|
|
puzzles[1, [1]]
|
|
|
|
puzzles[[1], [1]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for making views
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@view puzzles[1, 1]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@view puzzles[[1], 1]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@view puzzles[1, [1]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@view puzzles[[1], [1]]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
|
|
|
|
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
# Code for section 6.5
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
describe(good)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
rating_mapping = Dict{Int, Vector{Int}}()
|
|
|
|
for (i, rating) in enumerate(good.Rating)
|
|
|
|
if haskey(rating_mapping, rating)
|
|
|
|
push!(rating_mapping[rating], i)
|
|
|
|
else
|
|
|
|
rating_mapping[rating] = [i]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
rating_mapping
|
|
|
|
|
|
|
|
good[rating_mapping[2108], :]
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
unique(good[rating_mapping[2108], :].Rating)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
using Statistics
|
|
|
|
mean(good[rating_mapping[2108], "Popularity"])
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
ratings = unique(good.Rating)
|
|
|
|
|
|
|
|
mean_popularities = map(ratings) do rating
|
|
|
|
indices = rating_mapping[rating]
|
|
|
|
popularities = good[indices, "Popularity"]
|
|
|
|
return mean(popularities)
|
|
|
|
end
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
scatter(ratings, mean_popularities;
|
|
|
|
xlabel="rating", ylabel="mean popularity", legend=false)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
import Loess
|
|
|
|
model = Loess.loess(ratings, mean_popularities);
|
|
|
|
ratings_predict = float.(sort(ratings))
|
|
|
|
popularity_predict = Loess.predict(model, ratings_predict)
|
2022-01-27 15:28:42 +01:00
|
|
|
|
2022-02-08 20:58:33 +01:00
|
|
|
plot!(ratings_predict, popularity_predict; width=5, color="black")
|