JuliaForDataAnalysis/ch09.jl

163 lines
3.4 KiB
Julia
Raw Normal View History

2022-02-08 20:58:33 +01:00
# Bogumił Kamiński, 2022
2022-02-13 11:59:23 +01:00
# Codes for chapter 9
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for section 9.1
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
using DataFrames
using CSV
using Plots
puzzles = CSV.read("puzzles.csv", DataFrame);
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
using Statistics
plays_lo = median(puzzles.NbPlays)
puzzles.NbPlays .> plays_lo
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
puzzles.NbPlays > plays_lo
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
rating_lo = 1500
rating_hi = quantile(puzzles.Rating, 0.99)
rating_lo .< puzzles.Rating .< rating_hi
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
row_selector = (puzzles.NbPlays .> plays_lo) .&&
(rating_lo .< puzzles.Rating .< rating_hi)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
sum(row_selector)
count(row_selector)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for listing 9.1
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
good = puzzles[row_selector, ["Rating", "Popularity"]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for plotting histograms
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
plot(histogram(good.Rating; label="Rating"),
histogram(good.Popularity; label="Popularity"))
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for column selectors
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
puzzles[1, "Rating"]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
puzzles[:, "Rating"]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
row1 = puzzles[1, ["Rating", "Popularity"]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
row1["Rating"]
row1[:Rating]
row1[1]
row1.Rating
row1."Rating"
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
good = puzzles[row_selector, ["Rating", "Popularity"]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
good[1, "Rating"]
good[1, :]
good[:, "Rating"]
good[:, :]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
names(puzzles, ["Rating", "Popularity"])
names(puzzles, [:Rating, :Popularity])
names(puzzles, [4, 6])
names(puzzles, [false, false, false, true, false, true, false, false, false])
names(puzzles, r"Rating")
names(puzzles, Not([4, 6]))
names(puzzles, Not(r"Rating"))
names(puzzles, Between("Rating", "Popularity"))
names(puzzles, :)
names(puzzles, All())
names(puzzles, Cols(r"Rating", "NbPlays"))
names(puzzles, Cols(startswith("P")))
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
names(puzzles, startswith("P"))
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
names(puzzles, Real)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
names(puzzles, AbstractString)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
puzzles[:, names(puzzles, Real)]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for row subsetting
2022-02-08 20:58:33 +01:00
2022-03-05 16:44:32 +01:00
df_small = DataFrame(id=1:4)
df_small[[1, 3], :]
df_small[[true, false, true, false], :]
df_small[Not([2, 4]), :]
df_small[Not([false, true, false, true]), :]
2022-02-13 11:59:23 +01:00
df1 = puzzles[:, ["Rating", "Popularity"]];
df2 = puzzles[!, ["Rating", "Popularity"]];
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
df1 == df2
df1 == puzzles
df2 == puzzles
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
df1.Rating === puzzles.Rating
df1.Popularity === puzzles.Popularity
df2.Rating === puzzles.Rating
df2.Popularity === puzzles.Popularity
2022-02-08 20:58:33 +01:00
2022-03-26 07:10:18 +01:00
using BenchmarkTools
@btime $puzzles[:, ["Rating", "Popularity"]];
@btime $puzzles[!, ["Rating", "Popularity"]];
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
puzzles[1, 1]
puzzles[[1], 1]
puzzles[1, [1]]
puzzles[[1], [1]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for making views
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
@view puzzles[1, 1]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
@view puzzles[[1], 1]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
@view puzzles[1, [1]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
@view puzzles[[1], [1]]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
# Code for section 9.2
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
describe(good)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
rating_mapping = Dict{Int, Vector{Int}}()
for (i, rating) in enumerate(good.Rating)
if haskey(rating_mapping, rating)
push!(rating_mapping[rating], i)
else
rating_mapping[rating] = [i]
2022-02-08 20:58:33 +01:00
end
end
2022-02-13 11:59:23 +01:00
rating_mapping
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
good[rating_mapping[2108], :]
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
unique(good[rating_mapping[2108], :].Rating)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
using Statistics
mean(good[rating_mapping[2108], "Popularity"])
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
ratings = unique(good.Rating)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
mean_popularities = map(ratings) do rating
indices = rating_mapping[rating]
popularities = good[indices, "Popularity"]
return mean(popularities)
2022-02-08 20:58:33 +01:00
end
2022-02-13 11:59:23 +01:00
scatter(ratings, mean_popularities;
xlabel="rating", ylabel="mean popularity", legend=false)
2022-02-08 20:58:33 +01:00
2022-03-05 16:44:32 +01:00
using Loess
model = loess(ratings, mean_popularities);
2022-02-13 11:59:23 +01:00
ratings_predict = float.(sort(ratings))
2022-03-05 16:44:32 +01:00
popularity_predict = predict(model, ratings_predict)
2022-02-08 20:58:33 +01:00
2022-02-13 11:59:23 +01:00
plot!(ratings_predict, popularity_predict; width=5, color="black")
2022-03-24 20:09:55 +01:00
combine(groupby(good, :Rating), :Popularity => mean)