update up to chapter 9

This commit is contained in:
Bogumił Kamiński
2022-02-13 11:59:23 +01:00
parent e1d5277f8c
commit ab6b8f18f3
4 changed files with 637 additions and 618 deletions

326
ch09.jl
View File

@@ -1,279 +1,153 @@
# Bogumił Kamiński, 2022
# Codes for chapter 7
# Codes for chapter 9
# Code for section 7.1
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89];
data = (set1=(x=aq[:, 1], y=aq[:, 2]),
set2=(x=aq[:, 3], y=aq[:, 4]),
set3=(x=aq[:, 5], y=aq[:, 6]),
set4=(x=aq[:, 7], y=aq[:, 8]));
# Code for section 9.1
using DataFrames
using CSV
using Plots
puzzles = CSV.read("puzzles.csv", DataFrame);
# Code for listing 7.1
using Statistics
plays_lo = median(puzzles.NbPlays)
puzzles.NbPlays .> plays_lo
aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
puzzles.NbPlays > plays_lo
# Code for creating DataFrame with automatic column names
rating_lo = 1500
rating_hi = quantile(puzzles.Rating, 0.99)
rating_lo .< puzzles.Rating .< rating_hi
DataFrame(aq, :auto)
row_selector = (puzzles.NbPlays .> plays_lo) .&&
(rating_lo .< puzzles.Rating .< rating_hi)
# Codes for creating DataFrame from vector of vectors
sum(row_selector)
count(row_selector)
aq_vec = collect(eachcol(aq))
DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
DataFrame(aq_vec, :auto)
# Code for listing 9.1
# Codes for section 7.1.2
good = puzzles[row_selector, ["Rating", "Popularity"]]
data.set1.x
# Code for plotting histograms
DataFrame(x1=data.set1.x, y1=data.set1.y,
x2=data.set2.x, y2=data.set2.y,
x3=data.set3.x, y3=data.set3.y,
x4=data.set4.x, y4=data.set4.y)
plot(histogram(good.Rating; label="Rating"),
histogram(good.Popularity; label="Popularity"))
DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
:x2 => data.set2.x, :y2 => data.set2.y,
:x3 => data.set3.x, :y3 => data.set3.y,
:x4 => data.set4.x, :y4 => data.set4.y)
# Code for column selectors
DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
:x2 => data.set2.x, :y2 => data.set2.y,
:x3 => data.set3.x, :y3 => data.set3.y,
:x4 => data.set4.x, :y4 => data.set4.y]);
puzzles[1, "Rating"]
[(i, v) for i in 1:4 for v in [:x, :y]]
puzzles[:, "Rating"]
[string(v, i) for i in 1:4 for v in [:x, :y]]
row1 = puzzles[1, ["Rating", "Popularity"]]
[string(v, i) => getproperty(data[i], v)
for i in 1:4 for v in [:x, :y]]
row1["Rating"]
row1[:Rating]
row1[1]
row1.Rating
row1."Rating"
DataFrame([string(v, i) => getproperty(data[i], v)
for i in 1:4 for v in [:x, :y]]);
good = puzzles[row_selector, ["Rating", "Popularity"]]
data_dict = Dict([string(v, i) => getproperty(data[i], v)
for i in 1:4 for v in [:x, :y]])
collect(data_dict)
good[1, "Rating"]
good[1, :]
good[:, "Rating"]
good[:, :]
DataFrame(data_dict)
names(puzzles, ["Rating", "Popularity"])
names(puzzles, [:Rating, :Popularity])
names(puzzles, [4, 6])
names(puzzles, [false, false, false, true, false, true, false, false, false])
names(puzzles, r"Rating")
names(puzzles, Not([4, 6]))
names(puzzles, Not(r"Rating"))
names(puzzles, Between("Rating", "Popularity"))
names(puzzles, :)
names(puzzles, All())
names(puzzles, Cols(r"Rating", "NbPlays"))
names(puzzles, Cols(startswith("P")))
df1 = DataFrame(x1=data.set1.x)
df1.x1 === data.set1.x
names(puzzles, startswith("P"))
df2 = DataFrame(x1=data.set1.x; copycols=false)
df2.x1 === data.set1.x
names(puzzles, Real)
df = DataFrame(x=1:3, y=1)
df.x
names(puzzles, AbstractString)
DataFrame(x=[1], y=[1, 2, 3])
puzzles[:, names(puzzles, Real)]
# Codes for section 7.1.3
# Code for row subsetting
data.set1
DataFrame(data.set1)
df1 = puzzles[:, ["Rating", "Popularity"]];
df2 = puzzles[!, ["Rating", "Popularity"]];
DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
df1 == df2
df1 == puzzles
df2 == puzzles
data
df1.Rating === puzzles.Rating
df1.Popularity === puzzles.Popularity
df2.Rating === puzzles.Rating
df2.Popularity === puzzles.Popularity
# Code for listing 7.2
@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
aq2 = DataFrame(data)
puzzles[1, 1]
puzzles[[1], 1]
puzzles[1, [1]]
puzzles[[1], [1]]
# Codes for listing 7.3
# Code for making views
data_dfs = map(DataFrame, data)
@view puzzles[1, 1]
# Codes for vertical concatenation examples
@view puzzles[[1], 1]
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
@view puzzles[1, [1]]
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
source="source_id")
@view puzzles[[1], [1]]
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
source="source_id"=>string.("set", 1:4))
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
reduce(vcat, collect(data_dfs);
source="source_id"=>string.("set", 1:4))
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
# Code for listing 7.4
# Code for section 9.2
df1 = DataFrame(a=1:3, b=11:13)
df2 = DataFrame(a=4:6, c=24:26)
vcat(df1, df2)
vcat(df1, df2; cols=:union)
describe(good)
# Code for listing 7.5
df_agg = DataFrame()
append!(df_agg, data_dfs.set1)
append!(df_agg, data_dfs.set2)
# Code for appending tables to a data frame
df_agg = DataFrame()
append!(df_agg, data.set1)
append!(df_agg, data.set2)
# Code for promote keyword argument
df1 = DataFrame(a=1:3, b=11:13)
df2 = DataFrame(a=4:6, b=[14, missing, 16])
append!(df1, df2)
append!(df1, df2; promote=true)
# Code for section 7.2.3
df = DataFrame()
push!(df, (a=1, b=2))
push!(df, (a=3, b=4))
df = DataFrame(a=Int[], b=Int[])
push!(df, [1, 2])
push!(df, [3, 4])
function sim_step(current)
dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
return (x=current.x + dx, y=current.y + dy)
end
using BenchmarkTools
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
dx, dy = (10, 20)
dx
dy
using FreqTables
using Random
Random.seed!(1234);
proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
using Random
Random.seed!(6);
walk = DataFrame(x=0, y=0)
for _ in 1:10
current = walk[end, :]
push!(walk, sim_step(current))
end
walk
plot(walk.x, walk.y;
legend=false,
series_annotations=1:11,
xticks=range(extrema(walk.x)...),
yticks=range(extrema(walk.y)...))
extrema(walk.y)
range(1, 5)
(3/4)^9
# Code for listing 7.6
function walk_unique() #A
walk = DataFrame(x=0, y=0)
for _ in 1:10
current = walk[end, :]
push!(walk, sim_step(current))
rating_mapping = Dict{Int, Vector{Int}}()
for (i, rating) in enumerate(good.Rating)
if haskey(rating_mapping, rating)
push!(rating_mapping[rating], i)
else
rating_mapping[rating] = [i]
end
return nrow(unique(walk)) == nrow(walk) #B
end
Random.seed!(2);
proptable([walk_unique() for _ in 1:10^5])
rating_mapping
# Code for a note on conversion
good[rating_mapping[2108], :]
x = [1.5]
x[1] = 1
x
unique(good[rating_mapping[2108], :].Rating)
# Code from section 7.3.1
using Statistics
mean(good[rating_mapping[2108], "Popularity"])
Matrix(walk)
Matrix{Any}(walk)
Matrix{String}(walk)
ratings = unique(good.Rating)
plot(walk)
plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
# Code from section 7.3.2
Tables.columntable(walk)
using BenchmarkTools
function mysum(table)
s = 0 #A
for v in table.x #B
s += v
end
return s
end
df = DataFrame(x=1:1_000_000);
@btime mysum($df)
tab = Tables.columntable(df);
@btime mysum($tab)
@code_warntype mysum(df)
@code_warntype mysum(tab)
typeof(tab)
function barrier_mysum2(x)
s = 0
for v in x
s += v
end
return s
end
mysum2(table) = barrier_mysum2(table.x)
@btime mysum2($df)
df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
unique(df)
tab = Tables.columntable(df)
unique(tab)
# Code from section 7.3.3
Tables.rowtable(walk)
nti = Tables.namedtupleiterator(walk)
for v in nti
println(v)
mean_popularities = map(ratings) do rating
indices = rating_mapping[rating]
popularities = good[indices, "Popularity"]
return mean(popularities)
end
er = eachrow(walk)
er[1]
er[end]
ec = eachcol(walk)
ec[1]
ec[end]
scatter(ratings, mean_popularities;
xlabel="rating", ylabel="mean popularity", legend=false)
identity.(eachcol(walk))
import Loess
model = Loess.loess(ratings, mean_popularities);
ratings_predict = float.(sort(ratings))
popularity_predict = Loess.predict(model, ratings_predict)
df = DataFrame(x=1:2, b=["a", "b"])
identity.(eachcol(df))
plot!(ratings_predict, popularity_predict; width=5, color="black")