update up to chapter 9

2022-02-13 11:59:23 +01:00
parent e1d5277f8c
commit ab6b8f18f3
4 changed files with 637 additions and 618 deletions
--- a/ch09.jl
+++ b/ch09.jl
@@ -1,279 +1,153 @@
 # Bogumił Kamiński, 2022

-# Codes for chapter 7
+# Codes for chapter 9

-# Code for section 7.1
-
-aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
-       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
-      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
-       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
-      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
-      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
-       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
-       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
-      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
-       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
-       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
-
-data = (set1=(x=aq[:, 1], y=aq[:, 2]),
-        set2=(x=aq[:, 3], y=aq[:, 4]),
-        set3=(x=aq[:, 5], y=aq[:, 6]),
-        set4=(x=aq[:, 7], y=aq[:, 8]));
+# Code for section 9.1

 using DataFrames
+using CSV
+using Plots
+puzzles = CSV.read("puzzles.csv", DataFrame);

-# Code for listing 7.1
+using Statistics
+plays_lo = median(puzzles.NbPlays)
+puzzles.NbPlays .> plays_lo

-aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
-DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
+puzzles.NbPlays > plays_lo

-# Code for creating DataFrame with automatic column names
+rating_lo = 1500
+rating_hi = quantile(puzzles.Rating, 0.99)
+rating_lo .< puzzles.Rating .< rating_hi

-DataFrame(aq, :auto)
+row_selector = (puzzles.NbPlays .> plays_lo) .&&
+               (rating_lo .< puzzles.Rating .< rating_hi)

-# Codes for creating DataFrame from vector of vectors
+sum(row_selector)
+count(row_selector)

-aq_vec = collect(eachcol(aq))
-DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
-DataFrame(aq_vec, :auto)
+# Code for listing 9.1

-# Codes for section 7.1.2
+good = puzzles[row_selector, ["Rating", "Popularity"]]

-data.set1.x
+# Code for plotting histograms

-DataFrame(x1=data.set1.x, y1=data.set1.y,
-          x2=data.set2.x, y2=data.set2.y,
-          x3=data.set3.x, y3=data.set3.y,
-          x4=data.set4.x, y4=data.set4.y)
+plot(histogram(good.Rating; label="Rating"),
+     histogram(good.Popularity; label="Popularity"))

-DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
-          :x2 => data.set2.x, :y2 => data.set2.y,
-          :x3 => data.set3.x, :y3 => data.set3.y,
-          :x4 => data.set4.x, :y4 => data.set4.y)
+# Code for column selectors

-DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
-           :x2 => data.set2.x, :y2 => data.set2.y,
-           :x3 => data.set3.x, :y3 => data.set3.y,
-           :x4 => data.set4.x, :y4 => data.set4.y]);
+puzzles[1, "Rating"]

-[(i, v) for i in 1:4 for v in [:x, :y]]
+puzzles[:, "Rating"]

-[string(v, i) for i in 1:4 for v in [:x, :y]]
+row1 = puzzles[1, ["Rating", "Popularity"]]

-[string(v, i) => getproperty(data[i], v)
-        for i in 1:4 for v in [:x, :y]]
+row1["Rating"]
+row1[:Rating]
+row1[1]
+row1.Rating
+row1."Rating"

-DataFrame([string(v, i) => getproperty(data[i], v)
-           for i in 1:4 for v in [:x, :y]]);
+good = puzzles[row_selector, ["Rating", "Popularity"]]

-data_dict = Dict([string(v, i) => getproperty(data[i], v)
-                         for i in 1:4 for v in [:x, :y]])
-collect(data_dict)
+good[1, "Rating"]
+good[1, :]
+good[:, "Rating"]
+good[:, :]

-DataFrame(data_dict)
+names(puzzles, ["Rating", "Popularity"])
+names(puzzles, [:Rating, :Popularity])
+names(puzzles, [4, 6])
+names(puzzles, [false, false, false, true, false, true, false, false, false])
+names(puzzles, r"Rating")
+names(puzzles, Not([4, 6]))
+names(puzzles, Not(r"Rating"))
+names(puzzles, Between("Rating", "Popularity"))
+names(puzzles, :)
+names(puzzles, All())
+names(puzzles, Cols(r"Rating", "NbPlays"))
+names(puzzles, Cols(startswith("P")))

-df1 = DataFrame(x1=data.set1.x)
-df1.x1 === data.set1.x
+names(puzzles, startswith("P"))

-df2 = DataFrame(x1=data.set1.x; copycols=false)
-df2.x1 === data.set1.x
+names(puzzles, Real)

-df = DataFrame(x=1:3, y=1)
-df.x
+names(puzzles, AbstractString)

-DataFrame(x=[1], y=[1, 2, 3])
+puzzles[:, names(puzzles, Real)]

-# Codes for section 7.1.3
+# Code for row subsetting

-data.set1
-DataFrame(data.set1)
+df1 = puzzles[:, ["Rating", "Popularity"]];
+df2 = puzzles[!, ["Rating", "Popularity"]];

-DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
+df1 == df2
+df1 == puzzles
+df2 == puzzles

-data
+df1.Rating === puzzles.Rating
+df1.Popularity === puzzles.Popularity
+df2.Rating === puzzles.Rating
+df2.Popularity === puzzles.Popularity

-# Code for listing 7.2
+@benchmark $puzzles[:, ["Rating", "Popularity"]]
+@benchmark $puzzles[!, ["Rating", "Popularity"]]

-aq2 = DataFrame(data)
+puzzles[1, 1]
+puzzles[[1], 1]
+puzzles[1, [1]]
+puzzles[[1], [1]]

-# Codes for listing 7.3
+# Code for making views

-data_dfs = map(DataFrame, data)
+@view puzzles[1, 1]

-# Codes for vertical concatenation examples
+@view puzzles[[1], 1]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
+@view puzzles[1, [1]]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
-     source="source_id")
+@view puzzles[[1], [1]]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
-     source="source_id"=>string.("set", 1:4))
+@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
+@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];

-reduce(vcat, collect(data_dfs);
-       source="source_id"=>string.("set", 1:4))
+parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])

-# Code for listing 7.4
+# Code for section 9.2

-df1 = DataFrame(a=1:3, b=11:13)
-df2 = DataFrame(a=4:6, c=24:26)
-vcat(df1, df2)
-vcat(df1, df2; cols=:union)
+describe(good)

-# Code for listing 7.5
-
-df_agg = DataFrame()
-append!(df_agg, data_dfs.set1)
-append!(df_agg, data_dfs.set2)
-
-# Code for appending tables to a data frame
-
-df_agg = DataFrame()
-append!(df_agg, data.set1)
-append!(df_agg, data.set2)
-
-# Code for promote keyword argument
-
-df1 = DataFrame(a=1:3, b=11:13)
-df2 = DataFrame(a=4:6, b=[14, missing, 16])
-append!(df1, df2)
-append!(df1, df2; promote=true)
-
-# Code for section 7.2.3
-
-df = DataFrame()
-push!(df, (a=1, b=2))
-push!(df, (a=3, b=4))
-
-df = DataFrame(a=Int[], b=Int[])
-push!(df, [1, 2])
-push!(df, [3, 4])
-
-function sim_step(current)
-    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
-    return (x=current.x + dx, y=current.y + dy)
-end
-
-using BenchmarkTools
-@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
-
-dx, dy = (10, 20)
-dx
-dy
-
-using FreqTables
-using Random
-Random.seed!(1234);
-proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
-
-using Random
-Random.seed!(6);
-walk = DataFrame(x=0, y=0)
-for _ in 1:10
-    current = walk[end, :]
-    push!(walk, sim_step(current))
-end
-walk
-
-plot(walk.x, walk.y;
-     legend=false,
-     series_annotations=1:11,
-     xticks=range(extrema(walk.x)...),
-     yticks=range(extrema(walk.y)...))
-
-extrema(walk.y)
-
-range(1, 5)
-
-(3/4)^9
-
-# Code for listing 7.6
-
-function walk_unique() #A
-    walk = DataFrame(x=0, y=0)
-    for _ in 1:10
-        current = walk[end, :]
-        push!(walk, sim_step(current))
+rating_mapping = Dict{Int, Vector{Int}}()
+for (i, rating) in enumerate(good.Rating)
+    if haskey(rating_mapping, rating)
+        push!(rating_mapping[rating], i)
+    else
+        rating_mapping[rating] = [i]
    end
-    return nrow(unique(walk)) == nrow(walk) #B
 end
-Random.seed!(2);
-proptable([walk_unique() for _ in 1:10^5])
+rating_mapping

-# Code for a note on conversion
+good[rating_mapping[2108], :]

-x = [1.5]
-x[1] = 1
-x
+unique(good[rating_mapping[2108], :].Rating)

-# Code from section 7.3.1
+using Statistics
+mean(good[rating_mapping[2108], "Popularity"])

-Matrix(walk)
-Matrix{Any}(walk)
-Matrix{String}(walk)
+ratings = unique(good.Rating)

-plot(walk)
-
-plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
-
-# Code from section 7.3.2
-
-Tables.columntable(walk)
-
-using BenchmarkTools
-function mysum(table)
-           s = 0 #A
-           for v in table.x #B
-               s += v
-           end
-           return s
-       end
-df = DataFrame(x=1:1_000_000);
-@btime mysum($df)
-
-tab = Tables.columntable(df);
-@btime mysum($tab)
-
-@code_warntype mysum(df)
-
-@code_warntype mysum(tab)
-
-typeof(tab)
-
-function barrier_mysum2(x)
-    s = 0
-    for v in x
-        s += v
-    end
-    return s
-end
-mysum2(table) = barrier_mysum2(table.x)
-@btime mysum2($df)
-
-df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
-unique(df)
-
-tab = Tables.columntable(df)
-unique(tab)
-
-# Code from section 7.3.3
-
-Tables.rowtable(walk)
-
-nti = Tables.namedtupleiterator(walk)
-for v in nti
-    println(v)
+mean_popularities = map(ratings) do rating
+    indices = rating_mapping[rating]
+    popularities = good[indices, "Popularity"]
+    return mean(popularities)
 end

-er = eachrow(walk)
-er[1]
-er[end]
-ec = eachcol(walk)
-ec[1]
-ec[end]
+scatter(ratings, mean_popularities;
+        xlabel="rating", ylabel="mean popularity", legend=false)

-identity.(eachcol(walk))
+import Loess
+model = Loess.loess(ratings, mean_popularities);
+ratings_predict = float.(sort(ratings))
+popularity_predict = Loess.predict(model, ratings_predict)

-df = DataFrame(x=1:2, b=["a", "b"])
-identity.(eachcol(df))
+plot!(ratings_predict, popularity_predict; width=5, color="black")