update up to chapter 9

2022-02-13 11:59:23 +01:00 · 2022-02-13 11:59:23 +01:00 · ab6b8f18f3
commit ab6b8f18f3
parent e1d5277f8c
4 changed files with 637 additions and 618 deletions
--- a/ch08.jl
+++ b/ch08.jl
@ -1,8 +1,8 @@
 # Bogumił Kamiński, 2022

-# Codes for chapter 6
+# Codes for chapter 8

-# Code for section 6.1
+# Code for section 8.1

 if isfile("puzzles.csv.bz2")
    @info "file already present"
@ -25,22 +25,27 @@ end

 readlines("puzzles.csv")

-# Code for section 6.2
+# Code for section 8.2

 using CSV
 using DataFrames
 puzzles = CSV.read("puzzles.csv", DataFrame);

-CSV.read(plain, DataFrame);
+puzzles2 = CSV.read(plain, DataFrame;
+                    header=["PuzzleId", "FEN", "Moves",
+                            "Rating","RatingDeviation",
+                            "Popularity", "NbPlays",
+                            "Themes","GameUrl"]);
+puzzles == puzzles2

 compressed = nothing
 plain = nothing

-# Code for listing 6.1
+# Code for listing 8.1

 puzzles

-# Code for listing 6.2
+# Code for listing 8.2

 describe(puzzles)

@ -52,7 +57,13 @@ nrow(puzzles)

 names(puzzles)

-# Code for section 6.3
+CSV.write("puzzles2.csv", puzzles)
+
+read("puzzles2.csv")
+
+read("puzzles2.csv") == read("puzzles.csv")
+
+# Code for section 8.3

 puzzles.Rating

@ -101,148 +112,3 @@ plot(histogram(puzzles.Rating, label="Rating"),
 plot([histogram(puzzles[!, col]; label=col) for
      col in ["Rating", "RatingDeviation",
              "Popularity", "NbPlays"]]...)
-
-# Code for section 6.4
-
-using Statistics
-plays_lo = median(puzzles.NbPlays)
-puzzles.NbPlays .> plays_lo
-
-puzzles.NbPlays > plays_lo
-
-rating_lo = 1500
-rating_hi = quantile(puzzles.Rating, 0.99)
-rating_lo .< puzzles.Rating .< rating_hi
-
-row_selector = (puzzles.NbPlays .> plays_lo) .&&
-               (rating_lo .< puzzles.Rating .< rating_hi)
-
-sum(row_selector)
-count(row_selector)
-
-# Code for listing 6.3
-
-good = puzzles[row_selector, ["Rating", "Popularity"]]
-
-# Code for plotting histograms
-
-plot(histogram(good.Rating; label="Rating"),
-     histogram(good.Popularity; label="Popularity"))
-
-# Code for column selectors
-
-puzzles[1, "Rating"]
-
-puzzles[:, "Rating"]
-
-row1 = puzzles[1, ["Rating", "Popularity"]]
-
-row1["Rating"]
-row1[:Rating]
-row1[1]
-row1.Rating
-row1."Rating"
-
-good = puzzles[row_selector, ["Rating", "Popularity"]]
-
-good[1, "Rating"]
-good[1, :]
-good[:, "Rating"]
-good[:, :]
-
-names(puzzles, ["Rating", "Popularity"])
-names(puzzles, [:Rating, :Popularity])
-names(puzzles, [4, 6])
-names(puzzles, [false, false, false, true, false, true, false, false, false])
-names(puzzles, r"Rating")
-names(puzzles, Not([4, 6]))
-names(puzzles, Not(r"Rating"))
-names(puzzles, Between("Rating", "Popularity"))
-names(puzzles, :)
-names(puzzles, All())
-names(puzzles, Cols(r"Rating", "NbPlays"))
-names(puzzles, Cols(startswith("P")))
-
-names(puzzles, startswith("P"))
-
-names(puzzles, Real)
-
-names(puzzles, AbstractString)
-
-puzzles[:, names(puzzles, Real)]
-
-# Code for row subsetting
-
-df1 = puzzles[:, ["Rating", "Popularity"]];
-df2 = puzzles[!, ["Rating", "Popularity"]];
-
-df1 == df2
-df1 == puzzles
-df2 == puzzles
-
-df1.Rating === puzzles.Rating
-df1.Popularity === puzzles.Popularity
-df2.Rating === puzzles.Rating
-df2.Popularity === puzzles.Popularity
-
-@benchmark $puzzles[:, ["Rating", "Popularity"]]
-@benchmark $puzzles[!, ["Rating", "Popularity"]]
-
-puzzles[1, 1]
-puzzles[[1], 1]
-puzzles[1, [1]]
-puzzles[[1], [1]]
-
-# Code for making views
-
-@view puzzles[1, 1]
-
-@view puzzles[[1], 1]
-
-@view puzzles[1, [1]]
-
-@view puzzles[[1], [1]]
-
-@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
-@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
-
-parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
-
-# Code for section 6.5
-
-describe(good)
-
-rating_mapping = Dict{Int, Vector{Int}}()
-for (i, rating) in enumerate(good.Rating)
-    if haskey(rating_mapping, rating)
-        push!(rating_mapping[rating], i)
-    else
-        rating_mapping[rating] = [i]
-    end
-end
-rating_mapping
-
-good[rating_mapping[2108], :]
-
-unique(good[rating_mapping[2108], :].Rating)
-
-using Statistics
-mean(good[rating_mapping[2108], "Popularity"])
-
-ratings = unique(good.Rating)
-
-mean_popularities = map(ratings) do rating
-    indices = rating_mapping[rating]
-    popularities = good[indices, "Popularity"]
-    return mean(popularities)
-end
-
-scatter(ratings, mean_popularities;
-        xlabel="rating", ylabel="mean popularity", legend=false)
-
-import Loess
-model = Loess.loess(ratings, mean_popularities);
-ratings_predict = float.(sort(ratings))
-popularity_predict = Loess.predict(model, ratings_predict)
-
-plot!(ratings_predict, popularity_predict; width=5, color="black")
--- a/ch09.jl
+++ b/ch09.jl
@ -1,279 +1,153 @@
 # Bogumił Kamiński, 2022

-# Codes for chapter 7
+# Codes for chapter 9

-# Code for section 7.1
-
-aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
-       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
-      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
-       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
-      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
-      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
-       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
-       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
-      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
-       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
-       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
-
-data = (set1=(x=aq[:, 1], y=aq[:, 2]),
-        set2=(x=aq[:, 3], y=aq[:, 4]),
-        set3=(x=aq[:, 5], y=aq[:, 6]),
-        set4=(x=aq[:, 7], y=aq[:, 8]));
+# Code for section 9.1

 using DataFrames
+using CSV
+using Plots
+puzzles = CSV.read("puzzles.csv", DataFrame);

-# Code for listing 7.1
+using Statistics
+plays_lo = median(puzzles.NbPlays)
+puzzles.NbPlays .> plays_lo

-aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
-DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
+puzzles.NbPlays > plays_lo

-# Code for creating DataFrame with automatic column names
+rating_lo = 1500
+rating_hi = quantile(puzzles.Rating, 0.99)
+rating_lo .< puzzles.Rating .< rating_hi

-DataFrame(aq, :auto)
+row_selector = (puzzles.NbPlays .> plays_lo) .&&
+               (rating_lo .< puzzles.Rating .< rating_hi)

-# Codes for creating DataFrame from vector of vectors
+sum(row_selector)
+count(row_selector)

-aq_vec = collect(eachcol(aq))
-DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
-DataFrame(aq_vec, :auto)
+# Code for listing 9.1

-# Codes for section 7.1.2
+good = puzzles[row_selector, ["Rating", "Popularity"]]

-data.set1.x
+# Code for plotting histograms

-DataFrame(x1=data.set1.x, y1=data.set1.y,
-          x2=data.set2.x, y2=data.set2.y,
-          x3=data.set3.x, y3=data.set3.y,
-          x4=data.set4.x, y4=data.set4.y)
+plot(histogram(good.Rating; label="Rating"),
+     histogram(good.Popularity; label="Popularity"))

-DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
-          :x2 => data.set2.x, :y2 => data.set2.y,
-          :x3 => data.set3.x, :y3 => data.set3.y,
-          :x4 => data.set4.x, :y4 => data.set4.y)
+# Code for column selectors

-DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
-           :x2 => data.set2.x, :y2 => data.set2.y,
-           :x3 => data.set3.x, :y3 => data.set3.y,
-           :x4 => data.set4.x, :y4 => data.set4.y]);
+puzzles[1, "Rating"]

-[(i, v) for i in 1:4 for v in [:x, :y]]
+puzzles[:, "Rating"]

-[string(v, i) for i in 1:4 for v in [:x, :y]]
+row1 = puzzles[1, ["Rating", "Popularity"]]

-[string(v, i) => getproperty(data[i], v)
-        for i in 1:4 for v in [:x, :y]]
+row1["Rating"]
+row1[:Rating]
+row1[1]
+row1.Rating
+row1."Rating"

-DataFrame([string(v, i) => getproperty(data[i], v)
-           for i in 1:4 for v in [:x, :y]]);
+good = puzzles[row_selector, ["Rating", "Popularity"]]

-data_dict = Dict([string(v, i) => getproperty(data[i], v)
-                         for i in 1:4 for v in [:x, :y]])
-collect(data_dict)
+good[1, "Rating"]
+good[1, :]
+good[:, "Rating"]
+good[:, :]

-DataFrame(data_dict)
+names(puzzles, ["Rating", "Popularity"])
+names(puzzles, [:Rating, :Popularity])
+names(puzzles, [4, 6])
+names(puzzles, [false, false, false, true, false, true, false, false, false])
+names(puzzles, r"Rating")
+names(puzzles, Not([4, 6]))
+names(puzzles, Not(r"Rating"))
+names(puzzles, Between("Rating", "Popularity"))
+names(puzzles, :)
+names(puzzles, All())
+names(puzzles, Cols(r"Rating", "NbPlays"))
+names(puzzles, Cols(startswith("P")))

-df1 = DataFrame(x1=data.set1.x)
-df1.x1 === data.set1.x
+names(puzzles, startswith("P"))

-df2 = DataFrame(x1=data.set1.x; copycols=false)
-df2.x1 === data.set1.x
+names(puzzles, Real)

-df = DataFrame(x=1:3, y=1)
-df.x
+names(puzzles, AbstractString)

-DataFrame(x=[1], y=[1, 2, 3])
+puzzles[:, names(puzzles, Real)]

-# Codes for section 7.1.3
+# Code for row subsetting

-data.set1
-DataFrame(data.set1)
+df1 = puzzles[:, ["Rating", "Popularity"]];
+df2 = puzzles[!, ["Rating", "Popularity"]];

-DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
+df1 == df2
+df1 == puzzles
+df2 == puzzles

-data
+df1.Rating === puzzles.Rating
+df1.Popularity === puzzles.Popularity
+df2.Rating === puzzles.Rating
+df2.Popularity === puzzles.Popularity

-# Code for listing 7.2
+@benchmark $puzzles[:, ["Rating", "Popularity"]]
+@benchmark $puzzles[!, ["Rating", "Popularity"]]

-aq2 = DataFrame(data)
+puzzles[1, 1]
+puzzles[[1], 1]
+puzzles[1, [1]]
+puzzles[[1], [1]]

-# Codes for listing 7.3
+# Code for making views

-data_dfs = map(DataFrame, data)
+@view puzzles[1, 1]

-# Codes for vertical concatenation examples
+@view puzzles[[1], 1]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
+@view puzzles[1, [1]]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
-     source="source_id")
+@view puzzles[[1], [1]]

-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
-     source="source_id"=>string.("set", 1:4))
+@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
+@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];

-reduce(vcat, collect(data_dfs);
-       source="source_id"=>string.("set", 1:4))
+parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])

-# Code for listing 7.4
+# Code for section 9.2

-df1 = DataFrame(a=1:3, b=11:13)
-df2 = DataFrame(a=4:6, c=24:26)
-vcat(df1, df2)
-vcat(df1, df2; cols=:union)
+describe(good)

-# Code for listing 7.5
-
-df_agg = DataFrame()
-append!(df_agg, data_dfs.set1)
-append!(df_agg, data_dfs.set2)
-
-# Code for appending tables to a data frame
-
-df_agg = DataFrame()
-append!(df_agg, data.set1)
-append!(df_agg, data.set2)
-
-# Code for promote keyword argument
-
-df1 = DataFrame(a=1:3, b=11:13)
-df2 = DataFrame(a=4:6, b=[14, missing, 16])
-append!(df1, df2)
-append!(df1, df2; promote=true)
-
-# Code for section 7.2.3
-
-df = DataFrame()
-push!(df, (a=1, b=2))
-push!(df, (a=3, b=4))
-
-df = DataFrame(a=Int[], b=Int[])
-push!(df, [1, 2])
-push!(df, [3, 4])
-
-function sim_step(current)
-    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
-    return (x=current.x + dx, y=current.y + dy)
-end
-
-using BenchmarkTools
-@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
-
-dx, dy = (10, 20)
-dx
-dy
-
-using FreqTables
-using Random
-Random.seed!(1234);
-proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
-
-using Random
-Random.seed!(6);
-walk = DataFrame(x=0, y=0)
-for _ in 1:10
-    current = walk[end, :]
-    push!(walk, sim_step(current))
-end
-walk
-
-plot(walk.x, walk.y;
-     legend=false,
-     series_annotations=1:11,
-     xticks=range(extrema(walk.x)...),
-     yticks=range(extrema(walk.y)...))
-
-extrema(walk.y)
-
-range(1, 5)
-
-(3/4)^9
-
-# Code for listing 7.6
-
-function walk_unique() #A
-    walk = DataFrame(x=0, y=0)
-    for _ in 1:10
-        current = walk[end, :]
-        push!(walk, sim_step(current))
+rating_mapping = Dict{Int, Vector{Int}}()
+for (i, rating) in enumerate(good.Rating)
+    if haskey(rating_mapping, rating)
+        push!(rating_mapping[rating], i)
+    else
+        rating_mapping[rating] = [i]
    end
-    return nrow(unique(walk)) == nrow(walk) #B
 end
-Random.seed!(2);
-proptable([walk_unique() for _ in 1:10^5])
+rating_mapping

-# Code for a note on conversion
+good[rating_mapping[2108], :]

-x = [1.5]
-x[1] = 1
-x
+unique(good[rating_mapping[2108], :].Rating)

-# Code from section 7.3.1
+using Statistics
+mean(good[rating_mapping[2108], "Popularity"])

-Matrix(walk)
-Matrix{Any}(walk)
-Matrix{String}(walk)
+ratings = unique(good.Rating)

-plot(walk)
-
-plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
-
-# Code from section 7.3.2
-
-Tables.columntable(walk)
-
-using BenchmarkTools
-function mysum(table)
-           s = 0 #A
-           for v in table.x #B
-               s += v
-           end
-           return s
-       end
-df = DataFrame(x=1:1_000_000);
-@btime mysum($df)
-
-tab = Tables.columntable(df);
-@btime mysum($tab)
-
-@code_warntype mysum(df)
-
-@code_warntype mysum(tab)
-
-typeof(tab)
-
-function barrier_mysum2(x)
-    s = 0
-    for v in x
-        s += v
-    end
-    return s
-end
-mysum2(table) = barrier_mysum2(table.x)
-@btime mysum2($df)
-
-df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
-unique(df)
-
-tab = Tables.columntable(df)
-unique(tab)
-
-# Code from section 7.3.3
-
-Tables.rowtable(walk)
-
-nti = Tables.namedtupleiterator(walk)
-for v in nti
-    println(v)
+mean_popularities = map(ratings) do rating
+    indices = rating_mapping[rating]
+    popularities = good[indices, "Popularity"]
+    return mean(popularities)
 end

-er = eachrow(walk)
-er[1]
-er[end]
-ec = eachcol(walk)
-ec[1]
-ec[end]
+scatter(ratings, mean_popularities;
+        xlabel="rating", ylabel="mean popularity", legend=false)

-identity.(eachcol(walk))
+import Loess
+model = Loess.loess(ratings, mean_popularities);
+ratings_predict = float.(sort(ratings))
+popularity_predict = Loess.predict(model, ratings_predict)

-df = DataFrame(x=1:2, b=["a", "b"])
-identity.(eachcol(df))
+plot!(ratings_predict, popularity_predict; width=5, color="black")
--- a/ch10.jl
+++ b/ch10.jl
@ -1,284 +1,279 @@
 # Bogumił Kamiński, 2022

-# Codes for chapter 8
+# Codes for chapter 7

-# Codes for section 8.1
+# Code for section 7.1

-# Code for listing 8.1
+aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
+       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
+      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
+       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
+      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
+      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
+       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
+       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
+      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
+       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
+       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];

-import Downloads
-using SHA
-git_zip = "git_web_ml.zip"
-if !isfile(git_zip)
-    Downloads.download("https://snap.stanford.edu/data/" *
-                       "git_web_ml.zip",
-                       git_zip)
-end
-isfile(git_zip)
-open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
-                          0xc4, 0x60, 0xdc, 0x4c,
-                          0x7b, 0xf8, 0x93, 0x57,
-                          0xb1, 0xfe, 0xc0, 0x20,
-                          0xf4, 0x5e, 0x2e, 0xce,
-                          0xba, 0xb8, 0x1d, 0x13,
-                          0x1d, 0x07, 0x3b, 0x10,
-                          0xe2, 0x8e, 0xc0, 0x31]
+data = (set1=(x=aq[:, 1], y=aq[:, 2]),
+        set2=(x=aq[:, 3], y=aq[:, 4]),
+        set3=(x=aq[:, 5], y=aq[:, 6]),
+        set4=(x=aq[:, 7], y=aq[:, 8]));

-# Code for opeining a zip archive
-
-import ZipFile
-git_archive = ZipFile.Reader(git_zip)
-
-# Code for listing 8.2
-
-function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
-    idx = only(findall(x -> x.name == filename, archive.files))
-    return CSV.read(read(archive.files[idx]), DataFrame)
-end
-
-# Code for working with zip archive
-
-git_archive.files
-
-git_archive.files[2].name
-
-findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
-findall(x -> x.name == "", git_archive.files)
-
-only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
-only(findall(x -> x.name == "", git_archive.files))
-
-# Code for listing 8.3
-
-using CSV
 using DataFrames
-edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
-classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
-close(git_archive)
-summary(edges_df)
-describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
-summary(classes_df)
-describe(classes_df, :min, :max, :mean, :nmissing, :eltype)

-# Code for updating data frame columns using broadcasting
+# Code for listing 7.1

-edges_df .+= 1
-classes_df.id .+= 1
+aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
+DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])

-# Code for examples of data frame broadcasting
+# Code for creating DataFrame with automatic column names

-df = DataFrame(a=1:3, b=[4, missing, 5])
-df .^ 2
-coalesce.(df, 0)
-df .+ [10, 11, 12]
+DataFrame(aq, :auto)

-# Code for checking the order of :id column in a data frame
+# Codes for creating DataFrame from vector of vectors

-classes_df.id == axes(classes_df, 1)
+aq_vec = collect(eachcol(aq))
+DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
+DataFrame(aq_vec, :auto)

-# Code for the difference between ! and : in broadcasting assignment
+# Codes for section 7.1.2

-df = DataFrame(a=1:3, b=1:3)
-df[!, :a] .= "x"
-df[:, :b] .= "x"
-df
+data.set1.x

-# Code for the difference between ! and : in assignment
+DataFrame(x1=data.set1.x, y1=data.set1.y,
+          x2=data.set2.x, y2=data.set2.y,
+          x3=data.set3.x, y3=data.set3.y,
+          x4=data.set4.x, y4=data.set4.y)

-df = DataFrame(a=1:3, b=1:3, c=1:3)
-df[!, :a] = ["x", "y", "z"]
-df[:, :b] = ["x", "y", "z"]
-df[:, :c] = [11, 12, 13]
-df
+DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
+          :x2 => data.set2.x, :y2 => data.set2.y,
+          :x3 => data.set3.x, :y3 => data.set3.y,
+          :x4 => data.set4.x, :y4 => data.set4.y)

-# Codes for section 8.2
+DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
+           :x2 => data.set2.x, :y2 => data.set2.y,
+           :x3 => data.set3.x, :y3 => data.set3.y,
+           :x4 => data.set4.x, :y4 => data.set4.y]);

-# Code from listing 8.4
+[(i, v) for i in 1:4 for v in [:x, :y]]

-using Graphs
-gh = SimpleGraph(nrow(classes_df))
-for (from, to) in eachrow(edges_df)
-    add_edge!(gh, from, to)
-end
-gh
-ne(gh)
-nv(gh)
+[string(v, i) for i in 1:4 for v in [:x, :y]]

-# Code for iterator destruction in iteration specification
+[string(v, i) => getproperty(data[i], v)
+        for i in 1:4 for v in [:x, :y]]

-mat = [1 2; 3 4; 5 6]
-for (x1, x2) in eachrow(mat)
-    @show x1, x2
-end
+DataFrame([string(v, i) => getproperty(data[i], v)
+           for i in 1:4 for v in [:x, :y]]);

-# Code for getting degrees of nodes in the graph
+data_dict = Dict([string(v, i) => getproperty(data[i], v)
+                         for i in 1:4 for v in [:x, :y]])
+collect(data_dict)

-degree(gh)
+DataFrame(data_dict)

-# Code for adding a column to a data frame
+df1 = DataFrame(x1=data.set1.x)
+df1.x1 === data.set1.x

-classes_df.deg = degree(gh)
+df2 = DataFrame(x1=data.set1.x; copycols=false)
+df2.x1 === data.set1.x

-# Code for the difference between ! and : when adding a column
+df = DataFrame(x=1:3, y=1)
+df.x
+
+DataFrame(x=[1], y=[1, 2, 3])
+
+# Codes for section 7.1.3
+
+data.set1
+DataFrame(data.set1)
+
+DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
+
+data
+
+# Code for listing 7.2
+
+aq2 = DataFrame(data)
+
+# Codes for listing 7.3
+
+data_dfs = map(DataFrame, data)
+
+# Codes for vertical concatenation examples
+
+vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
+
+vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
+     source="source_id")
+
+vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
+     source="source_id"=>string.("set", 1:4))
+
+reduce(vcat, collect(data_dfs);
+       source="source_id"=>string.("set", 1:4))
+
+# Code for listing 7.4
+
+df1 = DataFrame(a=1:3, b=11:13)
+df2 = DataFrame(a=4:6, c=24:26)
+vcat(df1, df2)
+vcat(df1, df2; cols=:union)
+
+# Code for listing 7.5
+
+df_agg = DataFrame()
+append!(df_agg, data_dfs.set1)
+append!(df_agg, data_dfs.set2)
+
+# Code for appending tables to a data frame
+
+df_agg = DataFrame()
+append!(df_agg, data.set1)
+append!(df_agg, data.set2)
+
+# Code for promote keyword argument
+
+df1 = DataFrame(a=1:3, b=11:13)
+df2 = DataFrame(a=4:6, b=[14, missing, 16])
+append!(df1, df2)
+append!(df1, df2; promote=true)
+
+# Code for section 7.2.3

 df = DataFrame()
-x = [1, 2, 3]
-df[!, :x1] = x
-df[:, :x2] = x
-df
-df.x1 === x
-df.x2 === x
-df.x2 == x
+push!(df, (a=1, b=2))
+push!(df, (a=3, b=4))

-# Code for creating a column using broadcasting
+df = DataFrame(a=Int[], b=Int[])
+push!(df, [1, 2])
+push!(df, [3, 4])

-df.x3 .= 1
-df
-
-# Code for edge iterator of a graph
-
-edges(gh)
-
-e1 = first(edges(gh))
-dump(e1)
-e1.src
-e1.dst
-
-# Code for listing 8.5
-
-function deg_class(gh, class)
-    deg_ml = zeros(Int, length(class))
-    deg_web = zeros(Int, length(class))
-    for edge in edges(gh)
-        a, b = edge.src, edge.dst
-        if class[b] == 1
-            deg_ml[a] += 1
-        else
-            deg_web[a] += 1
-        end
-        if class[a] == 1
-            deg_ml[b] += 1
-        else
-            deg_web[b] += 1
-        end
-    end
-    return (deg_ml, deg_web)
+function sim_step(current)
+    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
+    return (x=current.x + dx, y=current.y + dy)
 end

-# Code for computing machine learning and web neighbors for gh graph
+using BenchmarkTools
+@btime rand(((1,0), (-1,0), (0,1), (0,-1)));

-classes_df.deg_ml, classes_df.deg_web =
-deg_class(gh, classes_df.ml_target)
-
-# Code for checking type stability of deg_class function
-
-@time deg_class(gh, classes_df.ml_target);
-@code_warntype deg_class(gh, classes_df.ml_target)
-
-# Code for checking the classes_df summary statistics
-
-describe(classes_df, :min, :max, :mean, :std)
-
-# Code for average degree of node in the graph
-
-2 * ne(gh) / nv(gh)
-
-# Code for checking correctness of computations
-
-classes_df.deg_ml + classes_df.deg_web == classes_df.deg
-
-# Code for showing that DataFrames.jl checks consistency of stored objects
-
-df = DataFrame(a=1, b=11)
-push!(df.a, 2)
-df
-
-# Codes for section 8.3
-
-# Code for computing groupwise means of columns
-
-using Statistics
-for type in [0, 1], col in ["deg_ml", "deg_web"]
-    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
-end
-
-gdf = groupby(classes_df, :ml_target)
-combine(gdf,
-        :deg_ml => mean => :mean_deg_ml,
-        :deg_web => mean => :mean_deg_web)
-
-using DataFramesMeta
-@combine(gdf,
-         :mean_deg_ml = mean(:deg_ml),
-         :mean_deg_web = mean(:deg_web))
-
-# Code for simple plotting of relationship between developer degree and type
-
-using Plots
-scatter(classes_df.deg_ml, classes_df.deg_web;
-        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
-        xlabel="degree ml", ylabel="degree web", labels=false)
-
-# Code for aggregation of degree data
-
-agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
-                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
-
-# Code for comparison how Julia parses expressions
-
-:ml_target => (x -> 1 - mean(x)) => :web_mean
-:ml_target => x -> 1 - mean(x) => :web_mean
-
-# Code for aggregation using DataFramesMeta.jl
-
-@combine(groupby(classes_df, [:deg_ml, :deg_web]),
-         :web_mean = 1 - mean(:ml_target))
-
-# Code for getting summary information about the aggregated data frame
-
-describe(agg_df)
-
-# Code for log1p function
-
-log1p(0)
-
-# Code for listing 8.6
-
-function gen_ticks(maxv)
-    max2 = round(Int, log2(maxv))
-    tick = [0; 2 .^ (0:max2)]
-    return (log1p.(tick), tick)
-end
-
-log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
+dx, dy = (10, 20)
+dx
+dy

+using FreqTables
 using Random
 Random.seed!(1234);
-scatter(log1pjitter.(agg_df.deg_ml),
-        log1pjitter.(agg_df.deg_web);
-        zcolor=agg_df.web_mean,
-        xlabel="degree ml", ylabel="degree web",
-        markersize=2, markerstrokewidth=0, markeralpha=0.8,
-        legend=:topleft, labels = "fraction web",
-        xticks=gen_ticks(maximum(classes_df.deg_ml)),
-        yticks=gen_ticks(maximum(classes_df.deg_web)))
+proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])

-# Code for fitting logistic regression model
+using Random
+Random.seed!(6);
+walk = DataFrame(x=0, y=0)
+for _ in 1:10
+    current = walk[end, :]
+    push!(walk, sim_step(current))
+end
+walk

-using GLM
-glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
+plot(walk.x, walk.y;
+     legend=false,
+     series_annotations=1:11,
+     xticks=range(extrema(walk.x)...),
+     yticks=range(extrema(walk.y)...))

-# Code for inspecting @formula result
+extrema(walk.y)

-@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
+range(1, 5)

-# Code for inserting columns to a data frame
+(3/4)^9

-df = DataFrame(x=1:3)
-insertcols!(df, :y => 4:6)
-insertcols!(df, :y => 4:6)
-insertcols!(df, :z => 1)
+# Code for listing 7.6

-insertcols!(df, 1, :a => 0)
-insertcols!(df, :x, :pre_x => 2)
-insertcols!(df, :x, :post_x => 3, after=true)
+function walk_unique() #A
+    walk = DataFrame(x=0, y=0)
+    for _ in 1:10
+        current = walk[end, :]
+        push!(walk, sim_step(current))
+    end
+    return nrow(unique(walk)) == nrow(walk) #B
+end
+Random.seed!(2);
+proptable([walk_unique() for _ in 1:10^5])
+
+# Code for a note on conversion
+
+x = [1.5]
+x[1] = 1
+x
+
+# Code from section 7.3.1
+
+Matrix(walk)
+Matrix{Any}(walk)
+Matrix{String}(walk)
+
+plot(walk)
+
+plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
+
+# Code from section 7.3.2
+
+Tables.columntable(walk)
+
+using BenchmarkTools
+function mysum(table)
+           s = 0 #A
+           for v in table.x #B
+               s += v
+           end
+           return s
+       end
+df = DataFrame(x=1:1_000_000);
+@btime mysum($df)
+
+tab = Tables.columntable(df);
+@btime mysum($tab)
+
+@code_warntype mysum(df)
+
+@code_warntype mysum(tab)
+
+typeof(tab)
+
+function barrier_mysum2(x)
+    s = 0
+    for v in x
+        s += v
+    end
+    return s
+end
+mysum2(table) = barrier_mysum2(table.x)
+@btime mysum2($df)
+
+df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
+unique(df)
+
+tab = Tables.columntable(df)
+unique(tab)
+
+# Code from section 7.3.3
+
+Tables.rowtable(walk)
+
+nti = Tables.namedtupleiterator(walk)
+for v in nti
+    println(v)
+end
+
+er = eachrow(walk)
+er[1]
+er[end]
+ec = eachcol(walk)
+ec[1]
+ec[end]
+
+identity.(eachcol(walk))
+
+df = DataFrame(x=1:2, b=["a", "b"])
+identity.(eachcol(df))
--- a/ch12.jl
+++ b/ch12.jl
@ -0,0 +1,284 @@
+# Bogumił Kamiński, 2022
+
+# Codes for chapter 8
+
+# Codes for section 8.1
+
+# Code for listing 8.1
+
+import Downloads
+using SHA
+git_zip = "git_web_ml.zip"
+if !isfile(git_zip)
+    Downloads.download("https://snap.stanford.edu/data/" *
+                       "git_web_ml.zip",
+                       git_zip)
+end
+isfile(git_zip)
+open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
+                          0xc4, 0x60, 0xdc, 0x4c,
+                          0x7b, 0xf8, 0x93, 0x57,
+                          0xb1, 0xfe, 0xc0, 0x20,
+                          0xf4, 0x5e, 0x2e, 0xce,
+                          0xba, 0xb8, 0x1d, 0x13,
+                          0x1d, 0x07, 0x3b, 0x10,
+                          0xe2, 0x8e, 0xc0, 0x31]
+
+# Code for opeining a zip archive
+
+import ZipFile
+git_archive = ZipFile.Reader(git_zip)
+
+# Code for listing 8.2
+
+function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
+    idx = only(findall(x -> x.name == filename, archive.files))
+    return CSV.read(read(archive.files[idx]), DataFrame)
+end
+
+# Code for working with zip archive
+
+git_archive.files
+
+git_archive.files[2].name
+
+findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
+findall(x -> x.name == "", git_archive.files)
+
+only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
+only(findall(x -> x.name == "", git_archive.files))
+
+# Code for listing 8.3
+
+using CSV
+using DataFrames
+edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
+classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
+close(git_archive)
+summary(edges_df)
+describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
+summary(classes_df)
+describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
+
+# Code for updating data frame columns using broadcasting
+
+edges_df .+= 1
+classes_df.id .+= 1
+
+# Code for examples of data frame broadcasting
+
+df = DataFrame(a=1:3, b=[4, missing, 5])
+df .^ 2
+coalesce.(df, 0)
+df .+ [10, 11, 12]
+
+# Code for checking the order of :id column in a data frame
+
+classes_df.id == axes(classes_df, 1)
+
+# Code for the difference between ! and : in broadcasting assignment
+
+df = DataFrame(a=1:3, b=1:3)
+df[!, :a] .= "x"
+df[:, :b] .= "x"
+df
+
+# Code for the difference between ! and : in assignment
+
+df = DataFrame(a=1:3, b=1:3, c=1:3)
+df[!, :a] = ["x", "y", "z"]
+df[:, :b] = ["x", "y", "z"]
+df[:, :c] = [11, 12, 13]
+df
+
+# Codes for section 8.2
+
+# Code from listing 8.4
+
+using Graphs
+gh = SimpleGraph(nrow(classes_df))
+for (from, to) in eachrow(edges_df)
+    add_edge!(gh, from, to)
+end
+gh
+ne(gh)
+nv(gh)
+
+# Code for iterator destruction in iteration specification
+
+mat = [1 2; 3 4; 5 6]
+for (x1, x2) in eachrow(mat)
+    @show x1, x2
+end
+
+# Code for getting degrees of nodes in the graph
+
+degree(gh)
+
+# Code for adding a column to a data frame
+
+classes_df.deg = degree(gh)
+
+# Code for the difference between ! and : when adding a column
+
+df = DataFrame()
+x = [1, 2, 3]
+df[!, :x1] = x
+df[:, :x2] = x
+df
+df.x1 === x
+df.x2 === x
+df.x2 == x
+
+# Code for creating a column using broadcasting
+
+df.x3 .= 1
+df
+
+# Code for edge iterator of a graph
+
+edges(gh)
+
+e1 = first(edges(gh))
+dump(e1)
+e1.src
+e1.dst
+
+# Code for listing 8.5
+
+function deg_class(gh, class)
+    deg_ml = zeros(Int, length(class))
+    deg_web = zeros(Int, length(class))
+    for edge in edges(gh)
+        a, b = edge.src, edge.dst
+        if class[b] == 1
+            deg_ml[a] += 1
+        else
+            deg_web[a] += 1
+        end
+        if class[a] == 1
+            deg_ml[b] += 1
+        else
+            deg_web[b] += 1
+        end
+    end
+    return (deg_ml, deg_web)
+end
+
+# Code for computing machine learning and web neighbors for gh graph
+
+classes_df.deg_ml, classes_df.deg_web =
+deg_class(gh, classes_df.ml_target)
+
+# Code for checking type stability of deg_class function
+
+@time deg_class(gh, classes_df.ml_target);
+@code_warntype deg_class(gh, classes_df.ml_target)
+
+# Code for checking the classes_df summary statistics
+
+describe(classes_df, :min, :max, :mean, :std)
+
+# Code for average degree of node in the graph
+
+2 * ne(gh) / nv(gh)
+
+# Code for checking correctness of computations
+
+classes_df.deg_ml + classes_df.deg_web == classes_df.deg
+
+# Code for showing that DataFrames.jl checks consistency of stored objects
+
+df = DataFrame(a=1, b=11)
+push!(df.a, 2)
+df
+
+# Codes for section 8.3
+
+# Code for computing groupwise means of columns
+
+using Statistics
+for type in [0, 1], col in ["deg_ml", "deg_web"]
+    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
+end
+
+gdf = groupby(classes_df, :ml_target)
+combine(gdf,
+        :deg_ml => mean => :mean_deg_ml,
+        :deg_web => mean => :mean_deg_web)
+
+using DataFramesMeta
+@combine(gdf,
+         :mean_deg_ml = mean(:deg_ml),
+         :mean_deg_web = mean(:deg_web))
+
+# Code for simple plotting of relationship between developer degree and type
+
+using Plots
+scatter(classes_df.deg_ml, classes_df.deg_web;
+        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
+        xlabel="degree ml", ylabel="degree web", labels=false)
+
+# Code for aggregation of degree data
+
+agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
+                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
+
+# Code for comparison how Julia parses expressions
+
+:ml_target => (x -> 1 - mean(x)) => :web_mean
+:ml_target => x -> 1 - mean(x) => :web_mean
+
+# Code for aggregation using DataFramesMeta.jl
+
+@combine(groupby(classes_df, [:deg_ml, :deg_web]),
+         :web_mean = 1 - mean(:ml_target))
+
+# Code for getting summary information about the aggregated data frame
+
+describe(agg_df)
+
+# Code for log1p function
+
+log1p(0)
+
+# Code for listing 8.6
+
+function gen_ticks(maxv)
+    max2 = round(Int, log2(maxv))
+    tick = [0; 2 .^ (0:max2)]
+    return (log1p.(tick), tick)
+end
+
+log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
+
+using Random
+Random.seed!(1234);
+scatter(log1pjitter.(agg_df.deg_ml),
+        log1pjitter.(agg_df.deg_web);
+        zcolor=agg_df.web_mean,
+        xlabel="degree ml", ylabel="degree web",
+        markersize=2, markerstrokewidth=0, markeralpha=0.8,
+        legend=:topleft, labels = "fraction web",
+        xticks=gen_ticks(maximum(classes_df.deg_ml)),
+        yticks=gen_ticks(maximum(classes_df.deg_web)))
+
+# Code for fitting logistic regression model
+
+using GLM
+glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
+
+# Code for inspecting @formula result
+
+@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
+
+# Code for inserting columns to a data frame
+
+df = DataFrame(x=1:3)
+insertcols!(df, :y => 4:6)
+insertcols!(df, :y => 4:6)
+insertcols!(df, :z => 1)
+
+insertcols!(df, 1, :a => 0)
+insertcols!(df, :x, :pre_x => 2)
+insertcols!(df, :x, :post_x => 3, after=true)