update up to chapter 9

2022-02-13 11:59:23 +01:00 · 2022-02-13 11:59:23 +01:00 · ab6b8f18f3
commit ab6b8f18f3
parent e1d5277f8c
4 changed files with 637 additions and 618 deletions
--- a/ch08.jl
+++ b/ch08.jl
@ -1,8 +1,8 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 6
+# Codes for chapter 8
-# Code for section 6.1
+# Code for section 8.1
 if isfile("puzzles.csv.bz2")
    @info "file already present"
@ -25,22 +25,27 @@ end
 readlines("puzzles.csv")
-# Code for section 6.2
+# Code for section 8.2
 using CSV
 using DataFrames
 puzzles = CSV.read("puzzles.csv", DataFrame);
-CSV.read(plain, DataFrame);
+puzzles2 = CSV.read(plain, DataFrame;
                    header=["PuzzleId", "FEN", "Moves",
                            "Rating","RatingDeviation",
                            "Popularity", "NbPlays",
                            "Themes","GameUrl"]);
 puzzles == puzzles2
 compressed = nothing
 plain = nothing
-# Code for listing 6.1
+# Code for listing 8.1
 puzzles
-# Code for listing 6.2
+# Code for listing 8.2
 describe(puzzles)
@ -52,7 +57,13 @@ nrow(puzzles)
 names(puzzles)
-# Code for section 6.3
+CSV.write("puzzles2.csv", puzzles)
 read("puzzles2.csv")
 read("puzzles2.csv") == read("puzzles.csv")
 # Code for section 8.3
 puzzles.Rating
@ -101,148 +112,3 @@ plot(histogram(puzzles.Rating, label="Rating"),
 plot([histogram(puzzles[!, col]; label=col) for
      col in ["Rating", "RatingDeviation",
              "Popularity", "NbPlays"]]...)
 # Code for section 6.4
 using Statistics
 plays_lo = median(puzzles.NbPlays)
 puzzles.NbPlays .> plays_lo
 puzzles.NbPlays > plays_lo
 rating_lo = 1500
 rating_hi = quantile(puzzles.Rating, 0.99)
 rating_lo .< puzzles.Rating .< rating_hi
 row_selector = (puzzles.NbPlays .> plays_lo) .&&
               (rating_lo .< puzzles.Rating .< rating_hi)
 sum(row_selector)
 count(row_selector)
 # Code for listing 6.3
 good = puzzles[row_selector, ["Rating", "Popularity"]]
 # Code for plotting histograms
 plot(histogram(good.Rating; label="Rating"),
     histogram(good.Popularity; label="Popularity"))
 # Code for column selectors
 puzzles[1, "Rating"]
 puzzles[:, "Rating"]
 row1 = puzzles[1, ["Rating", "Popularity"]]
 row1["Rating"]
 row1[:Rating]
 row1[1]
 row1.Rating
 row1."Rating"
 good = puzzles[row_selector, ["Rating", "Popularity"]]
 good[1, "Rating"]
 good[1, :]
 good[:, "Rating"]
 good[:, :]
 names(puzzles, ["Rating", "Popularity"])
 names(puzzles, [:Rating, :Popularity])
 names(puzzles, [4, 6])
 names(puzzles, [false, false, false, true, false, true, false, false, false])
 names(puzzles, r"Rating")
 names(puzzles, Not([4, 6]))
 names(puzzles, Not(r"Rating"))
 names(puzzles, Between("Rating", "Popularity"))
 names(puzzles, :)
 names(puzzles, All())
 names(puzzles, Cols(r"Rating", "NbPlays"))
 names(puzzles, Cols(startswith("P")))
 names(puzzles, startswith("P"))
 names(puzzles, Real)
 names(puzzles, AbstractString)
 puzzles[:, names(puzzles, Real)]
 # Code for row subsetting
 df1 = puzzles[:, ["Rating", "Popularity"]];
 df2 = puzzles[!, ["Rating", "Popularity"]];
 df1 == df2
 df1 == puzzles
 df2 == puzzles
 df1.Rating === puzzles.Rating
 df1.Popularity === puzzles.Popularity
 df2.Rating === puzzles.Rating
 df2.Popularity === puzzles.Popularity
@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
 puzzles[1, 1]
 puzzles[[1], 1]
 puzzles[1, [1]]
 puzzles[[1], [1]]
 # Code for making views
@view puzzles[1, 1]
@view puzzles[[1], 1]
@view puzzles[1, [1]]
@view puzzles[[1], [1]]
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
 parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
 # Code for section 6.5
 describe(good)
 rating_mapping = Dict{Int, Vector{Int}}()
 for (i, rating) in enumerate(good.Rating)
    if haskey(rating_mapping, rating)
        push!(rating_mapping[rating], i)
    else
        rating_mapping[rating] = [i]
    end
 end
 rating_mapping
 good[rating_mapping[2108], :]
 unique(good[rating_mapping[2108], :].Rating)
 using Statistics
 mean(good[rating_mapping[2108], "Popularity"])
 ratings = unique(good.Rating)
 mean_popularities = map(ratings) do rating
    indices = rating_mapping[rating]
    popularities = good[indices, "Popularity"]
    return mean(popularities)
 end
 scatter(ratings, mean_popularities;
        xlabel="rating", ylabel="mean popularity", legend=false)
 import Loess
 model = Loess.loess(ratings, mean_popularities);
 ratings_predict = float.(sort(ratings))
 popularity_predict = Loess.predict(model, ratings_predict)
 plot!(ratings_predict, popularity_predict; width=5, color="black")
--- a/ch09.jl
+++ b/ch09.jl
@ -1,279 +1,153 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 7
+# Codes for chapter 9
-# Code for section 7.1
+# Code for section 9.1
 aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
 data = (set1=(x=aq[:, 1], y=aq[:, 2]),
        set2=(x=aq[:, 3], y=aq[:, 4]),
        set3=(x=aq[:, 5], y=aq[:, 6]),
        set4=(x=aq[:, 7], y=aq[:, 8]));
 using DataFrames
 using CSV
 using Plots
 puzzles = CSV.read("puzzles.csv", DataFrame);
-# Code for listing 7.1
+using Statistics
 plays_lo = median(puzzles.NbPlays)
 puzzles.NbPlays .> plays_lo
-aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
+puzzles.NbPlays > plays_lo
 DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
-# Code for creating DataFrame with automatic column names
+rating_lo = 1500
 rating_hi = quantile(puzzles.Rating, 0.99)
 rating_lo .< puzzles.Rating .< rating_hi
-DataFrame(aq, :auto)
+row_selector = (puzzles.NbPlays .> plays_lo) .&&
               (rating_lo .< puzzles.Rating .< rating_hi)
-# Codes for creating DataFrame from vector of vectors
+sum(row_selector)
 count(row_selector)
-aq_vec = collect(eachcol(aq))
+# Code for listing 9.1
 DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
 DataFrame(aq_vec, :auto)
-# Codes for section 7.1.2
+good = puzzles[row_selector, ["Rating", "Popularity"]]
-data.set1.x
+# Code for plotting histograms
-DataFrame(x1=data.set1.x, y1=data.set1.y,
+plot(histogram(good.Rating; label="Rating"),
-          x2=data.set2.x, y2=data.set2.y,
+     histogram(good.Popularity; label="Popularity"))
          x3=data.set3.x, y3=data.set3.y,
          x4=data.set4.x, y4=data.set4.y)
-DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
+# Code for column selectors
          :x2 => data.set2.x, :y2 => data.set2.y,
          :x3 => data.set3.x, :y3 => data.set3.y,
          :x4 => data.set4.x, :y4 => data.set4.y)
-DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
+puzzles[1, "Rating"]
           :x2 => data.set2.x, :y2 => data.set2.y,
           :x3 => data.set3.x, :y3 => data.set3.y,
           :x4 => data.set4.x, :y4 => data.set4.y]);
-[(i, v) for i in 1:4 for v in [:x, :y]]
+puzzles[:, "Rating"]
-[string(v, i) for i in 1:4 for v in [:x, :y]]
+row1 = puzzles[1, ["Rating", "Popularity"]]
-[string(v, i) => getproperty(data[i], v)
+row1["Rating"]
-        for i in 1:4 for v in [:x, :y]]
+row1[:Rating]
 row1[1]
 row1.Rating
 row1."Rating"
-DataFrame([string(v, i) => getproperty(data[i], v)
+good = puzzles[row_selector, ["Rating", "Popularity"]]
           for i in 1:4 for v in [:x, :y]]);
-data_dict = Dict([string(v, i) => getproperty(data[i], v)
+good[1, "Rating"]
-                         for i in 1:4 for v in [:x, :y]])
+good[1, :]
-collect(data_dict)
+good[:, "Rating"]
 good[:, :]
-DataFrame(data_dict)
+names(puzzles, ["Rating", "Popularity"])
 names(puzzles, [:Rating, :Popularity])
 names(puzzles, [4, 6])
 names(puzzles, [false, false, false, true, false, true, false, false, false])
 names(puzzles, r"Rating")
 names(puzzles, Not([4, 6]))
 names(puzzles, Not(r"Rating"))
 names(puzzles, Between("Rating", "Popularity"))
 names(puzzles, :)
 names(puzzles, All())
 names(puzzles, Cols(r"Rating", "NbPlays"))
 names(puzzles, Cols(startswith("P")))
-df1 = DataFrame(x1=data.set1.x)
+names(puzzles, startswith("P"))
 df1.x1 === data.set1.x
-df2 = DataFrame(x1=data.set1.x; copycols=false)
+names(puzzles, Real)
 df2.x1 === data.set1.x
-df = DataFrame(x=1:3, y=1)
+names(puzzles, AbstractString)
 df.x
-DataFrame(x=[1], y=[1, 2, 3])
+puzzles[:, names(puzzles, Real)]
-# Codes for section 7.1.3
+# Code for row subsetting
-data.set1
+df1 = puzzles[:, ["Rating", "Popularity"]];
-DataFrame(data.set1)
+df2 = puzzles[!, ["Rating", "Popularity"]];
-DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
+df1 == df2
 df1 == puzzles
 df2 == puzzles
-data
+df1.Rating === puzzles.Rating
 df1.Popularity === puzzles.Popularity
 df2.Rating === puzzles.Rating
 df2.Popularity === puzzles.Popularity
-# Code for listing 7.2
+@benchmark $puzzles[:, ["Rating", "Popularity"]]
@benchmark $puzzles[!, ["Rating", "Popularity"]]
-aq2 = DataFrame(data)
+puzzles[1, 1]
 puzzles[[1], 1]
 puzzles[1, [1]]
 puzzles[[1], [1]]
-# Codes for listing 7.3
+# Code for making views
-data_dfs = map(DataFrame, data)
+@view puzzles[1, 1]
-# Codes for vertical concatenation examples
+@view puzzles[[1], 1]
-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
+@view puzzles[1, [1]]
-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
+@view puzzles[[1], [1]]
     source="source_id")
-vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
+@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
-     source="source_id"=>string.("set", 1:4))
+@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
-reduce(vcat, collect(data_dfs);
+parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
       source="source_id"=>string.("set", 1:4))
-# Code for listing 7.4
+# Code for section 9.2
-df1 = DataFrame(a=1:3, b=11:13)
+describe(good)
 df2 = DataFrame(a=4:6, c=24:26)
 vcat(df1, df2)
 vcat(df1, df2; cols=:union)
-# Code for listing 7.5
+rating_mapping = Dict{Int, Vector{Int}}()
-
+for (i, rating) in enumerate(good.Rating)
-df_agg = DataFrame()
+    if haskey(rating_mapping, rating)
-append!(df_agg, data_dfs.set1)
+        push!(rating_mapping[rating], i)
-append!(df_agg, data_dfs.set2)
+    else
-
+        rating_mapping[rating] = [i]
 # Code for appending tables to a data frame
 df_agg = DataFrame()
 append!(df_agg, data.set1)
 append!(df_agg, data.set2)
 # Code for promote keyword argument
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, b=[14, missing, 16])
 append!(df1, df2)
 append!(df1, df2; promote=true)
 # Code for section 7.2.3
 df = DataFrame()
 push!(df, (a=1, b=2))
 push!(df, (a=3, b=4))
 df = DataFrame(a=Int[], b=Int[])
 push!(df, [1, 2])
 push!(df, [3, 4])
 function sim_step(current)
    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
    return (x=current.x + dx, y=current.y + dy)
 end
 using BenchmarkTools
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
 dx, dy = (10, 20)
 dx
 dy
 using FreqTables
 using Random
 Random.seed!(1234);
 proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
 using Random
 Random.seed!(6);
 walk = DataFrame(x=0, y=0)
 for _ in 1:10
    current = walk[end, :]
    push!(walk, sim_step(current))
 end
 walk
 plot(walk.x, walk.y;
     legend=false,
     series_annotations=1:11,
     xticks=range(extrema(walk.x)...),
     yticks=range(extrema(walk.y)...))
 extrema(walk.y)
 range(1, 5)
 (3/4)^9
 # Code for listing 7.6
 function walk_unique() #A
    walk = DataFrame(x=0, y=0)
    for _ in 1:10
        current = walk[end, :]
        push!(walk, sim_step(current))
    end
    return nrow(unique(walk)) == nrow(walk) #B
 end
-Random.seed!(2);
+rating_mapping
 proptable([walk_unique() for _ in 1:10^5])
-# Code for a note on conversion
+good[rating_mapping[2108], :]
-x = [1.5]
+unique(good[rating_mapping[2108], :].Rating)
 x[1] = 1
 x
-# Code from section 7.3.1
+using Statistics
 mean(good[rating_mapping[2108], "Popularity"])
-Matrix(walk)
+ratings = unique(good.Rating)
 Matrix{Any}(walk)
 Matrix{String}(walk)
-plot(walk)
+mean_popularities = map(ratings) do rating
-
+    indices = rating_mapping[rating]
-plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
+    popularities = good[indices, "Popularity"]
-
+    return mean(popularities)
 # Code from section 7.3.2
 Tables.columntable(walk)
 using BenchmarkTools
 function mysum(table)
           s = 0 #A
           for v in table.x #B
               s += v
           end
           return s
       end
 df = DataFrame(x=1:1_000_000);
@btime mysum($df)
 tab = Tables.columntable(df);
@btime mysum($tab)
@code_warntype mysum(df)
@code_warntype mysum(tab)
 typeof(tab)
 function barrier_mysum2(x)
    s = 0
    for v in x
        s += v
    end
    return s
 end
 mysum2(table) = barrier_mysum2(table.x)
@btime mysum2($df)
 df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
 unique(df)
 tab = Tables.columntable(df)
 unique(tab)
 # Code from section 7.3.3
 Tables.rowtable(walk)
 nti = Tables.namedtupleiterator(walk)
 for v in nti
    println(v)
 end
-er = eachrow(walk)
+scatter(ratings, mean_popularities;
-er[1]
+        xlabel="rating", ylabel="mean popularity", legend=false)
 er[end]
 ec = eachcol(walk)
 ec[1]
 ec[end]
-identity.(eachcol(walk))
+import Loess
 model = Loess.loess(ratings, mean_popularities);
 ratings_predict = float.(sort(ratings))
 popularity_predict = Loess.predict(model, ratings_predict)
-df = DataFrame(x=1:2, b=["a", "b"])
+plot!(ratings_predict, popularity_predict; width=5, color="black")
 identity.(eachcol(df))
--- a/ch10.jl
+++ b/ch10.jl
@ -1,284 +1,279 @@
 # Bogumił Kamiński, 2022
-# Codes for chapter 8
+# Codes for chapter 7
-# Codes for section 8.1
+# Code for section 7.1
-# Code for listing 8.1
+aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
-import Downloads
+data = (set1=(x=aq[:, 1], y=aq[:, 2]),
-using SHA
+        set2=(x=aq[:, 3], y=aq[:, 4]),
-git_zip = "git_web_ml.zip"
+        set3=(x=aq[:, 5], y=aq[:, 6]),
-if !isfile(git_zip)
+        set4=(x=aq[:, 7], y=aq[:, 8]));
    Downloads.download("https://snap.stanford.edu/data/" *
                       "git_web_ml.zip",
                       git_zip)
 end
 isfile(git_zip)
 open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
                          0xc4, 0x60, 0xdc, 0x4c,
                          0x7b, 0xf8, 0x93, 0x57,
                          0xb1, 0xfe, 0xc0, 0x20,
                          0xf4, 0x5e, 0x2e, 0xce,
                          0xba, 0xb8, 0x1d, 0x13,
                          0x1d, 0x07, 0x3b, 0x10,
                          0xe2, 0x8e, 0xc0, 0x31]
 # Code for opeining a zip archive
 import ZipFile
 git_archive = ZipFile.Reader(git_zip)
 # Code for listing 8.2
 function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
    idx = only(findall(x -> x.name == filename, archive.files))
    return CSV.read(read(archive.files[idx]), DataFrame)
 end
 # Code for working with zip archive
 git_archive.files
 git_archive.files[2].name
 findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
 findall(x -> x.name == "", git_archive.files)
 only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
 only(findall(x -> x.name == "", git_archive.files))
 # Code for listing 8.3
 using CSV
 using DataFrames
 edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
 classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
 close(git_archive)
 summary(edges_df)
 describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
 summary(classes_df)
 describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
-# Code for updating data frame columns using broadcasting
+# Code for listing 7.1
-edges_df .+= 1
+aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
-classes_df.id .+= 1
+DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
-# Code for examples of data frame broadcasting
+# Code for creating DataFrame with automatic column names
-df = DataFrame(a=1:3, b=[4, missing, 5])
+DataFrame(aq, :auto)
 df .^ 2
 coalesce.(df, 0)
 df .+ [10, 11, 12]
-# Code for checking the order of :id column in a data frame
+# Codes for creating DataFrame from vector of vectors
-classes_df.id == axes(classes_df, 1)
+aq_vec = collect(eachcol(aq))
 DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
 DataFrame(aq_vec, :auto)
-# Code for the difference between ! and : in broadcasting assignment
+# Codes for section 7.1.2
-df = DataFrame(a=1:3, b=1:3)
+data.set1.x
 df[!, :a] .= "x"
 df[:, :b] .= "x"
 df
-# Code for the difference between ! and : in assignment
+DataFrame(x1=data.set1.x, y1=data.set1.y,
          x2=data.set2.x, y2=data.set2.y,
          x3=data.set3.x, y3=data.set3.y,
          x4=data.set4.x, y4=data.set4.y)
-df = DataFrame(a=1:3, b=1:3, c=1:3)
+DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
-df[!, :a] = ["x", "y", "z"]
+          :x2 => data.set2.x, :y2 => data.set2.y,
-df[:, :b] = ["x", "y", "z"]
+          :x3 => data.set3.x, :y3 => data.set3.y,
-df[:, :c] = [11, 12, 13]
+          :x4 => data.set4.x, :y4 => data.set4.y)
 df
-# Codes for section 8.2
+DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
           :x2 => data.set2.x, :y2 => data.set2.y,
           :x3 => data.set3.x, :y3 => data.set3.y,
           :x4 => data.set4.x, :y4 => data.set4.y]);
-# Code from listing 8.4
+[(i, v) for i in 1:4 for v in [:x, :y]]
-using Graphs
+[string(v, i) for i in 1:4 for v in [:x, :y]]
 gh = SimpleGraph(nrow(classes_df))
 for (from, to) in eachrow(edges_df)
    add_edge!(gh, from, to)
 end
 gh
 ne(gh)
 nv(gh)
-# Code for iterator destruction in iteration specification
+[string(v, i) => getproperty(data[i], v)
        for i in 1:4 for v in [:x, :y]]
-mat = [1 2; 3 4; 5 6]
+DataFrame([string(v, i) => getproperty(data[i], v)
-for (x1, x2) in eachrow(mat)
+           for i in 1:4 for v in [:x, :y]]);
    @show x1, x2
 end
-# Code for getting degrees of nodes in the graph
+data_dict = Dict([string(v, i) => getproperty(data[i], v)
                         for i in 1:4 for v in [:x, :y]])
 collect(data_dict)
-degree(gh)
+DataFrame(data_dict)
-# Code for adding a column to a data frame
+df1 = DataFrame(x1=data.set1.x)
 df1.x1 === data.set1.x
-classes_df.deg = degree(gh)
+df2 = DataFrame(x1=data.set1.x; copycols=false)
 df2.x1 === data.set1.x
-# Code for the difference between ! and : when adding a column
+df = DataFrame(x=1:3, y=1)
 df.x
 DataFrame(x=[1], y=[1, 2, 3])
 # Codes for section 7.1.3
 data.set1
 DataFrame(data.set1)
 DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
 data
 # Code for listing 7.2
 aq2 = DataFrame(data)
 # Codes for listing 7.3
 data_dfs = map(DataFrame, data)
 # Codes for vertical concatenation examples
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id")
 vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
     source="source_id"=>string.("set", 1:4))
 reduce(vcat, collect(data_dfs);
       source="source_id"=>string.("set", 1:4))
 # Code for listing 7.4
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, c=24:26)
 vcat(df1, df2)
 vcat(df1, df2; cols=:union)
 # Code for listing 7.5
 df_agg = DataFrame()
 append!(df_agg, data_dfs.set1)
 append!(df_agg, data_dfs.set2)
 # Code for appending tables to a data frame
 df_agg = DataFrame()
 append!(df_agg, data.set1)
 append!(df_agg, data.set2)
 # Code for promote keyword argument
 df1 = DataFrame(a=1:3, b=11:13)
 df2 = DataFrame(a=4:6, b=[14, missing, 16])
 append!(df1, df2)
 append!(df1, df2; promote=true)
 # Code for section 7.2.3
 df = DataFrame()
-x = [1, 2, 3]
+push!(df, (a=1, b=2))
-df[!, :x1] = x
+push!(df, (a=3, b=4))
 df[:, :x2] = x
 df
 df.x1 === x
 df.x2 === x
 df.x2 == x
-# Code for creating a column using broadcasting
+df = DataFrame(a=Int[], b=Int[])
 push!(df, [1, 2])
 push!(df, [3, 4])
-df.x3 .= 1
+function sim_step(current)
-df
+    dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
-
+    return (x=current.x + dx, y=current.y + dy)
 # Code for edge iterator of a graph
 edges(gh)
 e1 = first(edges(gh))
 dump(e1)
 e1.src
 e1.dst
 # Code for listing 8.5
 function deg_class(gh, class)
    deg_ml = zeros(Int, length(class))
    deg_web = zeros(Int, length(class))
    for edge in edges(gh)
        a, b = edge.src, edge.dst
        if class[b] == 1
            deg_ml[a] += 1
        else
            deg_web[a] += 1
        end
        if class[a] == 1
            deg_ml[b] += 1
        else
            deg_web[b] += 1
        end
    end
    return (deg_ml, deg_web)
 end
-# Code for computing machine learning and web neighbors for gh graph
+using BenchmarkTools
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
-classes_df.deg_ml, classes_df.deg_web =
+dx, dy = (10, 20)
-deg_class(gh, classes_df.ml_target)
+dx
-
+dy
 # Code for checking type stability of deg_class function
@time deg_class(gh, classes_df.ml_target);
@code_warntype deg_class(gh, classes_df.ml_target)
 # Code for checking the classes_df summary statistics
 describe(classes_df, :min, :max, :mean, :std)
 # Code for average degree of node in the graph
 2 * ne(gh) / nv(gh)
 # Code for checking correctness of computations
 classes_df.deg_ml + classes_df.deg_web == classes_df.deg
 # Code for showing that DataFrames.jl checks consistency of stored objects
 df = DataFrame(a=1, b=11)
 push!(df.a, 2)
 df
 # Codes for section 8.3
 # Code for computing groupwise means of columns
 using Statistics
 for type in [0, 1], col in ["deg_ml", "deg_web"]
    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
 end
 gdf = groupby(classes_df, :ml_target)
 combine(gdf,
        :deg_ml => mean => :mean_deg_ml,
        :deg_web => mean => :mean_deg_web)
 using DataFramesMeta
@combine(gdf,
         :mean_deg_ml = mean(:deg_ml),
         :mean_deg_web = mean(:deg_web))
 # Code for simple plotting of relationship between developer degree and type
 using Plots
 scatter(classes_df.deg_ml, classes_df.deg_web;
        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
        xlabel="degree ml", ylabel="degree web", labels=false)
 # Code for aggregation of degree data
 agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
 # Code for comparison how Julia parses expressions
 :ml_target => (x -> 1 - mean(x)) => :web_mean
 :ml_target => x -> 1 - mean(x) => :web_mean
 # Code for aggregation using DataFramesMeta.jl
@combine(groupby(classes_df, [:deg_ml, :deg_web]),
         :web_mean = 1 - mean(:ml_target))
 # Code for getting summary information about the aggregated data frame
 describe(agg_df)
 # Code for log1p function
 log1p(0)
 # Code for listing 8.6
 function gen_ticks(maxv)
    max2 = round(Int, log2(maxv))
    tick = [0; 2 .^ (0:max2)]
    return (log1p.(tick), tick)
 end
 log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
 using FreqTables
 using Random
 Random.seed!(1234);
-scatter(log1pjitter.(agg_df.deg_ml),
+proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
        log1pjitter.(agg_df.deg_web);
        zcolor=agg_df.web_mean,
        xlabel="degree ml", ylabel="degree web",
        markersize=2, markerstrokewidth=0, markeralpha=0.8,
        legend=:topleft, labels = "fraction web",
        xticks=gen_ticks(maximum(classes_df.deg_ml)),
        yticks=gen_ticks(maximum(classes_df.deg_web)))
-# Code for fitting logistic regression model
+using Random
 Random.seed!(6);
 walk = DataFrame(x=0, y=0)
 for _ in 1:10
    current = walk[end, :]
    push!(walk, sim_step(current))
 end
 walk
-using GLM
+plot(walk.x, walk.y;
-glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
+     legend=false,
     series_annotations=1:11,
     xticks=range(extrema(walk.x)...),
     yticks=range(extrema(walk.y)...))
-# Code for inspecting @formula result
+extrema(walk.y)
-@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
+range(1, 5)
-# Code for inserting columns to a data frame
+(3/4)^9
-df = DataFrame(x=1:3)
+# Code for listing 7.6
 insertcols!(df, :y => 4:6)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :z => 1)
-insertcols!(df, 1, :a => 0)
+function walk_unique() #A
-insertcols!(df, :x, :pre_x => 2)
+    walk = DataFrame(x=0, y=0)
-insertcols!(df, :x, :post_x => 3, after=true)
+    for _ in 1:10
        current = walk[end, :]
        push!(walk, sim_step(current))
    end
    return nrow(unique(walk)) == nrow(walk) #B
 end
 Random.seed!(2);
 proptable([walk_unique() for _ in 1:10^5])
 # Code for a note on conversion
 x = [1.5]
 x[1] = 1
 x
 # Code from section 7.3.1
 Matrix(walk)
 Matrix{Any}(walk)
 Matrix{String}(walk)
 plot(walk)
 plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
 # Code from section 7.3.2
 Tables.columntable(walk)
 using BenchmarkTools
 function mysum(table)
           s = 0 #A
           for v in table.x #B
               s += v
           end
           return s
       end
 df = DataFrame(x=1:1_000_000);
@btime mysum($df)
 tab = Tables.columntable(df);
@btime mysum($tab)
@code_warntype mysum(df)
@code_warntype mysum(tab)
 typeof(tab)
 function barrier_mysum2(x)
    s = 0
    for v in x
        s += v
    end
    return s
 end
 mysum2(table) = barrier_mysum2(table.x)
@btime mysum2($df)
 df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
 unique(df)
 tab = Tables.columntable(df)
 unique(tab)
 # Code from section 7.3.3
 Tables.rowtable(walk)
 nti = Tables.namedtupleiterator(walk)
 for v in nti
    println(v)
 end
 er = eachrow(walk)
 er[1]
 er[end]
 ec = eachcol(walk)
 ec[1]
 ec[end]
 identity.(eachcol(walk))
 df = DataFrame(x=1:2, b=["a", "b"])
 identity.(eachcol(df))
--- a/ch12.jl
+++ b/ch12.jl
@ -0,0 +1,284 @@
 # Bogumił Kamiński, 2022
 # Codes for chapter 8
 # Codes for section 8.1
 # Code for listing 8.1
 import Downloads
 using SHA
 git_zip = "git_web_ml.zip"
 if !isfile(git_zip)
    Downloads.download("https://snap.stanford.edu/data/" *
                       "git_web_ml.zip",
                       git_zip)
 end
 isfile(git_zip)
 open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
                          0xc4, 0x60, 0xdc, 0x4c,
                          0x7b, 0xf8, 0x93, 0x57,
                          0xb1, 0xfe, 0xc0, 0x20,
                          0xf4, 0x5e, 0x2e, 0xce,
                          0xba, 0xb8, 0x1d, 0x13,
                          0x1d, 0x07, 0x3b, 0x10,
                          0xe2, 0x8e, 0xc0, 0x31]
 # Code for opeining a zip archive
 import ZipFile
 git_archive = ZipFile.Reader(git_zip)
 # Code for listing 8.2
 function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
    idx = only(findall(x -> x.name == filename, archive.files))
    return CSV.read(read(archive.files[idx]), DataFrame)
 end
 # Code for working with zip archive
 git_archive.files
 git_archive.files[2].name
 findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
 findall(x -> x.name == "", git_archive.files)
 only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
 only(findall(x -> x.name == "", git_archive.files))
 # Code for listing 8.3
 using CSV
 using DataFrames
 edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
 classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
 close(git_archive)
 summary(edges_df)
 describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
 summary(classes_df)
 describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
 # Code for updating data frame columns using broadcasting
 edges_df .+= 1
 classes_df.id .+= 1
 # Code for examples of data frame broadcasting
 df = DataFrame(a=1:3, b=[4, missing, 5])
 df .^ 2
 coalesce.(df, 0)
 df .+ [10, 11, 12]
 # Code for checking the order of :id column in a data frame
 classes_df.id == axes(classes_df, 1)
 # Code for the difference between ! and : in broadcasting assignment
 df = DataFrame(a=1:3, b=1:3)
 df[!, :a] .= "x"
 df[:, :b] .= "x"
 df
 # Code for the difference between ! and : in assignment
 df = DataFrame(a=1:3, b=1:3, c=1:3)
 df[!, :a] = ["x", "y", "z"]
 df[:, :b] = ["x", "y", "z"]
 df[:, :c] = [11, 12, 13]
 df
 # Codes for section 8.2
 # Code from listing 8.4
 using Graphs
 gh = SimpleGraph(nrow(classes_df))
 for (from, to) in eachrow(edges_df)
    add_edge!(gh, from, to)
 end
 gh
 ne(gh)
 nv(gh)
 # Code for iterator destruction in iteration specification
 mat = [1 2; 3 4; 5 6]
 for (x1, x2) in eachrow(mat)
    @show x1, x2
 end
 # Code for getting degrees of nodes in the graph
 degree(gh)
 # Code for adding a column to a data frame
 classes_df.deg = degree(gh)
 # Code for the difference between ! and : when adding a column
 df = DataFrame()
 x = [1, 2, 3]
 df[!, :x1] = x
 df[:, :x2] = x
 df
 df.x1 === x
 df.x2 === x
 df.x2 == x
 # Code for creating a column using broadcasting
 df.x3 .= 1
 df
 # Code for edge iterator of a graph
 edges(gh)
 e1 = first(edges(gh))
 dump(e1)
 e1.src
 e1.dst
 # Code for listing 8.5
 function deg_class(gh, class)
    deg_ml = zeros(Int, length(class))
    deg_web = zeros(Int, length(class))
    for edge in edges(gh)
        a, b = edge.src, edge.dst
        if class[b] == 1
            deg_ml[a] += 1
        else
            deg_web[a] += 1
        end
        if class[a] == 1
            deg_ml[b] += 1
        else
            deg_web[b] += 1
        end
    end
    return (deg_ml, deg_web)
 end
 # Code for computing machine learning and web neighbors for gh graph
 classes_df.deg_ml, classes_df.deg_web =
 deg_class(gh, classes_df.ml_target)
 # Code for checking type stability of deg_class function
@time deg_class(gh, classes_df.ml_target);
@code_warntype deg_class(gh, classes_df.ml_target)
 # Code for checking the classes_df summary statistics
 describe(classes_df, :min, :max, :mean, :std)
 # Code for average degree of node in the graph
 2 * ne(gh) / nv(gh)
 # Code for checking correctness of computations
 classes_df.deg_ml + classes_df.deg_web == classes_df.deg
 # Code for showing that DataFrames.jl checks consistency of stored objects
 df = DataFrame(a=1, b=11)
 push!(df.a, 2)
 df
 # Codes for section 8.3
 # Code for computing groupwise means of columns
 using Statistics
 for type in [0, 1], col in ["deg_ml", "deg_web"]
    println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
 end
 gdf = groupby(classes_df, :ml_target)
 combine(gdf,
        :deg_ml => mean => :mean_deg_ml,
        :deg_web => mean => :mean_deg_web)
 using DataFramesMeta
@combine(gdf,
         :mean_deg_ml = mean(:deg_ml),
         :mean_deg_web = mean(:deg_web))
 # Code for simple plotting of relationship between developer degree and type
 using Plots
 scatter(classes_df.deg_ml, classes_df.deg_web;
        color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
        xlabel="degree ml", ylabel="degree web", labels=false)
 # Code for aggregation of degree data
 agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
                 :ml_target => (x -> 1 - mean(x)) => :web_mean)
 # Code for comparison how Julia parses expressions
 :ml_target => (x -> 1 - mean(x)) => :web_mean
 :ml_target => x -> 1 - mean(x) => :web_mean
 # Code for aggregation using DataFramesMeta.jl
@combine(groupby(classes_df, [:deg_ml, :deg_web]),
         :web_mean = 1 - mean(:ml_target))
 # Code for getting summary information about the aggregated data frame
 describe(agg_df)
 # Code for log1p function
 log1p(0)
 # Code for listing 8.6
 function gen_ticks(maxv)
    max2 = round(Int, log2(maxv))
    tick = [0; 2 .^ (0:max2)]
    return (log1p.(tick), tick)
 end
 log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
 using Random
 Random.seed!(1234);
 scatter(log1pjitter.(agg_df.deg_ml),
        log1pjitter.(agg_df.deg_web);
        zcolor=agg_df.web_mean,
        xlabel="degree ml", ylabel="degree web",
        markersize=2, markerstrokewidth=0, markeralpha=0.8,
        legend=:topleft, labels = "fraction web",
        xticks=gen_ticks(maximum(classes_df.deg_ml)),
        yticks=gen_ticks(maximum(classes_df.deg_web)))
 # Code for fitting logistic regression model
 using GLM
 glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
 # Code for inspecting @formula result
@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
 # Code for inserting columns to a data frame
 df = DataFrame(x=1:3)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :y => 4:6)
 insertcols!(df, :z => 1)
 insertcols!(df, 1, :a => 0)
 insertcols!(df, :x, :pre_x => 2)
 insertcols!(df, :x, :post_x => 3, after=true)