update up to chapter 9
This commit is contained in:
parent
e1d5277f8c
commit
ab6b8f18f3
170
ch08.jl
170
ch08.jl
@ -1,8 +1,8 @@
|
||||
# Bogumił Kamiński, 2022
|
||||
|
||||
# Codes for chapter 6
|
||||
# Codes for chapter 8
|
||||
|
||||
# Code for section 6.1
|
||||
# Code for section 8.1
|
||||
|
||||
if isfile("puzzles.csv.bz2")
|
||||
@info "file already present"
|
||||
@ -25,22 +25,27 @@ end
|
||||
|
||||
readlines("puzzles.csv")
|
||||
|
||||
# Code for section 6.2
|
||||
# Code for section 8.2
|
||||
|
||||
using CSV
|
||||
using DataFrames
|
||||
puzzles = CSV.read("puzzles.csv", DataFrame);
|
||||
|
||||
CSV.read(plain, DataFrame);
|
||||
puzzles2 = CSV.read(plain, DataFrame;
|
||||
header=["PuzzleId", "FEN", "Moves",
|
||||
"Rating","RatingDeviation",
|
||||
"Popularity", "NbPlays",
|
||||
"Themes","GameUrl"]);
|
||||
puzzles == puzzles2
|
||||
|
||||
compressed = nothing
|
||||
plain = nothing
|
||||
|
||||
# Code for listing 6.1
|
||||
# Code for listing 8.1
|
||||
|
||||
puzzles
|
||||
|
||||
# Code for listing 6.2
|
||||
# Code for listing 8.2
|
||||
|
||||
describe(puzzles)
|
||||
|
||||
@ -52,7 +57,13 @@ nrow(puzzles)
|
||||
|
||||
names(puzzles)
|
||||
|
||||
# Code for section 6.3
|
||||
CSV.write("puzzles2.csv", puzzles)
|
||||
|
||||
read("puzzles2.csv")
|
||||
|
||||
read("puzzles2.csv") == read("puzzles.csv")
|
||||
|
||||
# Code for section 8.3
|
||||
|
||||
puzzles.Rating
|
||||
|
||||
@ -101,148 +112,3 @@ plot(histogram(puzzles.Rating, label="Rating"),
|
||||
plot([histogram(puzzles[!, col]; label=col) for
|
||||
col in ["Rating", "RatingDeviation",
|
||||
"Popularity", "NbPlays"]]...)
|
||||
|
||||
# Code for section 6.4
|
||||
|
||||
using Statistics
|
||||
plays_lo = median(puzzles.NbPlays)
|
||||
puzzles.NbPlays .> plays_lo
|
||||
|
||||
puzzles.NbPlays > plays_lo
|
||||
|
||||
rating_lo = 1500
|
||||
rating_hi = quantile(puzzles.Rating, 0.99)
|
||||
rating_lo .< puzzles.Rating .< rating_hi
|
||||
|
||||
row_selector = (puzzles.NbPlays .> plays_lo) .&&
|
||||
(rating_lo .< puzzles.Rating .< rating_hi)
|
||||
|
||||
sum(row_selector)
|
||||
count(row_selector)
|
||||
|
||||
# Code for listing 6.3
|
||||
|
||||
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
||||
|
||||
# Code for plotting histograms
|
||||
|
||||
plot(histogram(good.Rating; label="Rating"),
|
||||
histogram(good.Popularity; label="Popularity"))
|
||||
|
||||
# Code for column selectors
|
||||
|
||||
puzzles[1, "Rating"]
|
||||
|
||||
puzzles[:, "Rating"]
|
||||
|
||||
row1 = puzzles[1, ["Rating", "Popularity"]]
|
||||
|
||||
row1["Rating"]
|
||||
row1[:Rating]
|
||||
row1[1]
|
||||
row1.Rating
|
||||
row1."Rating"
|
||||
|
||||
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
||||
|
||||
good[1, "Rating"]
|
||||
good[1, :]
|
||||
good[:, "Rating"]
|
||||
good[:, :]
|
||||
|
||||
names(puzzles, ["Rating", "Popularity"])
|
||||
names(puzzles, [:Rating, :Popularity])
|
||||
names(puzzles, [4, 6])
|
||||
names(puzzles, [false, false, false, true, false, true, false, false, false])
|
||||
names(puzzles, r"Rating")
|
||||
names(puzzles, Not([4, 6]))
|
||||
names(puzzles, Not(r"Rating"))
|
||||
names(puzzles, Between("Rating", "Popularity"))
|
||||
names(puzzles, :)
|
||||
names(puzzles, All())
|
||||
names(puzzles, Cols(r"Rating", "NbPlays"))
|
||||
names(puzzles, Cols(startswith("P")))
|
||||
|
||||
names(puzzles, startswith("P"))
|
||||
|
||||
names(puzzles, Real)
|
||||
|
||||
names(puzzles, AbstractString)
|
||||
|
||||
puzzles[:, names(puzzles, Real)]
|
||||
|
||||
# Code for row subsetting
|
||||
|
||||
df1 = puzzles[:, ["Rating", "Popularity"]];
|
||||
df2 = puzzles[!, ["Rating", "Popularity"]];
|
||||
|
||||
df1 == df2
|
||||
df1 == puzzles
|
||||
df2 == puzzles
|
||||
|
||||
df1.Rating === puzzles.Rating
|
||||
df1.Popularity === puzzles.Popularity
|
||||
df2.Rating === puzzles.Rating
|
||||
df2.Popularity === puzzles.Popularity
|
||||
|
||||
@benchmark $puzzles[:, ["Rating", "Popularity"]]
|
||||
@benchmark $puzzles[!, ["Rating", "Popularity"]]
|
||||
|
||||
puzzles[1, 1]
|
||||
puzzles[[1], 1]
|
||||
puzzles[1, [1]]
|
||||
puzzles[[1], [1]]
|
||||
|
||||
# Code for making views
|
||||
|
||||
@view puzzles[1, 1]
|
||||
|
||||
@view puzzles[[1], 1]
|
||||
|
||||
@view puzzles[1, [1]]
|
||||
|
||||
@view puzzles[[1], [1]]
|
||||
|
||||
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
|
||||
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
|
||||
|
||||
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
|
||||
|
||||
# Code for section 6.5
|
||||
|
||||
describe(good)
|
||||
|
||||
rating_mapping = Dict{Int, Vector{Int}}()
|
||||
for (i, rating) in enumerate(good.Rating)
|
||||
if haskey(rating_mapping, rating)
|
||||
push!(rating_mapping[rating], i)
|
||||
else
|
||||
rating_mapping[rating] = [i]
|
||||
end
|
||||
end
|
||||
rating_mapping
|
||||
|
||||
good[rating_mapping[2108], :]
|
||||
|
||||
unique(good[rating_mapping[2108], :].Rating)
|
||||
|
||||
using Statistics
|
||||
mean(good[rating_mapping[2108], "Popularity"])
|
||||
|
||||
ratings = unique(good.Rating)
|
||||
|
||||
mean_popularities = map(ratings) do rating
|
||||
indices = rating_mapping[rating]
|
||||
popularities = good[indices, "Popularity"]
|
||||
return mean(popularities)
|
||||
end
|
||||
|
||||
scatter(ratings, mean_popularities;
|
||||
xlabel="rating", ylabel="mean popularity", legend=false)
|
||||
|
||||
import Loess
|
||||
model = Loess.loess(ratings, mean_popularities);
|
||||
ratings_predict = float.(sort(ratings))
|
||||
popularity_predict = Loess.predict(model, ratings_predict)
|
||||
|
||||
plot!(ratings_predict, popularity_predict; width=5, color="black")
|
||||
|
326
ch09.jl
326
ch09.jl
@ -1,279 +1,153 @@
|
||||
# Bogumił Kamiński, 2022
|
||||
|
||||
# Codes for chapter 7
|
||||
# Codes for chapter 9
|
||||
|
||||
# Code for section 7.1
|
||||
|
||||
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
|
||||
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
|
||||
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
|
||||
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
|
||||
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
|
||||
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
|
||||
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
|
||||
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
|
||||
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
|
||||
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
|
||||
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89];
|
||||
|
||||
data = (set1=(x=aq[:, 1], y=aq[:, 2]),
|
||||
set2=(x=aq[:, 3], y=aq[:, 4]),
|
||||
set3=(x=aq[:, 5], y=aq[:, 6]),
|
||||
set4=(x=aq[:, 7], y=aq[:, 8]));
|
||||
# Code for section 9.1
|
||||
|
||||
using DataFrames
|
||||
using CSV
|
||||
using Plots
|
||||
puzzles = CSV.read("puzzles.csv", DataFrame);
|
||||
|
||||
# Code for listing 7.1
|
||||
using Statistics
|
||||
plays_lo = median(puzzles.NbPlays)
|
||||
puzzles.NbPlays .> plays_lo
|
||||
|
||||
aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
|
||||
DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
|
||||
puzzles.NbPlays > plays_lo
|
||||
|
||||
# Code for creating DataFrame with automatic column names
|
||||
rating_lo = 1500
|
||||
rating_hi = quantile(puzzles.Rating, 0.99)
|
||||
rating_lo .< puzzles.Rating .< rating_hi
|
||||
|
||||
DataFrame(aq, :auto)
|
||||
row_selector = (puzzles.NbPlays .> plays_lo) .&&
|
||||
(rating_lo .< puzzles.Rating .< rating_hi)
|
||||
|
||||
# Codes for creating DataFrame from vector of vectors
|
||||
sum(row_selector)
|
||||
count(row_selector)
|
||||
|
||||
aq_vec = collect(eachcol(aq))
|
||||
DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
|
||||
DataFrame(aq_vec, :auto)
|
||||
# Code for listing 9.1
|
||||
|
||||
# Codes for section 7.1.2
|
||||
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
||||
|
||||
data.set1.x
|
||||
# Code for plotting histograms
|
||||
|
||||
DataFrame(x1=data.set1.x, y1=data.set1.y,
|
||||
x2=data.set2.x, y2=data.set2.y,
|
||||
x3=data.set3.x, y3=data.set3.y,
|
||||
x4=data.set4.x, y4=data.set4.y)
|
||||
plot(histogram(good.Rating; label="Rating"),
|
||||
histogram(good.Popularity; label="Popularity"))
|
||||
|
||||
DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
|
||||
:x2 => data.set2.x, :y2 => data.set2.y,
|
||||
:x3 => data.set3.x, :y3 => data.set3.y,
|
||||
:x4 => data.set4.x, :y4 => data.set4.y)
|
||||
# Code for column selectors
|
||||
|
||||
DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
|
||||
:x2 => data.set2.x, :y2 => data.set2.y,
|
||||
:x3 => data.set3.x, :y3 => data.set3.y,
|
||||
:x4 => data.set4.x, :y4 => data.set4.y]);
|
||||
puzzles[1, "Rating"]
|
||||
|
||||
[(i, v) for i in 1:4 for v in [:x, :y]]
|
||||
puzzles[:, "Rating"]
|
||||
|
||||
[string(v, i) for i in 1:4 for v in [:x, :y]]
|
||||
row1 = puzzles[1, ["Rating", "Popularity"]]
|
||||
|
||||
[string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]]
|
||||
row1["Rating"]
|
||||
row1[:Rating]
|
||||
row1[1]
|
||||
row1.Rating
|
||||
row1."Rating"
|
||||
|
||||
DataFrame([string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]]);
|
||||
good = puzzles[row_selector, ["Rating", "Popularity"]]
|
||||
|
||||
data_dict = Dict([string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]])
|
||||
collect(data_dict)
|
||||
good[1, "Rating"]
|
||||
good[1, :]
|
||||
good[:, "Rating"]
|
||||
good[:, :]
|
||||
|
||||
DataFrame(data_dict)
|
||||
names(puzzles, ["Rating", "Popularity"])
|
||||
names(puzzles, [:Rating, :Popularity])
|
||||
names(puzzles, [4, 6])
|
||||
names(puzzles, [false, false, false, true, false, true, false, false, false])
|
||||
names(puzzles, r"Rating")
|
||||
names(puzzles, Not([4, 6]))
|
||||
names(puzzles, Not(r"Rating"))
|
||||
names(puzzles, Between("Rating", "Popularity"))
|
||||
names(puzzles, :)
|
||||
names(puzzles, All())
|
||||
names(puzzles, Cols(r"Rating", "NbPlays"))
|
||||
names(puzzles, Cols(startswith("P")))
|
||||
|
||||
df1 = DataFrame(x1=data.set1.x)
|
||||
df1.x1 === data.set1.x
|
||||
names(puzzles, startswith("P"))
|
||||
|
||||
df2 = DataFrame(x1=data.set1.x; copycols=false)
|
||||
df2.x1 === data.set1.x
|
||||
names(puzzles, Real)
|
||||
|
||||
df = DataFrame(x=1:3, y=1)
|
||||
df.x
|
||||
names(puzzles, AbstractString)
|
||||
|
||||
DataFrame(x=[1], y=[1, 2, 3])
|
||||
puzzles[:, names(puzzles, Real)]
|
||||
|
||||
# Codes for section 7.1.3
|
||||
# Code for row subsetting
|
||||
|
||||
data.set1
|
||||
DataFrame(data.set1)
|
||||
df1 = puzzles[:, ["Rating", "Popularity"]];
|
||||
df2 = puzzles[!, ["Rating", "Popularity"]];
|
||||
|
||||
DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
|
||||
df1 == df2
|
||||
df1 == puzzles
|
||||
df2 == puzzles
|
||||
|
||||
data
|
||||
df1.Rating === puzzles.Rating
|
||||
df1.Popularity === puzzles.Popularity
|
||||
df2.Rating === puzzles.Rating
|
||||
df2.Popularity === puzzles.Popularity
|
||||
|
||||
# Code for listing 7.2
|
||||
@benchmark $puzzles[:, ["Rating", "Popularity"]]
|
||||
@benchmark $puzzles[!, ["Rating", "Popularity"]]
|
||||
|
||||
aq2 = DataFrame(data)
|
||||
puzzles[1, 1]
|
||||
puzzles[[1], 1]
|
||||
puzzles[1, [1]]
|
||||
puzzles[[1], [1]]
|
||||
|
||||
# Codes for listing 7.3
|
||||
# Code for making views
|
||||
|
||||
data_dfs = map(DataFrame, data)
|
||||
@view puzzles[1, 1]
|
||||
|
||||
# Codes for vertical concatenation examples
|
||||
@view puzzles[[1], 1]
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
|
||||
@view puzzles[1, [1]]
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
|
||||
source="source_id")
|
||||
@view puzzles[[1], [1]]
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
|
||||
source="source_id"=>string.("set", 1:4))
|
||||
@btime $puzzles[$row_selector, ["Rating", "Popularity"]];
|
||||
@btime @view $puzzles[$row_selector, ["Rating", "Popularity"]];
|
||||
|
||||
reduce(vcat, collect(data_dfs);
|
||||
source="source_id"=>string.("set", 1:4))
|
||||
parentindices(@view puzzles[row_selector, ["Rating", "Popularity"]])
|
||||
|
||||
# Code for listing 7.4
|
||||
# Code for section 9.2
|
||||
|
||||
df1 = DataFrame(a=1:3, b=11:13)
|
||||
df2 = DataFrame(a=4:6, c=24:26)
|
||||
vcat(df1, df2)
|
||||
vcat(df1, df2; cols=:union)
|
||||
describe(good)
|
||||
|
||||
# Code for listing 7.5
|
||||
|
||||
df_agg = DataFrame()
|
||||
append!(df_agg, data_dfs.set1)
|
||||
append!(df_agg, data_dfs.set2)
|
||||
|
||||
# Code for appending tables to a data frame
|
||||
|
||||
df_agg = DataFrame()
|
||||
append!(df_agg, data.set1)
|
||||
append!(df_agg, data.set2)
|
||||
|
||||
# Code for promote keyword argument
|
||||
|
||||
df1 = DataFrame(a=1:3, b=11:13)
|
||||
df2 = DataFrame(a=4:6, b=[14, missing, 16])
|
||||
append!(df1, df2)
|
||||
append!(df1, df2; promote=true)
|
||||
|
||||
# Code for section 7.2.3
|
||||
|
||||
df = DataFrame()
|
||||
push!(df, (a=1, b=2))
|
||||
push!(df, (a=3, b=4))
|
||||
|
||||
df = DataFrame(a=Int[], b=Int[])
|
||||
push!(df, [1, 2])
|
||||
push!(df, [3, 4])
|
||||
|
||||
function sim_step(current)
|
||||
dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
|
||||
return (x=current.x + dx, y=current.y + dy)
|
||||
end
|
||||
|
||||
using BenchmarkTools
|
||||
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
|
||||
|
||||
dx, dy = (10, 20)
|
||||
dx
|
||||
dy
|
||||
|
||||
using FreqTables
|
||||
using Random
|
||||
Random.seed!(1234);
|
||||
proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
|
||||
|
||||
using Random
|
||||
Random.seed!(6);
|
||||
walk = DataFrame(x=0, y=0)
|
||||
for _ in 1:10
|
||||
current = walk[end, :]
|
||||
push!(walk, sim_step(current))
|
||||
end
|
||||
walk
|
||||
|
||||
plot(walk.x, walk.y;
|
||||
legend=false,
|
||||
series_annotations=1:11,
|
||||
xticks=range(extrema(walk.x)...),
|
||||
yticks=range(extrema(walk.y)...))
|
||||
|
||||
extrema(walk.y)
|
||||
|
||||
range(1, 5)
|
||||
|
||||
(3/4)^9
|
||||
|
||||
# Code for listing 7.6
|
||||
|
||||
function walk_unique() #A
|
||||
walk = DataFrame(x=0, y=0)
|
||||
for _ in 1:10
|
||||
current = walk[end, :]
|
||||
push!(walk, sim_step(current))
|
||||
rating_mapping = Dict{Int, Vector{Int}}()
|
||||
for (i, rating) in enumerate(good.Rating)
|
||||
if haskey(rating_mapping, rating)
|
||||
push!(rating_mapping[rating], i)
|
||||
else
|
||||
rating_mapping[rating] = [i]
|
||||
end
|
||||
return nrow(unique(walk)) == nrow(walk) #B
|
||||
end
|
||||
Random.seed!(2);
|
||||
proptable([walk_unique() for _ in 1:10^5])
|
||||
rating_mapping
|
||||
|
||||
# Code for a note on conversion
|
||||
good[rating_mapping[2108], :]
|
||||
|
||||
x = [1.5]
|
||||
x[1] = 1
|
||||
x
|
||||
unique(good[rating_mapping[2108], :].Rating)
|
||||
|
||||
# Code from section 7.3.1
|
||||
using Statistics
|
||||
mean(good[rating_mapping[2108], "Popularity"])
|
||||
|
||||
Matrix(walk)
|
||||
Matrix{Any}(walk)
|
||||
Matrix{String}(walk)
|
||||
ratings = unique(good.Rating)
|
||||
|
||||
plot(walk)
|
||||
|
||||
plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
|
||||
|
||||
# Code from section 7.3.2
|
||||
|
||||
Tables.columntable(walk)
|
||||
|
||||
using BenchmarkTools
|
||||
function mysum(table)
|
||||
s = 0 #A
|
||||
for v in table.x #B
|
||||
s += v
|
||||
end
|
||||
return s
|
||||
end
|
||||
df = DataFrame(x=1:1_000_000);
|
||||
@btime mysum($df)
|
||||
|
||||
tab = Tables.columntable(df);
|
||||
@btime mysum($tab)
|
||||
|
||||
@code_warntype mysum(df)
|
||||
|
||||
@code_warntype mysum(tab)
|
||||
|
||||
typeof(tab)
|
||||
|
||||
function barrier_mysum2(x)
|
||||
s = 0
|
||||
for v in x
|
||||
s += v
|
||||
end
|
||||
return s
|
||||
end
|
||||
mysum2(table) = barrier_mysum2(table.x)
|
||||
@btime mysum2($df)
|
||||
|
||||
df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
|
||||
unique(df)
|
||||
|
||||
tab = Tables.columntable(df)
|
||||
unique(tab)
|
||||
|
||||
# Code from section 7.3.3
|
||||
|
||||
Tables.rowtable(walk)
|
||||
|
||||
nti = Tables.namedtupleiterator(walk)
|
||||
for v in nti
|
||||
println(v)
|
||||
mean_popularities = map(ratings) do rating
|
||||
indices = rating_mapping[rating]
|
||||
popularities = good[indices, "Popularity"]
|
||||
return mean(popularities)
|
||||
end
|
||||
|
||||
er = eachrow(walk)
|
||||
er[1]
|
||||
er[end]
|
||||
ec = eachcol(walk)
|
||||
ec[1]
|
||||
ec[end]
|
||||
scatter(ratings, mean_popularities;
|
||||
xlabel="rating", ylabel="mean popularity", legend=false)
|
||||
|
||||
identity.(eachcol(walk))
|
||||
import Loess
|
||||
model = Loess.loess(ratings, mean_popularities);
|
||||
ratings_predict = float.(sort(ratings))
|
||||
popularity_predict = Loess.predict(model, ratings_predict)
|
||||
|
||||
df = DataFrame(x=1:2, b=["a", "b"])
|
||||
identity.(eachcol(df))
|
||||
plot!(ratings_predict, popularity_predict; width=5, color="black")
|
||||
|
475
ch10.jl
475
ch10.jl
@ -1,284 +1,279 @@
|
||||
# Bogumił Kamiński, 2022
|
||||
|
||||
# Codes for chapter 8
|
||||
# Codes for chapter 7
|
||||
|
||||
# Codes for section 8.1
|
||||
# Code for section 7.1
|
||||
|
||||
# Code for listing 8.1
|
||||
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
|
||||
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
|
||||
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
|
||||
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
|
||||
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
|
||||
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
|
||||
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
|
||||
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
|
||||
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
|
||||
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
|
||||
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89];
|
||||
|
||||
import Downloads
|
||||
using SHA
|
||||
git_zip = "git_web_ml.zip"
|
||||
if !isfile(git_zip)
|
||||
Downloads.download("https://snap.stanford.edu/data/" *
|
||||
"git_web_ml.zip",
|
||||
git_zip)
|
||||
end
|
||||
isfile(git_zip)
|
||||
open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
|
||||
0xc4, 0x60, 0xdc, 0x4c,
|
||||
0x7b, 0xf8, 0x93, 0x57,
|
||||
0xb1, 0xfe, 0xc0, 0x20,
|
||||
0xf4, 0x5e, 0x2e, 0xce,
|
||||
0xba, 0xb8, 0x1d, 0x13,
|
||||
0x1d, 0x07, 0x3b, 0x10,
|
||||
0xe2, 0x8e, 0xc0, 0x31]
|
||||
data = (set1=(x=aq[:, 1], y=aq[:, 2]),
|
||||
set2=(x=aq[:, 3], y=aq[:, 4]),
|
||||
set3=(x=aq[:, 5], y=aq[:, 6]),
|
||||
set4=(x=aq[:, 7], y=aq[:, 8]));
|
||||
|
||||
# Code for opeining a zip archive
|
||||
|
||||
import ZipFile
|
||||
git_archive = ZipFile.Reader(git_zip)
|
||||
|
||||
# Code for listing 8.2
|
||||
|
||||
function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
|
||||
idx = only(findall(x -> x.name == filename, archive.files))
|
||||
return CSV.read(read(archive.files[idx]), DataFrame)
|
||||
end
|
||||
|
||||
# Code for working with zip archive
|
||||
|
||||
git_archive.files
|
||||
|
||||
git_archive.files[2].name
|
||||
|
||||
findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
|
||||
findall(x -> x.name == "", git_archive.files)
|
||||
|
||||
only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
|
||||
only(findall(x -> x.name == "", git_archive.files))
|
||||
|
||||
# Code for listing 8.3
|
||||
|
||||
using CSV
|
||||
using DataFrames
|
||||
edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
|
||||
classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
|
||||
close(git_archive)
|
||||
summary(edges_df)
|
||||
describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
|
||||
summary(classes_df)
|
||||
describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
|
||||
|
||||
# Code for updating data frame columns using broadcasting
|
||||
# Code for listing 7.1
|
||||
|
||||
edges_df .+= 1
|
||||
classes_df.id .+= 1
|
||||
aq1 = ataFrame(aq, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
|
||||
DataFrame(aq, [:x1, :y1, :x2, :y2, :x3, :y3, :x4, :y4])
|
||||
|
||||
# Code for examples of data frame broadcasting
|
||||
# Code for creating DataFrame with automatic column names
|
||||
|
||||
df = DataFrame(a=1:3, b=[4, missing, 5])
|
||||
df .^ 2
|
||||
coalesce.(df, 0)
|
||||
df .+ [10, 11, 12]
|
||||
DataFrame(aq, :auto)
|
||||
|
||||
# Code for checking the order of :id column in a data frame
|
||||
# Codes for creating DataFrame from vector of vectors
|
||||
|
||||
classes_df.id == axes(classes_df, 1)
|
||||
aq_vec = collect(eachcol(aq))
|
||||
DataFrame(aq_vec, ["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"])
|
||||
DataFrame(aq_vec, :auto)
|
||||
|
||||
# Code for the difference between ! and : in broadcasting assignment
|
||||
# Codes for section 7.1.2
|
||||
|
||||
df = DataFrame(a=1:3, b=1:3)
|
||||
df[!, :a] .= "x"
|
||||
df[:, :b] .= "x"
|
||||
df
|
||||
data.set1.x
|
||||
|
||||
# Code for the difference between ! and : in assignment
|
||||
DataFrame(x1=data.set1.x, y1=data.set1.y,
|
||||
x2=data.set2.x, y2=data.set2.y,
|
||||
x3=data.set3.x, y3=data.set3.y,
|
||||
x4=data.set4.x, y4=data.set4.y)
|
||||
|
||||
df = DataFrame(a=1:3, b=1:3, c=1:3)
|
||||
df[!, :a] = ["x", "y", "z"]
|
||||
df[:, :b] = ["x", "y", "z"]
|
||||
df[:, :c] = [11, 12, 13]
|
||||
df
|
||||
DataFrame(:x1 => data.set1.x, :y1 => data.set1.y,
|
||||
:x2 => data.set2.x, :y2 => data.set2.y,
|
||||
:x3 => data.set3.x, :y3 => data.set3.y,
|
||||
:x4 => data.set4.x, :y4 => data.set4.y)
|
||||
|
||||
# Codes for section 8.2
|
||||
DataFrame([:x1 => data.set1.x, :y1 => data.set1.y,
|
||||
:x2 => data.set2.x, :y2 => data.set2.y,
|
||||
:x3 => data.set3.x, :y3 => data.set3.y,
|
||||
:x4 => data.set4.x, :y4 => data.set4.y]);
|
||||
|
||||
# Code from listing 8.4
|
||||
[(i, v) for i in 1:4 for v in [:x, :y]]
|
||||
|
||||
using Graphs
|
||||
gh = SimpleGraph(nrow(classes_df))
|
||||
for (from, to) in eachrow(edges_df)
|
||||
add_edge!(gh, from, to)
|
||||
end
|
||||
gh
|
||||
ne(gh)
|
||||
nv(gh)
|
||||
[string(v, i) for i in 1:4 for v in [:x, :y]]
|
||||
|
||||
# Code for iterator destruction in iteration specification
|
||||
[string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]]
|
||||
|
||||
mat = [1 2; 3 4; 5 6]
|
||||
for (x1, x2) in eachrow(mat)
|
||||
@show x1, x2
|
||||
end
|
||||
DataFrame([string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]]);
|
||||
|
||||
# Code for getting degrees of nodes in the graph
|
||||
data_dict = Dict([string(v, i) => getproperty(data[i], v)
|
||||
for i in 1:4 for v in [:x, :y]])
|
||||
collect(data_dict)
|
||||
|
||||
degree(gh)
|
||||
DataFrame(data_dict)
|
||||
|
||||
# Code for adding a column to a data frame
|
||||
df1 = DataFrame(x1=data.set1.x)
|
||||
df1.x1 === data.set1.x
|
||||
|
||||
classes_df.deg = degree(gh)
|
||||
df2 = DataFrame(x1=data.set1.x; copycols=false)
|
||||
df2.x1 === data.set1.x
|
||||
|
||||
# Code for the difference between ! and : when adding a column
|
||||
df = DataFrame(x=1:3, y=1)
|
||||
df.x
|
||||
|
||||
DataFrame(x=[1], y=[1, 2, 3])
|
||||
|
||||
# Codes for section 7.1.3
|
||||
|
||||
data.set1
|
||||
DataFrame(data.set1)
|
||||
|
||||
DataFrame([(a=1, b=2), (a=3, b=4), (a=5, b=6)])
|
||||
|
||||
data
|
||||
|
||||
# Code for listing 7.2
|
||||
|
||||
aq2 = DataFrame(data)
|
||||
|
||||
# Codes for listing 7.3
|
||||
|
||||
data_dfs = map(DataFrame, data)
|
||||
|
||||
# Codes for vertical concatenation examples
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4)
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
|
||||
source="source_id")
|
||||
|
||||
vcat(data_dfs.set1, data_dfs.set2, data_dfs.set3, data_dfs.set4;
|
||||
source="source_id"=>string.("set", 1:4))
|
||||
|
||||
reduce(vcat, collect(data_dfs);
|
||||
source="source_id"=>string.("set", 1:4))
|
||||
|
||||
# Code for listing 7.4
|
||||
|
||||
df1 = DataFrame(a=1:3, b=11:13)
|
||||
df2 = DataFrame(a=4:6, c=24:26)
|
||||
vcat(df1, df2)
|
||||
vcat(df1, df2; cols=:union)
|
||||
|
||||
# Code for listing 7.5
|
||||
|
||||
df_agg = DataFrame()
|
||||
append!(df_agg, data_dfs.set1)
|
||||
append!(df_agg, data_dfs.set2)
|
||||
|
||||
# Code for appending tables to a data frame
|
||||
|
||||
df_agg = DataFrame()
|
||||
append!(df_agg, data.set1)
|
||||
append!(df_agg, data.set2)
|
||||
|
||||
# Code for promote keyword argument
|
||||
|
||||
df1 = DataFrame(a=1:3, b=11:13)
|
||||
df2 = DataFrame(a=4:6, b=[14, missing, 16])
|
||||
append!(df1, df2)
|
||||
append!(df1, df2; promote=true)
|
||||
|
||||
# Code for section 7.2.3
|
||||
|
||||
df = DataFrame()
|
||||
x = [1, 2, 3]
|
||||
df[!, :x1] = x
|
||||
df[:, :x2] = x
|
||||
df
|
||||
df.x1 === x
|
||||
df.x2 === x
|
||||
df.x2 == x
|
||||
push!(df, (a=1, b=2))
|
||||
push!(df, (a=3, b=4))
|
||||
|
||||
# Code for creating a column using broadcasting
|
||||
df = DataFrame(a=Int[], b=Int[])
|
||||
push!(df, [1, 2])
|
||||
push!(df, [3, 4])
|
||||
|
||||
df.x3 .= 1
|
||||
df
|
||||
|
||||
# Code for edge iterator of a graph
|
||||
|
||||
edges(gh)
|
||||
|
||||
e1 = first(edges(gh))
|
||||
dump(e1)
|
||||
e1.src
|
||||
e1.dst
|
||||
|
||||
# Code for listing 8.5
|
||||
|
||||
function deg_class(gh, class)
|
||||
deg_ml = zeros(Int, length(class))
|
||||
deg_web = zeros(Int, length(class))
|
||||
for edge in edges(gh)
|
||||
a, b = edge.src, edge.dst
|
||||
if class[b] == 1
|
||||
deg_ml[a] += 1
|
||||
else
|
||||
deg_web[a] += 1
|
||||
end
|
||||
if class[a] == 1
|
||||
deg_ml[b] += 1
|
||||
else
|
||||
deg_web[b] += 1
|
||||
end
|
||||
end
|
||||
return (deg_ml, deg_web)
|
||||
function sim_step(current)
|
||||
dx, dy = rand(((1,0), (-1,0), (0,1), (0,-1)))
|
||||
return (x=current.x + dx, y=current.y + dy)
|
||||
end
|
||||
|
||||
# Code for computing machine learning and web neighbors for gh graph
|
||||
using BenchmarkTools
|
||||
@btime rand(((1,0), (-1,0), (0,1), (0,-1)));
|
||||
|
||||
classes_df.deg_ml, classes_df.deg_web =
|
||||
deg_class(gh, classes_df.ml_target)
|
||||
|
||||
# Code for checking type stability of deg_class function
|
||||
|
||||
@time deg_class(gh, classes_df.ml_target);
|
||||
@code_warntype deg_class(gh, classes_df.ml_target)
|
||||
|
||||
# Code for checking the classes_df summary statistics
|
||||
|
||||
describe(classes_df, :min, :max, :mean, :std)
|
||||
|
||||
# Code for average degree of node in the graph
|
||||
|
||||
2 * ne(gh) / nv(gh)
|
||||
|
||||
# Code for checking correctness of computations
|
||||
|
||||
classes_df.deg_ml + classes_df.deg_web == classes_df.deg
|
||||
|
||||
# Code for showing that DataFrames.jl checks consistency of stored objects
|
||||
|
||||
df = DataFrame(a=1, b=11)
|
||||
push!(df.a, 2)
|
||||
df
|
||||
|
||||
# Codes for section 8.3
|
||||
|
||||
# Code for computing groupwise means of columns
|
||||
|
||||
using Statistics
|
||||
for type in [0, 1], col in ["deg_ml", "deg_web"]
|
||||
println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
|
||||
end
|
||||
|
||||
gdf = groupby(classes_df, :ml_target)
|
||||
combine(gdf,
|
||||
:deg_ml => mean => :mean_deg_ml,
|
||||
:deg_web => mean => :mean_deg_web)
|
||||
|
||||
using DataFramesMeta
|
||||
@combine(gdf,
|
||||
:mean_deg_ml = mean(:deg_ml),
|
||||
:mean_deg_web = mean(:deg_web))
|
||||
|
||||
# Code for simple plotting of relationship between developer degree and type
|
||||
|
||||
using Plots
|
||||
scatter(classes_df.deg_ml, classes_df.deg_web;
|
||||
color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
|
||||
xlabel="degree ml", ylabel="degree web", labels=false)
|
||||
|
||||
# Code for aggregation of degree data
|
||||
|
||||
agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
|
||||
:ml_target => (x -> 1 - mean(x)) => :web_mean)
|
||||
|
||||
# Code for comparison how Julia parses expressions
|
||||
|
||||
:ml_target => (x -> 1 - mean(x)) => :web_mean
|
||||
:ml_target => x -> 1 - mean(x) => :web_mean
|
||||
|
||||
# Code for aggregation using DataFramesMeta.jl
|
||||
|
||||
@combine(groupby(classes_df, [:deg_ml, :deg_web]),
|
||||
:web_mean = 1 - mean(:ml_target))
|
||||
|
||||
# Code for getting summary information about the aggregated data frame
|
||||
|
||||
describe(agg_df)
|
||||
|
||||
# Code for log1p function
|
||||
|
||||
log1p(0)
|
||||
|
||||
# Code for listing 8.6
|
||||
|
||||
function gen_ticks(maxv)
|
||||
max2 = round(Int, log2(maxv))
|
||||
tick = [0; 2 .^ (0:max2)]
|
||||
return (log1p.(tick), tick)
|
||||
end
|
||||
|
||||
log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
|
||||
dx, dy = (10, 20)
|
||||
dx
|
||||
dy
|
||||
|
||||
using FreqTables
|
||||
using Random
|
||||
Random.seed!(1234);
|
||||
scatter(log1pjitter.(agg_df.deg_ml),
|
||||
log1pjitter.(agg_df.deg_web);
|
||||
zcolor=agg_df.web_mean,
|
||||
xlabel="degree ml", ylabel="degree web",
|
||||
markersize=2, markerstrokewidth=0, markeralpha=0.8,
|
||||
legend=:topleft, labels = "fraction web",
|
||||
xticks=gen_ticks(maximum(classes_df.deg_ml)),
|
||||
yticks=gen_ticks(maximum(classes_df.deg_web)))
|
||||
proptable([rand(((1,0), (-1,0), (0,1), (0,-1))) for _ in 1:10^7])
|
||||
|
||||
# Code for fitting logistic regression model
|
||||
using Random
|
||||
Random.seed!(6);
|
||||
walk = DataFrame(x=0, y=0)
|
||||
for _ in 1:10
|
||||
current = walk[end, :]
|
||||
push!(walk, sim_step(current))
|
||||
end
|
||||
walk
|
||||
|
||||
using GLM
|
||||
glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
|
||||
plot(walk.x, walk.y;
|
||||
legend=false,
|
||||
series_annotations=1:11,
|
||||
xticks=range(extrema(walk.x)...),
|
||||
yticks=range(extrema(walk.y)...))
|
||||
|
||||
# Code for inspecting @formula result
|
||||
extrema(walk.y)
|
||||
|
||||
@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
|
||||
range(1, 5)
|
||||
|
||||
# Code for inserting columns to a data frame
|
||||
(3/4)^9
|
||||
|
||||
df = DataFrame(x=1:3)
|
||||
insertcols!(df, :y => 4:6)
|
||||
insertcols!(df, :y => 4:6)
|
||||
insertcols!(df, :z => 1)
|
||||
# Code for listing 7.6
|
||||
|
||||
insertcols!(df, 1, :a => 0)
|
||||
insertcols!(df, :x, :pre_x => 2)
|
||||
insertcols!(df, :x, :post_x => 3, after=true)
|
||||
function walk_unique() #A
|
||||
walk = DataFrame(x=0, y=0)
|
||||
for _ in 1:10
|
||||
current = walk[end, :]
|
||||
push!(walk, sim_step(current))
|
||||
end
|
||||
return nrow(unique(walk)) == nrow(walk) #B
|
||||
end
|
||||
Random.seed!(2);
|
||||
proptable([walk_unique() for _ in 1:10^5])
|
||||
|
||||
# Code for a note on conversion
|
||||
|
||||
x = [1.5]
|
||||
x[1] = 1
|
||||
x
|
||||
|
||||
# Code from section 7.3.1
|
||||
|
||||
Matrix(walk)
|
||||
Matrix{Any}(walk)
|
||||
Matrix{String}(walk)
|
||||
|
||||
plot(walk)
|
||||
|
||||
plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
|
||||
|
||||
# Code from section 7.3.2
|
||||
|
||||
Tables.columntable(walk)
|
||||
|
||||
using BenchmarkTools
|
||||
function mysum(table)
|
||||
s = 0 #A
|
||||
for v in table.x #B
|
||||
s += v
|
||||
end
|
||||
return s
|
||||
end
|
||||
df = DataFrame(x=1:1_000_000);
|
||||
@btime mysum($df)
|
||||
|
||||
tab = Tables.columntable(df);
|
||||
@btime mysum($tab)
|
||||
|
||||
@code_warntype mysum(df)
|
||||
|
||||
@code_warntype mysum(tab)
|
||||
|
||||
typeof(tab)
|
||||
|
||||
function barrier_mysum2(x)
|
||||
s = 0
|
||||
for v in x
|
||||
s += v
|
||||
end
|
||||
return s
|
||||
end
|
||||
mysum2(table) = barrier_mysum2(table.x)
|
||||
@btime mysum2($df)
|
||||
|
||||
df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
|
||||
unique(df)
|
||||
|
||||
tab = Tables.columntable(df)
|
||||
unique(tab)
|
||||
|
||||
# Code from section 7.3.3
|
||||
|
||||
Tables.rowtable(walk)
|
||||
|
||||
nti = Tables.namedtupleiterator(walk)
|
||||
for v in nti
|
||||
println(v)
|
||||
end
|
||||
|
||||
er = eachrow(walk)
|
||||
er[1]
|
||||
er[end]
|
||||
ec = eachcol(walk)
|
||||
ec[1]
|
||||
ec[end]
|
||||
|
||||
identity.(eachcol(walk))
|
||||
|
||||
df = DataFrame(x=1:2, b=["a", "b"])
|
||||
identity.(eachcol(df))
|
||||
|
284
ch12.jl
Normal file
284
ch12.jl
Normal file
@ -0,0 +1,284 @@
|
||||
# Bogumił Kamiński, 2022
|
||||
|
||||
# Codes for chapter 8
|
||||
|
||||
# Codes for section 8.1
|
||||
|
||||
# Code for listing 8.1
|
||||
|
||||
import Downloads
|
||||
using SHA
|
||||
git_zip = "git_web_ml.zip"
|
||||
if !isfile(git_zip)
|
||||
Downloads.download("https://snap.stanford.edu/data/" *
|
||||
"git_web_ml.zip",
|
||||
git_zip)
|
||||
end
|
||||
isfile(git_zip)
|
||||
open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2,
|
||||
0xc4, 0x60, 0xdc, 0x4c,
|
||||
0x7b, 0xf8, 0x93, 0x57,
|
||||
0xb1, 0xfe, 0xc0, 0x20,
|
||||
0xf4, 0x5e, 0x2e, 0xce,
|
||||
0xba, 0xb8, 0x1d, 0x13,
|
||||
0x1d, 0x07, 0x3b, 0x10,
|
||||
0xe2, 0x8e, 0xc0, 0x31]
|
||||
|
||||
# Code for opeining a zip archive
|
||||
|
||||
import ZipFile
|
||||
git_archive = ZipFile.Reader(git_zip)
|
||||
|
||||
# Code for listing 8.2
|
||||
|
||||
function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString)
|
||||
idx = only(findall(x -> x.name == filename, archive.files))
|
||||
return CSV.read(read(archive.files[idx]), DataFrame)
|
||||
end
|
||||
|
||||
# Code for working with zip archive
|
||||
|
||||
git_archive.files
|
||||
|
||||
git_archive.files[2].name
|
||||
|
||||
findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)
|
||||
findall(x -> x.name == "", git_archive.files)
|
||||
|
||||
only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files))
|
||||
only(findall(x -> x.name == "", git_archive.files))
|
||||
|
||||
# Code for listing 8.3
|
||||
|
||||
using CSV
|
||||
using DataFrames
|
||||
edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv");
|
||||
classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv");
|
||||
close(git_archive)
|
||||
summary(edges_df)
|
||||
describe(edges_df, :min, :max, :mean, :nmissing, :eltype)
|
||||
summary(classes_df)
|
||||
describe(classes_df, :min, :max, :mean, :nmissing, :eltype)
|
||||
|
||||
# Code for updating data frame columns using broadcasting
|
||||
|
||||
edges_df .+= 1
|
||||
classes_df.id .+= 1
|
||||
|
||||
# Code for examples of data frame broadcasting
|
||||
|
||||
df = DataFrame(a=1:3, b=[4, missing, 5])
|
||||
df .^ 2
|
||||
coalesce.(df, 0)
|
||||
df .+ [10, 11, 12]
|
||||
|
||||
# Code for checking the order of :id column in a data frame
|
||||
|
||||
classes_df.id == axes(classes_df, 1)
|
||||
|
||||
# Code for the difference between ! and : in broadcasting assignment
|
||||
|
||||
df = DataFrame(a=1:3, b=1:3)
|
||||
df[!, :a] .= "x"
|
||||
df[:, :b] .= "x"
|
||||
df
|
||||
|
||||
# Code for the difference between ! and : in assignment
|
||||
|
||||
df = DataFrame(a=1:3, b=1:3, c=1:3)
|
||||
df[!, :a] = ["x", "y", "z"]
|
||||
df[:, :b] = ["x", "y", "z"]
|
||||
df[:, :c] = [11, 12, 13]
|
||||
df
|
||||
|
||||
# Codes for section 8.2
|
||||
|
||||
# Code from listing 8.4
|
||||
|
||||
using Graphs
|
||||
gh = SimpleGraph(nrow(classes_df))
|
||||
for (from, to) in eachrow(edges_df)
|
||||
add_edge!(gh, from, to)
|
||||
end
|
||||
gh
|
||||
ne(gh)
|
||||
nv(gh)
|
||||
|
||||
# Code for iterator destruction in iteration specification
|
||||
|
||||
mat = [1 2; 3 4; 5 6]
|
||||
for (x1, x2) in eachrow(mat)
|
||||
@show x1, x2
|
||||
end
|
||||
|
||||
# Code for getting degrees of nodes in the graph
|
||||
|
||||
degree(gh)
|
||||
|
||||
# Code for adding a column to a data frame
|
||||
|
||||
classes_df.deg = degree(gh)
|
||||
|
||||
# Code for the difference between ! and : when adding a column
|
||||
|
||||
df = DataFrame()
|
||||
x = [1, 2, 3]
|
||||
df[!, :x1] = x
|
||||
df[:, :x2] = x
|
||||
df
|
||||
df.x1 === x
|
||||
df.x2 === x
|
||||
df.x2 == x
|
||||
|
||||
# Code for creating a column using broadcasting
|
||||
|
||||
df.x3 .= 1
|
||||
df
|
||||
|
||||
# Code for edge iterator of a graph
|
||||
|
||||
edges(gh)
|
||||
|
||||
e1 = first(edges(gh))
|
||||
dump(e1)
|
||||
e1.src
|
||||
e1.dst
|
||||
|
||||
# Code for listing 8.5
|
||||
|
||||
function deg_class(gh, class)
|
||||
deg_ml = zeros(Int, length(class))
|
||||
deg_web = zeros(Int, length(class))
|
||||
for edge in edges(gh)
|
||||
a, b = edge.src, edge.dst
|
||||
if class[b] == 1
|
||||
deg_ml[a] += 1
|
||||
else
|
||||
deg_web[a] += 1
|
||||
end
|
||||
if class[a] == 1
|
||||
deg_ml[b] += 1
|
||||
else
|
||||
deg_web[b] += 1
|
||||
end
|
||||
end
|
||||
return (deg_ml, deg_web)
|
||||
end
|
||||
|
||||
# Code for computing machine learning and web neighbors for gh graph
|
||||
|
||||
classes_df.deg_ml, classes_df.deg_web =
|
||||
deg_class(gh, classes_df.ml_target)
|
||||
|
||||
# Code for checking type stability of deg_class function
|
||||
|
||||
@time deg_class(gh, classes_df.ml_target);
|
||||
@code_warntype deg_class(gh, classes_df.ml_target)
|
||||
|
||||
# Code for checking the classes_df summary statistics
|
||||
|
||||
describe(classes_df, :min, :max, :mean, :std)
|
||||
|
||||
# Code for average degree of node in the graph
|
||||
|
||||
2 * ne(gh) / nv(gh)
|
||||
|
||||
# Code for checking correctness of computations
|
||||
|
||||
classes_df.deg_ml + classes_df.deg_web == classes_df.deg
|
||||
|
||||
# Code for showing that DataFrames.jl checks consistency of stored objects
|
||||
|
||||
df = DataFrame(a=1, b=11)
|
||||
push!(df.a, 2)
|
||||
df
|
||||
|
||||
# Codes for section 8.3
|
||||
|
||||
# Code for computing groupwise means of columns
|
||||
|
||||
using Statistics
|
||||
for type in [0, 1], col in ["deg_ml", "deg_web"]
|
||||
println((type, col, mean(classes_df[classes_df.ml_target .== type, col])))
|
||||
end
|
||||
|
||||
gdf = groupby(classes_df, :ml_target)
|
||||
combine(gdf,
|
||||
:deg_ml => mean => :mean_deg_ml,
|
||||
:deg_web => mean => :mean_deg_web)
|
||||
|
||||
using DataFramesMeta
|
||||
@combine(gdf,
|
||||
:mean_deg_ml = mean(:deg_ml),
|
||||
:mean_deg_web = mean(:deg_web))
|
||||
|
||||
# Code for simple plotting of relationship between developer degree and type
|
||||
|
||||
using Plots
|
||||
scatter(classes_df.deg_ml, classes_df.deg_web;
|
||||
color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target],
|
||||
xlabel="degree ml", ylabel="degree web", labels=false)
|
||||
|
||||
# Code for aggregation of degree data
|
||||
|
||||
agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]),
|
||||
:ml_target => (x -> 1 - mean(x)) => :web_mean)
|
||||
|
||||
# Code for comparison how Julia parses expressions
|
||||
|
||||
:ml_target => (x -> 1 - mean(x)) => :web_mean
|
||||
:ml_target => x -> 1 - mean(x) => :web_mean
|
||||
|
||||
# Code for aggregation using DataFramesMeta.jl
|
||||
|
||||
@combine(groupby(classes_df, [:deg_ml, :deg_web]),
|
||||
:web_mean = 1 - mean(:ml_target))
|
||||
|
||||
# Code for getting summary information about the aggregated data frame
|
||||
|
||||
describe(agg_df)
|
||||
|
||||
# Code for log1p function
|
||||
|
||||
log1p(0)
|
||||
|
||||
# Code for listing 8.6
|
||||
|
||||
function gen_ticks(maxv)
|
||||
max2 = round(Int, log2(maxv))
|
||||
tick = [0; 2 .^ (0:max2)]
|
||||
return (log1p.(tick), tick)
|
||||
end
|
||||
|
||||
log1pjitter(x) = log1p(x) - 0.05 + rand() / 10
|
||||
|
||||
using Random
|
||||
Random.seed!(1234);
|
||||
scatter(log1pjitter.(agg_df.deg_ml),
|
||||
log1pjitter.(agg_df.deg_web);
|
||||
zcolor=agg_df.web_mean,
|
||||
xlabel="degree ml", ylabel="degree web",
|
||||
markersize=2, markerstrokewidth=0, markeralpha=0.8,
|
||||
legend=:topleft, labels = "fraction web",
|
||||
xticks=gen_ticks(maximum(classes_df.deg_ml)),
|
||||
yticks=gen_ticks(maximum(classes_df.deg_web)))
|
||||
|
||||
# Code for fitting logistic regression model
|
||||
|
||||
using GLM
|
||||
glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink())
|
||||
|
||||
# Code for inspecting @formula result
|
||||
|
||||
@formula(ml_target~log1p(deg_ml)+log1p(deg_web))
|
||||
|
||||
# Code for inserting columns to a data frame
|
||||
|
||||
df = DataFrame(x=1:3)
|
||||
insertcols!(df, :y => 4:6)
|
||||
insertcols!(df, :y => 4:6)
|
||||
insertcols!(df, :z => 1)
|
||||
|
||||
insertcols!(df, 1, :a => 0)
|
||||
insertcols!(df, :x, :pre_x => 2)
|
||||
insertcols!(df, :x, :post_x => 3, after=true)
|
Loading…
Reference in New Issue
Block a user