add comments to code

This commit is contained in:
Bogumił Kamiński 2022-02-26 12:39:55 +01:00
parent 4b1f7bb3ed
commit f955d30252

87
ch13.jl
View File

@ -1,3 +1,9 @@
# Bogumił Kamiński, 2022
# Codes for chapter 13
# Codes for section 13.1
using CSV
using CategoricalArrays
using DataFrames
@ -38,12 +44,37 @@ close(archive)
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype)
# Code for listing 13.1
select!(owensboro, :date, :type, :arrest_made, :violation);
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype)
# Code for listing 13.2
df = DataFrame(id=[1, 2, 1, 2], v=1:4)
combine(df, :v => sum => :sum)
transform(df, :v => sum => :sum)
select(df, :v => sum => :sum)
gdf = groupby(df, :id)
combine(gdf, :v => sum => :sum)
transform(gdf, :v => sum => :sum)
select(gdf, :v => sum => :sum)
# Code for listing 13.3
select(df,
:v => identity => :v1,
:v => identity,
:v => :v2,
:v)
# Codes for section 13.2
owensboro.violation
# Code for listing 13.4
violation_list = [strip.(split(x, ";"))
for x in owensboro.violation]
violation_flat = reduce(vcat, violation_list)
@ -51,6 +82,8 @@ violation_flat_clean = [contains(x, "SPEEDING") ?
"SPEEDING" : x for x in violation_flat]
sort(freqtable(violation_flat_clean), rev=true)
# Code for listing 13.5
agg_violation = @chain owensboro begin
select(:violation =>
ByRow(x -> strip.(split(x, ";"))) =>
@ -64,27 +97,39 @@ agg_violation = @chain owensboro begin
sort(:count, rev=true)
end
# Code explaining ByRow
sqrt(4)
sqrt([4, 9, 16])
ByRow(sqrt)([4, 9, 16])
f = ByRow(sqrt)
f([4, 9, 16])
# Code explaining flatten
df = DataFrame(id=1:2, v=[[11, 12], [13, 14, 15]])
flatten(df, :v)
# Code explaining nrow
@chain DataFrame(id=[1, 1, 2, 2, 2]) begin
groupby(:id)
combine(nrow, nrow => :rows)
end
# Code explaining sort
df = DataFrame(a=[2, 1, 2, 1, 2], b=5:-1:1)
sort(df, :b)
sort(df, [:a, :b])
# Code showing an example transformation
df = DataFrame(x=[4, 9, 16])
transform(df, :x => ByRow(sqrt))
# Code showing usage of DataFramesMeta.jl
@chain owensboro begin
@rselect(:v=strip.(split(:violation, ";")))
flatten(:v)
@ -95,11 +140,17 @@ transform(df, :x => ByRow(sqrt))
sort(:count, rev=true)
end
# Code showing comparison of DataFramesMeta.jl vs DataFrames.jl
df = DataFrame(x=[4, 9, 16])
@select(df, :s = sqrt.(:x))
@rselect(df, :s = sqrt(:x))
select(df, :x => ByRow(sqrt) => :s)
# Codes for section 13.3
# Code for listing 13.6
owensboro2 = select(owensboro,
:arrest_made => :arrest,
:date => ByRow(dayofweek) => :day,
@ -108,6 +159,8 @@ owensboro2 = select(owensboro,
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4])
# Code explaining programmatic generation of transformations
[:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4]
@ -115,42 +168,74 @@ owensboro2 = select(owensboro,
combine(owensboro, [:date :arrest_made] .=> [minimum, maximum])
[:date :arrest_made] .=> [minimum, maximum]
# Code for listing 13.7
weekdays = DataFrame(day=1:7,
dayname=categorical(dayname.(1:7); ordered=true))
isordered(weekdays.dayname)
levels(weekdays.dayname)
levels!(weekdays.dayname, weekdays.dayname)
# Code showing join operation
leftjoin!(owensboro2, weekdays; on=:day)
# Code for listing 13.8
@chain owensboro2 begin
groupby([:day, :dayname]; sort=true)
combine(nrow)
end
# Code showing creation of frequency table from a data frame
freqtable(owensboro2, :dayname, :day)
# Code for listing 13.9
@chain owensboro2 begin
groupby([:day, :dayname]; sort=true)
combine(nrow)
unstack(:dayname, :day, :nrow; fill=0)
end
# Code for dropping rows with missing data
dropmissing!(owensboro2)
# Code for dropping unwanted column
select!(owensboro2, Not(:day))
# Codes for section 13.4
# Code for listing 13.10
Random.seed!(1234);
owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2));
mean(owensboro2.train)
# Code for listing 13.11
train = subset(owensboro2, :train)
test = subset(owensboro2, :train => ByRow(!))
# Code showing subsetting with DataFramesMeta.jl
@rsubset(owensboro2, !(:train))
# Code building a predictive model
model = glm(@formula(arrest~dayname+type+v1+v2+v3+v4),
train, Binomial(), LogitLink())
# Code for making predictions
train.predict = predict(model)
test.predict = predict(model, test)
# Code for producing histograms showing predictions
test_groups = groupby(test, :arrest);
histogram(test_groups[(false,)].predict;
bins=10, normalize=:probability,
@ -159,6 +244,8 @@ histogram!(test_groups[(true,)].predict;
bins=10, normalize=:probability,
fillalpha=0.5, label="true")
# Code for listing 13.12
test_roc = roc(test, score=:predict, target=:arrest)
plot(test_roc.pfa, 1 .- test_roc.pmiss;
legend=:bottomright,