diff --git a/ch13.jl b/ch13.jl index fb388a4..37e6325 100644 --- a/ch13.jl +++ b/ch13.jl @@ -1,3 +1,9 @@ +# Bogumił Kamiński, 2022 + +# Codes for chapter 13 + +# Codes for section 13.1 + using CSV using CategoricalArrays using DataFrames @@ -38,12 +44,37 @@ close(archive) summary(owensboro) describe(owensboro, :nunique, :nmissing, :eltype) +# Code for listing 13.1 + select!(owensboro, :date, :type, :arrest_made, :violation); summary(owensboro) describe(owensboro, :nunique, :nmissing, :eltype) +# Code for listing 13.2 + +df = DataFrame(id=[1, 2, 1, 2], v=1:4) +combine(df, :v => sum => :sum) +transform(df, :v => sum => :sum) +select(df, :v => sum => :sum) +gdf = groupby(df, :id) +combine(gdf, :v => sum => :sum) +transform(gdf, :v => sum => :sum) +select(gdf, :v => sum => :sum) + +# Code for listing 13.3 + +select(df, + :v => identity => :v1, + :v => identity, + :v => :v2, + :v) + +# Codes for section 13.2 + owensboro.violation +# Code for listing 13.4 + violation_list = [strip.(split(x, ";")) for x in owensboro.violation] violation_flat = reduce(vcat, violation_list) @@ -51,6 +82,8 @@ violation_flat_clean = [contains(x, "SPEEDING") ? "SPEEDING" : x for x in violation_flat] sort(freqtable(violation_flat_clean), rev=true) +# Code for listing 13.5 + agg_violation = @chain owensboro begin select(:violation => ByRow(x -> strip.(split(x, ";"))) => @@ -64,27 +97,39 @@ agg_violation = @chain owensboro begin sort(:count, rev=true) end +# Code explaining ByRow + sqrt(4) sqrt([4, 9, 16]) ByRow(sqrt)([4, 9, 16]) f = ByRow(sqrt) f([4, 9, 16]) +# Code explaining flatten + df = DataFrame(id=1:2, v=[[11, 12], [13, 14, 15]]) flatten(df, :v) +# Code explaining nrow + @chain DataFrame(id=[1, 1, 2, 2, 2]) begin groupby(:id) combine(nrow, nrow => :rows) end +# Code explaining sort + df = DataFrame(a=[2, 1, 2, 1, 2], b=5:-1:1) sort(df, :b) sort(df, [:a, :b]) +# Code showing an example transformation + df = DataFrame(x=[4, 9, 16]) transform(df, :x => ByRow(sqrt)) +# Code showing usage of DataFramesMeta.jl + @chain owensboro begin @rselect(:v=strip.(split(:violation, ";"))) flatten(:v) @@ -95,11 +140,17 @@ transform(df, :x => ByRow(sqrt)) sort(:count, rev=true) end +# Code showing comparison of DataFramesMeta.jl vs DataFrames.jl + df = DataFrame(x=[4, 9, 16]) @select(df, :s = sqrt.(:x)) @rselect(df, :s = sqrt(:x)) select(df, :x => ByRow(sqrt) => :s) +# Codes for section 13.3 + +# Code for listing 13.6 + owensboro2 = select(owensboro, :arrest_made => :arrest, :date => ByRow(dayofweek) => :day, @@ -108,6 +159,8 @@ owensboro2 = select(owensboro, ByRow(x -> contains(x, agg_violation.v[i])) => "v$i" for i in 1:4]) +# Code explaining programmatic generation of transformations + [:violation => ByRow(x -> contains(x, agg_violation.v[i])) => "v$i" for i in 1:4] @@ -115,42 +168,74 @@ owensboro2 = select(owensboro, combine(owensboro, [:date :arrest_made] .=> [minimum, maximum]) [:date :arrest_made] .=> [minimum, maximum] +# Code for listing 13.7 + weekdays = DataFrame(day=1:7, dayname=categorical(dayname.(1:7); ordered=true)) isordered(weekdays.dayname) levels(weekdays.dayname) levels!(weekdays.dayname, weekdays.dayname) +# Code showing join operation + leftjoin!(owensboro2, weekdays; on=:day) +# Code for listing 13.8 + @chain owensboro2 begin groupby([:day, :dayname]; sort=true) combine(nrow) end +# Code showing creation of frequency table from a data frame + freqtable(owensboro2, :dayname, :day) +# Code for listing 13.9 + @chain owensboro2 begin groupby([:day, :dayname]; sort=true) combine(nrow) unstack(:dayname, :day, :nrow; fill=0) end +# Code for dropping rows with missing data + dropmissing!(owensboro2) + +# Code for dropping unwanted column + select!(owensboro2, Not(:day)) +# Codes for section 13.4 + +# Code for listing 13.10 + Random.seed!(1234); owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2)); mean(owensboro2.train) +# Code for listing 13.11 + train = subset(owensboro2, :train) test = subset(owensboro2, :train => ByRow(!)) +# Code showing subsetting with DataFramesMeta.jl + +@rsubset(owensboro2, !(:train)) + +# Code building a predictive model + model = glm(@formula(arrest~dayname+type+v1+v2+v3+v4), train, Binomial(), LogitLink()) + +# Code for making predictions + train.predict = predict(model) test.predict = predict(model, test) +# Code for producing histograms showing predictions + test_groups = groupby(test, :arrest); histogram(test_groups[(false,)].predict; bins=10, normalize=:probability, @@ -159,6 +244,8 @@ histogram!(test_groups[(true,)].predict; bins=10, normalize=:probability, fillalpha=0.5, label="true") +# Code for listing 13.12 + test_roc = roc(test, score=:predict, target=:arrest) plot(test_roc.pfa, 1 .- test_roc.pmiss; legend=:bottomright,