done chapter 13

This commit is contained in:
Bogumił Kamiński
2022-02-26 12:27:53 +01:00
parent 25da8d0c00
commit 4b1f7bb3ed
3 changed files with 147 additions and 66 deletions

View File

@@ -491,9 +491,9 @@ version = "1.3.0"
[[deps.Latexify]] [[deps.Latexify]]
deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"] deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"]
git-tree-sha1 = "2a8650452c07a9c89e6a58f296fd638fadaca021" git-tree-sha1 = "a6552bfeab40de157a297d84e03ade4b8177677f"
uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
version = "0.15.11" version = "0.15.12"
[[deps.LibCURL]] [[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"] deps = ["LibCURL_jll", "MozillaCACerts_jll"]
@@ -885,9 +885,9 @@ version = "1.8.3"
[[deps.StaticArrays]] [[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"] deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "95c6a5d0e8c69555842fc4a927fc485040ccc31c" git-tree-sha1 = "6354dfaf95d398a1a70e0b28238321d5d17b2530"
uuid = "90137ffa-7385-5640-81b9-e52037218182" uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.3.5" version = "1.4.0"
[[deps.Statistics]] [[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"] deps = ["LinearAlgebra", "SparseArrays"]

41
appB.jl
View File

@@ -204,3 +204,44 @@ df.b === df.a
df.b == df.a df.b == df.a
df[1:2, "a"] .= 10 df[1:2, "a"] .= 10
df df
# Code for exercise 13.1
@rselect(owensboro,
:arrest = :arrest_made,
:day = dayofweek(:date),
:type,
:v1 = contains(:violation, agg_violation.v[1]),
:v2 = contains(:violation, agg_violation.v[2]),
:v3 = contains(:violation, agg_violation.v[3]),
:v4 = contains(:violation, agg_violation.v[4]))
# Code for exercise 13.2
select(owensboro,
:arrest_made => :arrest,
:date => ByRow(dayofweek) => :day,
:type,
[:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4],
:date => ByRow(dayname) => :dayname)
# Code for exercise 13.3
@chain owensboro2 begin
groupby(:dayname, sort=true)
combine(:arrest => mean)
end
@chain owensboro2 begin
groupby([:dayname, :type], sort=true)
combine(:arrest => mean)
unstack(:dayname, :type, :arrest_mean)
end
# Code for exercise 13.4
train2 = owensboro2[owensboro2.train, :]
test2 = owensboro2[.!owensboro2.train, :]
test3, train3 = groupby(owensboro2, :train, sort=true)

164
ch13.jl
View File

@@ -1,20 +1,23 @@
using CSV using CSV
using CategoricalArrays
using DataFrames using DataFrames
using DataFramesMeta using DataFramesMeta
using Dates using Dates
using Distributions
import Downloads import Downloads
using FreqTables
using GLM using GLM
using Plots using Plots
using Random using Random
using ROCAnalysis
using SHA using SHA
using Statistics using Statistics
import ZipFile import ZipFile
url_zip = "https://stacks.stanford.edu/file/druid:yg821jf8611/" * url_zip = "https://stacks.stanford.edu/file/druid:yg821jf8611/" *
"yg821jf8611_ky_owensboro_2020_04_01.csv.zip" "yg821jf8611_ky_owensboro_2020_04_01.csv.zip";
local_zip = "owensboro.zip" local_zip = "owensboro.zip";
isfile(local_zip) || Downloads.download(url_zip, local_zip)
isfile(url_zip) || Downloads.download(url_zip, local_zip)
isfile(local_zip) isfile(local_zip)
open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74, open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
0xbc, 0x15, 0x74, 0xc5, 0xbc, 0x15, 0x74, 0xc5,
@@ -25,100 +28,137 @@ open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
0x02, 0x89, 0xdd, 0x74, 0x02, 0x89, 0xdd, 0x74,
0x3c, 0xb3, 0x5d, 0x56] 0x3c, 0xb3, 0x5d, 0x56]
archive = ZipFile.Reader(local_zip) archive = ZipFile.Reader(local_zip)
owensboro = CSV.read(read(only(archive.files)), DataFrame; owensboro = @chain archive begin
missingstring="NA") only(_.files)
read
CSV.read(DataFrame; missingstring="NA")
end;
close(archive) close(archive)
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype) describe(owensboro, :nunique, :nmissing, :eltype)
select!(owensboro, :date, :type, :arrest_made, :violation);
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype)
owensboro.violation
violation_list = [strip.(split(x, ";"))
for x in owensboro.violation]
violation_flat = reduce(vcat, violation_list)
violation_flat_clean = [contains(x, "SPEEDING") ?
"SPEEDING" : x for x in violation_flat]
sort(freqtable(violation_flat_clean), rev=true)
agg_violation = @chain owensboro begin agg_violation = @chain owensboro begin
@rselect(:violation = strip.(split(:violation, ";"))) select(:violation =>
flatten(:violation) ByRow(x -> strip.(split(x, ";"))) =>
@rselect(:violation = contains(:violation, "SPEEDING") ? "SPEEDING" : :violation) :v)
groupby(:violation) flatten(:v)
combine(nrow) select(:v =>
sort!(:nrow, rev=true) ByRow(x -> contains(x, "SPEEDING") ? "SPEEDING" : x) =>
:v)
groupby(:v)
combine(nrow => :count)
sort(:count, rev=true)
end end
top_violation = first(agg_violation.violation, 4) sqrt(4)
sqrt([4, 9, 16])
ByRow(sqrt)([4, 9, 16])
f = ByRow(sqrt)
f([4, 9, 16])
df = DataFrame(id=1:2, v=[[11, 12], [13, 14, 15]])
flatten(df, :v)
@chain DataFrame(id=[1, 1, 2, 2, 2]) begin
groupby(:id)
combine(nrow, nrow => :rows)
end
df = DataFrame(a=[2, 1, 2, 1, 2], b=5:-1:1)
sort(df, :b)
sort(df, [:a, :b])
df = DataFrame(x=[4, 9, 16])
transform(df, :x => ByRow(sqrt))
@chain owensboro begin
@rselect(:v=strip.(split(:violation, ";")))
flatten(:v)
@rselect(:v=contains(:v, "SPEEDING") ?
"SPEEDING" : :v)
groupby(:v)
combine(nrow => :count)
sort(:count, rev=true)
end
df = DataFrame(x=[4, 9, 16])
@select(df, :s = sqrt.(:x))
@rselect(df, :s = sqrt(:x))
select(df, :x => ByRow(sqrt) => :s)
owensboro2 = select(owensboro, owensboro2 = select(owensboro,
:date => ByRow(dayofweek) => :day, :arrest_made => :arrest,
:type, :date => ByRow(dayofweek) => :day,
:arrest_made => :arrest, :type,
:violation => [:violation =>
ByRow(x -> contains.(x, top_violation)) => ByRow(x -> contains(x, agg_violation.v[i])) =>
[:v_belt, :v_ins, :v_plate, :v_speed]) "v$i" for i in 1:4])
# mention rename and rename! [:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4]
# Exercise: combine(owensboro, [:date :arrest_made] .=> [minimum, maximum])
# select(owensboro, [:date :arrest_made] .=> [minimum, maximum]
# :date => ByRow(dayname) => :day, :type, :arrest_made => :arrest,
# :violation => ByRow(x -> contains.(x, top_violation)) =>
# [:v_belt, :v_ins, :v_plate, :v_speed])
using CategoricalArrays
weekdays = DataFrame(day=1:7, weekdays = DataFrame(day=1:7,
dayname=categorical(dayname.(1:7), ordered=true)) dayname=categorical(dayname.(1:7); ordered=true))
isordered(weekdays.dayname)
levels(weekdays.dayname) levels(weekdays.dayname)
levels!(weekdays.dayname, weekdays.dayname) levels!(weekdays.dayname, weekdays.dayname)
levels(weekdays.dayname)
leftjoin!(owensboro2, weekdays, on=:day) leftjoin!(owensboro2, weekdays; on=:day)
levels(owensboro2.dayname)
@chain owensboro2 begin @chain owensboro2 begin
groupby([:day, :dayname]) groupby([:day, :dayname]; sort=true)
combine(nrow) combine(nrow)
end end
@chain owensboro2 begin freqtable(owensboro2, :dayname, :day)
groupby([:day, :dayname])
combine(nrow)
unstack(:day, :dayname, :nrow)
end
# Alternative:
# unstack(owensboro2, :day, :dayname, :dayname, valuestransform=>length)
@chain owensboro2 begin @chain owensboro2 begin
combine(AsTable(r"v_") => sum => :total) groupby([:day, :dayname]; sort=true)
groupby(:total)
combine(nrow) combine(nrow)
unstack(:dayname, :day, :nrow; fill=0)
end end
select!(owensboro2, :arrest, :dayname, Not(:day))
mapcols(x -> count(ismissing, x), owensboro2)
dropmissing!(owensboro2) dropmissing!(owensboro2)
mapcols(x -> count(ismissing, x), owensboro2) select!(owensboro2, Not(:day))
@chain owensboro2 begin
groupby(:dayname, sort=true)
combine(:arrest => mean)
bar(_.dayname, _.arrest_mean, legend=false,
xlabel="day of week", ylabel="probability of arrest")
end
using Distributions
Random.seed!(1234); Random.seed!(1234);
owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2)); owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2));
mean(owensboro2.train) mean(owensboro2.train)
test, train = groupby(owensboro2, :train, sort=true);
model = glm(@formula(arrest~dayname+type+v_belt+v_ins+v_plate+v_speed), train = subset(owensboro2, :train)
test = subset(owensboro2, :train => ByRow(!))
model = glm(@formula(arrest~dayname+type+v1+v2+v3+v4),
train, Binomial(), LogitLink()) train, Binomial(), LogitLink())
train.predict = predict(model)
test.predict = predict(model, test)
train.predict = predict(model); test_groups = groupby(test, :arrest);
test.predict = predict(model, test);
test_groups = groupby(test, :arrest)
histogram(test_groups[(false,)].predict; histogram(test_groups[(false,)].predict;
bins=10, normalize=:probability, bins=10, normalize=:probability,
fillalpha=0.5, label="false") fillstyle= :/, label="false")
histogram!(test_groups[(true,)].predict; histogram!(test_groups[(true,)].predict;
bins=10, normalize=:probability, bins=10, normalize=:probability,
fillalpha=0.5, label="true") fillalpha=0.5, label="true")
using ROCAnalysis
test_roc = roc(test, score=:predict, target=:arrest) test_roc = roc(test, score=:predict, target=:arrest)
plot(test_roc.pfa, 1 .- test_roc.pmiss; plot(test_roc.pfa, 1 .- test_roc.pmiss;
legend=:bottomright, legend=:bottomright,
@@ -127,5 +167,5 @@ plot(test_roc.pfa, 1 .- test_roc.pmiss;
xlabel="FPR", ylabel="TPR") xlabel="FPR", ylabel="TPR")
train_roc = roc(train, score=:predict, target=:arrest) train_roc = roc(train, score=:predict, target=:arrest)
plot!(train_roc.pfa, 1 .- train_roc.pmiss; plot!(train_roc.pfa, 1 .- train_roc.pmiss;
color="green", lw=3, color="gold", lw=3,
label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)",) label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)")