done chapter 13

This commit is contained in:
Bogumił Kamiński 2022-02-26 12:27:53 +01:00
parent 25da8d0c00
commit 4b1f7bb3ed
3 changed files with 147 additions and 66 deletions

View File

@ -491,9 +491,9 @@ version = "1.3.0"
[[deps.Latexify]]
deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"]
git-tree-sha1 = "2a8650452c07a9c89e6a58f296fd638fadaca021"
git-tree-sha1 = "a6552bfeab40de157a297d84e03ade4b8177677f"
uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
version = "0.15.11"
version = "0.15.12"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
@ -885,9 +885,9 @@ version = "1.8.3"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "95c6a5d0e8c69555842fc4a927fc485040ccc31c"
git-tree-sha1 = "6354dfaf95d398a1a70e0b28238321d5d17b2530"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.3.5"
version = "1.4.0"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]

41
appB.jl
View File

@ -204,3 +204,44 @@ df.b === df.a
df.b == df.a
df[1:2, "a"] .= 10
df
# Code for exercise 13.1
@rselect(owensboro,
:arrest = :arrest_made,
:day = dayofweek(:date),
:type,
:v1 = contains(:violation, agg_violation.v[1]),
:v2 = contains(:violation, agg_violation.v[2]),
:v3 = contains(:violation, agg_violation.v[3]),
:v4 = contains(:violation, agg_violation.v[4]))
# Code for exercise 13.2
select(owensboro,
:arrest_made => :arrest,
:date => ByRow(dayofweek) => :day,
:type,
[:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4],
:date => ByRow(dayname) => :dayname)
# Code for exercise 13.3
@chain owensboro2 begin
groupby(:dayname, sort=true)
combine(:arrest => mean)
end
@chain owensboro2 begin
groupby([:dayname, :type], sort=true)
combine(:arrest => mean)
unstack(:dayname, :type, :arrest_mean)
end
# Code for exercise 13.4
train2 = owensboro2[owensboro2.train, :]
test2 = owensboro2[.!owensboro2.train, :]
test3, train3 = groupby(owensboro2, :train, sort=true)

164
ch13.jl
View File

@ -1,20 +1,23 @@
using CSV
using CategoricalArrays
using DataFrames
using DataFramesMeta
using Dates
using Distributions
import Downloads
using FreqTables
using GLM
using Plots
using Random
using ROCAnalysis
using SHA
using Statistics
import ZipFile
url_zip = "https://stacks.stanford.edu/file/druid:yg821jf8611/" *
"yg821jf8611_ky_owensboro_2020_04_01.csv.zip"
local_zip = "owensboro.zip"
isfile(url_zip) || Downloads.download(url_zip, local_zip)
"yg821jf8611_ky_owensboro_2020_04_01.csv.zip";
local_zip = "owensboro.zip";
isfile(local_zip) || Downloads.download(url_zip, local_zip)
isfile(local_zip)
open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
0xbc, 0x15, 0x74, 0xc5,
@ -25,100 +28,137 @@ open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
0x02, 0x89, 0xdd, 0x74,
0x3c, 0xb3, 0x5d, 0x56]
archive = ZipFile.Reader(local_zip)
owensboro = CSV.read(read(only(archive.files)), DataFrame;
missingstring="NA")
owensboro = @chain archive begin
only(_.files)
read
CSV.read(DataFrame; missingstring="NA")
end;
close(archive)
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype)
select!(owensboro, :date, :type, :arrest_made, :violation);
summary(owensboro)
describe(owensboro, :nunique, :nmissing, :eltype)
owensboro.violation
violation_list = [strip.(split(x, ";"))
for x in owensboro.violation]
violation_flat = reduce(vcat, violation_list)
violation_flat_clean = [contains(x, "SPEEDING") ?
"SPEEDING" : x for x in violation_flat]
sort(freqtable(violation_flat_clean), rev=true)
agg_violation = @chain owensboro begin
@rselect(:violation = strip.(split(:violation, ";")))
flatten(:violation)
@rselect(:violation = contains(:violation, "SPEEDING") ? "SPEEDING" : :violation)
groupby(:violation)
combine(nrow)
sort!(:nrow, rev=true)
select(:violation =>
ByRow(x -> strip.(split(x, ";"))) =>
:v)
flatten(:v)
select(:v =>
ByRow(x -> contains(x, "SPEEDING") ? "SPEEDING" : x) =>
:v)
groupby(:v)
combine(nrow => :count)
sort(:count, rev=true)
end
top_violation = first(agg_violation.violation, 4)
sqrt(4)
sqrt([4, 9, 16])
ByRow(sqrt)([4, 9, 16])
f = ByRow(sqrt)
f([4, 9, 16])
df = DataFrame(id=1:2, v=[[11, 12], [13, 14, 15]])
flatten(df, :v)
@chain DataFrame(id=[1, 1, 2, 2, 2]) begin
groupby(:id)
combine(nrow, nrow => :rows)
end
df = DataFrame(a=[2, 1, 2, 1, 2], b=5:-1:1)
sort(df, :b)
sort(df, [:a, :b])
df = DataFrame(x=[4, 9, 16])
transform(df, :x => ByRow(sqrt))
@chain owensboro begin
@rselect(:v=strip.(split(:violation, ";")))
flatten(:v)
@rselect(:v=contains(:v, "SPEEDING") ?
"SPEEDING" : :v)
groupby(:v)
combine(nrow => :count)
sort(:count, rev=true)
end
df = DataFrame(x=[4, 9, 16])
@select(df, :s = sqrt.(:x))
@rselect(df, :s = sqrt(:x))
select(df, :x => ByRow(sqrt) => :s)
owensboro2 = select(owensboro,
:date => ByRow(dayofweek) => :day,
:type,
:arrest_made => :arrest,
:violation =>
ByRow(x -> contains.(x, top_violation)) =>
[:v_belt, :v_ins, :v_plate, :v_speed])
:arrest_made => :arrest,
:date => ByRow(dayofweek) => :day,
:type,
[:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4])
# mention rename and rename!
[:violation =>
ByRow(x -> contains(x, agg_violation.v[i])) =>
"v$i" for i in 1:4]
# Exercise:
# select(owensboro,
# :date => ByRow(dayname) => :day, :type, :arrest_made => :arrest,
# :violation => ByRow(x -> contains.(x, top_violation)) =>
# [:v_belt, :v_ins, :v_plate, :v_speed])
using CategoricalArrays
combine(owensboro, [:date :arrest_made] .=> [minimum, maximum])
[:date :arrest_made] .=> [minimum, maximum]
weekdays = DataFrame(day=1:7,
dayname=categorical(dayname.(1:7), ordered=true))
dayname=categorical(dayname.(1:7); ordered=true))
isordered(weekdays.dayname)
levels(weekdays.dayname)
levels!(weekdays.dayname, weekdays.dayname)
levels(weekdays.dayname)
leftjoin!(owensboro2, weekdays, on=:day)
levels(owensboro2.dayname)
leftjoin!(owensboro2, weekdays; on=:day)
@chain owensboro2 begin
groupby([:day, :dayname])
groupby([:day, :dayname]; sort=true)
combine(nrow)
end
@chain owensboro2 begin
groupby([:day, :dayname])
combine(nrow)
unstack(:day, :dayname, :nrow)
end
# Alternative:
# unstack(owensboro2, :day, :dayname, :dayname, valuestransform=>length)
freqtable(owensboro2, :dayname, :day)
@chain owensboro2 begin
combine(AsTable(r"v_") => sum => :total)
groupby(:total)
groupby([:day, :dayname]; sort=true)
combine(nrow)
unstack(:dayname, :day, :nrow; fill=0)
end
select!(owensboro2, :arrest, :dayname, Not(:day))
mapcols(x -> count(ismissing, x), owensboro2)
dropmissing!(owensboro2)
mapcols(x -> count(ismissing, x), owensboro2)
@chain owensboro2 begin
groupby(:dayname, sort=true)
combine(:arrest => mean)
bar(_.dayname, _.arrest_mean, legend=false,
xlabel="day of week", ylabel="probability of arrest")
end
select!(owensboro2, Not(:day))
using Distributions
Random.seed!(1234);
owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2));
mean(owensboro2.train)
test, train = groupby(owensboro2, :train, sort=true);
model = glm(@formula(arrest~dayname+type+v_belt+v_ins+v_plate+v_speed),
train = subset(owensboro2, :train)
test = subset(owensboro2, :train => ByRow(!))
model = glm(@formula(arrest~dayname+type+v1+v2+v3+v4),
train, Binomial(), LogitLink())
train.predict = predict(model)
test.predict = predict(model, test)
train.predict = predict(model);
test.predict = predict(model, test);
test_groups = groupby(test, :arrest)
test_groups = groupby(test, :arrest);
histogram(test_groups[(false,)].predict;
bins=10, normalize=:probability,
fillalpha=0.5, label="false")
fillstyle= :/, label="false")
histogram!(test_groups[(true,)].predict;
bins=10, normalize=:probability,
fillalpha=0.5, label="true")
using ROCAnalysis
test_roc = roc(test, score=:predict, target=:arrest)
plot(test_roc.pfa, 1 .- test_roc.pmiss;
legend=:bottomright,
@ -127,5 +167,5 @@ plot(test_roc.pfa, 1 .- test_roc.pmiss;
xlabel="FPR", ylabel="TPR")
train_roc = roc(train, score=:predict, target=:arrest)
plot!(train_roc.pfa, 1 .- train_roc.pmiss;
color="green", lw=3,
label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)",)
color="gold", lw=3,
label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)")