done chapter 13
This commit is contained in:
@@ -491,9 +491,9 @@ version = "1.3.0"
|
|||||||
|
|
||||||
[[deps.Latexify]]
|
[[deps.Latexify]]
|
||||||
deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"]
|
deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"]
|
||||||
git-tree-sha1 = "2a8650452c07a9c89e6a58f296fd638fadaca021"
|
git-tree-sha1 = "a6552bfeab40de157a297d84e03ade4b8177677f"
|
||||||
uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
|
uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
|
||||||
version = "0.15.11"
|
version = "0.15.12"
|
||||||
|
|
||||||
[[deps.LibCURL]]
|
[[deps.LibCURL]]
|
||||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||||
@@ -885,9 +885,9 @@ version = "1.8.3"
|
|||||||
|
|
||||||
[[deps.StaticArrays]]
|
[[deps.StaticArrays]]
|
||||||
deps = ["LinearAlgebra", "Random", "Statistics"]
|
deps = ["LinearAlgebra", "Random", "Statistics"]
|
||||||
git-tree-sha1 = "95c6a5d0e8c69555842fc4a927fc485040ccc31c"
|
git-tree-sha1 = "6354dfaf95d398a1a70e0b28238321d5d17b2530"
|
||||||
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
uuid = "90137ffa-7385-5640-81b9-e52037218182"
|
||||||
version = "1.3.5"
|
version = "1.4.0"
|
||||||
|
|
||||||
[[deps.Statistics]]
|
[[deps.Statistics]]
|
||||||
deps = ["LinearAlgebra", "SparseArrays"]
|
deps = ["LinearAlgebra", "SparseArrays"]
|
||||||
|
|||||||
41
appB.jl
41
appB.jl
@@ -204,3 +204,44 @@ df.b === df.a
|
|||||||
df.b == df.a
|
df.b == df.a
|
||||||
df[1:2, "a"] .= 10
|
df[1:2, "a"] .= 10
|
||||||
df
|
df
|
||||||
|
|
||||||
|
# Code for exercise 13.1
|
||||||
|
|
||||||
|
@rselect(owensboro,
|
||||||
|
:arrest = :arrest_made,
|
||||||
|
:day = dayofweek(:date),
|
||||||
|
:type,
|
||||||
|
:v1 = contains(:violation, agg_violation.v[1]),
|
||||||
|
:v2 = contains(:violation, agg_violation.v[2]),
|
||||||
|
:v3 = contains(:violation, agg_violation.v[3]),
|
||||||
|
:v4 = contains(:violation, agg_violation.v[4]))
|
||||||
|
|
||||||
|
# Code for exercise 13.2
|
||||||
|
|
||||||
|
select(owensboro,
|
||||||
|
:arrest_made => :arrest,
|
||||||
|
:date => ByRow(dayofweek) => :day,
|
||||||
|
:type,
|
||||||
|
[:violation =>
|
||||||
|
ByRow(x -> contains(x, agg_violation.v[i])) =>
|
||||||
|
"v$i" for i in 1:4],
|
||||||
|
:date => ByRow(dayname) => :dayname)
|
||||||
|
|
||||||
|
# Code for exercise 13.3
|
||||||
|
|
||||||
|
@chain owensboro2 begin
|
||||||
|
groupby(:dayname, sort=true)
|
||||||
|
combine(:arrest => mean)
|
||||||
|
end
|
||||||
|
|
||||||
|
@chain owensboro2 begin
|
||||||
|
groupby([:dayname, :type], sort=true)
|
||||||
|
combine(:arrest => mean)
|
||||||
|
unstack(:dayname, :type, :arrest_mean)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Code for exercise 13.4
|
||||||
|
|
||||||
|
train2 = owensboro2[owensboro2.train, :]
|
||||||
|
test2 = owensboro2[.!owensboro2.train, :]
|
||||||
|
test3, train3 = groupby(owensboro2, :train, sort=true)
|
||||||
|
|||||||
164
ch13.jl
164
ch13.jl
@@ -1,20 +1,23 @@
|
|||||||
using CSV
|
using CSV
|
||||||
|
using CategoricalArrays
|
||||||
using DataFrames
|
using DataFrames
|
||||||
using DataFramesMeta
|
using DataFramesMeta
|
||||||
using Dates
|
using Dates
|
||||||
|
using Distributions
|
||||||
import Downloads
|
import Downloads
|
||||||
|
using FreqTables
|
||||||
using GLM
|
using GLM
|
||||||
using Plots
|
using Plots
|
||||||
using Random
|
using Random
|
||||||
|
using ROCAnalysis
|
||||||
using SHA
|
using SHA
|
||||||
using Statistics
|
using Statistics
|
||||||
import ZipFile
|
import ZipFile
|
||||||
|
|
||||||
url_zip = "https://stacks.stanford.edu/file/druid:yg821jf8611/" *
|
url_zip = "https://stacks.stanford.edu/file/druid:yg821jf8611/" *
|
||||||
"yg821jf8611_ky_owensboro_2020_04_01.csv.zip"
|
"yg821jf8611_ky_owensboro_2020_04_01.csv.zip";
|
||||||
local_zip = "owensboro.zip"
|
local_zip = "owensboro.zip";
|
||||||
|
isfile(local_zip) || Downloads.download(url_zip, local_zip)
|
||||||
isfile(url_zip) || Downloads.download(url_zip, local_zip)
|
|
||||||
isfile(local_zip)
|
isfile(local_zip)
|
||||||
open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
|
open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
|
||||||
0xbc, 0x15, 0x74, 0xc5,
|
0xbc, 0x15, 0x74, 0xc5,
|
||||||
@@ -25,100 +28,137 @@ open(sha256, local_zip) == [0x14, 0x3b, 0x7d, 0x74,
|
|||||||
0x02, 0x89, 0xdd, 0x74,
|
0x02, 0x89, 0xdd, 0x74,
|
||||||
0x3c, 0xb3, 0x5d, 0x56]
|
0x3c, 0xb3, 0x5d, 0x56]
|
||||||
archive = ZipFile.Reader(local_zip)
|
archive = ZipFile.Reader(local_zip)
|
||||||
owensboro = CSV.read(read(only(archive.files)), DataFrame;
|
owensboro = @chain archive begin
|
||||||
missingstring="NA")
|
only(_.files)
|
||||||
|
read
|
||||||
|
CSV.read(DataFrame; missingstring="NA")
|
||||||
|
end;
|
||||||
close(archive)
|
close(archive)
|
||||||
|
|
||||||
|
summary(owensboro)
|
||||||
describe(owensboro, :nunique, :nmissing, :eltype)
|
describe(owensboro, :nunique, :nmissing, :eltype)
|
||||||
|
|
||||||
|
select!(owensboro, :date, :type, :arrest_made, :violation);
|
||||||
|
summary(owensboro)
|
||||||
|
describe(owensboro, :nunique, :nmissing, :eltype)
|
||||||
|
|
||||||
|
owensboro.violation
|
||||||
|
|
||||||
|
violation_list = [strip.(split(x, ";"))
|
||||||
|
for x in owensboro.violation]
|
||||||
|
violation_flat = reduce(vcat, violation_list)
|
||||||
|
violation_flat_clean = [contains(x, "SPEEDING") ?
|
||||||
|
"SPEEDING" : x for x in violation_flat]
|
||||||
|
sort(freqtable(violation_flat_clean), rev=true)
|
||||||
|
|
||||||
agg_violation = @chain owensboro begin
|
agg_violation = @chain owensboro begin
|
||||||
@rselect(:violation = strip.(split(:violation, ";")))
|
select(:violation =>
|
||||||
flatten(:violation)
|
ByRow(x -> strip.(split(x, ";"))) =>
|
||||||
@rselect(:violation = contains(:violation, "SPEEDING") ? "SPEEDING" : :violation)
|
:v)
|
||||||
groupby(:violation)
|
flatten(:v)
|
||||||
combine(nrow)
|
select(:v =>
|
||||||
sort!(:nrow, rev=true)
|
ByRow(x -> contains(x, "SPEEDING") ? "SPEEDING" : x) =>
|
||||||
|
:v)
|
||||||
|
groupby(:v)
|
||||||
|
combine(nrow => :count)
|
||||||
|
sort(:count, rev=true)
|
||||||
end
|
end
|
||||||
|
|
||||||
top_violation = first(agg_violation.violation, 4)
|
sqrt(4)
|
||||||
|
sqrt([4, 9, 16])
|
||||||
|
ByRow(sqrt)([4, 9, 16])
|
||||||
|
f = ByRow(sqrt)
|
||||||
|
f([4, 9, 16])
|
||||||
|
|
||||||
|
df = DataFrame(id=1:2, v=[[11, 12], [13, 14, 15]])
|
||||||
|
flatten(df, :v)
|
||||||
|
|
||||||
|
@chain DataFrame(id=[1, 1, 2, 2, 2]) begin
|
||||||
|
groupby(:id)
|
||||||
|
combine(nrow, nrow => :rows)
|
||||||
|
end
|
||||||
|
|
||||||
|
df = DataFrame(a=[2, 1, 2, 1, 2], b=5:-1:1)
|
||||||
|
sort(df, :b)
|
||||||
|
sort(df, [:a, :b])
|
||||||
|
|
||||||
|
df = DataFrame(x=[4, 9, 16])
|
||||||
|
transform(df, :x => ByRow(sqrt))
|
||||||
|
|
||||||
|
@chain owensboro begin
|
||||||
|
@rselect(:v=strip.(split(:violation, ";")))
|
||||||
|
flatten(:v)
|
||||||
|
@rselect(:v=contains(:v, "SPEEDING") ?
|
||||||
|
"SPEEDING" : :v)
|
||||||
|
groupby(:v)
|
||||||
|
combine(nrow => :count)
|
||||||
|
sort(:count, rev=true)
|
||||||
|
end
|
||||||
|
|
||||||
|
df = DataFrame(x=[4, 9, 16])
|
||||||
|
@select(df, :s = sqrt.(:x))
|
||||||
|
@rselect(df, :s = sqrt(:x))
|
||||||
|
select(df, :x => ByRow(sqrt) => :s)
|
||||||
|
|
||||||
owensboro2 = select(owensboro,
|
owensboro2 = select(owensboro,
|
||||||
:date => ByRow(dayofweek) => :day,
|
:arrest_made => :arrest,
|
||||||
:type,
|
:date => ByRow(dayofweek) => :day,
|
||||||
:arrest_made => :arrest,
|
:type,
|
||||||
:violation =>
|
[:violation =>
|
||||||
ByRow(x -> contains.(x, top_violation)) =>
|
ByRow(x -> contains(x, agg_violation.v[i])) =>
|
||||||
[:v_belt, :v_ins, :v_plate, :v_speed])
|
"v$i" for i in 1:4])
|
||||||
|
|
||||||
# mention rename and rename!
|
[:violation =>
|
||||||
|
ByRow(x -> contains(x, agg_violation.v[i])) =>
|
||||||
|
"v$i" for i in 1:4]
|
||||||
|
|
||||||
# Exercise:
|
combine(owensboro, [:date :arrest_made] .=> [minimum, maximum])
|
||||||
# select(owensboro,
|
[:date :arrest_made] .=> [minimum, maximum]
|
||||||
# :date => ByRow(dayname) => :day, :type, :arrest_made => :arrest,
|
|
||||||
# :violation => ByRow(x -> contains.(x, top_violation)) =>
|
|
||||||
# [:v_belt, :v_ins, :v_plate, :v_speed])
|
|
||||||
|
|
||||||
using CategoricalArrays
|
|
||||||
|
|
||||||
weekdays = DataFrame(day=1:7,
|
weekdays = DataFrame(day=1:7,
|
||||||
dayname=categorical(dayname.(1:7), ordered=true))
|
dayname=categorical(dayname.(1:7); ordered=true))
|
||||||
|
isordered(weekdays.dayname)
|
||||||
levels(weekdays.dayname)
|
levels(weekdays.dayname)
|
||||||
levels!(weekdays.dayname, weekdays.dayname)
|
levels!(weekdays.dayname, weekdays.dayname)
|
||||||
levels(weekdays.dayname)
|
|
||||||
leftjoin!(owensboro2, weekdays, on=:day)
|
leftjoin!(owensboro2, weekdays; on=:day)
|
||||||
levels(owensboro2.dayname)
|
|
||||||
|
|
||||||
@chain owensboro2 begin
|
@chain owensboro2 begin
|
||||||
groupby([:day, :dayname])
|
groupby([:day, :dayname]; sort=true)
|
||||||
combine(nrow)
|
combine(nrow)
|
||||||
end
|
end
|
||||||
|
|
||||||
@chain owensboro2 begin
|
freqtable(owensboro2, :dayname, :day)
|
||||||
groupby([:day, :dayname])
|
|
||||||
combine(nrow)
|
|
||||||
unstack(:day, :dayname, :nrow)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Alternative:
|
|
||||||
# unstack(owensboro2, :day, :dayname, :dayname, valuestransform=>length)
|
|
||||||
|
|
||||||
@chain owensboro2 begin
|
@chain owensboro2 begin
|
||||||
combine(AsTable(r"v_") => sum => :total)
|
groupby([:day, :dayname]; sort=true)
|
||||||
groupby(:total)
|
|
||||||
combine(nrow)
|
combine(nrow)
|
||||||
|
unstack(:dayname, :day, :nrow; fill=0)
|
||||||
end
|
end
|
||||||
|
|
||||||
select!(owensboro2, :arrest, :dayname, Not(:day))
|
|
||||||
mapcols(x -> count(ismissing, x), owensboro2)
|
|
||||||
dropmissing!(owensboro2)
|
dropmissing!(owensboro2)
|
||||||
mapcols(x -> count(ismissing, x), owensboro2)
|
select!(owensboro2, Not(:day))
|
||||||
@chain owensboro2 begin
|
|
||||||
groupby(:dayname, sort=true)
|
|
||||||
combine(:arrest => mean)
|
|
||||||
bar(_.dayname, _.arrest_mean, legend=false,
|
|
||||||
xlabel="day of week", ylabel="probability of arrest")
|
|
||||||
end
|
|
||||||
|
|
||||||
using Distributions
|
|
||||||
Random.seed!(1234);
|
Random.seed!(1234);
|
||||||
owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2));
|
owensboro2.train = rand(Bernoulli(0.7), nrow(owensboro2));
|
||||||
mean(owensboro2.train)
|
mean(owensboro2.train)
|
||||||
test, train = groupby(owensboro2, :train, sort=true);
|
|
||||||
|
|
||||||
model = glm(@formula(arrest~dayname+type+v_belt+v_ins+v_plate+v_speed),
|
train = subset(owensboro2, :train)
|
||||||
|
test = subset(owensboro2, :train => ByRow(!))
|
||||||
|
|
||||||
|
model = glm(@formula(arrest~dayname+type+v1+v2+v3+v4),
|
||||||
train, Binomial(), LogitLink())
|
train, Binomial(), LogitLink())
|
||||||
|
train.predict = predict(model)
|
||||||
|
test.predict = predict(model, test)
|
||||||
|
|
||||||
train.predict = predict(model);
|
test_groups = groupby(test, :arrest);
|
||||||
test.predict = predict(model, test);
|
|
||||||
|
|
||||||
test_groups = groupby(test, :arrest)
|
|
||||||
histogram(test_groups[(false,)].predict;
|
histogram(test_groups[(false,)].predict;
|
||||||
bins=10, normalize=:probability,
|
bins=10, normalize=:probability,
|
||||||
fillalpha=0.5, label="false")
|
fillstyle= :/, label="false")
|
||||||
histogram!(test_groups[(true,)].predict;
|
histogram!(test_groups[(true,)].predict;
|
||||||
bins=10, normalize=:probability,
|
bins=10, normalize=:probability,
|
||||||
fillalpha=0.5, label="true")
|
fillalpha=0.5, label="true")
|
||||||
|
|
||||||
using ROCAnalysis
|
|
||||||
test_roc = roc(test, score=:predict, target=:arrest)
|
test_roc = roc(test, score=:predict, target=:arrest)
|
||||||
plot(test_roc.pfa, 1 .- test_roc.pmiss;
|
plot(test_roc.pfa, 1 .- test_roc.pmiss;
|
||||||
legend=:bottomright,
|
legend=:bottomright,
|
||||||
@@ -127,5 +167,5 @@ plot(test_roc.pfa, 1 .- test_roc.pmiss;
|
|||||||
xlabel="FPR", ylabel="TPR")
|
xlabel="FPR", ylabel="TPR")
|
||||||
train_roc = roc(train, score=:predict, target=:arrest)
|
train_roc = roc(train, score=:predict, target=:arrest)
|
||||||
plot!(train_roc.pfa, 1 .- train_roc.pmiss;
|
plot!(train_roc.pfa, 1 .- train_roc.pmiss;
|
||||||
color="green", lw=3,
|
color="gold", lw=3,
|
||||||
label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)",)
|
label="train (AUC=$(round(100*(1-auc(train_roc)), digits=2))%)")
|
||||||
|
|||||||
Reference in New Issue
Block a user