Codes for chapters 1, 2, and 3

This commit is contained in:
Bogumił Kamiński 2021-12-29 14:18:24 +01:00
parent f4c3f0f754
commit b82be9e882
6 changed files with 1899 additions and 2 deletions

1030
Manifest.toml Normal file

File diff suppressed because it is too large Load Diff

8
Project.toml Normal file
View File

@ -0,0 +1,8 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

View File

@ -1,2 +1,35 @@
# JuliaForDataAnalysis
Codes for the book "Julia for Data Analysis"
This repository contains source codes for the "Julia for Data Analysis" book
that is written by Bogumił Kamiński and is planned to be published in 2022 by
[Manning Publications Co.](https://www.manning.com/).
In order to prepare the Julia environment before working with the materials
presented in the book please perform the following setup steps:
* [download](https://julialang.org/downloads/) and
[install](https://julialang.org/downloads/platform/)
[Julia](https://julialang.org/);
all the codes were tested under Julia 1.7;
* make sure you can start Julia by running `julia` command in your system shell
(alternative ways to use Julia are described in Appendix A to the book)
* download [this repository](https://github.com/bkamins/JuliaForDataAnalysis)
to a local folder on your computer;
* start Julia in a folder containing the downloaded material using the command
`julia --project`; the folder must
contain the Project.toml and Manifest.toml files prepared for this book
(an explanation what these files do and why they are required is given in
Appendix A to the book);
* press *]*, write `instantiate` and press *Enter* (this process will ensure
that Julia properly configures the working environment for working with
the codes from the book);
* press *Backspace*, write `exit()` and press *Enter*; now you should exit Julia
and everything is set up to work with the materials presented in the book.
The codes for each chapter are stored in files named *chXX.jl*, where *XX* is
chapter number.
To work with codes from some given chapter:
* start a fresh Julia session using the `julia --project` command in a folder
containing the downloaded material;
* execute the commands sequentially as they appear in the file;
the codes were prepared in a way that you do not need to restart Julia
when working with material from a single chapter, unless it is explicitly
written in the instructions to restart Julia (some of the codes require this).

25
ch01.jl Normal file
View File

@ -0,0 +1,25 @@
# Bogumił Kamiński, 2021
# Codes for chapter 1
# Code from section 1.2.1
function f(n)
s = 0
for i in 1:n
s += i
end
return s
end
@time f(1_000_000_000)
# Code allowing to reproduce the data frame presented in section 1.3
using DataFrames
DataFrame(a=1:3, name=["Alice", "Bob", "Clyde"],
age=[19, 24, 21], friends=[[2], [1, 3], [2]],
location=[(city="Atlanta", state="GA"),
(city="Boston", state="MA"),
(city="Austin", state="TX")])

442
ch02.jl Normal file
View File

@ -0,0 +1,442 @@
# Bogumił Kamiński, 2021
# Codes for chapter 2
# Code for listing 2.1
1
true
"Hello world!"
0.1
[1, 2, 3]
# Code for listing 2.2
typeof(1)
typeof(true)
typeof("Hello world!")
typeof(0.1)
typeof([1, 2, 3])
# Code for showing bit representation of numbers
bitstring(1)
bitstring(1.0)
bitstring(Int8(1))
# Code showing to what Int alias expands
Int
# Code for checking if value is of some type
[1, 2, 3] isa Vector{Int}
[1, 2, 3] isa Array{Int64, 1}
# Code for section 2.2
x = 1
y = [1, 2, 3]
x = 1
x
typeof(x)
x = 0.1
x
typeof(x)
Kamiński = 1
x₁ = 0.5
ε = 0.0001
?₁
?ε
# Code for listing 2.3
x = -7
if x > 0
println("positive")
elseif x < 0
println("negative")
elseif x == 0
println("zero")
else
println("unexpected condition")
end
# Code showing that logical condition must be Bool
x = -7
if x
println("condition was true")
end
# Code showing comparisons against NaN
NaN > 0
NaN >= 0
NaN < 0
NaN <= 0
NaN == 0
NaN != 0
NaN != NaN
# Code showing that floating point arithmetic is only approximate
0.1 + 0.2 == 0.3
0.1 + 0.2
isapprox(0.1 + 0.2, 0.3)
0.1 + 0.2 0.3
# Code showing combining conditions
x = -7
x > 0 && x < 10
x < 0 || log(x) > 10
x = -7
log(x)
# Code showing typical one-line conditional execution expressions
x = -7
x < 0 && println(x^2)
iseven(x) || println("x is odd")
x = -7
if x < 0
println(x^2)
end
if !iseven(x)
println("x is odd")
end
x = -7
if x < 0 && x^2
println("inside if")
end
# Code showing ternary operator
x = -7
x > 0 ? println("x is positive") : println("x is not positive")
# Code from listing 2.4
for i in [1, 2, 3]
println(i, " is ", isodd(i) ? "odd" : "even")
end
# Code from listing 2.5
i = 1
while i < 4
println(i, " is ", isodd(i) ? "odd" : "even")
global i += 1
end
# Code showing break and continue keywords
i = 0
while true
global i += 1
i > 6 && break
isodd(i) && continue
println(i, " is even")
end
# Code from listing 2.6
x = -7
x < 0 && begin
println(x)
x += 1
println(x)
2 * x
end
x > 0 ? (println(x); x) : (x += 1; println(x); x)
# Code from section 2.3.4
x = [8, 3, 1, 5, 7]
k = 1
y = sort(x)
for i in 1:k
y[i] = y[k + 1]
y[end - i + 1] = y[end - k]
end
y
s = 0
for v in y
s += v
end
s
s / length(y)
# Code from listing 2.7
function times_two(x)
return 2 * x
end
times_two(10)
# Code from listing 2.8
function compose(x, y=10; a, b=10)
return x, y, a, b
end
compose(1, 2; a=3, b=4)
compose(1, 2; a=3)
compose(1; a=3)
compose(1)
compose(; a=3)
# Code from listing 2.9
times_two(x) = 2 * x
compose(x, y=10; a, b=10) = x, y, a, b
# Code showing the use of map function
map(times_two, [1, 2, 3])
# Code from listing 2.10
map(x -> 2 * x, [1, 2, 3])
# Code showing sum taking a function as a first argument
sum(x -> x ^ 2, [1, 2, 3])
# Code showing do-end syntax
sum([1, 2, 3]) do x
println("processing ", x)
return x ^ 2
end
# Code showing the difference between sort and sort!
x = [5, 1, 3, 2]
sort(x)
x
sort!(x)
x
# Code showing a simple implementation of winsorized_mean function
function winsorized_mean(x, k)
y = sort(x)
for i in 1:k
y[i] = y[k + 1]
y[end - i + 1] = y[end - k]
end
s = 0
for v in y
s += v
end
return s / length(y)
end
winsorized_mean([8, 3, 1, 5, 7], 1)
# Code from section 2.5
function fun1()
x = 1
return x + 1
end
fun1()
x
function fun2()
if true
x = 10
end
return x
end
fun2()
function fun3()
x = 0
for i in [1, 2, 3]
if i == 2
x = 2
end
end
return x
end
fun3()
function fun4()
for i in [1, 2, 3]
if i == 2
x = 2
end
end
return x
end
fun4()
function fun5()
for i in [1, 2, 3]
if i == 1
x = 1
else
x += 1
end
println(x)
end
end
fun5()
function fun6()
x = 0
for i in [1, 2, 3]
if i == 1
x = 1
else
x += 1
end
println(x)
end
end
fun6()
# Code from section 2.6
methods(cd)
sum isa Function
typeof(sum)
typeof(sum) == Function
supertype(typeof(sum))
function traverse(T)
println(T)
T == Any || traverse(supertype(T))
return nothing
end
traverse(Int64)
function print_subtypes(T, indent_level=0)
println(" " ^ indent_level, T)
for S in subtypes(T)
print_subtypes(S, indent_level + 2)
end
return nothing
end
print_subtypes(Integer)
traverse(typeof([1.0, 2.0, 3.0]))
traverse(typeof(1:3))
AbstractVector
typejoin(typeof([1.0, 2.0, 3.0]), typeof(1:3))
# Code from section 2.7
fun(x) = println("unsupported type")
fun(x::Number) = println("a number was passed")
fun(x::Float64) = println("a Float64 value")
methods(fun)
fun("hello!")
fun(1)
fun(1.0)
bar(x, y) = "no numbers passed"
bar(x::Number, y) = "first argument is a number"
bar(x, y::Number) = "second argument is a number"
bar("hello", "world")
bar(1, "world")
bar("hello", 2)
bar(1, 2)
bar(x::Number, y::Number) = "both arguments are numbers"
bar(1, 2)
methods(bar)
function winsorized_mean(x::AbstractVector, k::Integer)
k >= 0 || throw(ArgumentError("k must be non-negative"))
length(x) > 2 * k || throw(ArgumentError("k is too large"))
y = sort!(collect(x))
for i in 1:k
y[i] = y[k + 1]
y[end - i + 1] = y[end - k]
end
return sum(y) / length(y)
end
winsorized_mean([8, 3, 1, 5, 7], 1)
winsorized_mean(1:10, 2)
winsorized_mean(1:10, "a")
winsorized_mean(10, 1)
winsorized_mean(1:10, -1)
winsorized_mean(1:10, 5)
# Code from section 2.8
import Statistics
x = [1, 2, 3]
mean(x)
Statistics.mean(x)
using Statistics
mean(x)
# start a fresh Julia session before running this code
mean = 1
using Statistics
mean
# start a fresh Julia session before running this code
using Statistics
mean([1, 2, 3])
mean = 1
# start a fresh Julia session before running this code
using Statistics
mean = 1
mean([1, 2, 3])
# start a fresh Julia session before running this code
using Statistics
using StatsBase
?winsor
mean(winsor([8, 3, 1, 5, 7], count=1))
# Code from section 2.9
@time 1 + 2
@time(1 + 2)
@assert 1 == 2 "1 is not equal 2"
@assert(1 == 2, "1 is not equal 2")
@macroexpand @assert(1 == 2, "1 is not equal 2")
@macroexpand @time 1 + 2
# before running these codes
# define the winsorized_mean function using the code from section 2.7
using BenchmarkTools
x = rand(10^6);
@benchmark winsorized_mean($x, 10^5)
using Statistics, StatsBase
@benchmark mean(winsor($x; count=10^5))
@edit winsor(x, count=10^5)

359
ch03.jl Normal file
View File

@ -0,0 +1,359 @@
# Bogumił Kamiński, 2021
# Codes for chapter 3
# Code for listing 3.1
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89]
# Code for checking size of a matrix
size(aq)
size(aq, 1)
size(aq, 2)
# Code comparing tuple to a vector
v = [1, 2, 3]
t = (1, 2, 3)
v[1]
t[1]
v[1] = 10
v
t[1] = 10
# Code for figure 3.2
using BenchmarkTools
@benchmark (1, 2, 3)
@benchmark [1, 2, 3]
# Code for section 3.1.2
using Statistics
mean(aq; dims=1)
std(aq; dims=1)
map(mean, eachcol(aq))
map(std, eachcol(aq))
map(eachcol(aq)) do col
mean(col)
end
[mean(col) for col in eachcol(aq)]
[std(col) for col in eachcol(aq)]
# Code for section 3.1.3
[mean(aq[:, j]) for j in axes(aq, 2)]
[std(aq[:, j]) for j in axes(aq, 2)]
axes(aq, 2)
?Base.OneTo
[mean(view(aq, :, j)) for j in axes(aq, 2)]
[std(@view aq[:, j]) for j in axes(aq, 2)]
# Code for section 3.1.4
using BenchmarkTools
x = ones(10^7, 10)
@benchmark [mean(@view $x[:, j]) for j in axes($x, 2)]
@benchmark [mean($x[:, j]) for j in axes($x, 2)]
@benchmark mean($x, dims=1)
# Code for section 3.1.5
[cor(aq[:, i], aq[:, i+1]) for i in 1:2:7]
collect(1:2:7)
# Code for section 3.1.6
y = aq[:, 2]
X = [ones(11) aq[:, 1]]
X \ y
[[ones(11) aq[:, i]] \ aq[:, i+1] for i in 1:2:7]
function (x, y)
X = [ones(11) x]
model = X \ y
prediction = X * model
error = y - prediction
SS_res = sum(v -> v ^ 2, error)
mean_y = mean(y)
SS_tot = sum(v -> (v - mean_y) ^ 2, y)
return 1 - SS_res / SS_tot
end
[(aq[:, i], aq[:, i+1]) for i in 1:2:7]
# Code for section 3.1.7
using Plots
scatter(aq[:, 1], aq[:, 2]; legend=false)
plot(scatter(aq[:, 1], aq[:, 2]; legend=false),
scatter(aq[:, 3], aq[:, 4]; legend=false),
scatter(aq[:, 5], aq[:, 6]; legend=false),
scatter(aq[:, 7], aq[:, 8]; legend=false))
plot([scatter(aq[:, i], aq[:, i+1]; legend=false)
for i in 1:2:7]...)
# Code for section 3.2
two_standard = Dict{Int, Int}()
for i in [1, 2, 3, 4, 5, 6]
for j in [1, 2, 3, 4, 5, 6]
s = i + j
if haskey(two_standard, s)
two_standard[s] += 1
else
two_standard[s] = 1
end
end
end
two_standard
keys(two_standard)
values(two_standard)
using Plots
scatter(collect(keys(two_standard)), collect(values(two_standard));
legend=false, xaxis=2:12)
all_dice = [[1, x2, x3, x4, x5, x6]
for x2 in 2:11
for x3 in x2:11
for x4 in x3:11
for x5 in x4:11
for x6 in x5:11]
for d1 in all_dice, d2 in all_dice
test = Dict{Int, Int}()
for i in d1, j in d2
s = i + j
if haskey(test, s)
test[s] += 1
else
test[s] = 1
end
end
if test == two_standard
println(d1, " ", d2)
end
end
# Code for section 3.3
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89]
dataset1 = (x=aq[:, 1], y=aq[:, 2])
dataset1[1]
dataset1.x
# Code for listing 3.2
data = (set1=(x=aq[:, 1], y=aq[:, 2]),
set2=(x=aq[:, 3], y=aq[:, 4]),
set3=(x=aq[:, 5], y=aq[:, 6]),
set4=(x=aq[:, 7], y=aq[:, 8]))
# Code for section 3.3.2
using Statistics
map(s -> mean(s.x), data)
map(s -> cor(s.x, s.y), data)
using GLM
model = lm(@formula(y ~ x), data.set1)
r2(model)
# Code for section 3.3.3
model.mm
x = [3, 1, 2]
sort(x)
x
sort!(x)
x
empty_field!(nt, i) = empty!(nt[i])
nt = (dict = Dict("a" => 1, "b" => 2), int=10)
empty_field!(nt, 1)
nt
# Code for section 3.4.1
x = [1 2 3]
y = [1, 2, 3]
x * y
a = [1, 2, 3]
b = [4, 5, 6]
a * b
a .* b
map(*, a, b)
[a[i] * b[i] for i in eachindex(a, b)]
eachindex(a, b)
eachindex([1, 2, 3], [4, 5])
map(*, [1, 2, 3], [4, 5])
[1, 2, 3] .* [4, 5]
# Code for section 3.4.2
[1, 2, 3] .* [4]
[1, 2, 3] .^ 2
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] .* [1 2 3 4 5 6 7 8 9 10]
["x", "y", "z"] .=> [sum minimum maximum]
abs.([1, -2, 3, -4])
abs([1, 2, 3])
string(1, 2, 3)
string.("x", 1:10)
f(i::Int) = string("got integer ", i)
f(s::String) = string("got string ", s)
f.([1, "1"])
# Code for section 3.4.3
in(1, [1, 2, 3])
in(4, [1, 2, 3])
in([1, 3, 5, 7, 9], [1, 2, 3, 4])
in.([1, 3, 5, 7, 9], [1, 2, 3, 4])
in.([1, 3, 5, 7, 9], Ref([1, 2, 3, 4]))
# Code for section 3.4.4
aq = [10.0 8.04 10.0 9.14 10.0 7.46 8.0 6.58
8.0 6.95 8.0 8.14 8.0 6.77 8.0 5.76
13.0 7.58 13.0 8.74 13.0 12.74 8.0 7.71
9.0 8.81 9.0 8.77 9.0 7.11 8.0 8.84
11.0 8.33 11.0 9.26 11.0 7.81 8.0 8.47
14.0 9.96 14.0 8.1 14.0 8.84 8.0 7.04
6.0 7.24 6.0 6.13 6.0 6.08 8.0 5.25
4.0 4.26 4.0 3.1 4.0 5.39 19.0 12.50
12.0 10.84 12.0 9.13 12.0 8.15 8.0 5.56
7.0 4.82 7.0 7.26 7.0 6.42 8.0 7.91
5.0 5.68 5.0 4.74 5.0 5.73 8.0 6.89]
using Statistics
mean.(eachcol(aq))
mean(eachcol(aq))
function (x, y)
X = [ones(11) x]
model = X \ y
prediction = X * model
error = y - prediction
SS_res = sum(v -> v ^ 2, error)
mean_y = mean(y)
SS_tot = sum(v -> (v - mean_y) ^ 2, y)
return 1 - SS_res / SS_tot
end
function (x, y)
X = [ones(11) x]
model = X \ y
prediction = X * model
SS_res = sum((y .- prediction) .^ 2)
SS_tot = sum((y .- mean(y)) .^ 2)
return 1 - SS_res / SS_tot
end
# Code for section 3.5
[]
Dict()
Float64[1, 2, 3]
Dict{UInt8, Float64}(0 => 0, 1 => 1)
UInt32(200)
Real[1, 1.0, 0x3]
v1 = Any[1, 2, 3]
eltype(v1)
v2 = Float64[1, 2, 3]
eltype(v2)
v3 = [1, 2, 3]
eltype(v2)
d1 = Dict()
eltype(d1)
d2 = Dict(1 => 2, 3 => 4)
eltype(d2)
p = 1 => 2
typeof(p)
# Code for section 3.5.1
[1, 2, 3] isa AbstractVector{Int}
[1, 2, 3] isa AbstractVector{Real}
AbstractVector{<:Real}
# Code for section 3.5.2
using Statistics
function ourcov(x::AbstractVector{<:Real},
y::AbstractVector{<:Real})
len = length(x)
@assert len == length(y) > 0
return sum((x .- mean(x)) .* (y .- mean(y))) / (len - 1)
end
ourcov(1:4, [1.0, 3.0, 2.0, 4.0])
cov(1:4, [1.0, 3.0, 2.0, 4.0])
ourcov(1:4, Any[1.0, 3.0, 2.0, 4.0])
x = Any[1, 2, 3]
identity.(x)
y = Any[1, 2.0]
identity.(y)