JuliaForDataAnalysis/ch06.jl

226 lines
3.7 KiB
Julia
Raw Normal View History

2022-01-17 00:22:06 +01:00
# Bogumił Kamiński, 2022
2022-02-12 19:16:05 +01:00
# Codes for chapter 6
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.1
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
import Downloads
Downloads.download("https://raw.githubusercontent.com/" *
"sidooms/MovieTweetings/" *
"44c525d0c766944910686c60697203cda39305d6/" *
"snapshots/10K/movies.dat",
"movies.dat")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Code for string interpolation examples
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
x = 10
"I have $x apples"
2022-01-17 00:22:06 +01:00
2022-04-12 11:21:05 +02:00
"I have $(2 * x) apples"
2022-02-08 20:58:33 +01:00
"I have \$100."
"I have $100."
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Code for multiline strings
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
Downloads.download("https://raw.githubusercontent.com/\
sidooms/MovieTweetings/\
44c525d0c766944910686c60697203cda39305d6/\
snapshots/10K/movies.dat",
"movies.dat")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
"a\
b\
c"
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Code for raw strings
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
"C:\my_folder\my_file.txt"
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
raw"C:\my_folder\my_file.txt"
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.2
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
movies = readlines("movies.dat")
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.2
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
movie1 = first(movies)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
movie1_parts = split(movie1, "::")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
supertype(String)
supertype(SubString{String})
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.3
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
movie1_parts[2]
2022-01-17 00:22:06 +01:00
2022-04-24 21:13:15 +02:00
rx = r"(.+) \((\d{4})\)$"
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
m = match(rx, movie1_parts[2])
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
m[1]
m[2]
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
parse(Int, m[2])
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.3
2022-01-17 00:22:06 +01:00
2022-04-12 11:23:32 +02:00
function parseline(line::AbstractString)
2022-02-08 20:58:33 +01:00
parts = split(line, "::")
2022-04-24 21:13:15 +02:00
m = match(r"(.+) \((\d{4})\)", parts[2])
2022-02-08 20:58:33 +01:00
return (id=parts[1],
name=m[1],
year=parse(Int, m[2]),
genres=split(parts[3], "|"))
end
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Code for parsing one line of movies data
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
record1 = parseline(movie1)
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.4
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
codeunits("a")
codeunits("ε")
codeunits("")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Codes for different patterns of string subsetting
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
word = first(record1.name, 8)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
record1.name[1:8]
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
for i in eachindex(word)
println(i, ": ", word[i])
end
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
codeunits("ô")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
codeunits("Fantômas")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
isascii("Hello world!")
isascii("∀ x: x≥0")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
word[1]
word[5]
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.5
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
records = parseline.(movies)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
genres = String[]
for record in records
append!(genres, record.genres)
end
genres
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using FreqTables
table = freqtable(genres)
sort!(table)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
years = [record.year for record in records]
has_drama = ["Drama" in record.genres for record in records]
drama_prop = proptable(years, has_drama; margins=1)
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.5
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using Plots
plot(names(drama_prop, 1), drama_prop[:, 2]; legend=false,
xlabel="year", ylabel="Drama probability")
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.6.1
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
s1 = Symbol("x")
s2 = Symbol("hello world!")
s3 = Symbol("x", 1)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
typeof(s1)
typeof(s2)
typeof(s3)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
Symbol("1")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
:x
:x1
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
:hello world
:1
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.6.2
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
supertype(Symbol)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
:x == :x
:x == :y
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.6
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using BenchmarkTools
str = string.("x", 1:10^6)
symb = Symbol.(str)
2022-03-22 22:49:55 +01:00
@btime "x" in $str;
@btime :x in $symb;
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.7
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using InlineStrings
s1 = InlineString("x")
typeof(s1)
s2 = InlineString("")
typeof(s2)
sv = inlinestrings(["The", "quick", "brown", "fox", "jumps",
"over", "the", "lazy", "dog"])
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.7
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using Random
using BenchmarkTools
Random.seed!(1234);
s1 = [randstring(3) for i in 1:10^6]
s2 = inlinestrings(s1)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
# Code for analyzing properties of InlineStrings.jl
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
Base.summarysize(s1)
Base.summarysize(s2)
2022-01-17 00:22:06 +01:00
2022-03-22 22:49:55 +01:00
@btime sort($s1);
@btime sort($s2);
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for listing 6.8
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
open("iris.txt", "w") do io
for i in 1:10^6
println(io, "Iris setosa")
println(io, "Iris virginica")
println(io, "Iris versicolor")
2022-01-17 00:22:06 +01:00
end
end
2022-02-12 19:16:05 +01:00
# Code for section 6.8.2
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
uncompressed = readlines("iris.txt")
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
using PooledArrays
compressed = PooledArray(uncompressed)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
Base.summarysize(uncompressed)
Base.summarysize(compressed)
2022-01-17 00:22:06 +01:00
2022-02-12 19:16:05 +01:00
# Code for section 6.8.3
2022-02-08 20:58:33 +01:00
compressed.invpool
compressed.pool
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
compressed[10]
compressed.pool[compressed.refs[10]]
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
Base.summarysize.(compressed.pool)
2022-01-17 00:22:06 +01:00
2022-02-08 20:58:33 +01:00
v1 = string.("x", 1:10^6)
v2 = PooledArray(v1)
Base.summarysize(v1)
Base.summarysize(v2)