JuliaForDataAnalysis/appB.jl

# Bogumił Kamiński, 2021

# Codes for appendix B

# the solutions for exercises from a given chapter assume that
# there are packages loaded, variables and functions defined in the user's
# Julia session in a state that reflects the point of computations
# at the position of the chapter where a given exercise is formulated

# Code for exercise 3.1

using BenchmarkTools
x = 1:10^6;
y = collect(x);
@btime sort($x);
@btime sort($y);
@edit sort(x)

# Code for exercise 4.1

using Statistics
using BenchmarkTools
aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  12.50
      12.0  10.84  12.0  9.13  12.0   8.15   8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89];
@benchmark [cor($aq[:, i], $aq[:, i+1]) for i in 1:2:7]
@benchmark [cor(view($aq, :, i), view($aq, :, i+1)) for i in 1:2:7]
[cor(@view(aq[:, i]), @view(aq[:, i+1])) for i in 1:2:7]

# Code for exercise 4.2

function dice_distribution(dice1, dice2)
    distribution = Dict{Int, Int}()
    for i in dice1
        for j in dice2
            s = i + j
            if haskey(distribution, s)
                distribution[s] += 1
            else
                distribution[s] = 1
            end
        end
    end
    return distribution
end

function test_dice()
    all_dice = [[1, x2, x3, x4, x5, x6]
                for x2 in 2:11
                for x3 in x2:11
                for x4 in x3:11
                for x5 in x4:11
                for x6 in x5:11]

    two_standard = dice_distribution(1:6, 1:6)

    for d1 in all_dice, d2 in all_dice
        test = dice_distribution(d1, d2)
        if test == two_standard
            println(d1, " ", d2)
        end
    end
end

test_dice()

# Code for exercise 4.3

plot(scatter(data.set1.x, data.set1.y; legend=false),
     scatter(data.set2.x, data.set2.y; legend=false),
     scatter(data.set3.x, data.set3.y; legend=false),
     scatter(data.set4.x, data.set4.y; legend=false))

# Code for exercise 5.1

parse.(Int, ["1", "2", "3"])

# Code for exercise 5.2

Random.seed!(1234);
data5bis = [randn(100, 5) .- 0.4; randn(100, 5) .+ 0.4];
tsne = manifold.TSNE(n_components=2, init="random",
                     learning_rate="auto", random_state=1234);
data2bis = tsne.fit_transform(data5bis);
scatter(data2bis[:, 1], data2bis[:, 2];
        color=[fill("black", 100); fill("gold", 100)],
        legend=false)

# Code for exercise 6.1

years_table = freqtable(years)
plot(names(years_table, 1), years_table; legend=false,
           xlabel="year", ylabel="# of movies")


# Code for exercise 6.2

s3 = Symbol.(s1)
@btime sort($s3);
@btime unique($s1);
@btime unique($s2);
@btime unique($s3);

# Code for exercise 7.1

v = ["1", "2", missing, "4"]
[ismissing(x) ? missing : parse(Int, x) for x in v]
map(v) do x
    if ismissing(x)
        return missing
    else
        return parse(Int, x)
    end
end
using Missings
passmissing(parse).(Int, v)

# Code for exercise 7.2

using Dates
Date(2021, 1, 1):Month(1):Date(2021, 12, 1)
collect(Date(2021, 1, 1):Month(1):Date(2021, 12, 1))

# Code for exercise 7.3

query2 = "https://api.nbp.pl/api/exchangerates/rates/a/usd/" *
         "2020-06-01/2020-06-30/?format=json";
response2 = HTTP.get(query2);
json2 = JSON3.read(response2.body)
rates2 = [x.mid for x in json2.rates]
dates2 = [Date(x.effectiveDate) for x in json2.rates]
has_rate = rates .!== missing
rates2 == rates[has_rate]
dates2 == dates[has_rate]

# Code for exercise 8.1

using BenchmarkTools
@btime $puzzles."Rating";

# Code for exercise 9.1

using StatsBase
summarystats(puzzles[puzzles.Popularity .== 100, "NbPlays"])
summarystats(puzzles[puzzles.Popularity .== -100, "NbPlays"])

# Code for exercise 9.2

sum(length, values(rating_mapping))
nrow(good)

# Code for exercise 9.3

model2 = loess(ratings, mean_popularities; span=0.25);
popularity_predict2 = predict(model2, ratings_predict);
plot!(ratings_predict, popularity_predict2; width=5, color="yellow");

# Code for exercise 10.1

using BenchmarkTools
x = rand(10^6);
@btime DataFrame(x=$x);
@btime DataFrame(x=$x; copycols=false);

# Code for exercise 10.2

df1 = DataFrame(a=1,b=2)
df2 = DataFrame(b=3, a=4)
vcat(df1, df2)
vcat(df1, df2, cols=:orderequal)

# Code for exercise 10.3

function walk_unique_2ahead()
    walk = DataFrame(x=0, y=0)
    for _ in 1:10
        current = walk[end, :]
        push!(walk, sim_step(current))
    end
    return all(walk[i, :] != walk[i+2, :] for i in 1:9)
end
Random.seed!(2);
proptable([walk_unique_2ahead() for _ in 1:10^5])

# Code for exercise 11.1

@time wide = DataFrame(ones(1, 10_000), :auto);
@time wide = DataFrame(ones(1, 10_000), :auto);
@time Tables.columntable(wide);
@time Tables.columntable(wide);

# Code for exercise 11.2

using Statistics
Dict(key.city => mean(df.rainfall) for (key, df) in pairs(gdf_city))
combine(gdf_city, :rainfall => mean)

# Code for exercise 12.1

cg = complete_graph(37700)
Base.summarysize(cg)
@time deg_class(cg, classes_df.ml_target);

# Code for exercise 12.2

scatter(log1p.(agg_df.deg_ml),
        log1p.(agg_df.deg_web);
        zcolor=agg_df.web_mean,
        xlabel="degree ml", ylabel="degree web",
        markersize=2, markerstrokewidth=0.5, markeralpha=0.8,
        legend=:topleft, labels = "fraction web",
        xticks=gen_ticks(maximum(classes_df.deg_ml)),
        yticks=gen_ticks(maximum(classes_df.deg_web)))

# Code for exercise 12.3

glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)),
    classes_df, Binomial(), ProbitLink())

# Code for exercise 12.4

df = DataFrame()
df.a = [1, 2, 3]
df.b = df.a
df.b === df.a
df.b = df[:, "b"]
df.b === df.a
df.b == df.a
df[1:2, "a"] .= 10
df

# Code for exercise 13.1

@rselect(owensboro,
    :arrest = :arrest_made,
    :day = dayofweek(:date),
    :type,
    :v1 = contains(:violation, agg_violation.v[1]),
    :v2 = contains(:violation, agg_violation.v[2]),
    :v3 = contains(:violation, agg_violation.v[3]),
    :v4 = contains(:violation, agg_violation.v[4]))

# Code for exercise 13.2

select(owensboro,
    :arrest_made => :arrest,
    :date => ByRow(dayofweek) => :day,
    :type,
    [:violation =>
     ByRow(x -> contains(x, agg_violation.v[i])) =>
     "v$i" for i in 1:4],
    :date => ByRow(dayname) => :dayname)

# Code for exercise 13.3

@chain owensboro2 begin
    groupby(:dayname, sort=true)
    combine(:arrest => mean)
end

@chain owensboro2 begin
    groupby([:dayname, :type], sort=true)
    combine(:arrest => mean)
    unstack(:dayname, :type, :arrest_mean)
end

# Code for exercise 13.4

train2 = owensboro2[owensboro2.train, :]
test2 = owensboro2[.!owensboro2.train, :]
test3, train3 = groupby(owensboro2, :train, sort=true)

# Code for exercise 14.1

@time mean(x -> x < 0, -10^6:10^6)
@time mean(x -> x < 0, -10^6:10^6)
@time mean(x -> x < 0, -10^6:10^6)
@time mean(<(0), -10^6:10^6)
@time mean(<(0), -10^6:10^6)
@time mean(<(0), -10^6:10^6)

lt0(x) = x < 0
@time mean(lt0, -10^6:10^6)
@time mean(lt0, -10^6:10^6)
@time mean(lt0, -10^6:10^6)

# Code for exercise 14.2

# web service code

using Genie
Genie.config.run_as_server = true
Genie.Router.route("/", method=POST) do
    message = Genie.Requests.jsonpayload()
    return try
        n = message["n"]
        Genie.Renderer.Json.json(rand(n))
    catch
        Genie.Responses.setstatus(400)
    end
end
Genie.Server.up()

# client code

using HTTP
using JSON3
req = HTTP.post("http://127.0.0.1:8000",
                ["Content-Type" => "application/json"],
                JSON3.write((n=3,)))
JSON3.read(req.body)
HTTP.post("http://127.0.0.1:8000",
                ["Content-Type" => "application/json"],
                JSON3.write((x=3,)))