156 lines
2.6 KiB
Julia
156 lines
2.6 KiB
Julia
# Bogumił Kamiński, 2022
|
|
|
|
# Codes for chapter 11
|
|
|
|
# Code for section 11.1
|
|
|
|
# deserialization of source data frame
|
|
|
|
using DataFrames
|
|
using Serialization
|
|
walk = deserialize("walk.bin")
|
|
|
|
# Code for a note on conversion
|
|
|
|
x = [1.5]
|
|
x[1] = 1
|
|
x
|
|
|
|
# Code for section 11.1.1
|
|
|
|
Matrix(walk)
|
|
Matrix{Any}(walk)
|
|
Matrix{String}(walk)
|
|
|
|
using Plots
|
|
plot(walk)
|
|
|
|
plot(Matrix(walk); labels=["x" "y"] , legend=:topleft)
|
|
|
|
# Code for section 11.1.2
|
|
|
|
Tables.columntable(walk)
|
|
|
|
using BenchmarkTools
|
|
function mysum(table)
|
|
s = 0 #A
|
|
for v in table.x #B
|
|
s += v
|
|
end
|
|
return s
|
|
end
|
|
df = DataFrame(x=1:1_000_000);
|
|
@btime mysum($df)
|
|
|
|
tab = Tables.columntable(df);
|
|
@btime mysum($tab)
|
|
|
|
@code_warntype mysum(df)
|
|
|
|
@code_warntype mysum(tab)
|
|
|
|
typeof(tab)
|
|
|
|
function barrier_mysum2(x)
|
|
s = 0
|
|
for v in x
|
|
s += v
|
|
end
|
|
return s
|
|
end
|
|
mysum2(table) = barrier_mysum2(table.x)
|
|
@btime mysum2($df)
|
|
|
|
df = DataFrame(a=[1, 1, 2], b=[1, 1, 2])
|
|
unique(df)
|
|
|
|
tab = Tables.columntable(df)
|
|
unique(tab)
|
|
|
|
# Code for section 11.1.3
|
|
|
|
Tables.rowtable(walk)
|
|
|
|
nti = Tables.namedtupleiterator(walk)
|
|
for v in nti
|
|
println(v)
|
|
end
|
|
|
|
er = eachrow(walk)
|
|
er[1]
|
|
er[end]
|
|
ec = eachcol(walk)
|
|
ec[1]
|
|
ec[end]
|
|
|
|
identity.(eachcol(walk))
|
|
|
|
df = DataFrame(x=1:2, b=["a", "b"])
|
|
identity.(eachcol(df))
|
|
|
|
# Code for section 11.2
|
|
|
|
using CSV
|
|
raw_data = """
|
|
city,date,rainfall
|
|
Olecko,2020-11-16,2.9
|
|
Olecko,2020-11-17,4.1
|
|
Olecko,2020-11-19,4.3
|
|
Olecko,2020-11-20,2.0
|
|
Olecko,2020-11-21,0.6
|
|
Olecko,2020-11-22,1.0
|
|
Ełk,2020-11-16,3.9
|
|
Ełk,2020-11-19,1.2
|
|
Ełk,2020-11-20,2.0
|
|
Ełk,2020-11-22,2.0
|
|
""";
|
|
rainfall_df = CSV.read(IOBuffer(raw_data), DataFrame)
|
|
|
|
gdf_city = groupby(rainfall_df, "city")
|
|
|
|
gdf_city_date = groupby(rainfall_df, Not("rainfall"))
|
|
|
|
keys(gdf_city_date)
|
|
|
|
gk1 = keys(gdf_city_date)[1]
|
|
g1_t = Tuple(gk1)
|
|
g1_nt = NamedTuple(gk1)
|
|
g1_dict = Dict(gk1)
|
|
|
|
gdf_city_date[1]
|
|
gdf_city_date[gk1]
|
|
gdf_city_date[g1_t]
|
|
gdf_city_date[g1_nt]
|
|
gdf_city_date[g1_dict]
|
|
|
|
gdf_city[("Olecko",)]
|
|
gdf_city[(city="Olecko",)]
|
|
|
|
using BenchmarkTools
|
|
bench_df = DataFrame(id=1:10^8);
|
|
bench_gdf = groupby(bench_df, :id);
|
|
@btime groupby($bench_df, :id);
|
|
bench_i = 1_000_000;
|
|
bench_gk = keys(bench_gdf)[bench_i];
|
|
bench_t = Tuple(bench_gk);
|
|
bench_nt = NamedTuple(bench_gk);
|
|
bench_dict = Dict(bench_gk);
|
|
@btime $bench_gdf[$bench_i];
|
|
@btime $bench_gdf[$bench_gk];
|
|
@btime $bench_gdf[$bench_t];
|
|
@btime $bench_gdf[$bench_nt];
|
|
@btime $bench_gdf[$bench_dict];
|
|
|
|
gdf_city[[2, 1]]
|
|
gdf_city[[1]]
|
|
|
|
[nrow(df) for df in gdf_city]
|
|
|
|
for p in pairs(gdf_city)
|
|
println(p)
|
|
end
|
|
|
|
Dict(key.city => nrow(df) for (key, df) in pairs(gdf_city))
|
|
|
|
combine(gdf_city, nrow)
|