From a7f0c90b4638690ce589522bfcc99347172552fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 14 Feb 2022 16:37:49 +0100 Subject: [PATCH] add chapter 11 --- appB.jl | 6 +++++ ch11.jl | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/appB.jl b/appB.jl index 42c4ecd..5b0755d 100644 --- a/appB.jl +++ b/appB.jl @@ -165,6 +165,12 @@ proptable([walk_unique_2ahead() for _ in 1:10^5]) @time wide = DataFrame(ones(1, 10_000), :auto); @time Tables.columntable(wide); +# Code for exercise 11.2 + +using Statistics +Dict(key.city => mean(df.rainfall) for (key, df) in pairs(gdf_city)) +combine(gdf_city, :rainfall => mean) + # Code for exercise 12.1 cg = complete_graph(37700) diff --git a/ch11.jl b/ch11.jl index 565b295..24cc580 100644 --- a/ch11.jl +++ b/ch11.jl @@ -4,6 +4,12 @@ # Code for section 11.1 +# deserialization of source data frame + +using DataFrames +using Serialization +walk = deserialize("walk.bin") + # Code for a note on conversion x = [1.5] @@ -12,9 +18,6 @@ x # Code from section 11.1.1 -using Serialization -walk = deserialize("walk.bin") - Matrix(walk) Matrix{Any}(walk) Matrix{String}(walk) @@ -83,3 +86,69 @@ identity.(eachcol(walk)) df = DataFrame(x=1:2, b=["a", "b"]) identity.(eachcol(df)) + +# Code from section 11.2 + +using CSV +raw_data = """ +city,date,rainfall +Olecko,2020-11-16,2.9 +Olecko,2020-11-17,4.1 +Olecko,2020-11-19,4.3 +Olecko,2020-11-20,2.0 +Olecko,2020-11-21,0.6 +Olecko,2020-11-22,1.0 +Ełk,2020-11-16,3.9 +Ełk,2020-11-19,1.2 +Ełk,2020-11-20,2.0 +Ełk,2020-11-22,2.0 +"""; +rainfall_df = CSV.read(IOBuffer(raw_data), DataFrame) + +gdf_city = groupby(rainfall_df, "city") + +gdf_city_date = groupby(rainfall_df, Not("rainfall")) + +keys(gdf_city_date) + +gk1 = keys(gdf_city_date)[1] +g1_t = Tuple(gk1) +g1_nt = NamedTuple(gk1) +g1_dict = Dict(gk1) + +gdf_city_date[1] +gdf_city_date[gk1] +gdf_city_date[g1_t] +gdf_city_date[g1_nt] +gdf_city_date[g1_dict] + +gdf_city[("Olecko",)] +gdf_city[(city="Olecko",)] + +using BenchmarkTools +bench_df = DataFrame(id=1:10^8); +bench_gdf = groupby(bench_df, :id); +@btime groupby($bench_df, :id); +bench_i = 1_000_000; +bench_gk = keys(bench_gdf)[bench_i]; +bench_t = Tuple(bench_gk); +bench_nt = NamedTuple(bench_gk); +bench_dict = Dict(bench_gk); +@btime $bench_gdf[$bench_i]; +@btime $bench_gdf[$bench_gk]; +@btime $bench_gdf[$bench_t]; +@btime $bench_gdf[$bench_nt]; +@btime $bench_gdf[$bench_dict]; + +gdf_city[[2, 1]] +gdf_city[[1]] + +[nrow(df) for df in gdf_city] + +for p in pairs(gdf_city) + println(p) +end + +Dict(key.city => nrow(df) for (key, df) in pairs(gdf_city)) + +combine(gdf_city, nrow)