JuliaForDataAnalysis/ch08.jl

147 lines
2.9 KiB
Julia
Raw Normal View History

2022-01-27 15:28:42 +01:00
# Bogumił Kamiński, 2022
2022-02-13 11:59:23 +01:00
# Codes for chapter 8
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
# Code for section 8.1
2022-01-27 15:28:42 +01:00
2022-03-25 11:34:31 +01:00
import Downloads
2022-02-08 20:58:33 +01:00
if isfile("puzzles.csv.bz2")
@info "file already present"
else
@info "fetching file"
2022-03-25 11:34:31 +01:00
Downloads.download("https://database.lichess.org/" *
"lichess_db_puzzle.csv.bz2",
"puzzles.csv.bz2")
2022-01-27 15:28:42 +01:00
end
2022-02-08 20:58:33 +01:00
using CodecBzip2
compressed = read("puzzles.csv.bz2")
plain = transcode(Bzip2Decompressor, compressed)
open("puzzles.csv", "w") do io
println(io, "PuzzleId,FEN,Moves,Rating,RatingDeviation," *
"Popularity,NbPlays,Themes,GameUrl")
write(io, plain)
2022-01-27 15:28:42 +01:00
end
2022-02-08 20:58:33 +01:00
readlines("puzzles.csv")
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
# Code for section 8.2
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
using CSV
using DataFrames
puzzles = CSV.read("puzzles.csv", DataFrame);
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
puzzles2 = CSV.read(plain, DataFrame;
header=["PuzzleId", "FEN", "Moves",
2022-05-29 13:12:37 +02:00
"Rating", "RatingDeviation",
2022-02-13 11:59:23 +01:00
"Popularity", "NbPlays",
2022-05-29 13:12:37 +02:00
"Themes", "GameUrl"]);
2022-02-13 11:59:23 +01:00
puzzles == puzzles2
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
compressed = nothing
plain = nothing
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
# Code for listing 8.1
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
# Code for listing 8.2
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
describe(puzzles)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
# Code for getting basic information about a data frame
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
ncol(puzzles)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
nrow(puzzles)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
names(puzzles)
2022-01-27 15:28:42 +01:00
2022-02-13 11:59:23 +01:00
CSV.write("puzzles2.csv", puzzles)
read("puzzles2.csv")
read("puzzles2.csv") == read("puzzles.csv")
# Code for section 8.3
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles.Rating
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
using BenchmarkTools
2022-03-24 10:35:42 +01:00
@btime $puzzles.Rating;
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles.Rating == copy(puzzles.Rating)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles.Rating === copy(puzzles.Rating)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles.Rating === puzzles.Rating
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
copy(puzzles.Rating) === copy(puzzles.Rating)
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles."Rating"
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
col = "Rating"
2022-01-27 15:28:42 +01:00
2022-04-01 10:05:19 +02:00
# data_frame_name[selected_rows, selected_columns]
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles[:, "Rating"]
puzzles[:, :Rating]
puzzles[:, 4]
puzzles[:, col]
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
columnindex(puzzles, "Rating")
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
columnindex(puzzles, "Some fancy column name")
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
hasproperty(puzzles, "Rating")
hasproperty(puzzles, "Some fancy column name")
2022-01-27 15:28:42 +01:00
2022-03-24 10:35:42 +01:00
@btime $puzzles[:, :Rating];
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
puzzles[!, "Rating"]
puzzles[!, :Rating]
puzzles[!, 4]
puzzles[!, col]
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
using Plots
plot(histogram(puzzles.Rating, label="Rating"),
histogram(puzzles.RatingDeviation, label="RatingDeviation"),
histogram(puzzles.Popularity, label="Popularity"),
histogram(puzzles.NbPlays, label="NbPlays"))
2022-01-27 15:28:42 +01:00
2022-02-08 20:58:33 +01:00
plot([histogram(puzzles[!, col]; label=col) for
col in ["Rating", "RatingDeviation",
"Popularity", "NbPlays"]]...)
2022-03-02 14:58:15 +01:00
# Code for section 8.4
# Codes for Arrow examples
using Arrow
Arrow.write("puzzles.arrow", puzzles)
arrow_table = Arrow.Table("puzzles.arrow")
puzzles_arrow = DataFrame(arrow_table);
puzzles_arrow == puzzles
puzzles_arrow.PuzzleId
puzzles_arrow.PuzzleId[1] = "newID"
puzzles_arrow = copy(puzzles_arrow);
puzzles_arrow.PuzzleId
# Codes for SQLite examples
using SQLite
db = SQLite.DB("puzzles.db")
SQLite.load!(puzzles, db, "puzzles")
SQLite.tables(db)
SQLite.columns(db, "puzzles")
query = DBInterface.execute(db, "SELECT * FROM puzzles")
puzzles_db = DataFrame(query);
puzzles_db == puzzles
puzzles_db.PuzzleId