diff --git a/Manifest.toml b/Manifest.toml index ed827c9..ef0f320 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -11,13 +11,19 @@ version = "1.1.0" [[deps.Adapt]] deps = ["LinearAlgebra"] -git-tree-sha1 = "9faf218ea18c51fcccaf956c8d39614c9d30fe8b" +git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.2" +version = "3.3.3" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +[[deps.ArnoldiMethod]] +deps = ["LinearAlgebra", "Random", "StaticArrays"] +git-tree-sha1 = "62e51b39331de8911e4a7ff6f5aaf38a5f4cc0ae" +uuid = "ec485272-7323-5ecc-a04f-4719b315124d" +version = "0.2.0" + [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -49,9 +55,9 @@ version = "1.0.8+0" [[deps.CSV]] deps = ["CodecZlib", "Dates", "FilePathsBase", "InlineStrings", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode", "WeakRefStrings"] -git-tree-sha1 = "49f14b6c56a2da47608fe30aed711b5882264d7a" +git-tree-sha1 = "9519274b50500b8029973d241d32cfbf0b127d97" uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -version = "0.9.11" +version = "0.10.2" [[deps.Cairo_jll]] deps = ["Artifacts", "Bzip2_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] @@ -65,11 +71,16 @@ git-tree-sha1 = "c308f209870fdbd84cb20332b6dfaf14bf3387f8" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" version = "0.10.2" +[[deps.Chain]] +git-tree-sha1 = "339237319ef4712e6e5df7758d0bccddf5c237d9" +uuid = "8be319e6-bccf-4806-a6f7-6fae938471bc" +version = "0.4.10" + [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4c26b4e9e91ca528ea212927326ece5918a04b47" +git-tree-sha1 = "54fc4400de6e5c3e27be6047da2ef6ba355511f8" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.11.2" +version = "1.11.6" [[deps.ChangesOfVariables]] deps = ["ChainRulesCore", "LinearAlgebra", "Test"] @@ -91,9 +102,9 @@ version = "0.7.0" [[deps.ColorSchemes]] deps = ["ColorTypes", "Colors", "FixedPointNumbers", "Random"] -git-tree-sha1 = "a851fec56cb73cfdf43762999ec72eff5b86882a" +git-tree-sha1 = "6b6f04f93710c71550ec7e16b650c1b9a612d0b6" uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -version = "3.15.0" +version = "3.16.0" [[deps.ColorTypes]] deps = ["FixedPointNumbers", "Random"] @@ -135,9 +146,9 @@ uuid = "587fd27a-f159-11e8-2dae-1979310e6154" version = "0.2.7" [[deps.Crayons]] -git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d" +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.0.4" +version = "4.1.1" [[deps.DataAPI]] git-tree-sha1 = "cc70b17275652eb47bc9e5f81635981f13cea5c8" @@ -152,9 +163,15 @@ version = "0.7.7" [[deps.DataFrames]] deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "cfdfef912b7f93e4b848e80b9befdf9e331bc05a" +git-tree-sha1 = "ae02104e835f219b8930c7664b8012c93475c340" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.1" +version = "1.3.2" + +[[deps.DataFramesMeta]] +deps = ["Chain", "DataFrames", "MacroTools", "OrderedCollections", "Reexport"] +git-tree-sha1 = "ab4768d2cc6ab000cd0cec78e8e1ea6b03c7c3e2" +uuid = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" +version = "0.10.0" [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] @@ -193,9 +210,9 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[deps.Distributions]] deps = ["ChainRulesCore", "DensityInterface", "FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns", "Test"] -git-tree-sha1 = "6a8dc9f82e5ce28279b6e3e2cea9421154f5bd0d" +git-tree-sha1 = "5863b0b10512ed4add2b5ec07e335dc6121065a5" uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.25.37" +version = "0.25.41" [[deps.DocStringExtensions]] deps = ["LibGit2"] @@ -291,21 +308,21 @@ version = "3.3.5+1" [[deps.GLM]] deps = ["Distributions", "LinearAlgebra", "Printf", "Reexport", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns", "StatsModels"] -git-tree-sha1 = "f564ce4af5e79bb88ff1f4488e64363487674278" +git-tree-sha1 = "fb764dacfa30f948d52a6a4269ae293a479bbc62" uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a" -version = "1.5.1" +version = "1.6.1" [[deps.GR]] -deps = ["Base64", "DelimitedFiles", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Printf", "Random", "Serialization", "Sockets", "Test", "UUIDs"] -git-tree-sha1 = "30f2b340c2fff8410d89bfcdc9c0a6dd661ac5f7" +deps = ["Base64", "DelimitedFiles", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Printf", "Random", "RelocatableFolders", "Serialization", "Sockets", "Test", "UUIDs"] +git-tree-sha1 = "4a740db447aae0fbeb3ee730de1afbb14ac798a1" uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" -version = "0.62.1" +version = "0.63.1" [[deps.GR_jll]] deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Pkg", "Qt5Base_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "f97acd98255568c3c9b416c5a3cf246c1315771b" +git-tree-sha1 = "aa22e1ee9e722f1da183eb33370df4c1aeb6c2cd" uuid = "d2c73de3-f751-5644-a686-071e5b155ba9" -version = "0.63.0+0" +version = "0.63.1+0" [[deps.GeometryBasics]] deps = ["EarCut_jll", "IterTools", "LinearAlgebra", "StaticArrays", "StructArrays", "Tables"] @@ -331,6 +348,12 @@ git-tree-sha1 = "344bf40dcab1073aca04aa0df4fb092f920e4011" uuid = "3b182d85-2403-5c21-9c21-1e1f0cc25472" version = "1.3.14+0" +[[deps.Graphs]] +deps = ["ArnoldiMethod", "Compat", "DataStructures", "Distributed", "Inflate", "LinearAlgebra", "Random", "SharedArrays", "SimpleTraits", "SparseArrays", "Statistics"] +git-tree-sha1 = "d727758173afef0af878b29ac364a0eca299fc6b" +uuid = "86223c79-3864-5bf0-83f7-82e725a168b6" +version = "1.5.1" + [[deps.Grisu]] git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2" uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" @@ -350,9 +373,14 @@ version = "2.8.1+1" [[deps.Impute]] deps = ["BSON", "CSV", "DataDeps", "Distances", "IterTools", "LinearAlgebra", "Missings", "NamedDims", "NearestNeighbors", "Random", "Statistics", "StatsBase", "TableOperations", "Tables"] -git-tree-sha1 = "4baf7c1120d33f925259c98dce84c35cd107aae3" +git-tree-sha1 = "d3fb6342d94030706ad31f05c23514962c29296c" uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575" -version = "0.6.7" +version = "0.6.8" + +[[deps.Inflate]] +git-tree-sha1 = "f5fc07d4e706b84f72d54eedcc1c13d92fb0871c" +uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +version = "0.1.2" [[deps.IniFile]] deps = ["Test"] @@ -398,9 +426,9 @@ version = "1.0.0" [[deps.JLLWrappers]] deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +git-tree-sha1 = "22df5b96feef82434b07327e2d3c770a9b21e023" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" +version = "1.4.0" [[deps.JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] @@ -608,9 +636,9 @@ uuid = "05823500-19ac-5b8b-9628-191a04bc5112" [[deps.OpenSSL_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "15003dcb7d8db3c6c857fda14891a539a8f2705a" +git-tree-sha1 = "648107615c15d4e09f7eca16307bc821c1f718d8" uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" -version = "1.1.10+0" +version = "1.1.13+0" [[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] @@ -643,9 +671,9 @@ version = "0.11.5" [[deps.Parsers]] deps = ["Dates"] -git-tree-sha1 = "d7fa6237da8004be601e19bd6666083056649918" +git-tree-sha1 = "92f91ba9e5941fc781fecf5494ac1da87bdac775" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.1.3" +version = "2.2.0" [[deps.Pixman_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -665,15 +693,15 @@ version = "2.0.1" [[deps.PlotUtils]] deps = ["ColorSchemes", "Colors", "Dates", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "68604313ed59f0408313228ba09e79252e4b2da8" +git-tree-sha1 = "6f1b25e8ea06279b5689263cc538f51331d7ca17" uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.1.2" +version = "1.1.3" [[deps.Plots]] deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "Unzip"] -git-tree-sha1 = "7eda8e2a61e35b7f553172ef3d9eaa5e4e76d92e" +git-tree-sha1 = "db7393a80d0e5bef70f2b518990835541917a544" uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -version = "1.25.3" +version = "1.25.6" [[deps.PooledArrays]] deps = ["DataAPI", "Future"] @@ -728,20 +756,26 @@ version = "1.2.1" [[deps.RecipesPipeline]] deps = ["Dates", "NaNMath", "PlotUtils", "RecipesBase"] -git-tree-sha1 = "7ad0dfa8d03b7bcf8c597f59f5292801730c55b8" +git-tree-sha1 = "37c1631cb3cc36a535105e6d5557864c82cd8c2b" uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c" -version = "0.4.1" +version = "0.5.0" [[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" +[[deps.RelocatableFolders]] +deps = ["SHA", "Scratch"] +git-tree-sha1 = "cdbd3b1338c72ce29d9584fdbe9e9b70eeb5adca" +uuid = "05181044-ff0b-4ac5-8273-598c1e38db00" +version = "0.1.3" + [[deps.Requires]] deps = ["UUIDs"] -git-tree-sha1 = "8f82019e525f4d5c669692772a6f4b0a58b06a6a" +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.2.0" +version = "1.3.0" [[deps.Rmath]] deps = ["Random", "Rmath_jll"] @@ -766,9 +800,9 @@ version = "1.1.0" [[deps.SentinelArrays]] deps = ["Dates", "Random"] -git-tree-sha1 = "244586bc07462d22aed0113af9c731f2a518c93e" +git-tree-sha1 = "15dfe6b103c2a993be24404124b8791a09460983" uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" -version = "1.3.10" +version = "1.3.11" [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -788,6 +822,12 @@ git-tree-sha1 = "91eddf657aca81df9ae6ceb20b959ae5653ad1de" uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" version = "1.0.3" +[[deps.SimpleTraits]] +deps = ["InteractiveUtils", "MacroTools"] +git-tree-sha1 = "5d7e3f4e11935503d3ecaf7186eac40602e7d231" +uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d" +version = "0.9.4" + [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -803,15 +843,15 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[deps.SpecialFunctions]] deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "f0bccf98e16759818ffc5d97ac3ebf87eb950150" +git-tree-sha1 = "e08890d19787ec25029113e88c34ec20cac1c91e" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.8.1" +version = "2.0.0" [[deps.StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "3c76dde64d03699e074ac02eb2e8ba8254d428da" +git-tree-sha1 = "2884859916598f974858ff01df7dfc6c708dd895" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.13" +version = "1.3.3" [[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] @@ -842,9 +882,9 @@ version = "0.6.28" [[deps.StructArrays]] deps = ["Adapt", "DataAPI", "StaticArrays", "Tables"] -git-tree-sha1 = "2ce41e0d042c60ecd131e9fb7154a3bfadbf50d3" +git-tree-sha1 = "d21f2c564b21a202f4677c0fba5b5ee431058544" uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" -version = "0.6.3" +version = "0.6.4" [[deps.StructTypes]] deps = ["Dates", "UUIDs"] @@ -1071,6 +1111,12 @@ git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" version = "1.4.0+3" +[[deps.ZipFile]] +deps = ["Libdl", "Printf", "Zlib_jll"] +git-tree-sha1 = "3593e69e469d2111389a9bd06bac1f3d730ac6de" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.9.4" + [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" @@ -1105,9 +1151,9 @@ version = "1.6.38+0" [[deps.libvorbis_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"] -git-tree-sha1 = "c45f4e40e7aafe9d086379e5578947ec8b95a8fb" +git-tree-sha1 = "b910cb81ef3fe6e78bf6acee440bda86fd6ae00c" uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" -version = "1.3.7+0" +version = "1.3.7+1" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] diff --git a/Project.toml b/Project.toml index 5b505db..3ec72f1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,13 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575" InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" @@ -14,3 +17,4 @@ Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" diff --git a/appB.jl b/appB.jl index 131cdab..2bbfdb1 100644 --- a/appB.jl +++ b/appB.jl @@ -155,3 +155,37 @@ proptable([walk_unique_2ahead() for _ in 1:10^5]) @time wide = DataFrame(ones(1, 10_000), :auto); @time Tables.columntable(wide); + +# Code for exercise 8.1 + +cg = complete_graph(37700) +Base.summarysize(cg) +@time deg_class(cg, classes_df.ml_target); + +# Code for exercise 8.2 + +scatter(log1p.(agg_df.deg_ml), + log1p.(agg_df.deg_web); + zcolor=agg_df.web_mean, + xlabel="degree ml", ylabel="degree web", + markersize=2, markerstrokewidth=0, markeralpha=0.8, + legend=:topleft, labels = "fraction web", + xticks=gen_ticks(maximum(classes_df.deg_ml)), + yticks=gen_ticks(maximum(classes_df.deg_web))) + +# Code for exercise 8.3 + +glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), + classes_df, Binomial(), ProbitLink()) + +# Code for exercise 8.4 + +df = DataFrame() +df.a = [1, 2, 3] +df.b = df.a +df.b === df.a +df.b = df[:, "b"] +df.b === df.a +df.b == df.a +df[1:2, "a"] .= 10 +df diff --git a/ch08.jl b/ch08.jl new file mode 100644 index 0000000..a34ccc6 --- /dev/null +++ b/ch08.jl @@ -0,0 +1,284 @@ +# Bogumił Kamiński, 2022 + +# Codes for chapter 8 + +# Codes for section 8.1 + +# Code for listing 8.1 + +import Downloads +using SHA +git_zip = "git_web_ml.zip" +if !isfile(git_zip) + Downloads.download("https://snap.stanford.edu/data/" * + "git_web_ml.zip", + git_zip) +end +isfile(git_zip) +open(sha256, git_zip) == [0x56, 0xc0, 0xc1, 0xc2, + 0xc4, 0x60, 0xdc, 0x4c, + 0x7b, 0xf8, 0x93, 0x57, + 0xb1, 0xfe, 0xc0, 0x20, + 0xf4, 0x5e, 0x2e, 0xce, + 0xba, 0xb8, 0x1d, 0x13, + 0x1d, 0x07, 0x3b, 0x10, + 0xe2, 0x8e, 0xc0, 0x31] + +# Code for opeining a zip archive + +import ZipFile +git_archive = ZipFile.Reader(git_zip) + +# Code for listing 8.2 + +function ingest_to_df(archive::ZipFile.Reader, filename::AbstractString) + idx = only(findall(x -> x.name == filename, archive.files)) + return CSV.read(read(archive.files[idx]), DataFrame) +end + +# Code for working with zip archive + +git_archive.files + +git_archive.files[2].name + +findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files) +findall(x -> x.name == "", git_archive.files) + +only(findall(x -> x.name == "git_web_ml/musae_git_edges.csv", git_archive.files)) +only(findall(x -> x.name == "", git_archive.files)) + +# Code for listing 8.3 + +using CSV +using DataFrames +edges_df = ingest_to_df(git_archive, "git_web_ml/musae_git_edges.csv"); +classes_df = ingest_to_df(git_archive, "git_web_ml/musae_git_target.csv"); +close(git_archive) +summary(edges_df) +describe(edges_df, :min, :max, :mean, :nmissing, :eltype) +summary(classes_df) +describe(classes_df, :min, :max, :mean, :nmissing, :eltype) + +# Code for updating data frame columns using broadcasting + +edges_df .+= 1 +classes_df.id .+= 1 + +# Code for examples of data frame broadcasting + +df = DataFrame(a=1:3, b=[4, missing, 5]) +df .^ 2 +coalesce.(df, 0) +df .+ [10, 11, 12] + +# Code for checking the order of :id column in a data frame + +classes_df.id == axes(classes_df, 1) + +# Code for the difference between ! and : in broadcasting assignment + +df = DataFrame(a=1:3, b=1:3) +df[!, :a] .= "x" +df[:, :b] .= "x" +df + +# Code for the difference between ! and : in assignment + +df = DataFrame(a=1:3, b=1:3, c=1:3) +df[!, :a] = ["x", "y", "z"] +df[:, :b] = ["x", "y", "z"] +df[:, :c] = [11, 12, 13] +df + +# Codes for section 8.2 + +# Code from listing 8.4 + +using Graphs +gh = SimpleGraph(nrow(classes_df)) +for (from, to) in eachrow(edges_df) + add_edge!(gh, from, to) +end +gh +ne(gh) +nv(gh) + +# Code for iterator destruction in iteration specification + +mat = [1 2; 3 4; 5 6] +for (x1, x2) in eachrow(mat) + @show x1, x2 +end + +# Code for getting degrees of nodes in the graph + +degree(gh) + +# Code for adding a column to a data frame + +classes_df.deg = degree(gh) + +# Code for the difference between ! and : when adding a column + +df = DataFrame() +x = [1, 2, 3] +df[!, :x1] = x +df[:, :x2] = x +df +df.x1 === x +df.x2 === x +df.x2 == x + +# Code for creating a column using broadcasting + +df.x3 .= 1 +df + +# Code for edge iterator of a graph + +edges(gh) + +e1 = first(edges(gh)) +dump(e1) +e1.src +e1.dst + +# Code for listing 8.5 + +function deg_class(gh, class) + deg_ml = zeros(Int, length(class)) + deg_web = zeros(Int, length(class)) + for edge in edges(gh) + a, b = edge.src, edge.dst + if class[b] == 1 + deg_ml[a] += 1 + else + deg_web[a] += 1 + end + if class[a] == 1 + deg_ml[b] += 1 + else + deg_web[b] += 1 + end + end + return (deg_ml, deg_web) +end + +# Code for computing machine learning and web neighbors for gh graph + +classes_df.deg_ml, classes_df.deg_web = +deg_class(gh, classes_df.ml_target) + +# Code for checking type stability of deg_class function + +@time deg_class(gh, classes_df.ml_target); +@code_warntype deg_class(gh, classes_df.ml_target) + +# Code for checking the classes_df summary statistics + +describe(classes_df, :min, :max, :mean, :std) + +# Code for average degree of node in the graph + +2 * ne(gh) / nv(gh) + +# Code for checking correctness of computations + +classes_df.deg_ml + classes_df.deg_web == classes_df.deg + +# Code for showing that DataFrames.jl checks consistency of stored objects + +df = DataFrame(a=1, b=11) +push!(df.a, 2) +df + +# Codes for section 8.3 + +# Code for computing groupwise means of columns + +using Statistics +for type in [0, 1], col in ["deg_ml", "deg_web"] + println((type, col, mean(classes_df[classes_df.ml_target .== type, col]))) +end + +gdf = groupby(classes_df, :ml_target) +combine(gdf, + :deg_ml => mean => :mean_deg_ml, + :deg_web => mean => :mean_deg_web) + +using DataFramesMeta +@combine(gdf, + :mean_deg_ml = mean(:deg_ml), + :mean_deg_web = mean(:deg_web)) + +# Code for simple plotting of relationship between developer degree and type + +using Plots +scatter(classes_df.deg_ml, classes_df.deg_web; + color=[x == 1 ? "black" : "gray" for x in classes_df.ml_target], + xlabel="degree ml", ylabel="degree web", labels=false) + +# Code for aggregation of degree data + +agg_df = combine(groupby(classes_df, [:deg_ml, :deg_web]), + :ml_target => (x -> 1 - mean(x)) => :web_mean) + +# Code for comparison how Julia parses expressions + +:ml_target => (x -> 1 - mean(x)) => :web_mean +:ml_target => x -> 1 - mean(x) => :web_mean + +# Code for aggregation using DataFramesMeta.jl + +@combine(groupby(classes_df, [:deg_ml, :deg_web]), + :web_mean = 1 - mean(:ml_target)) + +# Code for getting summary information about the aggregated data frame + +describe(agg_df) + +# Code for log1p function + +log1p(0) + +# Code for listing 8.6 + +function gen_ticks(maxv) + max2 = round(Int, log2(maxv)) + tick = [0; 2 .^ (0:max2)] + return (log1p.(tick), tick) +end + +log1pjitter(x) = log1p(x) - 0.05 + rand() / 10 + +using Random +Random.seed!(1234); +scatter(log1pjitter.(agg_df.deg_ml), + log1pjitter.(agg_df.deg_web); + zcolor=agg_df.web_mean, + xlabel="degree ml", ylabel="degree web", + markersize=2, markerstrokewidth=0, markeralpha=0.8, + legend=:topleft, labels = "fraction web", + xticks=gen_ticks(maximum(classes_df.deg_ml)), + yticks=gen_ticks(maximum(classes_df.deg_web))) + +# Code for fitting logistic regression model + +using GLM +glm(@formula(ml_target~log1p(deg_ml)+log1p(deg_web)), classes_df, Binomial(), LogitLink()) + +# Code for inspecting @formula result + +@formula(ml_target~log1p(deg_ml)+log1p(deg_web)) + +# Code for inserting columns to a data frame + +df = DataFrame(x=1:3) +insertcols!(df, :y => 4:6) +insertcols!(df, :y => 4:6) +insertcols!(df, :z => 1) + +insertcols!(df, 1, :a => 0) +insertcols!(df, :x, :pre_x => 2) +insertcols!(df, :x, :post_x => 3, after=true)