From c82e9d506767ba36c453f0cfc9bf880d37785823 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sun, 20 Aug 2023 19:37:25 -0700 Subject: [PATCH] v2.1 of Ch13 --- Ch13-multiple-lab.Rmd | 82 ++++----- Ch13-multiple-lab.ipynb | 392 ++++++++++++++++++++-------------------- 2 files changed, 232 insertions(+), 242 deletions(-) diff --git a/Ch13-multiple-lab.Rmd b/Ch13-multiple-lab.Rmd index ef5db15..d5123f1 100644 --- a/Ch13-multiple-lab.Rmd +++ b/Ch13-multiple-lab.Rmd @@ -1,22 +1,9 @@ ---- -jupyter: - jupytext: - cell_metadata_filter: -all - formats: ipynb,Rmd - main_language: python - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.14.7 ---- - # Chapter 13 # Lab: Multiple Testing - + We include our usual imports seen in earlier labs. @@ -28,7 +15,7 @@ import statsmodels.api as sm from ISLP import load_data ``` - + We also collect the new imports needed for this lab. @@ -60,7 +47,7 @@ true_mean = np.array([0.5]*50 + [0]*50) X += true_mean[None,:] ``` - + To begin, we use `ttest_1samp()` from the `scipy.stats` module to test $H_{0}: \mu_1=0$, the null hypothesis that the first variable has mean zero. @@ -70,7 +57,7 @@ result = ttest_1samp(X[:,0], 0) result.pvalue ``` - + The $p$-value comes out to 0.931, which is not low enough to reject the null hypothesis at level $\alpha=0.05$. In this case, $\mu_1=0.5$, so the null hypothesis is false. Therefore, we have made @@ -121,7 +108,7 @@ amounts to quite a weak signal, and it resulted in a high number of Type II errors. Let’s instead simulate data with a stronger signal, so that the ratio of the mean to the standard deviation for the false null hypotheses equals $1$. We make only 10 Type II errors. - + ```{python} true_mean = np.array([1]*50 + [0]*50) @@ -167,7 +154,7 @@ ax.legend() ax.axhline(0.05, c='k', ls='--'); ``` - + As discussed previously, even for moderate values of $m$ such as $50$, the FWER exceeds $0.05$ unless $\alpha$ is set to a very low value, such as $0.001$. Of course, the problem with setting $\alpha$ to such @@ -189,7 +176,7 @@ for i in range(5): fund_mini_pvals ``` - + The $p$-values are low for Managers One and Three, and high for the other three managers. However, we cannot simply reject $H_{0,1}$ and $H_{0,3}$, since this would fail to account for the multiple testing @@ -219,8 +206,8 @@ reject, bonf = mult_test(fund_mini_pvals, method = "bonferroni")[:2] reject ``` - - + + The $p$-values `bonf` are simply the `fund_mini_pvalues` multiplied by 5 and truncated to be less than or equal to 1. @@ -228,7 +215,7 @@ or equal to 1. bonf, np.minimum(fund_mini_pvals * 5, 1) ``` - + Therefore, using Bonferroni’s method, we are able to reject the null hypothesis only for Manager One while controlling FWER at $0.05$. @@ -240,17 +227,18 @@ hypotheses for Managers One and Three at a FWER of $0.05$. mult_test(fund_mini_pvals, method = "holm", alpha=0.05)[:2] ``` - - + + As discussed previously, Manager One seems to perform particularly well, whereas Manager Two has poor performance. + ```{python} fund_mini.mean() ``` - - + + Is there evidence of a meaningful difference in performance between these two managers? We can check this by performing a paired $t$-test using the `ttest_rel()` function from `scipy.stats`: @@ -260,7 +248,7 @@ ttest_rel(fund_mini['Manager1'], fund_mini['Manager2']).pvalue ``` - + The test results in a $p$-value of 0.038, suggesting a statistically significant difference. @@ -285,8 +273,8 @@ tukey = pairwise_tukeyhsd(returns, managers) print(tukey.summary()) ``` - - + + The `pairwise_tukeyhsd()` function provides confidence intervals for the difference between each pair of managers (`lower` and `upper`), as well as a $p$-value. All of these quantities have @@ -316,7 +304,7 @@ for i, manager in enumerate(Fund.columns): fund_pvalues[i] = ttest_1samp(Fund[manager], 0).pvalue ``` - + There are far too many managers to consider trying to control the FWER. Instead, we focus on controlling the FDR: that is, the expected fraction of rejected null hypotheses that are actually false positives. The `multipletests()` function (abbreviated `mult_test()`) can be used to carry out the Benjamini--Hochberg procedure. @@ -326,7 +314,7 @@ fund_qvalues = mult_test(fund_pvalues, method = "fdr_bh")[1] fund_qvalues[:10] ``` - + The *q-values* output by the Benjamini--Hochberg procedure can be interpreted as the smallest FDR threshold at which we would reject a particular null hypothesis. For @@ -353,8 +341,8 @@ null hypotheses! (fund_pvalues <= 0.1 / 2000).sum() ``` - - + + Figure 13.6 displays the ordered $p$-values, $p_{(1)} \leq p_{(2)} \leq \cdots \leq p_{(2000)}$, for the `Fund` dataset, as well as the threshold for rejection by the @@ -383,7 +371,7 @@ else: sorted_set_ = [] ``` - + We now reproduce the middle panel of Figure 13.6. ```{python} @@ -398,7 +386,7 @@ ax.scatter(sorted_set_+1, sorted_[sorted_set_], c='r', s=20) ax.axline((0, 0), (1,q/m), c='k', ls='--', linewidth=3); ``` - + ## A Re-Sampling Approach Here, we implement the re-sampling approach to hypothesis testing @@ -414,8 +402,8 @@ D['Y'] = pd.concat([Khan['ytrain'], Khan['ytest']]) D['Y'].value_counts() ``` - - + + There are four classes of cancer. For each gene, we compare the mean expression in the second class (rhabdomyosarcoma) to the mean expression in the fourth class (Burkitt’s lymphoma). Performing a @@ -435,8 +423,8 @@ observedT, pvalue = ttest_ind(D2[gene_11], observedT, pvalue ``` - - + + However, this $p$-value relies on the assumption that under the null hypothesis of no difference between the two groups, the test statistic follows a $t$-distribution with $29+25-2=52$ degrees of freedom. @@ -464,8 +452,8 @@ for b in range(B): (np.abs(Tnull) > np.abs(observedT)).mean() ``` - - + + This fraction, 0.0398, is our re-sampling-based $p$-value. It is almost identical to the $p$-value of 0.0412 obtained using the theoretical null distribution. @@ -521,7 +509,7 @@ for j in range(m): Tnull_vals[j,b] = ttest_.statistic ``` - + Next, we compute the number of rejected null hypotheses $R$, the estimated number of false positives $\widehat{V}$, and the estimated FDR, for a range of threshold values $c$ in @@ -539,7 +527,7 @@ for j in range(m): FDRs[j] = V / R ``` - + Now, for any given FDR, we can find the genes that will be rejected. For example, with FDR controlled at 0.1, we reject 15 of the 100 null hypotheses. On average, we would expect about one or two of @@ -555,7 +543,7 @@ the genes whose estimated FDR is less than 0.1. sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.1].min()]) ``` - + At an FDR threshold of 0.2, more genes are selected, at the cost of having a higher expected proportion of false discoveries. @@ -563,7 +551,7 @@ proportion of false discoveries. sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.2].min()]) ``` - + The next line generates Figure 13.11, which is similar to Figure 13.9, except that it is based on only a subset of the genes. @@ -575,5 +563,5 @@ ax.set_xlabel("Number of Rejections") ax.set_ylabel("False Discovery Rate"); ``` - + diff --git a/Ch13-multiple-lab.ipynb b/Ch13-multiple-lab.ipynb index 65c7fd0..cd40464 100644 --- a/Ch13-multiple-lab.ipynb +++ b/Ch13-multiple-lab.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "75b2d75c", + "id": "687e9b80", "metadata": {}, "source": [ "\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "34e410a6", + "id": "9f594a41", "metadata": {}, "source": [ "We include our usual imports seen in earlier labs." @@ -23,13 +23,13 @@ { "cell_type": "code", "execution_count": 1, - "id": "1f928b2d", + "id": "7cc4fbeb", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:48.489971Z", - "iopub.status.busy": "2023-08-07T00:19:48.489727Z", - "iopub.status.idle": "2023-08-07T00:19:50.216508Z", - "shell.execute_reply": "2023-08-07T00:19:50.215573Z" + "iopub.execute_input": "2023-08-21T02:29:16.417394Z", + "iopub.status.busy": "2023-08-21T02:29:16.417287Z", + "iopub.status.idle": "2023-08-21T02:29:17.613483Z", + "shell.execute_reply": "2023-08-21T02:29:17.613156Z" } }, "outputs": [], @@ -43,7 +43,7 @@ }, { "cell_type": "markdown", - "id": "12319e0a", + "id": "08ba7bed", "metadata": {}, "source": [ "We also collect the new imports\n", @@ -53,13 +53,13 @@ { "cell_type": "code", "execution_count": 2, - "id": "eb4b32aa", + "id": "595efc18", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.219989Z", - "iopub.status.busy": "2023-08-07T00:19:50.219429Z", - "iopub.status.idle": "2023-08-07T00:19:50.223239Z", - "shell.execute_reply": "2023-08-07T00:19:50.222392Z" + "iopub.execute_input": "2023-08-21T02:29:17.615551Z", + "iopub.status.busy": "2023-08-21T02:29:17.615375Z", + "iopub.status.idle": "2023-08-21T02:29:17.617379Z", + "shell.execute_reply": "2023-08-21T02:29:17.617087Z" }, "lines_to_next_cell": 2 }, @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "a2747e58", + "id": "69e5023e", "metadata": {}, "source": [ "## Review of Hypothesis Tests\n", @@ -92,13 +92,13 @@ { "cell_type": "code", "execution_count": 3, - "id": "e12ac0cd", + "id": "985d1d6e", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.225454Z", - "iopub.status.busy": "2023-08-07T00:19:50.225335Z", - "iopub.status.idle": "2023-08-07T00:19:50.228651Z", - "shell.execute_reply": "2023-08-07T00:19:50.228301Z" + "iopub.execute_input": "2023-08-21T02:29:17.618995Z", + "iopub.status.busy": "2023-08-21T02:29:17.618887Z", + "iopub.status.idle": "2023-08-21T02:29:17.620921Z", + "shell.execute_reply": "2023-08-21T02:29:17.620629Z" } }, "outputs": [], @@ -111,7 +111,7 @@ }, { "cell_type": "markdown", - "id": "70d37233", + "id": "9ae561c4", "metadata": {}, "source": [ "To begin, we use `ttest_1samp()` from the\n", @@ -122,13 +122,13 @@ { "cell_type": "code", "execution_count": 4, - "id": "04d0f49e", + "id": "753d612a", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.230854Z", - "iopub.status.busy": "2023-08-07T00:19:50.230727Z", - "iopub.status.idle": "2023-08-07T00:19:50.236745Z", - "shell.execute_reply": "2023-08-07T00:19:50.236388Z" + "iopub.execute_input": "2023-08-21T02:29:17.622537Z", + "iopub.status.busy": "2023-08-21T02:29:17.622429Z", + "iopub.status.idle": "2023-08-21T02:29:17.626063Z", + "shell.execute_reply": "2023-08-21T02:29:17.625801Z" } }, "outputs": [ @@ -150,7 +150,7 @@ }, { "cell_type": "markdown", - "id": "cf83426f", + "id": "5d9dc17f", "metadata": {}, "source": [ "The $p$-value comes out to 0.931, which is not low enough to\n", @@ -169,13 +169,13 @@ { "cell_type": "code", "execution_count": 5, - "id": "d1f0c695", + "id": "facd6569", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.238822Z", - "iopub.status.busy": "2023-08-07T00:19:50.238685Z", - "iopub.status.idle": "2023-08-07T00:19:50.263876Z", - "shell.execute_reply": "2023-08-07T00:19:50.263508Z" + "iopub.execute_input": "2023-08-21T02:29:17.627714Z", + "iopub.status.busy": "2023-08-21T02:29:17.627617Z", + "iopub.status.idle": "2023-08-21T02:29:17.651726Z", + "shell.execute_reply": "2023-08-21T02:29:17.651448Z" }, "lines_to_next_cell": 0 }, @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "3d8e0d96", + "id": "4094daa7", "metadata": {}, "source": [ "Since this is a simulated data set, we can create a $2 \\times 2$ table\n", @@ -205,13 +205,13 @@ { "cell_type": "code", "execution_count": 6, - "id": "7a9594a0", + "id": "e89ef3eb", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.266708Z", - "iopub.status.busy": "2023-08-07T00:19:50.266387Z", - "iopub.status.idle": "2023-08-07T00:19:50.280831Z", - "shell.execute_reply": "2023-08-07T00:19:50.280194Z" + "iopub.execute_input": "2023-08-21T02:29:17.653344Z", + "iopub.status.busy": "2023-08-21T02:29:17.653256Z", + "iopub.status.idle": "2023-08-21T02:29:17.662644Z", + "shell.execute_reply": "2023-08-21T02:29:17.662346Z" }, "lines_to_next_cell": 0 }, @@ -282,7 +282,7 @@ }, { "cell_type": "markdown", - "id": "9610c817", + "id": "a10ba423", "metadata": {}, "source": [ "Therefore, at level $\\alpha=0.05$, we reject 15 of the 50 false\n", @@ -299,19 +299,20 @@ "amounts to quite a weak signal, and it resulted in a high number of\n", "Type II errors. Let’s instead simulate data with a stronger signal,\n", "so that the ratio of the mean to the standard deviation for the false\n", - "null hypotheses equals $1$. We make only 10 Type II errors.\n" + "null hypotheses equals $1$. We make only 10 Type II errors.\n", + " " ] }, { "cell_type": "code", "execution_count": 7, - "id": "25f7fc5d", + "id": "ae184aaf", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.283336Z", - "iopub.status.busy": "2023-08-07T00:19:50.283198Z", - "iopub.status.idle": "2023-08-07T00:19:50.317664Z", - "shell.execute_reply": "2023-08-07T00:19:50.317356Z" + "iopub.execute_input": "2023-08-21T02:29:17.664327Z", + "iopub.status.busy": "2023-08-21T02:29:17.664213Z", + "iopub.status.idle": "2023-08-21T02:29:17.690928Z", + "shell.execute_reply": "2023-08-21T02:29:17.690657Z" }, "lines_to_next_cell": 0 }, @@ -394,7 +395,7 @@ }, { "cell_type": "markdown", - "id": "bb70c597", + "id": "7ca15d3f", "metadata": {}, "source": [ " " @@ -402,7 +403,7 @@ }, { "cell_type": "markdown", - "id": "f6953d33", + "id": "9e9f5977", "metadata": {}, "source": [ "## Family-Wise Error Rate\n", @@ -417,13 +418,13 @@ { "cell_type": "code", "execution_count": 8, - "id": "369b5bd3", + "id": "0295fe68", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.319399Z", - "iopub.status.busy": "2023-08-07T00:19:50.319268Z", - "iopub.status.idle": "2023-08-07T00:19:50.674219Z", - "shell.execute_reply": "2023-08-07T00:19:50.673887Z" + "iopub.execute_input": "2023-08-21T02:29:17.692568Z", + "iopub.status.busy": "2023-08-21T02:29:17.692459Z", + "iopub.status.idle": "2023-08-21T02:29:17.899403Z", + "shell.execute_reply": "2023-08-21T02:29:17.899081Z" } }, "outputs": [ @@ -454,7 +455,7 @@ }, { "cell_type": "markdown", - "id": "3a81479e", + "id": "fecaca9e", "metadata": {}, "source": [ "As discussed previously, even for moderate values of $m$ such as $50$,\n", @@ -473,13 +474,13 @@ { "cell_type": "code", "execution_count": 9, - "id": "9ce7a19f", + "id": "406e59a8", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.676029Z", - "iopub.status.busy": "2023-08-07T00:19:50.675909Z", - "iopub.status.idle": "2023-08-07T00:19:50.715451Z", - "shell.execute_reply": "2023-08-07T00:19:50.715145Z" + "iopub.execute_input": "2023-08-21T02:29:17.901146Z", + "iopub.status.busy": "2023-08-21T02:29:17.901041Z", + "iopub.status.idle": "2023-08-21T02:29:17.939312Z", + "shell.execute_reply": "2023-08-21T02:29:17.939019Z" } }, "outputs": [ @@ -505,7 +506,7 @@ }, { "cell_type": "markdown", - "id": "7561e3a3", + "id": "87bab88b", "metadata": {}, "source": [ "The $p$-values are low for Managers One and Three, and high for the\n", @@ -530,7 +531,7 @@ }, { "cell_type": "markdown", - "id": "5b608e46", + "id": "d0c7a2a0", "metadata": {}, "source": [ "The `mult_test()` function takes $p$-values and a `method` argument, as well as an optional\n", @@ -541,13 +542,13 @@ { "cell_type": "code", "execution_count": 10, - "id": "de6cffed", + "id": "d4f6a247", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.717476Z", - "iopub.status.busy": "2023-08-07T00:19:50.717258Z", - "iopub.status.idle": "2023-08-07T00:19:50.719841Z", - "shell.execute_reply": "2023-08-07T00:19:50.719577Z" + "iopub.execute_input": "2023-08-21T02:29:17.941032Z", + "iopub.status.busy": "2023-08-21T02:29:17.940919Z", + "iopub.status.idle": "2023-08-21T02:29:17.943369Z", + "shell.execute_reply": "2023-08-21T02:29:17.943081Z" }, "lines_to_next_cell": 2 }, @@ -570,7 +571,7 @@ }, { "cell_type": "markdown", - "id": "5135c6b9", + "id": "4d5bc7e7", "metadata": {}, "source": [ "The $p$-values `bonf` are simply the `fund_mini_pvalues` multiplied by 5 and truncated to be less than\n", @@ -580,13 +581,13 @@ { "cell_type": "code", "execution_count": 11, - "id": "0de71500", + "id": "01a29d71", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.721450Z", - "iopub.status.busy": "2023-08-07T00:19:50.721342Z", - "iopub.status.idle": "2023-08-07T00:19:50.723962Z", - "shell.execute_reply": "2023-08-07T00:19:50.723691Z" + "iopub.execute_input": "2023-08-21T02:29:17.944859Z", + "iopub.status.busy": "2023-08-21T02:29:17.944760Z", + "iopub.status.idle": "2023-08-21T02:29:17.946888Z", + "shell.execute_reply": "2023-08-21T02:29:17.946639Z" } }, "outputs": [ @@ -608,7 +609,7 @@ }, { "cell_type": "markdown", - "id": "1f0bc112", + "id": "e68c9051", "metadata": {}, "source": [ "Therefore, using Bonferroni’s method, we are able to reject the null hypothesis only for Manager\n", @@ -622,13 +623,13 @@ { "cell_type": "code", "execution_count": 12, - "id": "f7e87bdb", + "id": "95454eb4", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.725450Z", - "iopub.status.busy": "2023-08-07T00:19:50.725349Z", - "iopub.status.idle": "2023-08-07T00:19:50.777055Z", - "shell.execute_reply": "2023-08-07T00:19:50.776612Z" + "iopub.execute_input": "2023-08-21T02:29:17.948474Z", + "iopub.status.busy": "2023-08-21T02:29:17.948372Z", + "iopub.status.idle": "2023-08-21T02:29:17.990740Z", + "shell.execute_reply": "2023-08-21T02:29:17.990464Z" }, "lines_to_next_cell": 2 }, @@ -651,23 +652,24 @@ }, { "cell_type": "markdown", - "id": "f762fecd", + "id": "587b5b48", "metadata": {}, "source": [ "As discussed previously, Manager One seems to perform particularly\n", - "well, whereas Manager Two has poor performance." + "well, whereas Manager Two has poor performance.\n", + " " ] }, { "cell_type": "code", "execution_count": 13, - "id": "e88be376", + "id": "1f1ac764", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.779763Z", - "iopub.status.busy": "2023-08-07T00:19:50.779554Z", - "iopub.status.idle": "2023-08-07T00:19:50.783798Z", - "shell.execute_reply": "2023-08-07T00:19:50.783418Z" + "iopub.execute_input": "2023-08-21T02:29:17.992261Z", + "iopub.status.busy": "2023-08-21T02:29:17.992149Z", + "iopub.status.idle": "2023-08-21T02:29:17.995141Z", + "shell.execute_reply": "2023-08-21T02:29:17.994894Z" }, "lines_to_next_cell": 2 }, @@ -694,7 +696,7 @@ }, { "cell_type": "markdown", - "id": "88dbf0a6", + "id": "e3c4b716", "metadata": {}, "source": [ "Is there evidence of a meaningful difference in performance between\n", @@ -705,13 +707,13 @@ { "cell_type": "code", "execution_count": 14, - "id": "41149af6", + "id": "298d975d", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.786752Z", - "iopub.status.busy": "2023-08-07T00:19:50.786580Z", - "iopub.status.idle": "2023-08-07T00:19:50.791095Z", - "shell.execute_reply": "2023-08-07T00:19:50.790607Z" + "iopub.execute_input": "2023-08-21T02:29:17.996686Z", + "iopub.status.busy": "2023-08-21T02:29:17.996590Z", + "iopub.status.idle": "2023-08-21T02:29:17.999332Z", + "shell.execute_reply": "2023-08-21T02:29:17.999076Z" } }, "outputs": [ @@ -733,7 +735,7 @@ }, { "cell_type": "markdown", - "id": "1aca6122", + "id": "3908d7d2", "metadata": {}, "source": [ "The test results in a $p$-value of 0.038,\n", @@ -757,13 +759,13 @@ { "cell_type": "code", "execution_count": 15, - "id": "61aabda7", + "id": "be117713", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:50.793577Z", - "iopub.status.busy": "2023-08-07T00:19:50.793394Z", - "iopub.status.idle": "2023-08-07T00:19:51.318999Z", - "shell.execute_reply": "2023-08-07T00:19:51.318071Z" + "iopub.execute_input": "2023-08-21T02:29:18.000853Z", + "iopub.status.busy": "2023-08-21T02:29:18.000747Z", + "iopub.status.idle": "2023-08-21T02:29:18.487357Z", + "shell.execute_reply": "2023-08-21T02:29:18.487078Z" }, "lines_to_next_cell": 2 }, @@ -799,7 +801,7 @@ }, { "cell_type": "markdown", - "id": "e0084fc5", + "id": "0fdf963f", "metadata": {}, "source": [ "The `pairwise_tukeyhsd()` function provides confidence intervals\n", @@ -817,13 +819,13 @@ { "cell_type": "code", "execution_count": 16, - "id": "cbcad4de", + "id": "537c4ea8", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.321200Z", - "iopub.status.busy": "2023-08-07T00:19:51.321046Z", - "iopub.status.idle": "2023-08-07T00:19:51.411142Z", - "shell.execute_reply": "2023-08-07T00:19:51.410682Z" + "iopub.execute_input": "2023-08-21T02:29:18.489069Z", + "iopub.status.busy": "2023-08-21T02:29:18.488949Z", + "iopub.status.idle": "2023-08-21T02:29:18.570869Z", + "shell.execute_reply": "2023-08-21T02:29:18.570427Z" } }, "outputs": [ @@ -845,7 +847,7 @@ }, { "cell_type": "markdown", - "id": "6278d13c", + "id": "8590f246", "metadata": {}, "source": [ "## False Discovery Rate\n", @@ -858,13 +860,13 @@ { "cell_type": "code", "execution_count": 17, - "id": "b5842190", + "id": "2c88ec87", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.413331Z", - "iopub.status.busy": "2023-08-07T00:19:51.413176Z", - "iopub.status.idle": "2023-08-07T00:19:51.848427Z", - "shell.execute_reply": "2023-08-07T00:19:51.847956Z" + "iopub.execute_input": "2023-08-21T02:29:18.572454Z", + "iopub.status.busy": "2023-08-21T02:29:18.572341Z", + "iopub.status.idle": "2023-08-21T02:29:19.005707Z", + "shell.execute_reply": "2023-08-21T02:29:19.005387Z" } }, "outputs": [], @@ -876,7 +878,7 @@ }, { "cell_type": "markdown", - "id": "80fc2fcc", + "id": "80e77fab", "metadata": {}, "source": [ "There are far too many managers to consider trying to control the FWER.\n", @@ -887,13 +889,13 @@ { "cell_type": "code", "execution_count": 18, - "id": "7c9d8bed", + "id": "b6d56819", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.850663Z", - "iopub.status.busy": "2023-08-07T00:19:51.850523Z", - "iopub.status.idle": "2023-08-07T00:19:51.854777Z", - "shell.execute_reply": "2023-08-07T00:19:51.854196Z" + "iopub.execute_input": "2023-08-21T02:29:19.007847Z", + "iopub.status.busy": "2023-08-21T02:29:19.007564Z", + "iopub.status.idle": "2023-08-21T02:29:19.010742Z", + "shell.execute_reply": "2023-08-21T02:29:19.010371Z" } }, "outputs": [ @@ -916,7 +918,7 @@ }, { "cell_type": "markdown", - "id": "4f73096d", + "id": "b4662444", "metadata": {}, "source": [ "The *q-values* output by the\n", @@ -932,13 +934,13 @@ { "cell_type": "code", "execution_count": 19, - "id": "bfa39f7c", + "id": "b00da3a1", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.856795Z", - "iopub.status.busy": "2023-08-07T00:19:51.856678Z", - "iopub.status.idle": "2023-08-07T00:19:51.859719Z", - "shell.execute_reply": "2023-08-07T00:19:51.859327Z" + "iopub.execute_input": "2023-08-21T02:29:19.012400Z", + "iopub.status.busy": "2023-08-21T02:29:19.012298Z", + "iopub.status.idle": "2023-08-21T02:29:19.015314Z", + "shell.execute_reply": "2023-08-21T02:29:19.014978Z" }, "lines_to_next_cell": 0 }, @@ -960,7 +962,7 @@ }, { "cell_type": "markdown", - "id": "ccb44c8d", + "id": "fdccb808", "metadata": {}, "source": [ "We find that 146 of the 2,000 fund managers have a $q$-value below\n", @@ -976,13 +978,13 @@ { "cell_type": "code", "execution_count": 20, - "id": "70b69b47", + "id": "1c230117", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.861924Z", - "iopub.status.busy": "2023-08-07T00:19:51.861522Z", - "iopub.status.idle": "2023-08-07T00:19:51.864394Z", - "shell.execute_reply": "2023-08-07T00:19:51.863987Z" + "iopub.execute_input": "2023-08-21T02:29:19.016857Z", + "iopub.status.busy": "2023-08-21T02:29:19.016769Z", + "iopub.status.idle": "2023-08-21T02:29:19.019332Z", + "shell.execute_reply": "2023-08-21T02:29:19.019032Z" }, "lines_to_next_cell": 2 }, @@ -1004,7 +1006,7 @@ }, { "cell_type": "markdown", - "id": "c8a969f4", + "id": "6112239d", "metadata": {}, "source": [ "Figure 13.6 displays the ordered\n", @@ -1026,13 +1028,13 @@ { "cell_type": "code", "execution_count": 21, - "id": "4c0ddea1", + "id": "62289650", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.865978Z", - "iopub.status.busy": "2023-08-07T00:19:51.865869Z", - "iopub.status.idle": "2023-08-07T00:19:51.868792Z", - "shell.execute_reply": "2023-08-07T00:19:51.868357Z" + "iopub.execute_input": "2023-08-21T02:29:19.021112Z", + "iopub.status.busy": "2023-08-21T02:29:19.020904Z", + "iopub.status.idle": "2023-08-21T02:29:19.023622Z", + "shell.execute_reply": "2023-08-21T02:29:19.023338Z" } }, "outputs": [], @@ -1051,7 +1053,7 @@ }, { "cell_type": "markdown", - "id": "ddeb3900", + "id": "c36b13b7", "metadata": {}, "source": [ "We now reproduce the middle panel of Figure 13.6." @@ -1060,13 +1062,13 @@ { "cell_type": "code", "execution_count": 22, - "id": "0314eac9", + "id": "18b3c0ed", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:51.871473Z", - "iopub.status.busy": "2023-08-07T00:19:51.871214Z", - "iopub.status.idle": "2023-08-07T00:19:52.126671Z", - "shell.execute_reply": "2023-08-07T00:19:52.126261Z" + "iopub.execute_input": "2023-08-21T02:29:19.025191Z", + "iopub.status.busy": "2023-08-21T02:29:19.025074Z", + "iopub.status.idle": "2023-08-21T02:29:19.262207Z", + "shell.execute_reply": "2023-08-21T02:29:19.261823Z" }, "lines_to_next_cell": 2 }, @@ -1096,7 +1098,7 @@ }, { "cell_type": "markdown", - "id": "83416f4a", + "id": "d87198e4", "metadata": {}, "source": [ "## A Re-Sampling Approach\n", @@ -1110,13 +1112,13 @@ { "cell_type": "code", "execution_count": 23, - "id": "b59b8137", + "id": "eb79e606", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:52.129167Z", - "iopub.status.busy": "2023-08-07T00:19:52.128842Z", - "iopub.status.idle": "2023-08-07T00:19:52.208320Z", - "shell.execute_reply": "2023-08-07T00:19:52.207936Z" + "iopub.execute_input": "2023-08-21T02:29:19.264174Z", + "iopub.status.busy": "2023-08-21T02:29:19.264030Z", + "iopub.status.idle": "2023-08-21T02:29:19.339232Z", + "shell.execute_reply": "2023-08-21T02:29:19.338912Z" }, "lines_to_next_cell": 2 }, @@ -1145,7 +1147,7 @@ }, { "cell_type": "markdown", - "id": "5534c8d4", + "id": "659ee2b8", "metadata": {}, "source": [ "There are four classes of cancer. For each gene, we compare the mean\n", @@ -1161,13 +1163,13 @@ { "cell_type": "code", "execution_count": 24, - "id": "96fb2f61", + "id": "1afbcf47", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:52.210235Z", - "iopub.status.busy": "2023-08-07T00:19:52.210101Z", - "iopub.status.idle": "2023-08-07T00:19:52.215004Z", - "shell.execute_reply": "2023-08-07T00:19:52.214604Z" + "iopub.execute_input": "2023-08-21T02:29:19.341009Z", + "iopub.status.busy": "2023-08-21T02:29:19.340889Z", + "iopub.status.idle": "2023-08-21T02:29:19.344670Z", + "shell.execute_reply": "2023-08-21T02:29:19.344391Z" }, "lines_to_next_cell": 2 }, @@ -1195,7 +1197,7 @@ }, { "cell_type": "markdown", - "id": "3131124e", + "id": "61f24919", "metadata": {}, "source": [ "However, this $p$-value relies on the assumption that under the null\n", @@ -1214,13 +1216,13 @@ { "cell_type": "code", "execution_count": 25, - "id": "fdc229fa", + "id": "f73f4c6d", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:52.217206Z", - "iopub.status.busy": "2023-08-07T00:19:52.217085Z", - "iopub.status.idle": "2023-08-07T00:19:54.776066Z", - "shell.execute_reply": "2023-08-07T00:19:54.775642Z" + "iopub.execute_input": "2023-08-21T02:29:19.346368Z", + "iopub.status.busy": "2023-08-21T02:29:19.346227Z", + "iopub.status.idle": "2023-08-21T02:29:21.776569Z", + "shell.execute_reply": "2023-08-21T02:29:21.776267Z" }, "lines_to_next_cell": 2 }, @@ -1253,7 +1255,7 @@ }, { "cell_type": "markdown", - "id": "c7fc4557", + "id": "a97f74af", "metadata": {}, "source": [ "This fraction, 0.0398,\n", @@ -1265,13 +1267,13 @@ { "cell_type": "code", "execution_count": 26, - "id": "e3894695", + "id": "062daf19", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:54.778563Z", - "iopub.status.busy": "2023-08-07T00:19:54.778388Z", - "iopub.status.idle": "2023-08-07T00:19:55.017161Z", - "shell.execute_reply": "2023-08-07T00:19:55.016821Z" + "iopub.execute_input": "2023-08-21T02:29:21.778366Z", + "iopub.status.busy": "2023-08-21T02:29:21.778242Z", + "iopub.status.idle": "2023-08-21T02:29:21.990476Z", + "shell.execute_reply": "2023-08-21T02:29:21.989965Z" }, "lines_to_next_cell": 0 }, @@ -1307,7 +1309,7 @@ }, { "cell_type": "markdown", - "id": "3bd21158", + "id": "e81b939b", "metadata": {}, "source": [ "The re-sampling-based null distribution is almost identical to the theoretical null distribution, which is displayed in red.\n", @@ -1325,13 +1327,13 @@ { "cell_type": "code", "execution_count": 27, - "id": "3b7392cb", + "id": "6d14fcad", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:19:55.019036Z", - "iopub.status.busy": "2023-08-07T00:19:55.018920Z", - "iopub.status.idle": "2023-08-07T00:39:19.291005Z", - "shell.execute_reply": "2023-08-07T00:39:19.287314Z" + "iopub.execute_input": "2023-08-21T02:29:21.992665Z", + "iopub.status.busy": "2023-08-21T02:29:21.992515Z", + "iopub.status.idle": "2023-08-21T02:34:05.930300Z", + "shell.execute_reply": "2023-08-21T02:34:05.929181Z" } }, "outputs": [], @@ -1358,7 +1360,7 @@ }, { "cell_type": "markdown", - "id": "1b92df1b", + "id": "06286699", "metadata": {}, "source": [ "Next, we compute the number of rejected null hypotheses $R$, the\n", @@ -1371,13 +1373,13 @@ { "cell_type": "code", "execution_count": 28, - "id": "cac15616", + "id": "8f0ec909", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:39:19.314420Z", - "iopub.status.busy": "2023-08-07T00:39:19.313452Z", - "iopub.status.idle": "2023-08-07T00:39:19.544251Z", - "shell.execute_reply": "2023-08-07T00:39:19.543932Z" + "iopub.execute_input": "2023-08-21T02:34:05.935513Z", + "iopub.status.busy": "2023-08-21T02:34:05.935323Z", + "iopub.status.idle": "2023-08-21T02:34:06.118079Z", + "shell.execute_reply": "2023-08-21T02:34:06.117633Z" } }, "outputs": [], @@ -1394,7 +1396,7 @@ }, { "cell_type": "markdown", - "id": "f6779ea0", + "id": "e26b64c6", "metadata": {}, "source": [ "Now, for any given FDR, we can find the genes that will be\n", @@ -1412,13 +1414,13 @@ { "cell_type": "code", "execution_count": 29, - "id": "9661eb10", + "id": "f11339e5", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:39:19.546693Z", - "iopub.status.busy": "2023-08-07T00:39:19.546543Z", - "iopub.status.idle": "2023-08-07T00:39:19.549970Z", - "shell.execute_reply": "2023-08-07T00:39:19.549697Z" + "iopub.execute_input": "2023-08-21T02:34:06.120138Z", + "iopub.status.busy": "2023-08-21T02:34:06.119994Z", + "iopub.status.idle": "2023-08-21T02:34:06.123846Z", + "shell.execute_reply": "2023-08-21T02:34:06.123478Z" } }, "outputs": [ @@ -1457,7 +1459,7 @@ }, { "cell_type": "markdown", - "id": "001e3fc1", + "id": "e145621b", "metadata": {}, "source": [ "At an FDR threshold of 0.2, more genes are selected, at the cost of having a higher expected\n", @@ -1467,13 +1469,13 @@ { "cell_type": "code", "execution_count": 30, - "id": "18ad4900", + "id": "d2600773", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:39:19.552090Z", - "iopub.status.busy": "2023-08-07T00:39:19.552004Z", - "iopub.status.idle": "2023-08-07T00:39:19.554743Z", - "shell.execute_reply": "2023-08-07T00:39:19.554473Z" + "iopub.execute_input": "2023-08-21T02:34:06.126460Z", + "iopub.status.busy": "2023-08-21T02:34:06.126346Z", + "iopub.status.idle": "2023-08-21T02:34:06.129561Z", + "shell.execute_reply": "2023-08-21T02:34:06.129124Z" } }, "outputs": [ @@ -1524,7 +1526,7 @@ }, { "cell_type": "markdown", - "id": "8767f70c", + "id": "32e600ff", "metadata": {}, "source": [ "The next line generates Figure 13.11, which is similar\n", @@ -1535,13 +1537,13 @@ { "cell_type": "code", "execution_count": 31, - "id": "28c276b6", + "id": "924b7705", "metadata": { "execution": { - "iopub.execute_input": "2023-08-07T00:39:19.556715Z", - "iopub.status.busy": "2023-08-07T00:39:19.556515Z", - "iopub.status.idle": "2023-08-07T00:39:19.650514Z", - "shell.execute_reply": "2023-08-07T00:39:19.650181Z" + "iopub.execute_input": "2023-08-21T02:34:06.131323Z", + "iopub.status.busy": "2023-08-21T02:34:06.131207Z", + "iopub.status.idle": "2023-08-21T02:34:06.216626Z", + "shell.execute_reply": "2023-08-21T02:34:06.216270Z" }, "lines_to_next_cell": 0 }, @@ -1566,18 +1568,18 @@ }, { "cell_type": "markdown", - "id": "e4b5d621", + "id": "b9f54695", "metadata": {}, "source": [ - "\n" + " \n" ] } ], "metadata": { "jupytext": { "cell_metadata_filter": "-all", - "formats": "ipynb,Rmd", - "main_language": "python" + "main_language": "python", + "notebook_metadata_filter": "-all" }, "language_info": { "codemirror_mode": {