Performing bootstrap calculations — PoL workshop on statistical inference, part 1 documentation (2024)

Dataset download

[2]:
import numpy as npimport polars as plimport numbaimport iqplotimport bebi103import bokeh.iobokeh.io.output_notebook()

Loading BokehJS ...

"}}; function display_loaded(error = null) { const el = document.getElementById("b098d865-61e1-406b-b302-722de47be692"); if (el != null) { const html = (() => { if (typeof root.Bokeh === "undefined") { if (error == null) { return "BokehJS is loading ..."; } else { return "BokehJS failed to load."; } } else { const prefix = `BokehJS ${root.Bokeh.version}`; if (error == null) { return `${prefix} successfully loaded.`; } else { return `${prefix} encountered errors while loading and may not function as expected.`; } } })(); el.innerHTML = html; if (error != null) { const wrapper = document.createElement("div"); wrapper.style.overflow = "auto"; wrapper.style.height = "5em"; wrapper.style.resize = "vertical"; const content = document.createElement("div"); content.style.fontFamily = "monospace"; content.style.whiteSpace = "pre-wrap"; content.style.backgroundColor = "rgb(255, 221, 221)"; content.textContent = error.stack ?? error.toString(); wrapper.append(content); el.append(wrapper); } } else if (Date.now() < root._bokeh_timeout) { setTimeout(() => display_loaded(error), 100); } } function run_callbacks() { try { root._bokeh_onload_callbacks.forEach(function(callback) { if (callback != null) callback(); }); } finally { delete root._bokeh_onload_callbacks } console.debug("Bokeh: all callbacks have finished"); } function load_libs(css_urls, js_urls, callback) { if (css_urls == null) css_urls = []; if (js_urls == null) js_urls = []; root._bokeh_onload_callbacks.push(callback); if (root._bokeh_is_loading > 0) { console.debug("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.debug("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); root._bokeh_is_loading = css_urls.length + js_urls.length; function on_load() { root._bokeh_is_loading--; if (root._bokeh_is_loading === 0) { console.debug("Bokeh: all BokehJS libraries/stylesheets loaded"); run_callbacks() } } function on_error(url) { console.error("failed to load " + url); } for (let i = 0; i < css_urls.length; i++) { const url = css_urls[i]; const element = document.createElement("link"); element.onload = on_load; element.onerror = on_error.bind(null, url); element.rel = "stylesheet"; element.type = "text/css"; element.href = url; console.debug("Bokeh: injecting link tag for BokehJS stylesheet: ", url); document.body.appendChild(element); } for (let i = 0; i < js_urls.length; i++) { const url = js_urls[i]; const element = document.createElement('script'); element.onload = on_load; element.onerror = on_error.bind(null, url); element.async = false; element.src = url; console.debug("Bokeh: injecting script tag for BokehJS library: ", url); document.head.appendChild(element); } }; function inject_raw_css(css) { const element = document.createElement("style"); element.appendChild(document.createTextNode(css)); document.body.appendChild(element); } const js_urls = ["https://cdn.bokeh.org/bokeh/release/bokeh-3.4.1.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.1.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.1.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.1.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.4.1.min.js", "https://unpkg.com/@holoviz/panel@1.4.4/dist/panel.min.js"]; const css_urls = []; const inline_js = [ function(Bokeh) { Bokeh.set_log_level("info"); },function(Bokeh) { } ]; function run_inline_js() { if (root.Bokeh !== undefined || force === true) { try { for (let i = 0; i < inline_js.length; i++) { inline_js[i].call(root, root.Bokeh); } } catch (error) {display_loaded(error);throw error; }if (force === true) { display_loaded(); }} else if (Date.now() < root._bokeh_timeout) { setTimeout(run_inline_js, 100); } else if (!root._bokeh_failed_load) { console.log("Bokeh: BokehJS failed to load within specified timeout."); root._bokeh_failed_load = true; } else if (force !== true) { const cell = $(document.getElementById("b098d865-61e1-406b-b302-722de47be692")).parents('.cell').data().cell; cell.output_area.append_execute_result(NB_LOAD_WARNING) } } if (root._bokeh_is_loading === 0) { console.debug("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(css_urls, js_urls, function() { console.debug("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); }}(window));

We have discussed the plug-in principle, in which we approximate the distribution function of the generative model, \(F\), with the empirical distribution function, \(\hat{F}\). We then approximate all statistical functionals of \(F\) using \(\hat{F}\); \(T(F)\approx T(\hat{F})\). Using the frequentist perspective, we talked about using the plug-in principle and bootstrapping to compute confidence intervals and do hypothesis testing. In this lesson, we will goover how we implement this in practice. We will work with some real data on the reproductive health of male bees that have been treated with pesticide.

It is important as you go through this lesson to remember that you do not know what the generative distribution is. We are only approximating it using the plug-in principle. Since we do not know what \(F\) is, we do not know what its parameters are, so we therefore cannot estimate them. Rather, we can study summary statistics, which are plug-in estimates for statistical functionals of the generative distribution, such as means, variances, or even the ECDF itself, and how they may varyfrom experiment to experiment. In this sense, we are doing nonparametric inference.

The data set

Neonicotinoid pesticides are thought to have inadvertent effects on service-providing insects such as bees. A study of this was featured in the New York Times. The original paper by Straub and coworkers may be found here. The authors put their data in the Dryad repository, which means that it is free to all to workwith!

(Do you see a trend here? If you want people to think deeply about your results, explore them, learn from them, in general further science with them, make your data publicly available. Strongly encourage the members of your lab to do the same.)

We will look at the sperm quality of drone bees using this data set. First, let’s load in the data set and check it out.

[3]:
df = pl.read_csv( os.path.join(data_path, "bee_sperm.csv"), null_values=["NO SPERM", "No Sperm"], comment_prefix="#",)df.head()
[3]:

shape: (5, 18)

SpecimenTreatmentEnvironmentTreatmentNCSSSample IDColonyCageSampleSperm Volume per 500 ulQuantityViabilityRaw (%)QualityAge (d)InfertilAliveSpermQuantity MillionsAlive Sperm MillionsDead Sperm Millions
i64strstri64stri64i64i64i64i64f64f64i64i64i64f64f64f64
227"Control""Cage"1"C2-1-1"2112150000215000096.72638196.72638114020796172.152.0796170.070383
228"Control""Cage"1"C2-1-2"2122287500228750096.34980896.34980814022040012.28752.2040010.083499
229"Control""Cage"1"C2-1-3"213875008750098.7598.75140864060.08750.0864060.001094
230"Control""Cage"1"C2-1-4"2141875000187500093.28742193.28742114017491391.8751.7491390.125861
231"Control""Cage"1"C2-1-5"2151587500158750097.79250697.79250614015524561.58751.5524560.035044

We are interested in the number of alive sperm in the samples. Let’s first explore the data by making ECDFs for the two groups we will compare, those treated with pesticide (“Pesticide”) and those that are not (“Control”).

The visual inspection of the ECDFs suggests that indeed the control drones have more alive sperm than those treated with pesticide. But how variable would these ECDFs be if we repeated the experiment?

Bootstrap samples and ECDFs

To address this question, we can generate bootstrap samples from the experimental data and make lots of ECDFs. We can then plot them all to see how the ECDF might vary. Recall that a bootstrap sample from a data set of \(n\) repeated measurements is generated by drawing \(n\) data points out of the original data set with replacement. The rng.choice() function enables random draws of elements out of an array. Let’s generate 100 bootstrap samples and plot their ECDFs to visualize howour data set might change as we repeat the experiment.

[5]:
rng = np.random.default_rng()# Set up Numpy arrays for convenience (also much better performance)dfs = df.partition_by('Treatment', as_dict=True)alive_ctrl = dfs[('Control',)].get_column("Alive Sperm Millions").to_numpy()alive_pest = dfs[('Pesticide',)].get_column("Alive Sperm Millions").to_numpy()# ECDF values for plottingctrl_ecdf = np.arange(1, len(alive_ctrl)+1) / len(alive_ctrl)pest_ecdf = np.arange(1, len(alive_pest)+1) / len(alive_pest)# Make 100 bootstrap samples and plot themfor _ in range(100): bs_ctrl = rng.choice(alive_ctrl, size=len(alive_ctrl)) bs_pest = rng.choice(alive_pest, size=len(alive_pest)) # Add semitransparent ECDFs to the plot p.scatter(np.sort(bs_ctrl), ctrl_ecdf, color=bokeh.palettes.Category10_3[0], alpha=0.02) p.scatter(np.sort(bs_pest), pest_ecdf, color=bokeh.palettes.Category10_3[1], alpha=0.02)bokeh.io.show(p)

From this graphical display, we can already see that the ECDFs do not strongly overlap in 100 bootstrap samples, so there is likely a real difference between the two treatments.

Speeding up sampling with Numba

We can make sampling faster (though note that the slowness in generating the above plot is not in resampling, but in adding glyphs to the plot) by employing just-in-time compilation, wherein the Python code is compiled at runtime into machine code. This results in a substantial speed boost. Numba is a powerful tool for this purpose, and we will use it. Bear in mind, though, that not all Python code is Numba-able. Random number generation is supported, but as ofNumba version 0.60, only the Mersenne Twister bit generator is thread safe, so we will use it in Numba’d code.

In order to just-in-time compile a function, we need to decorate its definition with the @numba.njit decorator, which tells the Python interpreter to use Numba to just-in-time compile the function. Note that Numba works on basic Python objects like tuples and Numpy arrays, so be sure to pass Numpy arrays into the respective functions.

[6]:
@numba.njitdef draw_bs_sample(data): """Draw a bootstrap sample from a 1D data set.""" return np.random.choice(data, size=len(data))

Let’s quickly quantify the speed boost. Before we do the timing, we will run the Numba’d version once to give it a chance to do the JIT compilation.

[7]:
# Run once to JITdraw_bs_sample(alive_ctrl)print('Using the Numpy default RNG (PCG64):')%timeit rng.choice(alive_ctrl, size=len(alive_ctrl))print('\nUsing the Numpy Mersenne Twister:')%timeit np.random.choice(alive_ctrl, size=len(alive_ctrl))print("\nUsing a Numba'd Mersenne Twister:")%timeit draw_bs_sample(alive_ctrl)
Using the Numpy default RNG (PCG64):19.2 μs ± 211 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)Using the Numpy Mersenne Twister:19.3 μs ± 157 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)Using a Numba'd Mersenne Twister:3.84 μs ± 69.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

That’s a factor of 6 or so increase. That can make a big difference when generating lots of bootstrap samples.

Bootstrap replicates and confidence intervals

We have plotted the ECDF of the data, which is instructive, but we would also like to get plug-in estimates for the generative distribution. Remember, when doing nonparametric plug-in estimates, we plug in the ECDF for the CDF. We do not need to specify what the distribution (described mathematically by the CDF, or equivalently by the PDF) is, just that we approximate it by the empirical distribution.

We have laid out the procedure to compute a confidence interval.

  1. Generate B independent bootstrap samples.

  2. Compute the plug-in estimate of the statistical functional of interest for each bootstrap sample to get the bootstrap replicates.

  3. The 100(1 – α) percent confidence interval consists of the percentiles 100 α/2 and 100(1 – α/2) of the bootstrap replicates.

A key step here is computing the bootstrap replicate. We will write a function to draw bootstrap replicates of a statistic of interest. To do so, we pass in a function, which we will call stat_fun. Because we do not know what function we will pass in, we cannot just-in-time compile this functions.

[8]:
def draw_bs_reps(data, stat_fun, size=1): """Draw boostrap replicates computed with stat_fun from 1D data set.""" return np.array([stat_fun(draw_bs_sample(data)) for _ in range(size)])

We will also write a few functions for commonly computed statistics, which enables us to use Numba to greatly speed up the process of generating bootstrap replicates. Note that in our decorator, we use the parallel=True keyword argument. Within the for loop to compute the bootstrap replicates, as use numba.prange() as a drop-in replacement for range(). When we do it this way, Numba automatically parallelizes the calculation across available CPUs.

[9]:
@numba.njit(parallel=True)def draw_bs_reps_mean(data, size=1): """Draw boostrap replicates of the mean from 1D data set.""" out = np.empty(size) for i in numba.prange(size): out[i] = np.mean(draw_bs_sample(data)) return out@numba.njit(parallel=True)def draw_bs_reps_median(data, size=1): """Draw boostrap replicates of the median from 1D data set.""" out = np.empty(size) for i in numba.prange(size): out[i] = np.median(draw_bs_sample(data)) return out@numba.njit(parallel=True)def draw_bs_reps_std(data, size=1): """Draw boostrap replicates of the standard deviation from 1D data set.""" out = np.empty(size) for i in numba.prange(size): out[i] = np.std(draw_bs_sample(data)) return out

Now, let’s get bootstrap replicates for the mean of each of the two treatments.

[10]:
bs_reps_mean_ctrl = draw_bs_reps_mean(alive_ctrl, size=10000)bs_reps_mean_pest = draw_bs_reps_mean(alive_pest, size=10000)

We can now compute the confidence intervals by computing the percentiles using the np.percentile() function.

[11]:
# 95% confidence intervalsmean_ctrl_conf_int = np.percentile(bs_reps_mean_ctrl, [2.5, 97.5])mean_pest_conf_int = np.percentile(bs_reps_mean_pest, [2.5, 97.5])print("""Mean alive sperm count 95% conf int control (millions): [{0:.2f}, {1:.2f}]Mean alive sperm count 95% conf int treatment (millions): [{2:.2f}, {3:.2f}]""".format(*(tuple(mean_ctrl_conf_int) + tuple(mean_pest_conf_int))))
Mean alive sperm count 95% conf int control (millions): [1.67, 2.07]Mean alive sperm count 95% conf int treatment (millions): [1.11, 1.49]

Display of confidence intervals

While the textual confidence intervals shown above are useful, it is often desired to display confidence intervals graphically. The bebi103 package has a utility to do this. It requires as input a list of dictionaries where each dictionary contains an estimate, a confidence interval, and a label.

[12]:
summaries = [ dict(label="control", estimate=np.mean(alive_ctrl), conf_int=mean_ctrl_conf_int), dict(label="treated", estimate=np.mean(alive_pest), conf_int=mean_pest_conf_int),]bokeh.io.show(bebi103.viz.confints(summaries, x_axis_label="alive sperm (millions)"))

ECDF of bootstrap replicates

We can also use the bootstrap replicates to plot the probability distribution of mean alive sperm count. Remember: This is not a confidence interval on a parameter value. That does not make sense in frequentist statistics, and furthermore we have not assumed any parametric model. It is the confidence interval describing what we would get as a plug-in estimate for the mean if we did the experiment over and over again.

When we plot the ECDF of the bootstrap replicates, we will thin them so as not to choke the browser with too many points. Since we will do this again, we write a quick function to do it.

[13]:
def plot_bs_reps_ecdf(ctrl, pest, q, thin=1, **kwargs): """Make plot of bootstrap ECDFs for control and pesticide treatment.""" x_ctrl = np.sort(ctrl)[::thin] x_pest = np.sort(pest)[::thin] df = pl.DataFrame( data={ "treatment": ["control"] * len(x_ctrl) + ["pesticide"] * len(x_pest), q: np.concatenate((x_ctrl, x_pest)), } ) p = iqplot.ecdf( df, q=q, cats="treatment", frame_height=200, frame_width=350, **kwargs ) return pp = plot_bs_reps_ecdf( bs_reps_mean_ctrl, bs_reps_mean_pest, "mean alive sperm (millions)", thin=50)bokeh.io.show(p)

These are both nice, Normal distributions with almost no overlap in the tails. We also saw in the computation of the 95% confidence intervals above that there is no overlap. This is expected, the mean alive sperm count should be Normally distributed as per the central limit theorem.

We can do the same procedure for other statistical quantities that do not follow the central limit theorem. The procedure is exactly the same. We will do it for the median.

[14]:
# Get the bootstrap replicatesbs_reps_median_ctrl = draw_bs_reps_median(alive_ctrl, size=10000)bs_reps_median_pest = draw_bs_reps_median(alive_pest, size=10000)# 95% confidence intervalsmedian_ctrl_conf_int = np.percentile(bs_reps_median_ctrl, [2.5, 97.5])median_pest_conf_int = np.percentile(bs_reps_median_pest, [2.5, 97.5])# Plot of confidence intervalsummaries = [ dict(label="control", estimate=np.median(alive_ctrl), conf_int=median_ctrl_conf_int), dict(label="treated", estimate=np.median(alive_pest), conf_int=median_pest_conf_int),]p_conf_int = bebi103.viz.confints(summaries, x_axis_label="alive sperm (millions)")# Plot ECDFs of bootstrap replicatesp_ecdf = plot_bs_reps_ecdf( bs_reps_median_ctrl, bs_reps_median_pest, "median alive sperm (millions)", thin=50)# Show both plotsbokeh.io.show(bokeh.layouts.column([p_conf_int, bokeh.models.Spacer(height=50), p]))

The results are similar, and we clearly see clear non-normality in the ECDFs.

Note that plotting ECDFs of bootstrap replicates is not a normal reporting mechanism for bootstrap confidence intervals. We show them here to illustrate how bootstrapping works.

Computing environment

[15]:
%load_ext watermark%watermark -v -p numpy,polars,numba,bokeh,iqplot,jupyterlab
Python implementation: CPythonPython version : 3.12.4IPython version : 8.25.0numpy : 1.26.4polars : 1.1.0numba : 0.59.1bokeh : 3.4.1iqplot : 0.3.7jupyterlab: 4.0.13
Performing bootstrap calculations — PoL workshop on statistical inference, part 1  documentation (2024)

References

Top Articles
Best horror movies of 2024 to stream on Netflix, Max, Hulu, Shudder and more
Iconic L.A. Horror Houses From Movies and TV To Visit - Wealth of Geeks
Foxy Roxxie Coomer
Diario Las Americas Rentas Hialeah
Brady Hughes Justified
80 For Brady Showtimes Near Marcus Point Cinema
Boomerang Media Group: Quality Media Solutions
Free Atm For Emerald Card Near Me
30 Insanely Useful Websites You Probably Don't Know About
How To Be A Reseller: Heather Hooks Is Hooked On Pickin’ - Seeking Connection: Life Is Like A Crossword Puzzle
Rondale Moore Or Gabe Davis
Is Sportsurge Safe and Legal in 2024? Any Alternatives?
Vanadium Conan Exiles
Tcu Jaggaer
Meritas Health Patient Portal
Kvta Ventura News
Fdny Business
Napa Autocare Locator
Buy Swap Sell Dirt Late Model
623-250-6295
Talbots.dayforce.com
Joann Ally Employee Portal
Moving Sales Craigslist
bode - Bode frequency response of dynamic system
Barber Gym Quantico Hours
Jenna Ortega’s Height, Age, Net Worth & Biography
Crossword Help - Find Missing Letters & Solve Clues
Hannah Palmer Listal
UCLA Study Abroad | International Education Office
Downtown Dispensary Promo Code
Weather Underground Durham
Shia Prayer Times Houston
Parent Management Training (PMT) Worksheet | HappierTHERAPY
Life Insurance Policies | New York Life
6465319333
Workboy Kennel
Phone number detective
Www.craigslist.com Syracuse Ny
Ukg Dimensions Urmc
Are you ready for some football? Zag Alum Justin Lange Forges Career in NFL
Kazwire
Boone County Sheriff 700 Report
Dollar Tree's 1,000 store closure tells the perils of poor acquisitions
St Anthony Hospital Crown Point Visiting Hours
Amc.santa Anita
Craigslist Rooms For Rent In San Fernando Valley
Bf273-11K-Cl
Erespassrider Ual
Sml Wikia
How To Connect To Rutgers Wifi
Predator revo radial owners
Elizabethtown Mesothelioma Legal Question
Latest Posts
Article information

Author: Dr. Pierre Goyette

Last Updated:

Views: 5630

Rating: 5 / 5 (50 voted)

Reviews: 81% of readers found this page helpful

Author information

Name: Dr. Pierre Goyette

Birthday: 1998-01-29

Address: Apt. 611 3357 Yong Plain, West Audra, IL 70053

Phone: +5819954278378

Job: Construction Director

Hobby: Embroidery, Creative writing, Shopping, Driving, Stand-up comedy, Coffee roasting, Scrapbooking

Introduction: My name is Dr. Pierre Goyette, I am a enchanting, powerful, jolly, rich, graceful, colorful, zany person who loves writing and wants to share my knowledge and understanding with you.