Bootstrap Tutorial

This notebook contains a tutorial on how to use the bootstrap functionality provided by estimagic. We start with the simplest possible example of calculating standard errors and confidence intervals for an OLS estimator without as well as with clustering. Then we progress to more advanced examples.

In the example here, we will work with the “exercise” example dataset taken from the seaborn library.

The working example will be a linear regression to investigate the effects of exercise time on pulse.

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

import estimagic as em
Matplotlib is building the font cache; this may take a moment.

Prepare the dataset

df = sns.load_dataset("exercise", index_col=0)
replacements = {"1 min": 1, "15 min": 15, "30 min": 30}
df = df.replace({"time": replacements})
df["constant"] = 1

df.head()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[2], line 3
      1 df = sns.load_dataset("exercise", index_col=0)
      2 replacements = {"1 min": 1, "15 min": 15, "30 min": 30}
----> 3 df = df.replace({"time": replacements})
      4 df["constant"] = 1
      6 df.head()

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/generic.py:7764, in NDFrame.replace(self, to_replace, value, inplace, regex)
   7761     else:
   7762         to_replace, value = keys, values
-> 7764     return self.replace(to_replace, value, inplace=inplace, regex=regex)
   7765 else:
   7766     # need a non-zero len on all axes
   7767     if not self.size:

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/generic.py:7783, in NDFrame.replace(self, to_replace, value, inplace, regex)
   7776     # Note: Checking below for `in foo.keys()` instead of
   7777     #  `in foo` is needed for when we have a Series and not dict
   7778     mapping = {
   7779         col: (to_replace[col], value[col])
   7780         for col in to_replace.keys()
   7781         if col in value.keys() and col in self
   7782     }
-> 7783     return self._replace_columnwise(mapping, inplace, regex)
   7785 # {'A': NA} -> 0
   7786 elif not is_list_like(value):
   7787     # Operate column-wise

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/frame.py:6560, in DataFrame._replace_columnwise(self, mapping, inplace, regex)
   6557         ser = self.iloc[:, i]
   6559         target, value = mapping[ax_value]
-> 6560         newobj = ser.replace(target, value, regex=regex)
   6562         res._iset_item(i, newobj, inplace=inplace)
   6564 return res if inplace else res.__finalize__(self)

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/generic.py:7812, in NDFrame.replace(self, to_replace, value, inplace, regex)
   7807     if len(to_replace) != len(value):
   7808         raise ValueError(
   7809             f"Replacement lists must match in length. "
   7810             f"Expecting {len(to_replace)} got {len(value)} "
   7811         )
-> 7812     new_data = self._mgr.replace_list(
   7813         src_list=to_replace,
   7814         dest_list=value,
   7815         inplace=inplace,
   7816         regex=regex,
   7817     )
   7819 elif to_replace is None:
   7820     if not (
   7821         is_re_compilable(regex)
   7822         or is_list_like(regex)
   7823         or is_dict_like(regex)
   7824     ):

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/internals/managers.py:527, in BaseBlockManager.replace_list(self, src_list, dest_list, inplace, regex)
    524 """do a list replace"""
    525 inplace = validate_bool_kwarg(inplace, "inplace")
--> 527 bm = self.apply(
    528     "replace_list",
    529     src_list=src_list,
    530     dest_list=dest_list,
    531     inplace=inplace,
    532     regex=regex,
    533 )
    534 bm._consolidate_inplace()
    535 return bm

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/internals/managers.py:442, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    440         applied = b.apply(f, **kwargs)
    441     else:
--> 442         applied = getattr(b, f)(**kwargs)
    443     result_blocks = extend_blocks(applied, result_blocks)
    445 out = type(self).from_blocks(result_blocks, [ax.view() for ax in self.axes])

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/internals/blocks.py:852, in Block.replace_list(self, src_list, dest_list, inplace, regex)
    847     m = mib[blk_num : blk_num + 1]
    849 # error: Argument "mask" to "_replace_coerce" of "Block" has
    850 # incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]";
    851 # expected "ndarray[Any, dtype[bool_]]"
--> 852 result = blk._replace_coerce(
    853     to_replace=src,
    854     value=dest,
    855     mask=m,
    856     inplace=inplace,
    857     regex=regex,
    858 )
    860 if i != src_len:
    861     # This is ugly, but we have to get rid of intermediate refs. We
    862     # can simply clear the referenced_blocks if we already copied,
    863     # otherwise we have to remove ourselves
    864     self_blk_ids = {
    865         id(b()): i for i, b in enumerate(self.refs.referenced_blocks)
    866     }

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/internals/blocks.py:932, in Block._replace_coerce(self, to_replace, value, mask, inplace, regex)
    930         return [nb]
    931     return [self.copy(deep=False)]
--> 932 return self.replace(
    933     to_replace=to_replace,
    934     value=value,
    935     inplace=inplace,
    936     mask=mask,
    937 )

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/internals/blocks.py:711, in Block.replace(self, to_replace, value, inplace, mask)
    707 elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)):
    708     # TODO(CoW): Maybe split here as well into columns where mask has True
    709     # and rest?
    710     blk = self._maybe_copy(inplace)
--> 711     putmask_inplace(blk.values, mask, value)
    712     return [blk]
    714 elif self.ndim == 1 or self.shape[0] == 1:

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/array_algos/putmask.py:57, in putmask_inplace(values, mask, value)
     55         values[mask] = value[mask]
     56     else:
---> 57         values[mask] = value
     58 else:
     59     # GH#37833 np.putmask is more performant than __setitem__
     60     np.putmask(values, mask, value)

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/arrays/_mixins.py:272, in NDArrayBackedExtensionArray.__setitem__(self, key, value)
    269     raise ValueError("Cannot modify read-only array")
    271 key = check_array_indexer(self, key)
--> 272 value = self._validate_setitem_value(value)
    273 self._ndarray[key] = value

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/arrays/categorical.py:1653, in Categorical._validate_setitem_value(self, value)
   1651     return self._validate_listlike(value)
   1652 else:
-> 1653     return self._validate_scalar(value)

File ~/checkouts/readthedocs.org/user_builds/estimagic/conda/stable/lib/python3.12/site-packages/pandas/core/arrays/categorical.py:1678, in Categorical._validate_scalar(self, fill_value)
   1676     fill_value = self._unbox_scalar(fill_value)
   1677 else:
-> 1678     raise TypeError(
   1679         "Cannot setitem on a Categorical with a new "
   1680         f"category ({fill_value}), set the categories first"
   1681     ) from None
   1682 return fill_value

TypeError: Cannot setitem on a Categorical with a new category (1), set the categories first

Doing a very simple bootstrap

The first thing we need is a function that calculates the bootstrap outcome, given an empirical or re-sampled dataset. The bootstrap outcome is the quantity for which you want to calculate standard errors and confidence intervals. In most applications those are just parameter estimates.

In our case, we want to regress “pulse” on “time” and a constant. Our outcome function looks as follows:

def ols_fit(data):
    y = data["pulse"]
    x = data[["constant", "time"]]
    params = sm.OLS(y, x).fit().params

    return params

In general, the user-specified outcome function may return any pytree (e.g. numpy.ndarray, pandas.DataFrame, dict etc.). In the example here, it returns a pandas.Series.

Now we are ready to calculate confidence intervals and standard errors.

results_without_cluster = em.bootstrap(data=df, outcome=ols_fit)
results_without_cluster.ci()
results_without_cluster.se()

The above function call represents the minimum that a user has to specify, making full use of the default options, such as drawing a 1_000 bootstrap draws, using the “percentile” bootstrap confidence interval, not making use of parallelization, etc.

If, for example, we wanted to take 10_000 draws, while parallelizing on two cores, and using a “bc” type confidence interval, we would simply call the following:

results_without_cluster2 = em.bootstrap(
    data=df, outcome=ols_fit, n_draws=10_000, n_cores=2
)

results_without_cluster2.ci(ci_method="bc")

Doing a clustered bootstrap

In the cluster robust variant of the bootstrap, the original dataset is divided into clusters according to the values of some user-specified variable, and then clusters are drawn uniformly with replacement in order to create the different bootstrap samples.

In order to use the cluster robust boostrap, we simply specify which variable to cluster by. In the example we are working with, it seems sensible to cluster on individuals, i.e. on the column “id” of our dataset.

results_with_cluster = em.bootstrap(data=df, outcome=ols_fit, cluster_by="id")

results_with_cluster.se()

We can see that the estimated standard errors are indeed of smaller magnitude when we use the cluster robust bootstrap.

Finally, we can compare our bootstrap results to a regression on the full sample using statsmodels’ OLS function. We see that the cluster robust bootstrap yields standard error estimates very close to the ones of the cluster robust regression, while the regular bootstrap seems to overestimate the standard errors of both coefficients.

Note: We would not expect the asymptotic statsmodels standard errors to be exactly the same as the bootstrapped standard errors.

y = df["pulse"]
x = df[["constant", "time"]]


cluster_robust_ols = sm.OLS(y, x).fit(cov_type="cluster", cov_kwds={"groups": df["id"]})

Splitting up the process

In many situations, the above procedure is enough. However, sometimes it may be important to split the bootstrapping process up into smaller steps. Examples for such situations are:

  1. You want to look at the bootstrap estimates

  2. You want to do a bootstrap with a low number of draws first and add more draws later without duplicated calculations

  3. You have more bootstrap outcomes than just the parameters

1. Accessing bootstrap outcomes

The bootstrap outcomes are stored in the results object you get back when calling the bootstrap function.

result = em.bootstrap(data=df, outcome=ols_fit, seed=1234)
my_outcomes = result.outcomes

my_outcomes[:5]

To further compare the cluster bootstrap to the uniform bootstrap, let’s plot the sampling distribution of the parameters on time. We can again see that the standard error is smaller when we cluster on the subject id.

result_clustered = em.bootstrap(data=df, outcome=ols_fit, seed=1234, cluster_by="id")
my_outcomes_clustered = result_clustered.outcomes
# clustered distribution in blue
sns.histplot(
    pd.DataFrame(my_outcomes_clustered)["time"], kde=True, stat="density", linewidth=0
)

# non-clustered distribution in orange
sns.histplot(
    pd.DataFrame(my_outcomes)["time"],
    kde=True,
    stat="density",
    linewidth=0,
    color="orange",
);

Calculating standard errors and confidence intervals from existing bootstrap result

If you’ve already run bootstrap once, you can simply pass the existing result object to a new call of bootstrap. Estimagic reuses the existing bootstrap outcomes and now only draws n_draws - n_existing outcomes instead of drawing entirely new n_draws. Depending on the n_draws you specified (this is set to 1_000 by default), this may save considerable computation time.

We can go on and compute confidence intervals and standard errors, just the same way as before, with several methods (e.g. “percentile” and “bc”), yet without duplicated evaluations of the bootstrap outcome function.

my_results = em.bootstrap(
    data=df,
    outcome=ols_fit,
    existing_result=result,
)
my_results.ci(ci_method="t")

You can use this to calculate confidence intervals with several methods (e.g. “percentile” and “bc”) without duplicated evaluations of the bootstrap outcome function.

2. Extending bootstrap results with more draws

It is often the case that, for speed reasons, you set the number of bootstrap draws quite low, so you can look at the results earlier and later decide that you need more draws.

As an example, we will take an initial sample of 500 draws. We then extend it with another 1500 draws.

Note: It is very important to use a different random seed when you calculate the additional outcomes!!!

initial_result = em.bootstrap(data=df, outcome=ols_fit, seed=5471, n_draws=500)
initial_result.ci(ci_method="t")
combined_result = em.bootstrap(
    data=df, outcome=ols_fit, existing_result=initial_result, seed=2365, n_draws=2000
)
combined_result.ci(ci_method="t")

3. Using less draws than totally available bootstrap outcomes

You have a large sample of bootstrap outcomes but want to compute summary statistics only on a subset? No problem! Estimagic got you covered. You can simply pass any number of n_draws to your next call of bootstrap, regardless of the size of the existing sample you want to use. We already covered the case where n_draws > n_existing above, in which case estimagic draws the remaining bootstrap outcomes for you.

If n_draws <= n_existing, estimagic takes a random subset of the existing outcomes - and voilà!

subset_result = em.bootstrap(
    data=df, outcome=ols_fit, existing_result=combined_result, seed=4632, n_draws=500
)
subset_result.ci(ci_method="t")

Accessing the bootstrap samples

It is also possible to just access the bootstrap samples. You may do so, for example, if you want to calculate your bootstrap outcomes in parallel in a way that is not yet supported by estimagic (e.g. on a large cluster or super-computer).

from estimagic.bootstrap_samples import get_bootstrap_samples

rng = np.random.default_rng(1234)
my_samples = get_bootstrap_samples(data=df, rng=rng)
my_samples[0]