Skip to content

Commit

Permalink
Set data when building Linearmodel (#249)
Browse files Browse the repository at this point in the history
* Create test of fitted parameter values

* Set data when creating model

* Clean up sample_prior_predictive

Since model_builder must create model object and set data, data only
needs to be set if model object already exists and no need to check for
model object before sampling.

* Apply suggestions from code review

Co-authored-by: Ricardo Vieira <[email protected]>

* Add join=right to extend calls

* Update test_parameter_fit based on comments from PR review

Use np.testing.assert_allclose instead of pytest.approx

Make fitted_model use fewer samples for fitting and do a
separate model and fit for checking parameter recovery.

* Add tests for extend_idata parameters

* Update based on code review comments

Make copies of `fitted_model_instance` to keep tests from interfering
with each other.

Combine `test_sample_prior_extend_idata_param` and
`test_sample_posterior_extend_idata_param` to reduce code repetition.

* Work around `deepcopy` not working for model objects

* Mark correct test for skipping on win32

---------

Co-authored-by: Ricardo Vieira <[email protected]>
  • Loading branch information
pdb5627 and ricardoV94 authored Nov 13, 2023
1 parent 6d8f569 commit a362ff0
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 33 deletions.
4 changes: 2 additions & 2 deletions pymc_experimental/linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
"""
cfg = self.model_config

# The model is built with placeholder data.
# Actual data will be set by _data_setter when fitting or evaluating the model.
# Data array size can change but number of dimensions must stay the same.
with pm.Model() as self.model:
x = pm.MutableData("x", np.zeros((1,)), dims="observation")
Expand All @@ -94,6 +92,8 @@ def build_model(self, X: pd.DataFrame, y: pd.Series):
dims="observation",
)

self._data_setter(X, y)

def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
with self.model:
pm.set_data({"x": X.squeeze()})
Expand Down
29 changes: 14 additions & 15 deletions pymc_experimental/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def sample_model(self, **kwargs):
with self.model:
sampler_args = {**self.sampler_config, **kwargs}
idata = pm.sample(**sampler_args)
idata.extend(pm.sample_prior_predictive())
idata.extend(pm.sample_posterior_predictive(idata))
idata.extend(pm.sample_prior_predictive(), join="right")
idata.extend(pm.sample_posterior_predictive(idata), join="right")

idata = self.set_idata_attrs(idata)
return idata
Expand Down Expand Up @@ -595,23 +595,22 @@ def sample_prior_predictive(
Prior predictive samples for each input X_pred
"""
if y_pred is None:
y_pred = np.zeros(len(X_pred))
y_pred = pd.Series(np.zeros(len(X_pred)), name=self.output_var)
if samples is None:
samples = self.sampler_config.get("draws", 500)

if self.model is None:
self.build_model(X_pred, y_pred)

self._data_setter(X_pred, y_pred)
if self.model is not None:
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred)
else:
self.idata = prior_pred
else:
self._data_setter(X_pred, y_pred)
with self.model: # sample with new input data
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
self.set_idata_attrs(prior_pred)
if extend_idata:
if self.idata is not None:
self.idata.extend(prior_pred, join="right")
else:
self.idata = prior_pred

prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)

Expand Down Expand Up @@ -641,7 +640,7 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined, **kwargs):
with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata, **kwargs)
if extend_idata:
self.idata.extend(post_pred)
self.idata.extend(post_pred, join="right")

posterior_predictive_samples = az.extract(
post_pred, "posterior_predictive", combined=combined
Expand Down
39 changes: 33 additions & 6 deletions pymc_experimental/tests/test_linearmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@
sklearn_available = False


@pytest.fixture(scope="module")
def toy_actual_params():
return {
"intercept": 3,
"slope": 5,
"obs_error": 0.5,
}


@pytest.fixture(scope="module")
def toy_X():
x = np.linspace(start=0, stop=1, num=100)
Expand All @@ -44,23 +53,24 @@ def toy_X():


@pytest.fixture(scope="module")
def toy_y(toy_X):
y = 5 * toy_X["input"] + 3
y = y + np.random.normal(0, 1, size=len(toy_X))
def toy_y(toy_X, toy_actual_params):
y = toy_actual_params["slope"] * toy_X["input"] + toy_actual_params["intercept"]
rng = np.random.default_rng(427)
y = y + rng.normal(0, toy_actual_params["obs_error"], size=len(toy_X))
y = pd.Series(y, name="output")
return y


@pytest.fixture(scope="module")
def fitted_linear_model_instance(toy_X, toy_y):
sampler_config = {
"draws": 500,
"tune": 300,
"draws": 50,
"tune": 30,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y)
model.fit(toy_X, toy_y, random_seed=312)
return model


Expand Down Expand Up @@ -101,6 +111,23 @@ def test_fit(fitted_linear_model_instance):
assert isinstance(post_pred, xr.DataArray)


def test_parameter_fit(toy_X, toy_y, toy_actual_params):
"""Check that the fit model recovered the data-generating parameters."""
# Fit the model with a sufficient number of samples
sampler_config = {
"draws": 500,
"tune": 300,
"chains": 2,
"target_accept": 0.95,
}
model = LinearModel(sampler_config=sampler_config)
model.fit(toy_X, toy_y, random_seed=312)
fit_params = model.idata.posterior.mean()
np.testing.assert_allclose(fit_params["intercept"], toy_actual_params["intercept"], rtol=0.1)
np.testing.assert_allclose(fit_params["slope"], toy_actual_params["slope"], rtol=0.1)
np.testing.assert_allclose(fit_params["σ_model_fmc"], toy_actual_params["obs_error"], rtol=0.1)


def test_predict(fitted_linear_model_instance):
model = fitted_linear_model_instance
X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
Expand Down
80 changes: 70 additions & 10 deletions pymc_experimental/tests/test_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ def toy_y(toy_X):
return y


@pytest.fixture(scope="module")
def fitted_model_instance(toy_X, toy_y):
def get_unfitted_model_instance(X, y):
"""Creates an unfitted model instance to which idata can be copied in
and then used as a fitted model instance. That way a fitted model
can be used multiple times without having to run `fit` multiple times."""
sampler_config = {
"draws": 100,
"tune": 100,
"draws": 20,
"tune": 10,
"chains": 2,
"target_accept": 0.95,
}
Expand All @@ -57,7 +59,30 @@ def fitted_model_instance(toy_X, toy_y):
model = test_ModelBuilder(
model_config=model_config, sampler_config=sampler_config, test_parameter="test_paramter"
)
model.fit(toy_X)
# Do the things that `model.fit` does except sample to create idata.
model._generate_and_preprocess_model_data(X, y.values.flatten())
model.build_model(X, y)
return model


@pytest.fixture(scope="module")
def fitted_model_instance_base(toy_X, toy_y):
"""Because fitting takes a relatively long time, this is intended to
be used only once and then have new instances created and fit data patched in
for tests that use a fitted model instance. Tests should use
`fitted_model_instance` instead of this."""
model = get_unfitted_model_instance(toy_X, toy_y)
model.fit(toy_X, toy_y)
return model


@pytest.fixture
def fitted_model_instance(toy_X, toy_y, fitted_model_instance_base):
"""Get a fitted model instance. A new instance is created and fit data is
patched in, so tests using this fixture can modify the model object without
affecting other tests."""
model = get_unfitted_model_instance(toy_X, toy_y)
model.idata = fitted_model_instance_base.idata.copy()
return model


Expand Down Expand Up @@ -131,8 +156,8 @@ def _generate_and_preprocess_model_data(
@staticmethod
def get_default_sampler_config() -> Dict:
return {
"draws": 1_000,
"tune": 1_000,
"draws": 10,
"tune": 10,
"chains": 3,
"target_accept": 0.95,
}
Expand All @@ -142,6 +167,9 @@ def test_save_input_params(fitted_model_instance):
assert fitted_model_instance.idata.attrs["test_paramter"] == '"test_paramter"'


@pytest.mark.skipif(
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
)
def test_save_load(fitted_model_instance):
temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
fitted_model_instance.save(temp.name)
Expand Down Expand Up @@ -193,9 +221,6 @@ def test_fit_no_y(toy_X):
assert "posterior" in model_builder.idata.groups()


@pytest.mark.skipif(
sys.platform == "win32", reason="Permissions for temp files not granted on windows CI."
)
def test_predict(fitted_model_instance):
x_pred = np.random.uniform(low=0, high=1, size=100)
prediction_data = pd.DataFrame({"input": x_pred})
Expand All @@ -220,6 +245,41 @@ def test_sample_posterior_predictive(fitted_model_instance, combined):
assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating)


@pytest.mark.parametrize("group", ["prior_predictive", "posterior_predictive"])
@pytest.mark.parametrize("extend_idata", [True, False])
def test_sample_xxx_extend_idata_param(fitted_model_instance, group, extend_idata):
output_var = fitted_model_instance.output_var
idata_prev = fitted_model_instance.idata[group][output_var]

# Since coordinates are provided, the dimension must match
n_pred = 100 # Must match toy_x
x_pred = np.random.uniform(0, 1, n_pred)

prediction_data = pd.DataFrame({"input": x_pred})
if group == "prior_predictive":
prediction_method = fitted_model_instance.sample_prior_predictive
else: # group == "posterior_predictive":
prediction_method = fitted_model_instance.sample_posterior_predictive

pred = prediction_method(prediction_data["input"], combined=False, extend_idata=extend_idata)

pred_unstacked = pred[output_var].values
idata_now = fitted_model_instance.idata[group][output_var].values

if extend_idata:
# After sampling, data in the model should be the same as the predictions
np.testing.assert_array_equal(idata_now, pred_unstacked)
# Data in the model should NOT be the same as before
if idata_now.shape == idata_prev.values.shape:
assert np.sum(np.abs(idata_now - idata_prev.values) < 1e-5) <= 2
else:
# After sampling, data in the model should be the same as it was before
np.testing.assert_array_equal(idata_now, idata_prev.values)
# Data in the model should NOT be the same as the predictions
if idata_now.shape == pred_unstacked.shape:
assert np.sum(np.abs(idata_now - pred_unstacked) < 1e-5) <= 2


def test_model_config_formatting():
model_config = {
"a": {
Expand Down

0 comments on commit a362ff0

Please sign in to comment.