Latent Component Gaussian Process (LCGP): replicated 1D illustration#

The experiment compares LCGP behavior under three replication designs and two training modes:

Axis	Options
Replication design	uniform, skewed, hotspot
Training mode	replicated-data reduction (`rep`) and full-data training (`full`)
Figure type	output predictions (`y`) and latent GP diagnostics (`g`)

The notebook is deterministic: each (case, submethod) run receives its own fixed random seed.

Execution requirements#

This page expects the following packages to already be available in the JupyterBook build environment or to be installed during runtime:

  lcgp
  pandas
  matplotlib
  tensorflow-probability[tf]

Imports and global configuration#

Matplotlib is building the font cache; this may take a moment.

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 12
      8 import pandas as pd
      9 import matplotlib.pyplot as plt
     10 from IPython.display import Markdown, display
     11 
---> 12 from call_model import LCGPRun
     13 from lcgp import evaluation
     14 
     15 plt.rcParams.update({

File ~/checkouts/readthedocs.org/user_builds/lcgp/checkouts/lcgp-r/docs/call_model.py:1
----> 1 from lcgp import LCGP
      2 import numpy as np
      5 class SuperRun:

ModuleNotFoundError: No module named 'lcgp'

# All options executed by this JupyterBook page.
CASES = (1, 2, 3)
SUBMETHODS = ("rep", "full")
PLOT_MODES = ("y", "g")

BASE_SEED = 42
RESULTS_ROOT = Path("results_figure_jupyterbook")
RESULTS_ROOT.mkdir(parents=True, exist_ok=True)

CASE_LABELS = {
    1: "Uniform replication",
    2: "Skewed replication in [0.20, 0.45]",
    3: "Hotspot replication at selected x locations",
}

True function#

The input is one-dimensional, (x \in [0,1]), while the response has three output dimensions:

[ y(x) = \begin{bmatrix} f_1(x) \ f_2(x) \ f_3(x) \end{bmatrix}. ]

The three outputs share the same scalar input but have different shapes. This gives a compact multi-output regression problem where replication can affect both mean estimation and uncertainty quantification.

Replicated data generators#

Show code cell source

Hide code cell source

def make_rep_data(
    n_unique: int = 12,
    rep_choices: tuple[int, ...] = (1, 2, 3, 4),
    noise_std: tuple[float, float, float] = (0.05, 0.08, 0.10),
    rng: np.random.Generator | None = None,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Case 1: roughly uniform replication over the input domain."""
    rng = np.random.default_rng(BASE_SEED) if rng is None else rng
    x_unique = np.linspace(0.0, 1.0, n_unique, dtype=np.float64)
    r = rng.choice(rep_choices, size=n_unique, replace=True)

    xs, ys = [], []
    for i, xi in enumerate(x_unique):
        yi_true = f_true(np.array([xi]))[:, 0]
        for _ in range(int(r[i])):
            eps = rng.normal(0, noise_std, size=3).astype(np.float64)
            xs.append([xi])
            ys.append(yi_true + eps)

    xtrain = np.asarray(xs, dtype=np.float64)
    ytrain = np.asarray(ys, dtype=np.float64).T
    xtest = np.linspace(0.0, 1.0, 400, dtype=np.float64)[:, None]
    ytrue = f_true(xtest[:, 0])
    return xtrain, ytrain, xtest, ytrue


def make_rep_data_skewed(
    n_unique: int = 40,
    heavy_region: tuple[float, float] = (0.20, 0.45),
    light_rep_choices: tuple[int, ...] = (1, 2),
    heavy_rep_choices: tuple[int, ...] = (8, 12, 16, 20),
    noise_std: tuple[float, float, float] = (0.05, 0.08, 0.10),
    rng: np.random.Generator | None = None,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Case 2: many replicates in a contiguous subregion."""
    rng = np.random.default_rng(BASE_SEED) if rng is None else rng
    x_unique = np.linspace(0.0, 1.0, n_unique, dtype=np.float64)

    xs, ys = [], []
    for xi in x_unique:
        choices = heavy_rep_choices if heavy_region[0] <= xi <= heavy_region[1] else light_rep_choices
        reps = int(rng.choice(choices))
        yi_base = f_true(np.array([xi]))[:, 0]
        for _ in range(reps):
            eps = np.asarray([rng.normal(0, s) for s in noise_std], dtype=np.float64)
            xs.append([xi])
            ys.append(yi_base + eps)

    xtrain = np.asarray(xs, dtype=np.float64)
    ytrain = np.asarray(ys, dtype=np.float64).T
    xtest = np.linspace(0.0, 1.0, 400, dtype=np.float64)[:, None]
    ytrue = f_true(xtest[:, 0])
    return xtrain, ytrain, xtest, ytrue


def make_rep_data_hotspots(
    n_unique: int = 50,
    hotspots: tuple[tuple[float, int, int], ...] = ((0.15, 10, 15), (0.50, 18, 25), (0.80, 12, 20)),
    base_rep_choices: tuple[int, ...] = (1,),
    noise_std: tuple[float, float, float] = (0.05, 0.08, 0.10),
    rng: np.random.Generator | None = None,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Case 3: very high replication at selected hotspot locations."""
    rng = np.random.default_rng(BASE_SEED) if rng is None else rng
    x_unique = np.linspace(0.0, 1.0, n_unique, dtype=np.float64)
    hotspot_idx = {
        int(np.argmin(np.abs(x_unique - x0))): (lo, hi)
        for (x0, lo, hi) in hotspots
    }

    xs, ys = [], []
    for i, xi in enumerate(x_unique):
        if i in hotspot_idx:
            lo, hi = hotspot_idx[i]
            reps = int(rng.integers(lo, hi + 1))
        else:
            reps = int(rng.choice(base_rep_choices))

        yi_base = f_true(np.array([xi]))[:, 0]
        for _ in range(reps):
            eps = np.asarray([rng.normal(0, s) for s in noise_std], dtype=np.float64)
            xs.append([xi])
            ys.append(yi_base + eps)

    xtrain = np.asarray(xs, dtype=np.float64)
    ytrain = np.asarray(ys, dtype=np.float64).T
    xtest = np.linspace(0.0, 1.0, 400, dtype=np.float64)[:, None]
    ytrue = f_true(xtest[:, 0])
    return xtrain, ytrain, xtest, ytrue