DGP Families¶

synth-bench ships eight data-generating process families. This notebook shows each family's API, the effect of the complexity parameter on dataset difficulty, and how to read the Bayes error and effective-rank metadata.

In [1]:

Copied!





import matplotlib.pyplot as plt
import pandas as pd

from synthbench import (
    AdditiveDGP,
    BenchPipeline,
    FriedmanDGP,
    GeometricDGP,
    LinearDGP,
    PolynomialDGP,
    SparseDGP,
    TreeDGP,
)

plt.rcParams["figure.dpi"] = 72
import matplotlib.pyplot as plt
import pandas as pd

from synthbench import (
    AdditiveDGP,
    BenchPipeline,
    FriedmanDGP,
    GeometricDGP,
    LinearDGP,
    PolynomialDGP,
    SparseDGP,
    TreeDGP,
)

plt.rcParams["figure.dpi"] = 72

LinearDGP — regression and metadata¶

In [2]:

Copied!





dgp = LinearDGP(task_type="regression", complexity="medium")
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=42)
print(f"X shape: {result.X.shape}, y shape: {result.y.shape}")
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
print(
    f"Bayes error:    {be:.4f}"
    if be is not None
    else "Bayes error:    N/A (regression)"
)
print(f"Effective rank: {er:.2f}")
top3 = list(result.metadata["signal_feature_importances"].values())[:3]
print(f"Top feature importances: {top3}")
dgp = LinearDGP(task_type="regression", complexity="medium")
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=42)
print(f"X shape: {result.X.shape}, y shape: {result.y.shape}")
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
print(
    f"Bayes error:    {be:.4f}"
    if be is not None
    else "Bayes error:    N/A (regression)"
)
print(f"Effective rank: {er:.2f}")
top3 = list(result.metadata["signal_feature_importances"].values())[:3]
print(f"Top feature importances: {top3}")

X shape: (300, 10), y shape: (300,)
Bayes error:    N/A (regression)
Effective rank: 9.96
Top feature importances: [0.06884210339150995, 0.5961952215457994, 0.005002074169428147]

Complexity sweep — LinearDGP¶

In [3]:

Copied!





fig, axes = plt.subplots(1, 3, figsize=(12, 3))
for ax, complexity in zip(axes, ["low", "medium", "high"], strict=False):
    dgp = LinearDGP(task_type="regression", complexity=complexity)
    result = BenchPipeline(dgp).run(n_samples=300, n_features=5, random_state=0)
    ax.scatter(result.X[:, 0], result.y, alpha=0.4, s=10)
    be = result.metadata["bayes_error"]
    be_str = f"{be:.3f}" if be is not None else "N/A"
    ax.set_title(f"LinearDGP - {complexity}, Bayes err={be_str}")
    ax.set_xlabel("X[:,0]")
    ax.set_ylabel("y")
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
for ax, complexity in zip(axes, ["low", "medium", "high"], strict=False):
    dgp = LinearDGP(task_type="regression", complexity=complexity)
    result = BenchPipeline(dgp).run(n_samples=300, n_features=5, random_state=0)
    ax.scatter(result.X[:, 0], result.y, alpha=0.4, s=10)
    be = result.metadata["bayes_error"]
    be_str = f"{be:.3f}" if be is not None else "N/A"
    ax.set_title(f"LinearDGP - {complexity}, Bayes err={be_str}")
    ax.set_xlabel("X[:,0]")
    ax.set_ylabel("y")
plt.tight_layout()
plt.show()

No description has been provided for this image

Other DGP families¶

In [4]:

Copied!





families = [
    ("PolynomialDGP", PolynomialDGP(task_type="regression", complexity="medium")),
    ("TreeDGP", TreeDGP(task_type="classification", complexity="medium")),
    ("FriedmanDGP", FriedmanDGP(task_type="regression")),
    ("AdditiveDGP", AdditiveDGP(task_type="regression", complexity="medium")),
    ("SparseDGP", SparseDGP(task_type="regression", complexity="medium")),
    ("GeometricDGP", GeometricDGP(task_type="classification", complexity="medium")),
]

rows = []
for name, dgp in families:
    result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
    be = result.metadata["bayes_error"]
    rows.append(
        {
            "DGP": name,
            "bayes_error": round(be, 4) if be is not None else None,
            "effective_rank": round(result.metadata["effective_rank"], 2),
        }
    )

pd.DataFrame(rows).set_index("DGP")
families = [
    ("PolynomialDGP", PolynomialDGP(task_type="regression", complexity="medium")),
    ("TreeDGP", TreeDGP(task_type="classification", complexity="medium")),
    ("FriedmanDGP", FriedmanDGP(task_type="regression")),
    ("AdditiveDGP", AdditiveDGP(task_type="regression", complexity="medium")),
    ("SparseDGP", SparseDGP(task_type="regression", complexity="medium")),
    ("GeometricDGP", GeometricDGP(task_type="classification", complexity="medium")),
]

rows = []
for name, dgp in families:
    result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
    be = result.metadata["bayes_error"]
    rows.append(
        {
            "DGP": name,
            "bayes_error": round(be, 4) if be is not None else None,
            "effective_rank": round(result.metadata["effective_rank"], 2),
        }
    )

pd.DataFrame(rows).set_index("DGP")

Out[4]:

	bayes_error	effective_rank
DGP
PolynomialDGP	NaN	9.97
TreeDGP	0.4467	9.97
FriedmanDGP	NaN	8.66
AdditiveDGP	NaN	9.97
SparseDGP	NaN	9.97
GeometricDGP	0.3367	9.83

FriedmanDGP — classification with class_weight¶

In [5]:

Copied!





cw_rows = []
for cw in [0.3, 0.5, 0.7]:
    dgp = FriedmanDGP(task_type="classification", class_weight=cw)
    result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
    cw_rows.append(
        {
            "class_weight": cw,
            "bayes_error": round(result.metadata["bayes_error"], 4),
            "positive_fraction": round(float(result.y.mean()), 3),
        }
    )
pd.DataFrame(cw_rows)
cw_rows = []
for cw in [0.3, 0.5, 0.7]:
    dgp = FriedmanDGP(task_type="classification", class_weight=cw)
    result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
    cw_rows.append(
        {
            "class_weight": cw,
            "bayes_error": round(result.metadata["bayes_error"], 4),
            "positive_fraction": round(float(result.y.mean()), 3),
        }
    )
pd.DataFrame(cw_rows)

Out[5]:

	class_weight	bayes_error	positive_fraction
0	0.3	0.4367	0.340
1	0.5	0.4933	0.497
2	0.7	0.4133	0.677

RandomNeuralDGP (requires synthbench[neural])¶

In [6]:

Copied!





try:
    from synthbench.dgps.neural import RandomNeuralDGP

    dgp = RandomNeuralDGP(task_type="regression", n_hidden_layers=1)
    result = BenchPipeline(dgp).run(n_samples=200, n_features=5, random_state=0)
    be = result.metadata["bayes_error"]
    er = result.metadata["effective_rank"]
    be_str = f"{be:.4f}" if be is not None else "N/A (regression)"
    print(f"RandomNeuralDGP — bayes_error: {be_str}, effective_rank: {er:.2f}")
except ImportError:
    print("RandomNeuralDGP requires synthbench[neural] (torch). Skipping.")
try:
    from synthbench.dgps.neural import RandomNeuralDGP

    dgp = RandomNeuralDGP(task_type="regression", n_hidden_layers=1)
    result = BenchPipeline(dgp).run(n_samples=200, n_features=5, random_state=0)
    be = result.metadata["bayes_error"]
    er = result.metadata["effective_rank"]
    be_str = f"{be:.4f}" if be is not None else "N/A (regression)"
    print(f"RandomNeuralDGP — bayes_error: {be_str}, effective_rank: {er:.2f}")
except ImportError:
    print("RandomNeuralDGP requires synthbench[neural] (torch). Skipping.")

RandomNeuralDGP — bayes_error: N/A (regression), effective_rank: 4.99

Summary: Bayes error by DGP family¶

In [7]:

Copied!





# Re-use the rows DataFrame from cell 8 above (rows still in scope)
df = pd.DataFrame(rows)
# Drop rows where bayes_error is None (regression DGPs do not compute it)
df_plot = df.dropna(subset=["bayes_error"]).sort_values("bayes_error")
fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(df_plot["DGP"], df_plot["bayes_error"], color="steelblue")
ax.set_xlabel("Bayes error (lower = easier)")
ax.set_title("Bayes error by DGP family (classification DGPs only, n_features=10)")
plt.tight_layout()
plt.show()
# Re-use the rows DataFrame from cell 8 above (rows still in scope)
df = pd.DataFrame(rows)
# Drop rows where bayes_error is None (regression DGPs do not compute it)
df_plot = df.dropna(subset=["bayes_error"]).sort_values("bayes_error")
fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(df_plot["DGP"], df_plot["bayes_error"], color="steelblue")
ax.set_xlabel("Bayes error (lower = easier)")
ax.set_title("Bayes error by DGP family (classification DGPs only, n_features=10)")
plt.tight_layout()
plt.show()