DGP Families¶
synth-bench ships eight data-generating process families. This notebook shows each family's API, the effect of the complexity parameter on dataset difficulty, and how to read the Bayes error and effective-rank metadata.
In [1]:
Copied!
import matplotlib.pyplot as plt
import pandas as pd
from synthbench import (
AdditiveDGP,
BenchPipeline,
FriedmanDGP,
GeometricDGP,
LinearDGP,
PolynomialDGP,
SparseDGP,
TreeDGP,
)
plt.rcParams["figure.dpi"] = 72
import matplotlib.pyplot as plt
import pandas as pd
from synthbench import (
AdditiveDGP,
BenchPipeline,
FriedmanDGP,
GeometricDGP,
LinearDGP,
PolynomialDGP,
SparseDGP,
TreeDGP,
)
plt.rcParams["figure.dpi"] = 72
LinearDGP — regression and metadata¶
In [2]:
Copied!
dgp = LinearDGP(task_type="regression", complexity="medium")
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=42)
print(f"X shape: {result.X.shape}, y shape: {result.y.shape}")
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
print(
f"Bayes error: {be:.4f}"
if be is not None
else "Bayes error: N/A (regression)"
)
print(f"Effective rank: {er:.2f}")
top3 = list(result.metadata["signal_feature_importances"].values())[:3]
print(f"Top feature importances: {top3}")
dgp = LinearDGP(task_type="regression", complexity="medium")
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=42)
print(f"X shape: {result.X.shape}, y shape: {result.y.shape}")
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
print(
f"Bayes error: {be:.4f}"
if be is not None
else "Bayes error: N/A (regression)"
)
print(f"Effective rank: {er:.2f}")
top3 = list(result.metadata["signal_feature_importances"].values())[:3]
print(f"Top feature importances: {top3}")
X shape: (300, 10), y shape: (300,) Bayes error: N/A (regression) Effective rank: 9.96 Top feature importances: [0.06884210339150995, 0.5961952215457994, 0.005002074169428147]
Complexity sweep — LinearDGP¶
In [3]:
Copied!
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
for ax, complexity in zip(axes, ["low", "medium", "high"], strict=False):
dgp = LinearDGP(task_type="regression", complexity=complexity)
result = BenchPipeline(dgp).run(n_samples=300, n_features=5, random_state=0)
ax.scatter(result.X[:, 0], result.y, alpha=0.4, s=10)
be = result.metadata["bayes_error"]
be_str = f"{be:.3f}" if be is not None else "N/A"
ax.set_title(f"LinearDGP - {complexity}, Bayes err={be_str}")
ax.set_xlabel("X[:,0]")
ax.set_ylabel("y")
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
for ax, complexity in zip(axes, ["low", "medium", "high"], strict=False):
dgp = LinearDGP(task_type="regression", complexity=complexity)
result = BenchPipeline(dgp).run(n_samples=300, n_features=5, random_state=0)
ax.scatter(result.X[:, 0], result.y, alpha=0.4, s=10)
be = result.metadata["bayes_error"]
be_str = f"{be:.3f}" if be is not None else "N/A"
ax.set_title(f"LinearDGP - {complexity}, Bayes err={be_str}")
ax.set_xlabel("X[:,0]")
ax.set_ylabel("y")
plt.tight_layout()
plt.show()
Other DGP families¶
In [4]:
Copied!
families = [
("PolynomialDGP", PolynomialDGP(task_type="regression", complexity="medium")),
("TreeDGP", TreeDGP(task_type="classification", complexity="medium")),
("FriedmanDGP", FriedmanDGP(task_type="regression")),
("AdditiveDGP", AdditiveDGP(task_type="regression", complexity="medium")),
("SparseDGP", SparseDGP(task_type="regression", complexity="medium")),
("GeometricDGP", GeometricDGP(task_type="classification", complexity="medium")),
]
rows = []
for name, dgp in families:
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
be = result.metadata["bayes_error"]
rows.append(
{
"DGP": name,
"bayes_error": round(be, 4) if be is not None else None,
"effective_rank": round(result.metadata["effective_rank"], 2),
}
)
pd.DataFrame(rows).set_index("DGP")
families = [
("PolynomialDGP", PolynomialDGP(task_type="regression", complexity="medium")),
("TreeDGP", TreeDGP(task_type="classification", complexity="medium")),
("FriedmanDGP", FriedmanDGP(task_type="regression")),
("AdditiveDGP", AdditiveDGP(task_type="regression", complexity="medium")),
("SparseDGP", SparseDGP(task_type="regression", complexity="medium")),
("GeometricDGP", GeometricDGP(task_type="classification", complexity="medium")),
]
rows = []
for name, dgp in families:
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
be = result.metadata["bayes_error"]
rows.append(
{
"DGP": name,
"bayes_error": round(be, 4) if be is not None else None,
"effective_rank": round(result.metadata["effective_rank"], 2),
}
)
pd.DataFrame(rows).set_index("DGP")
Out[4]:
| bayes_error | effective_rank | |
|---|---|---|
| DGP | ||
| PolynomialDGP | NaN | 9.97 |
| TreeDGP | 0.4467 | 9.97 |
| FriedmanDGP | NaN | 8.66 |
| AdditiveDGP | NaN | 9.97 |
| SparseDGP | NaN | 9.97 |
| GeometricDGP | 0.3367 | 9.83 |
FriedmanDGP — classification with class_weight¶
In [5]:
Copied!
cw_rows = []
for cw in [0.3, 0.5, 0.7]:
dgp = FriedmanDGP(task_type="classification", class_weight=cw)
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
cw_rows.append(
{
"class_weight": cw,
"bayes_error": round(result.metadata["bayes_error"], 4),
"positive_fraction": round(float(result.y.mean()), 3),
}
)
pd.DataFrame(cw_rows)
cw_rows = []
for cw in [0.3, 0.5, 0.7]:
dgp = FriedmanDGP(task_type="classification", class_weight=cw)
result = BenchPipeline(dgp).run(n_samples=300, n_features=10, random_state=0)
cw_rows.append(
{
"class_weight": cw,
"bayes_error": round(result.metadata["bayes_error"], 4),
"positive_fraction": round(float(result.y.mean()), 3),
}
)
pd.DataFrame(cw_rows)
Out[5]:
| class_weight | bayes_error | positive_fraction | |
|---|---|---|---|
| 0 | 0.3 | 0.4367 | 0.340 |
| 1 | 0.5 | 0.4933 | 0.497 |
| 2 | 0.7 | 0.4133 | 0.677 |
RandomNeuralDGP (requires synthbench[neural])¶
In [6]:
Copied!
try:
from synthbench.dgps.neural import RandomNeuralDGP
dgp = RandomNeuralDGP(task_type="regression", n_hidden_layers=1)
result = BenchPipeline(dgp).run(n_samples=200, n_features=5, random_state=0)
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
be_str = f"{be:.4f}" if be is not None else "N/A (regression)"
print(f"RandomNeuralDGP — bayes_error: {be_str}, effective_rank: {er:.2f}")
except ImportError:
print("RandomNeuralDGP requires synthbench[neural] (torch). Skipping.")
try:
from synthbench.dgps.neural import RandomNeuralDGP
dgp = RandomNeuralDGP(task_type="regression", n_hidden_layers=1)
result = BenchPipeline(dgp).run(n_samples=200, n_features=5, random_state=0)
be = result.metadata["bayes_error"]
er = result.metadata["effective_rank"]
be_str = f"{be:.4f}" if be is not None else "N/A (regression)"
print(f"RandomNeuralDGP — bayes_error: {be_str}, effective_rank: {er:.2f}")
except ImportError:
print("RandomNeuralDGP requires synthbench[neural] (torch). Skipping.")
RandomNeuralDGP — bayes_error: N/A (regression), effective_rank: 4.99
Summary: Bayes error by DGP family¶
In [7]:
Copied!
# Re-use the rows DataFrame from cell 8 above (rows still in scope)
df = pd.DataFrame(rows)
# Drop rows where bayes_error is None (regression DGPs do not compute it)
df_plot = df.dropna(subset=["bayes_error"]).sort_values("bayes_error")
fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(df_plot["DGP"], df_plot["bayes_error"], color="steelblue")
ax.set_xlabel("Bayes error (lower = easier)")
ax.set_title("Bayes error by DGP family (classification DGPs only, n_features=10)")
plt.tight_layout()
plt.show()
# Re-use the rows DataFrame from cell 8 above (rows still in scope)
df = pd.DataFrame(rows)
# Drop rows where bayes_error is None (regression DGPs do not compute it)
df_plot = df.dropna(subset=["bayes_error"]).sort_values("bayes_error")
fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(df_plot["DGP"], df_plot["bayes_error"], color="steelblue")
ax.set_xlabel("Bayes error (lower = easier)")
ax.set_title("Bayes error by DGP family (classification DGPs only, n_features=10)")
plt.tight_layout()
plt.show()