End-to-End Workflow¶
A complete research workflow using synth-bench: generate a synthetic classification dataset, corrupt it, sweep over severity levels, serialize the best result, reload it from disk, and benchmark two sklearn classifiers.
In [1]:
Copied!
import pathlib
import tempfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from synthbench import (
BenchPipeline,
BenchResult,
FriedmanDGP,
LabelNoiseCorruptor,
MissingDataCorruptor,
severity_sweep,
)
plt.rcParams["figure.dpi"] = 72
import pathlib
import tempfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from synthbench import (
BenchPipeline,
BenchResult,
FriedmanDGP,
LabelNoiseCorruptor,
MissingDataCorruptor,
severity_sweep,
)
plt.rcParams["figure.dpi"] = 72
Step 1: Generate a clean classification dataset¶
In [2]:
Copied!
dgp = FriedmanDGP(task_type="classification", complexity="medium")
clean = BenchPipeline(dgp).run(n_samples=400, n_features=10, random_state=42)
print(f"Clean dataset: X={clean.X.shape}, class balance={clean.y.mean():.2f}")
be = clean.metadata["bayes_error"]
print(f"Bayes error (lower bound): {be:.4f}" if be is not None else "Bayes error: N/A")
print(f"Effective rank: {clean.metadata['effective_rank']:.2f}")
dgp = FriedmanDGP(task_type="classification", complexity="medium")
clean = BenchPipeline(dgp).run(n_samples=400, n_features=10, random_state=42)
print(f"Clean dataset: X={clean.X.shape}, class balance={clean.y.mean():.2f}")
be = clean.metadata["bayes_error"]
print(f"Bayes error (lower bound): {be:.4f}" if be is not None else "Bayes error: N/A")
print(f"Effective rank: {clean.metadata['effective_rank']:.2f}")
Clean dataset: X=(400, 10), class balance=0.47 Bayes error (lower bound): 0.4600 Effective rank: 8.62
Step 2: Apply a corruption pipeline¶
In [3]:
Copied!
dgp = FriedmanDGP(task_type="classification", complexity="medium")
pipeline = BenchPipeline(
dgp,
corruptors=[MissingDataCorruptor(severity="medium", mechanism="mcar")],
label_corruptors=[LabelNoiseCorruptor(noise_rate=0.05)],
)
corrupted = pipeline.run(n_samples=400, n_features=10, random_state=42)
print(f"Missing fraction: {np.isnan(corrupted.X).mean():.3f}")
be = corrupted.metadata["bayes_error"]
print(
f"Bayes error after corruption: {be:.4f}"
if be is not None
else "Bayes error after corruption: N/A (not computed for corrupted pipelines)"
)
dgp = FriedmanDGP(task_type="classification", complexity="medium")
pipeline = BenchPipeline(
dgp,
corruptors=[MissingDataCorruptor(severity="medium", mechanism="mcar")],
label_corruptors=[LabelNoiseCorruptor(noise_rate=0.05)],
)
corrupted = pipeline.run(n_samples=400, n_features=10, random_state=42)
print(f"Missing fraction: {np.isnan(corrupted.X).mean():.3f}")
be = corrupted.metadata["bayes_error"]
print(
f"Bayes error after corruption: {be:.4f}"
if be is not None
else "Bayes error after corruption: N/A (not computed for corrupted pipelines)"
)
Missing fraction: 0.150 Bayes error after corruption: N/A (not computed for corrupted pipelines)
Step 3: Sweep over severity levels¶
In [4]:
Copied!
dgp = FriedmanDGP(task_type="classification", complexity="medium")
sweep_results = severity_sweep(
dgp,
MissingDataCorruptor,
severities=["low", "medium", "high"],
n_samples=300,
n_features=10,
random_state=42,
)
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
be = r.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"Severity={sev}: missing={np.isnan(r.X).mean():.3f}, bayes_error={be_str}")
dgp = FriedmanDGP(task_type="classification", complexity="medium")
sweep_results = severity_sweep(
dgp,
MissingDataCorruptor,
severities=["low", "medium", "high"],
n_samples=300,
n_features=10,
random_state=42,
)
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
be = r.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"Severity={sev}: missing={np.isnan(r.X).mean():.3f}, bayes_error={be_str}")
Severity=low: missing=0.050, bayes_error=N/A Severity=medium: missing=0.150, bayes_error=N/A Severity=high: missing=0.300, bayes_error=N/A
Step 4: Serialize a result to Parquet and reload¶
In [5]:
Copied!
# Use a temporary directory so this notebook is self-contained
with tempfile.TemporaryDirectory() as tmp:
parquet_path = pathlib.Path(tmp) / "medium_severity.parquet"
# Save the medium-severity result
sweep_results[1].to_parquet(parquet_path)
print(f"Saved to: {parquet_path.name} ({parquet_path.stat().st_size:,} bytes)")
# Reload from disk
loaded = BenchResult.from_parquet(parquet_path)
print(f"Reloaded: X={loaded.X.shape}")
be = loaded.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"Metadata preserved: bayes_error={be_str}")
# Round-trip check
assert np.allclose(sweep_results[1].X, loaded.X, equal_nan=True), (
"Round-trip mismatch!"
)
print("Round-trip check: PASS")
# Use a temporary directory so this notebook is self-contained
with tempfile.TemporaryDirectory() as tmp:
parquet_path = pathlib.Path(tmp) / "medium_severity.parquet"
# Save the medium-severity result
sweep_results[1].to_parquet(parquet_path)
print(f"Saved to: {parquet_path.name} ({parquet_path.stat().st_size:,} bytes)")
# Reload from disk
loaded = BenchResult.from_parquet(parquet_path)
print(f"Reloaded: X={loaded.X.shape}")
be = loaded.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"Metadata preserved: bayes_error={be_str}")
# Round-trip check
assert np.allclose(sweep_results[1].X, loaded.X, equal_nan=True), (
"Round-trip mismatch!"
)
print("Round-trip check: PASS")
Saved to: medium_severity.parquet (29,968 bytes) Reloaded: X=(300, 10) Metadata preserved: bayes_error=N/A Round-trip check: PASS
Step 5: CSV round-trip¶
In [6]:
Copied!
with tempfile.TemporaryDirectory() as tmp:
csv_path = pathlib.Path(tmp) / "medium_severity.csv"
sweep_results[1].to_csv(csv_path)
loaded_csv = BenchResult.from_csv(csv_path)
be = loaded_csv.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"CSV round-trip: X={loaded_csv.X.shape}, bayes_error={be_str}")
with tempfile.TemporaryDirectory() as tmp:
csv_path = pathlib.Path(tmp) / "medium_severity.csv"
sweep_results[1].to_csv(csv_path)
loaded_csv = BenchResult.from_csv(csv_path)
be = loaded_csv.metadata["bayes_error"]
be_str = f"{be:.4f}" if be is not None else "N/A"
print(f"CSV round-trip: X={loaded_csv.X.shape}, bayes_error={be_str}")
CSV round-trip: X=(300, 10), bayes_error=N/A
Step 6: sklearn benchmark across severity levels¶
cross_val_score measures classifier accuracy at each corruption severity.
The Bayes error (when available) gives a theoretical lower bound on achievable error.
In [7]:
Copied!
rows = []
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
imputer = SimpleImputer(strategy="mean")
X_imp = imputer.fit_transform(r.X)
y = r.y
for clf_name, clf in [
("LogisticRegression", LogisticRegression(max_iter=200)),
("RandomForest", RandomForestClassifier(n_estimators=50, random_state=0)),
]:
scores = cross_val_score(clf, X_imp, y, cv=3, scoring="accuracy")
be = r.metadata["bayes_error"]
rows.append(
{
"severity": sev,
"classifier": clf_name,
"accuracy_mean": round(scores.mean(), 3),
"bayes_error": round(be, 4) if be is not None else None,
}
)
df = pd.DataFrame(rows)
df
rows = []
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
imputer = SimpleImputer(strategy="mean")
X_imp = imputer.fit_transform(r.X)
y = r.y
for clf_name, clf in [
("LogisticRegression", LogisticRegression(max_iter=200)),
("RandomForest", RandomForestClassifier(n_estimators=50, random_state=0)),
]:
scores = cross_val_score(clf, X_imp, y, cv=3, scoring="accuracy")
be = r.metadata["bayes_error"]
rows.append(
{
"severity": sev,
"classifier": clf_name,
"accuracy_mean": round(scores.mean(), 3),
"bayes_error": round(be, 4) if be is not None else None,
}
)
df = pd.DataFrame(rows)
df
Out[7]:
| severity | classifier | accuracy_mean | bayes_error | |
|---|---|---|---|---|
| 0 | low | LogisticRegression | 0.603 | None |
| 1 | low | RandomForest | 0.587 | None |
| 2 | medium | LogisticRegression | 0.597 | None |
| 3 | medium | RandomForest | 0.550 | None |
| 4 | high | LogisticRegression | 0.630 | None |
| 5 | high | RandomForest | 0.590 | None |
Summary: interpreting the results¶
The Bayes error from result.metadata["bayes_error"] is a theoretical lower bound on
classifier error for this dataset — no model can achieve error below this value.
As corruption severity increases, the effective difficulty of the task rises and
classifier accuracy falls.