End-to-End Workflow¶

A complete research workflow using synth-bench: generate a synthetic classification dataset, corrupt it, sweep over severity levels, serialize the best result, reload it from disk, and benchmark two sklearn classifiers.

In [1]:

Copied!





import pathlib
import tempfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from synthbench import (
    BenchPipeline,
    BenchResult,
    FriedmanDGP,
    LabelNoiseCorruptor,
    MissingDataCorruptor,
    severity_sweep,
)

plt.rcParams["figure.dpi"] = 72
import pathlib
import tempfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from synthbench import (
    BenchPipeline,
    BenchResult,
    FriedmanDGP,
    LabelNoiseCorruptor,
    MissingDataCorruptor,
    severity_sweep,
)

plt.rcParams["figure.dpi"] = 72

Step 1: Generate a clean classification dataset¶

In [2]:

Copied!





dgp = FriedmanDGP(task_type="classification", complexity="medium")
clean = BenchPipeline(dgp).run(n_samples=400, n_features=10, random_state=42)
print(f"Clean dataset: X={clean.X.shape}, class balance={clean.y.mean():.2f}")
be = clean.metadata["bayes_error"]
print(f"Bayes error (lower bound): {be:.4f}" if be is not None else "Bayes error: N/A")
print(f"Effective rank: {clean.metadata['effective_rank']:.2f}")
dgp = FriedmanDGP(task_type="classification", complexity="medium")
clean = BenchPipeline(dgp).run(n_samples=400, n_features=10, random_state=42)
print(f"Clean dataset: X={clean.X.shape}, class balance={clean.y.mean():.2f}")
be = clean.metadata["bayes_error"]
print(f"Bayes error (lower bound): {be:.4f}" if be is not None else "Bayes error: N/A")
print(f"Effective rank: {clean.metadata['effective_rank']:.2f}")

Clean dataset: X=(400, 10), class balance=0.47
Bayes error (lower bound): 0.4600
Effective rank: 8.62

Step 2: Apply a corruption pipeline¶

In [3]:

Copied!





dgp = FriedmanDGP(task_type="classification", complexity="medium")
pipeline = BenchPipeline(
    dgp,
    corruptors=[MissingDataCorruptor(severity="medium", mechanism="mcar")],
    label_corruptors=[LabelNoiseCorruptor(noise_rate=0.05)],
)
corrupted = pipeline.run(n_samples=400, n_features=10, random_state=42)
print(f"Missing fraction: {np.isnan(corrupted.X).mean():.3f}")
be = corrupted.metadata["bayes_error"]
print(
    f"Bayes error after corruption: {be:.4f}"
    if be is not None
    else "Bayes error after corruption: N/A (not computed for corrupted pipelines)"
)
dgp = FriedmanDGP(task_type="classification", complexity="medium")
pipeline = BenchPipeline(
    dgp,
    corruptors=[MissingDataCorruptor(severity="medium", mechanism="mcar")],
    label_corruptors=[LabelNoiseCorruptor(noise_rate=0.05)],
)
corrupted = pipeline.run(n_samples=400, n_features=10, random_state=42)
print(f"Missing fraction: {np.isnan(corrupted.X).mean():.3f}")
be = corrupted.metadata["bayes_error"]
print(
    f"Bayes error after corruption: {be:.4f}"
    if be is not None
    else "Bayes error after corruption: N/A (not computed for corrupted pipelines)"
)

Missing fraction: 0.150
Bayes error after corruption: N/A (not computed for corrupted pipelines)

Step 3: Sweep over severity levels¶

In [4]:

Copied!





dgp = FriedmanDGP(task_type="classification", complexity="medium")
sweep_results = severity_sweep(
    dgp,
    MissingDataCorruptor,
    severities=["low", "medium", "high"],
    n_samples=300,
    n_features=10,
    random_state=42,
)

for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
    be = r.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"Severity={sev}: missing={np.isnan(r.X).mean():.3f}, bayes_error={be_str}")
dgp = FriedmanDGP(task_type="classification", complexity="medium")
sweep_results = severity_sweep(
    dgp,
    MissingDataCorruptor,
    severities=["low", "medium", "high"],
    n_samples=300,
    n_features=10,
    random_state=42,
)

for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
    be = r.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"Severity={sev}: missing={np.isnan(r.X).mean():.3f}, bayes_error={be_str}")

Severity=low: missing=0.050, bayes_error=N/A
Severity=medium: missing=0.150, bayes_error=N/A
Severity=high: missing=0.300, bayes_error=N/A

Step 4: Serialize a result to Parquet and reload¶

In [5]:

Copied!





# Use a temporary directory so this notebook is self-contained
with tempfile.TemporaryDirectory() as tmp:
    parquet_path = pathlib.Path(tmp) / "medium_severity.parquet"

    # Save the medium-severity result
    sweep_results[1].to_parquet(parquet_path)
    print(f"Saved to: {parquet_path.name} ({parquet_path.stat().st_size:,} bytes)")

    # Reload from disk
    loaded = BenchResult.from_parquet(parquet_path)
    print(f"Reloaded: X={loaded.X.shape}")
    be = loaded.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"Metadata preserved: bayes_error={be_str}")

    # Round-trip check
    assert np.allclose(sweep_results[1].X, loaded.X, equal_nan=True), (
        "Round-trip mismatch!"
    )
    print("Round-trip check: PASS")
# Use a temporary directory so this notebook is self-contained
with tempfile.TemporaryDirectory() as tmp:
    parquet_path = pathlib.Path(tmp) / "medium_severity.parquet"

    # Save the medium-severity result
    sweep_results[1].to_parquet(parquet_path)
    print(f"Saved to: {parquet_path.name} ({parquet_path.stat().st_size:,} bytes)")

    # Reload from disk
    loaded = BenchResult.from_parquet(parquet_path)
    print(f"Reloaded: X={loaded.X.shape}")
    be = loaded.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"Metadata preserved: bayes_error={be_str}")

    # Round-trip check
    assert np.allclose(sweep_results[1].X, loaded.X, equal_nan=True), (
        "Round-trip mismatch!"
    )
    print("Round-trip check: PASS")

Saved to: medium_severity.parquet (29,968 bytes)
Reloaded: X=(300, 10)
Metadata preserved: bayes_error=N/A
Round-trip check: PASS

Step 5: CSV round-trip¶

In [6]:

Copied!





with tempfile.TemporaryDirectory() as tmp:
    csv_path = pathlib.Path(tmp) / "medium_severity.csv"
    sweep_results[1].to_csv(csv_path)
    loaded_csv = BenchResult.from_csv(csv_path)
    be = loaded_csv.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"CSV round-trip: X={loaded_csv.X.shape}, bayes_error={be_str}")
with tempfile.TemporaryDirectory() as tmp:
    csv_path = pathlib.Path(tmp) / "medium_severity.csv"
    sweep_results[1].to_csv(csv_path)
    loaded_csv = BenchResult.from_csv(csv_path)
    be = loaded_csv.metadata["bayes_error"]
    be_str = f"{be:.4f}" if be is not None else "N/A"
    print(f"CSV round-trip: X={loaded_csv.X.shape}, bayes_error={be_str}")

CSV round-trip: X=(300, 10), bayes_error=N/A

Step 6: sklearn benchmark across severity levels¶

cross_val_score measures classifier accuracy at each corruption severity. The Bayes error (when available) gives a theoretical lower bound on achievable error.

In [7]:

Copied!





rows = []
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
    imputer = SimpleImputer(strategy="mean")
    X_imp = imputer.fit_transform(r.X)
    y = r.y

    for clf_name, clf in [
        ("LogisticRegression", LogisticRegression(max_iter=200)),
        ("RandomForest", RandomForestClassifier(n_estimators=50, random_state=0)),
    ]:
        scores = cross_val_score(clf, X_imp, y, cv=3, scoring="accuracy")
        be = r.metadata["bayes_error"]
        rows.append(
            {
                "severity": sev,
                "classifier": clf_name,
                "accuracy_mean": round(scores.mean(), 3),
                "bayes_error": round(be, 4) if be is not None else None,
            }
        )

df = pd.DataFrame(rows)
df
rows = []
for sev, r in zip(["low", "medium", "high"], sweep_results, strict=False):
    imputer = SimpleImputer(strategy="mean")
    X_imp = imputer.fit_transform(r.X)
    y = r.y

    for clf_name, clf in [
        ("LogisticRegression", LogisticRegression(max_iter=200)),
        ("RandomForest", RandomForestClassifier(n_estimators=50, random_state=0)),
    ]:
        scores = cross_val_score(clf, X_imp, y, cv=3, scoring="accuracy")
        be = r.metadata["bayes_error"]
        rows.append(
            {
                "severity": sev,
                "classifier": clf_name,
                "accuracy_mean": round(scores.mean(), 3),
                "bayes_error": round(be, 4) if be is not None else None,
            }
        )

df = pd.DataFrame(rows)
df

Out[7]:

	severity	classifier	accuracy_mean	bayes_error
0	low	LogisticRegression	0.603	None
1	low	RandomForest	0.587	None
2	medium	LogisticRegression	0.597	None
3	medium	RandomForest	0.550	None
4	high	LogisticRegression	0.630	None
5	high	RandomForest	0.590	None

Summary: interpreting the results¶

The Bayes error from result.metadata["bayes_error"] is a theoretical lower bound on classifier error for this dataset — no model can achieve error below this value. As corruption severity increases, the effective difficulty of the task rises and classifier accuracy falls.