Tutorials · PyCaret

A tutorial per task type. Each one is self-contained — you can paste the code into a fresh notebook and follow along. They use sample datasets shipped with pycaret.datasets so there's nothing to download.

Classification — predicting customer purchase#

from pycaret.classification import ClassificationExperiment
from pycaret.datasets import get_data

data = get_data("juice", verbose=False)

exp = ClassificationExperiment(
    target="Purchase",
    session_id=42,
    normalize=True,
).fit(data)

# Compare 12 models. Returns a CompareResult.
top = exp.compare_models(n_select=3)
print(top.leaderboard.head())

# Tune the winner.
tuned = exp.tune_model(top.best, n_iter=20, optimize="AUC")
print("best params:", tuned.best_params)

# Inspect the diagnostics.
preds = exp.predict_model(tuned.pipeline)
print(preds.metrics)

# Persist for production.
final = exp.finalize_model(tuned.pipeline)
exp.save_model(final.pipeline, "juice-classifier")

Regression — Boston housing prices#

from pycaret.regression import RegressionExperiment
from pycaret.datasets import get_data

data = get_data("boston", verbose=False)

exp = RegressionExperiment(
    target="medv",
    session_id=42,
    normalize=True,
    transformation=True,
).fit(data)

best = exp.compare_models(sort="RMSE").best
tuned = exp.tune_model(best, n_iter=20, optimize="RMSE")

# Residual diagnostics — Plotly figure ready for fig.show() / API serving.
from pycaret.plots.regression import residuals, prediction_error
residuals(tuned.pipeline, exp.X_test, exp.y_test).show()
prediction_error(tuned.pipeline, exp.X_test, exp.y_test).show()

Clustering — segmenting jewellery customers#

from pycaret.clustering import ClusteringExperiment
from pycaret.datasets import get_data
from pycaret.plots.clustering import elbow_curve, silhouette_curve, embedding_2d

data = get_data("jewellery", verbose=False)

exp = ClusteringExperiment(session_id=42, normalize=True).fit(data)

# Pick k via the elbow + silhouette charts.
elbow_curve(exp.create_model("kmeans").pipeline, exp._fit_state["X_transformed"]).show()
silhouette_curve(exp.create_model("kmeans").pipeline, exp._fit_state["X_transformed"]).show()

# Final model with the chosen k.
res = exp.create_model("kmeans", num_clusters=4)
labelled = exp.assign_model(res.pipeline)  # original df + Cluster column

embedding_2d(res.pipeline, exp._fit_state["X_transformed"]).show()

Anomaly detection — identifying outliers#

from pycaret.anomaly import AnomalyExperiment
from pycaret.datasets import get_data
from pycaret.plots.anomaly import score_distribution, anomaly_map

data = get_data("anomaly", verbose=False)

exp = AnomalyExperiment(session_id=42).fit(data)
res = exp.create_model("iforest")  # IsolationForest
labelled = exp.assign_model(res.pipeline)
print(labelled.head())  # Anomaly + Anomaly_Score columns attached

score_distribution(res.pipeline, exp._fit_state["X_transformed"]).show()
anomaly_map(res.pipeline, exp._fit_state["X_transformed"]).show()

Time-series — airline passengers#

from pycaret.time_series import TimeSeriesExperiment
from pycaret.datasets import get_data
from pycaret.plots.time_series import (
    forecast, decomposition, residual_diagnostics
)

data = get_data("airline", verbose=False)

exp = TimeSeriesExperiment(fh=12, session_id=42).fit(data)
top = exp.compare_models(include=["arima", "ets", "theta", "naive"], sort="MASE")
print(top.leaderboard)

best = exp.tune_model(top.best, n_iter=10, optimize="MASE").pipeline
preds = exp.predict_model(best, return_pred_int=True).predictions

# Forecast + diagnostics.
forecast(
    y_true=exp.y_test,
    y_pred=preds["y_pred"],
    lower=preds["lower"],
    upper=preds["upper"],
    history=exp.y_train,
).show()
decomposition(exp.y_train, period=12).show()
residual_diagnostics(exp.y_test, preds["y_pred"]).show()

What to do next#

Read Modules for an overview of all five task modules and their differences.
Skim Functions / Initialize to understand the Experiment(...).fit() constructor in depth.
Browse API reference for every public symbol.