Quickstart#
Get up and running in a few minutes using the main dataset entry point:
pdmlabs.utils.dataset.Dataset.
This page shows:
Data preparation with one
DatasethandlerHow the same runner pattern works across flavors
Flavor-specific examples using only APIs that exist in this repository
Step 1: Install#
pip install .
Optional (for experiment tracking UI):
pip install mlflow
Step 2: Data Preparation with utils.dataset.Dataset#
import pandas as pd
from pdmlabs.utils.dataset import Dataset
# Minimal multi-source toy data
n = 240
df = pd.DataFrame(
{
"timestamp": pd.date_range("2024-01-01", periods=n, freq="H").tolist()
+ pd.date_range("2024-01-01", periods=n, freq="H").tolist(),
"source": ["asset_1"] * n + ["asset_2"] * n,
"sensor_1": [0.2 + i * 0.001 for i in range(n)] + [0.3 + i * 0.0012 for i in range(n)],
"sensor_2": [1.0] * n + [0.9] * n,
# event columns used to form episodes
"maintenance": [0] * (n - 1) + [1] + [0] * (n - 1) + [1],
"failure": [0] * (n - 5) + [1, 0, 0, 0, 0] + [0] * (n - 10) + [1] + [0] * 9,
}
)
dataset_handler = Dataset(
data=df,
datetime_column="timestamp",
source_column="source",
maintenance_column="maintenance",
failure_column="failure",
train_sources=0.6,
val_sources=0.2,
test_sources=0.2,
)
From the same dataset_handler, you can create all flavor-specific dataset dictionaries:
ds_semi, _ = dataset_handler.get_semi_dataset()
ds_unsup, _ = dataset_handler.get_unsupervised_dataset()
ds_cls, _ = dataset_handler.get_Classification_dataset()
ds_rul, _ = dataset_handler.get_rul_dataset()
ds_sa, _ = dataset_handler.get_SA_dataset()
Step 3: One Common run_experiment Pattern#
Important behavior in this framework:
event_preferencesare taken from the dataset flavor dictionary (created byDataset.get_*_dataset())In
run_experiment, pass the method class (for exampleIsolationForest), not an instantiated object
from pdmlabs.RunExperiment import run_experiment
from pdmlabs.thresholding.constant import ConstantThresholder
def run_one_flavor(
dataset,
method_class,
method_name,
method_param_space,
experiment_cls,
experiment_name,
*,
fit_size=100,
profile_size=2,
thresholder_cls=ConstantThresholder,
optimization_param="AD1_AUC",
maximize=True,
):
return run_experiment(
dataset=dataset,
# Pass class, not object. The pipeline handles event_preferences from dataset.
methods=[method_class],
param_space_dict_per_method=[method_param_space],
method_names=[method_name],
experiments=[experiment_cls],
experiment_names=[experiment_name],
MAX_RUNS=4,
MAX_JOBS=1,
INITIAL_RANDOM=1,
fit_size=fit_size,
profile_size=profile_size,
thresholder=thresholder_cls,
mlflow_port=None,
optimization_param=optimization_param,
maximize=maximize,
)
Step 4: Run Each Flavor#
Semi-supervised anomaly detection (auto-profile)#
from pdmlabs.experiment.batch.auto_profile_semi_supervised_experiment import AutoProfileSemiSupervisedPdMExperiment
from pdmlabs.method.isolation_forest import IsolationForest
from pdmlabs.utils import automatic_parameter_generation
result_auto_profile = run_one_flavor(
dataset=ds_semi,
method_class=IsolationForest,
method_name="IF",
method_param_space=automatic_parameter_generation.online_technique("IF", 100),
experiment_cls=AutoProfileSemiSupervisedPdMExperiment,
experiment_name="AutoProfile Semi",
fit_size=100,
profile_size=2,
)
Unsupervised anomaly detection#
from pdmlabs.experiment.batch.unsupervised_experiment import UnsupervisedPdMExperiment
from pdmlabs.method.isolation_forest_uns import IsolationForestUnsupervised
result_unsupervised = run_one_flavor(
dataset=ds_unsup,
method_class=IsolationForestUnsupervised,
method_name="IF",
method_param_space=automatic_parameter_generation.unsupervised_technique("IF", 100),
experiment_cls=UnsupervisedPdMExperiment,
experiment_name="Unsupervised",
fit_size=100,
profile_size=2,
)
Supervised classification#
from pdmlabs.experiment.batch.supervised_experiment import SupervisedPdMExperiment
from pdmlabs.method.xgboost import XGBoost
result_classification = run_one_flavor(
dataset=ds_cls,
method_class=XGBoost,
method_name="XGBOOST",
method_param_space=automatic_parameter_generation.supervised_technique("XGBOOST", 100),
experiment_cls=SupervisedPdMExperiment,
experiment_name="Classification",
fit_size=100,
profile_size=2,
)
Supervised RUL regression#
from pdmlabs.experiment.batch.RUL_experiment import SupervisedRULPdMExperiment
from pdmlabs.method.sklearn_regression_wraper import RandomForestRUL
result_rul = run_one_flavor(
dataset=ds_rul,
method_class=RandomForestRUL,
method_name="RandomForestRUL",
method_param_space={
"n_estimators": [50, 100],
"max_depth": [5, None],
"random_state": [42],
},
experiment_cls=SupervisedRULPdMExperiment,
experiment_name="RUL",
fit_size=100,
profile_size=2,
)
Supervised survival-analysis flavor#
from pdmlabs.experiment.batch.SA_experiment import Supervised_SA_PdMExperiment
from pdmlabs.thresholding.SurvSuperVisedTH import SurvToRUL
result_sa = run_one_flavor(
dataset=ds_sa,
method_class=XGBoost,
method_name="XGBOOST",
method_param_space=automatic_parameter_generation.supervised_technique("XGBOOST", 100),
experiment_cls=Supervised_SA_PdMExperiment,
experiment_name="Survival Analysis",
fit_size=100,
profile_size=2,
thresholder_cls=SurvToRUL,
)
Inspect Results#
print("AutoProfile:", result_auto_profile)
print("Unsupervised:", result_unsupervised)
print("Classification:", result_classification)
print("RUL:", result_rul)
print("SA:", result_sa)
Each call returns a list with one item for this setup (one method x one experiment),
typically containing keys such as best_params, best_objective, and th.
Step 5: Load and Use Stored MLflow Models#
PdMLabs automatically logs the best pipeline (including preprocessors, methods, postprocessors, and thresholders) as an MLflow pyfunc artifact for each experiment.
You can easily load this pipeline later to generate predictions on test or streaming data:
import mlflow
# 1. Find your experiment and run ID
experiment_name = "AutoProfile Semi IF" # Name format is typically "{experiment_name} {method_name}"
experiment = mlflow.get_experiment_by_name(experiment_name)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id], order_by=["start_time DESC"], max_results=1)
run_id = runs.iloc[0].run_id
# 2. Load the complete pipeline
loaded_pipeline = mlflow.pyfunc.load_model(f"runs:/{run_id}/best_pdm_pipeline")
# 3. Make predictions
# The pipeline handles data transformations and thresholds transparently
target_data = ds_semi['target_data'][0]
target_source = ds_semi['target_sources'][0]
# Pre-process raw test data (e.g., dropping timestamps) before prediction if necessary
if 'timestamp' in target_data.columns:
target_data = target_data.drop(columns=['timestamp'])
predictions = loaded_pipeline.predict({
'target_data': target_data,
'source': target_source,
'event_data': ds_semi['event_data'] # optional
})
print("Anomaly Scores:", predictions['scores'])
print("Dynamic Thresholds:", predictions.get('dynamic_thresholds'))
Troubleshooting#
βModuleNotFoundError: No module named βpdmlabsββ
pip install .
No experiments are running
Check that your event columns create valid episodes and that each train/val/test split contains at least one failure episode.
MLflow issues
Keep mlflow_port=None for first runs. Enable later only if needed.