ML for Developers

This commit is contained in:
GokuMohandas
2023-07-26 04:53:11 -07:00
commit 776a75b010
54 changed files with 55464 additions and 0 deletions

244
madewithml/config.py Normal file
View File

@@ -0,0 +1,244 @@
# config.py
import logging
import sys
from pathlib import Path
import mlflow
import pretty_errors # NOQA: F401 (imported but unused)
# Directories
ROOT_DIR = Path(__file__).parent.parent.absolute()
LOGS_DIR = Path(ROOT_DIR, "logs")
LOGS_DIR.mkdir(parents=True, exist_ok=True)
# Config MLflow
MODEL_REGISTRY = Path("/tmp/mlflow")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute())
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
# Logger
logging_config = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"minimal": {"format": "%(message)s"},
"detailed": {"format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"},
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"stream": sys.stdout,
"formatter": "minimal",
"level": logging.DEBUG,
},
"info": {
"class": "logging.handlers.RotatingFileHandler",
"filename": Path(LOGS_DIR, "info.log"),
"maxBytes": 10485760, # 1 MB
"backupCount": 10,
"formatter": "detailed",
"level": logging.INFO,
},
"error": {
"class": "logging.handlers.RotatingFileHandler",
"filename": Path(LOGS_DIR, "error.log"),
"maxBytes": 10485760, # 1 MB
"backupCount": 10,
"formatter": "detailed",
"level": logging.ERROR,
},
},
"root": {
"handlers": ["console", "info", "error"],
"level": logging.INFO,
"propagate": True,
},
}
# Logger
logging.config.dictConfig(logging_config)
logger = logging.getLogger()
# Constraints
STOPWORDS = [
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"you're",
"you've",
"you'll",
"you'd",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"she's",
"her",
"hers",
"herself",
"it",
"it's",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"that'll",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"don't",
"should",
"should've",
"now",
"d",
"ll",
"m",
"o",
"re",
"ve",
"y",
"ain",
"aren",
"aren't",
"couldn",
"couldn't",
"didn",
"didn't",
"doesn",
"doesn't",
"hadn",
"hadn't",
"hasn",
"hasn't",
"haven",
"haven't",
"isn",
"isn't",
"ma",
"mightn",
"mightn't",
"mustn",
"mustn't",
"needn",
"needn't",
"shan",
"shan't",
"shouldn",
"shouldn't",
"wasn",
"wasn't",
"weren",
"weren't",
"won",
"won't",
"wouldn",
"wouldn't",
]

147
madewithml/data.py Normal file
View File

@@ -0,0 +1,147 @@
import re
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
import ray
from ray.data import Dataset
from ray.data.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from madewithml.config import STOPWORDS
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
"""Load data from source into a Ray Dataset.
Args:
dataset_loc (str): Location of the dataset.
num_samples (int, optional): The number of samples to load. Defaults to None.
Returns:
Dataset: Our dataset represented by a Ray Dataset.
"""
ds = ray.data.read_csv(dataset_loc)
ds = ds.random_shuffle(seed=1234)
ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
return ds
def stratify_split(
ds: Dataset,
stratify: str,
test_size: float,
shuffle: bool = True,
seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
"""Split a dataset into train and test splits with equal
amounts of data points from each class in the column we
want to stratify on.
Args:
ds (Dataset): Input dataset to split.
stratify (str): Name of column to split on.
test_size (float): Proportion of dataset to split for test set.
shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
seed (int, optional): seed for shuffling. Defaults to 1234.
Returns:
Tuple[Dataset, Dataset]: the stratified train and test datasets.
"""
def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Naively split a dataframe into train and test splits.
Add a column specifying whether it's the train or test split."""
train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
train["_split"] = "train"
test["_split"] = "test"
return pd.concat([train, test])
def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function
"""Filter by data points that match the split column's value
and return the dataframe with the _split column dropped."""
return df[df["_split"] == split].drop("_split", axis=1)
# Train, test split with stratify
grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on
train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine
test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine
# Shuffle each split (required)
train_ds = train_ds.random_shuffle(seed=seed)
test_ds = test_ds.random_shuffle(seed=seed)
return train_ds, test_ds
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
"""Clean raw text string.
Args:
text (str): Raw text to clean.
stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS.
Returns:
str: cleaned text.
"""
# Lower
text = text.lower()
# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub(" ", text)
# Spacing and filters
text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip() # strip white space at the ends
text = re.sub(r"http\S+", "", text) # remove links
return text
def tokenize(batch: Dict) -> Dict:
"""Tokenize the text input in our batch using a tokenizer.
Args:
batch (Dict): batch of data with the text inputs to tokenize.
Returns:
Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
"""
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
"""Preprocess the data in our dataframe.
Args:
df (pd.DataFrame): Raw dataframe to preprocess.
class_to_index (Dict): Mapping of class names to indices.
Returns:
Dict: preprocessed data (ids, masks, targets).
"""
df["text"] = df.title + " " + df.description # feature engineering
df["text"] = df.text.apply(clean_text) # clean text
df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe
df = df[["text", "tag"]] # rearrange columns
df["tag"] = df["tag"].map(class_to_index) # label encoding
outputs = tokenize(df)
return outputs
class CustomPreprocessor(Preprocessor):
"""Custom preprocessor class."""
def _fit(self, ds):
tags = ds.unique(column="tag")
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
def _transform_pandas(self, batch): # could also do _transform_numpy
return preprocess(batch, class_to_index=self.class_to_index)

154
madewithml/evaluate.py Normal file
View File

@@ -0,0 +1,154 @@
import datetime
import json
from collections import OrderedDict
from typing import Dict
import numpy as np
import ray
import ray.train.torch # NOQA: F401 (imported but unused)
import typer
from ray.data import Dataset
from ray.train.torch.torch_predictor import TorchPredictor
from sklearn.metrics import precision_recall_fscore_support
from snorkel.slicing import PandasSFApplier, slicing_function
from typing_extensions import Annotated
from madewithml import predict, utils
from madewithml.config import logger
# Initialize Typer CLI app
app = typer.Typer()
def get_overall_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict: # pragma: no cover, eval workload
"""Get overall performance metrics.
Args:
y_true (np.ndarray): ground truth labels.
y_pred (np.ndarray): predicted labels.
Returns:
Dict: overall metrics.
"""
metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
overall_metrics = {
"precision": metrics[0],
"recall": metrics[1],
"f1": metrics[2],
"num_samples": np.float64(len(y_true)),
}
return overall_metrics
def get_per_class_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_to_index: Dict) -> Dict: # pragma: no cover, eval workload
"""Get per class performance metrics.
Args:
y_true (np.ndarray): ground truth labels.
y_pred (np.ndarray): predicted labels.
class_to_index (Dict): dictionary mapping class to index.
Returns:
Dict: per class metrics.
"""
per_class_metrics = {}
metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
for i, _class in enumerate(class_to_index):
per_class_metrics[_class] = {
"precision": metrics[0][i],
"recall": metrics[1][i],
"f1": metrics[2][i],
"num_samples": np.float64(metrics[3][i]),
}
sorted_per_class_metrics = OrderedDict(sorted(per_class_metrics.items(), key=lambda tag: tag[1]["f1"], reverse=True))
return sorted_per_class_metrics
@slicing_function()
def nlp_llm(x): # pragma: no cover, eval workload
"""NLP projects that use LLMs."""
nlp_project = "natural-language-processing" in x.tag
llm_terms = ["transformer", "llm", "bert"]
llm_project = any(s.lower() in x.text.lower() for s in llm_terms)
return nlp_project and llm_project
@slicing_function()
def short_text(x): # pragma: no cover, eval workload
"""Projects with short titles and descriptions."""
return len(x.text.split()) < 8 # less than 8 words
def get_slice_metrics(y_true: np.ndarray, y_pred: np.ndarray, ds: Dataset) -> Dict: # pragma: no cover, eval workload
"""Get performance metrics for slices.
Args:
y_true (np.ndarray): ground truth labels.
y_pred (np.ndarray): predicted labels.
ds (Dataset): Ray dataset with labels.
Returns:
Dict: performance metrics for slices.
"""
slice_metrics = {}
df = ds.to_pandas()
df["text"] = df["title"] + " " + df["description"]
slices = PandasSFApplier([nlp_llm, short_text]).apply(df)
for slice_name in slices.dtype.names:
mask = slices[slice_name].astype(bool)
if sum(mask):
metrics = precision_recall_fscore_support(y_true[mask], y_pred[mask], average="micro")
slice_metrics[slice_name] = {}
slice_metrics[slice_name]["precision"] = metrics[0]
slice_metrics[slice_name]["recall"] = metrics[1]
slice_metrics[slice_name]["f1"] = metrics[2]
slice_metrics[slice_name]["num_samples"] = len(y_true[mask])
return slice_metrics
@app.command()
def evaluate(
run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
dataset_loc: Annotated[str, typer.Option(help="dataset (with labels) to evaluate on")] = None,
results_fp: Annotated[str, typer.Option(help="location to save evaluation results to")] = None,
) -> Dict: # pragma: no cover, eval workload
"""Evaluate on the holdout dataset.
Args:
run_id (str): id of the specific run to load from. Defaults to None.
dataset_loc (str): dataset (with labels) to evaluate on.
results_fp (str, optional): location to save evaluation results to. Defaults to None.
Returns:
Dict: model's performance metrics on the dataset.
"""
# Load
ds = ray.data.read_csv(dataset_loc)
best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
# y_true
preprocessor = predictor.get_preprocessor()
preprocessed_ds = preprocessor.transform(ds)
values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
y_true = np.stack([item["targets"] for item in values])
# y_pred
z = predictor.predict(data=ds.to_pandas())["predictions"]
y_pred = np.stack(z).argmax(1)
# Metrics
metrics = {
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
"run_id": run_id,
"overall": get_overall_metrics(y_true=y_true, y_pred=y_pred),
"per_class": get_per_class_metrics(y_true=y_true, y_pred=y_pred, class_to_index=preprocessor.class_to_index),
"slices": get_slice_metrics(y_true=y_true, y_pred=y_pred, ds=ds),
}
logger.info(json.dumps(metrics, indent=2))
if results_fp: # pragma: no cover, saving results
utils.save_dict(d=metrics, path=results_fp)
return metrics
if __name__ == "__main__": # pragma: no cover, checked during evaluation workload
app()

19
madewithml/models.py Normal file
View File

@@ -0,0 +1,19 @@
import torch
import torch.nn as nn
class FinetunedLLM(nn.Module): # pragma: no cover, torch model
"""Model architecture for a Large Language Model (LLM) that we will fine-tune."""
def __init__(self, llm, dropout_p, embedding_dim, num_classes):
super(FinetunedLLM, self).__init__()
self.llm = llm
self.dropout = torch.nn.Dropout(dropout_p)
self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
def forward(self, batch):
ids, masks = batch["ids"], batch["masks"]
seq, pool = self.llm(input_ids=ids, attention_mask=masks)
z = self.dropout(pool)
z = self.fc1(z)
return z

139
madewithml/predict.py Normal file
View File

@@ -0,0 +1,139 @@
import json
from typing import Any, Dict, Iterable, List
from urllib.parse import urlparse
import pandas as pd
import ray
import torch
import typer
from numpyencoder import NumpyEncoder
from ray.air import Result
from ray.train.torch import TorchPredictor
from ray.train.torch.torch_checkpoint import TorchCheckpoint
from typing_extensions import Annotated
from madewithml.config import logger, mlflow
# Initialize Typer CLI app
app = typer.Typer()
def decode(indices: Iterable[Any], index_to_class: Dict) -> List:
"""Decode indices to labels.
Args:
indices (Iterable[Any]): Iterable (list, array, etc.) with indices.
index_to_class (Dict): mapping between indices and labels.
Returns:
List: list of labels.
"""
return [index_to_class[index] for index in indices]
def format_prob(prob: Iterable, index_to_class: Dict) -> Dict:
"""Format probabilities to a dictionary mapping class label to probability.
Args:
prob (Iterable): probabilities.
index_to_class (Dict): mapping between indices and labels.
Returns:
Dict: Dictionary mapping class label to probability.
"""
d = {}
for i, item in enumerate(prob):
d[index_to_class[i]] = item
return d
def predict_with_proba(
df: pd.DataFrame,
predictor: ray.train.torch.torch_predictor.TorchPredictor,
) -> List: # pragma: no cover, tested with inference workload
"""Predict tags (with probabilities) for input data from a dataframe.
Args:
df (pd.DataFrame): dataframe with input features.
predictor (ray.train.torch.torch_predictor.TorchPredictor): loaded predictor from a checkpoint.
Returns:
List: list of predicted labels.
"""
preprocessor = predictor.get_preprocessor()
z = predictor.predict(data=df)["predictions"]
import numpy as np
y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
results = []
for i, prob in enumerate(y_prob):
tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
return results
@app.command()
def get_best_run_id(experiment_name: str = "", metric: str = "", mode: str = "") -> str: # pragma: no cover, mlflow logic
"""Get the best run_id from an MLflow experiment.
Args:
experiment_name (str): name of the experiment.
metric (str): metric to filter by.
mode (str): direction of metric (ASC/DESC).
Returns:
str: best run id from experiment.
"""
sorted_runs = mlflow.search_runs(
experiment_names=[experiment_name],
order_by=[f"metrics.{metric} {mode}"],
)
run_id = sorted_runs.iloc[0].run_id
print(run_id)
return run_id
def get_best_checkpoint(run_id: str) -> TorchCheckpoint: # pragma: no cover, mlflow logic
"""Get the best checkpoint from a specific run.
Args:
run_id (str): ID of the run to get the best checkpoint from.
Returns:
TorchCheckpoint: Best checkpoint from the run.
"""
artifact_dir = urlparse(mlflow.get_run(run_id).info.artifact_uri).path # get path from mlflow
results = Result.from_path(artifact_dir)
return results.best_checkpoints[0][0]
@app.command()
def predict(
run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
title: Annotated[str, typer.Option(help="project title")] = None,
description: Annotated[str, typer.Option(help="project description")] = None,
) -> Dict: # pragma: no cover, tested with inference workload
"""Predict the tag for a project given it's title and description.
Args:
run_id (str): id of the specific run to load from. Defaults to None.
title (str, optional): project title. Defaults to "".
description (str, optional): project description. Defaults to "".
Returns:
Dict: prediction results for the input data.
"""
# Load components
best_checkpoint = get_best_checkpoint(run_id=run_id)
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
preprocessor = predictor.get_preprocessor()
# Predict
sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
results = predict_with_proba(df=sample_df, predictor=predictor, index_to_class=preprocessor.index_to_class)
logger.info(json.dumps(results, cls=NumpyEncoder, indent=2))
return results
if __name__ == "__main__": # pragma: no cover, application
app()

79
madewithml/serve.py Normal file
View File

@@ -0,0 +1,79 @@
import argparse
from http import HTTPStatus
from typing import Dict
import pandas as pd
import ray
from fastapi import FastAPI
from ray import serve
from ray.train.torch import TorchPredictor
from starlette.requests import Request
from madewithml import evaluate, predict
from madewithml.config import MLFLOW_TRACKING_URI, mlflow
# Define application
app = FastAPI(
title="Made With ML",
description="Classify machine learning projects.",
version="0.1",
)
@serve.deployment(route_prefix="/", num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
@serve.ingress(app)
class ModelDeployment:
def __init__(self, run_id: str, threshold: int = 0.9):
"""Initialize the model."""
self.run_id = run_id
self.threshold = threshold
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # so workers have access to model registry
best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
self.predictor = TorchPredictor.from_checkpoint(best_checkpoint)
self.preprocessor = self.predictor.get_preprocessor()
@app.get("/")
def _index(self) -> Dict:
"""Health check."""
response = {
"message": HTTPStatus.OK.phrase,
"status-code": HTTPStatus.OK,
"data": {},
}
return response
@app.get("/run_id/")
def _run_id(self) -> Dict:
"""Get the run ID."""
return {"run_id": self.run_id}
@app.post("/evaluate/")
async def _evaluate(self, request: Request) -> Dict:
data = await request.json()
results = evaluate.evaluate(run_id=self.run_id, dataset_loc=data.get("dataset"))
return {"results": results}
@app.post("/predict/")
async def _predict(self, request: Request) -> Dict:
# Get prediction
data = await request.json()
df = pd.DataFrame([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
results = predict.predict_with_proba(df=df, predictor=self.predictor)
# Apply custom logic
for i, result in enumerate(results):
pred = result["prediction"]
prob = result["probabilities"]
if prob[pred] < self.threshold:
results[i]["prediction"] = "other"
return {"results": results}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--run_id", help="run ID to use for serving.")
parser.add_argument("--threshold", type=float, default=0.9, help="threshold for `other` class.")
args = parser.parse_args()
ray.init()
serve.run(ModelDeployment.bind(run_id=args.run_id, threshold=args.threshold))

256
madewithml/train.py Normal file
View File

@@ -0,0 +1,256 @@
import datetime
import json
from typing import Tuple
import numpy as np
import ray
import ray.train as train
import torch
import torch.nn as nn
import torch.nn.functional as F
import typer
from ray.air import session
from ray.air.config import (
CheckpointConfig,
DatasetConfig,
RunConfig,
ScalingConfig,
)
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.data import Dataset
from ray.train.torch import TorchCheckpoint, TorchTrainer
from transformers import BertModel
from typing_extensions import Annotated
from madewithml import data, models, utils
from madewithml.config import MLFLOW_TRACKING_URI, logger
# Initialize Typer CLI app
app = typer.Typer()
def train_step(
ds: Dataset,
batch_size: int,
model: nn.Module,
num_classes: int,
loss_fn: torch.nn.modules.loss._WeightedLoss,
optimizer: torch.optim.Optimizer,
) -> float: # pragma: no cover, tested via train workload
"""Train step.
Args:
ds (Dataset): dataset to iterate batches from.
batch_size (int): size of each batch.
model (nn.Module): model to train.
num_classes (int): number of classes.
loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
optimizer (torch.optimizer.Optimizer): optimizer to use for updating the model's weights.
Returns:
float: cumulative loss for the dataset.
"""
model.train()
loss = 0.0
ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
for i, batch in enumerate(ds_generator):
optimizer.zero_grad() # reset gradients
z = model(batch) # forward pass
targets = F.one_hot(batch["targets"], num_classes=num_classes).float() # one-hot (for loss_fn)
J = loss_fn(z, targets) # define loss
J.backward() # backward pass
optimizer.step() # update weights
loss += (J.detach().item() - loss) / (i + 1) # cumulative loss
return loss
def eval_step(
ds: Dataset, batch_size: int, model: nn.Module, num_classes: int, loss_fn: torch.nn.modules.loss._WeightedLoss
) -> Tuple[float, np.array, np.array]: # pragma: no cover, tested via train workload
"""Eval step.
Args:
ds (Dataset): dataset to iterate batches from.
batch_size (int): size of each batch.
model (nn.Module): model to train.
num_classes (int): number of classes.
loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
Returns:
Tuple[float, np.array, np.array]: cumulative loss, ground truths and predictions.
"""
model.eval()
loss = 0.0
y_trues, y_preds = [], []
ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
with torch.inference_mode():
for i, batch in enumerate(ds_generator):
z = model(batch)
targets = F.one_hot(batch["targets"], num_classes=num_classes).float() # one-hot (for loss_fn)
J = loss_fn(z, targets).item()
loss += (J - loss) / (i + 1)
y_trues.extend(batch["targets"].cpu().numpy())
y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
return loss, np.vstack(y_trues), np.vstack(y_preds)
def train_loop_per_worker(config: dict) -> None: # pragma: no cover, tested via train workload
"""Training loop that each worker will execute.
Args:
config (dict): arguments to use for training.
"""
# Hyperparameters
dropout_p = config["dropout_p"]
lr = config["lr"]
lr_factor = config["lr_factor"]
lr_patience = config["lr_patience"]
batch_size = config["batch_size"]
num_epochs = config["num_epochs"]
num_classes = config["num_classes"]
# Get datasets
utils.set_seeds()
train_ds = session.get_dataset_shard("train")
val_ds = session.get_dataset_shard("val")
# Model
llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
model = models.FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
model = train.torch.prepare_model(model)
# Training components
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)
# Training
batch_size_per_worker = batch_size // session.get_world_size()
for epoch in range(num_epochs):
# Step
train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
scheduler.step(val_loss)
# Checkpoint
metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
checkpoint = TorchCheckpoint.from_model(model=model)
session.report(metrics, checkpoint=checkpoint)
@app.command()
def train_model(
experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
train_loop_config: Annotated[str, typer.Option(help="arguments to use for training.")] = None,
num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
) -> ray.air.result.Result:
"""Main train function to train our model as a distributed workload.
Args:
experiment_name (str): name of the experiment for this training workload.
dataset_loc (str): location of the dataset.
train_loop_config (str): arguments to use for training.
num_workers (int, optional): number of workers to use for training. Defaults to 1.
cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
num_samples (int, optional): number of samples to use from dataset.
If this is passed in, it will override the config. Defaults to None.
num_epochs (int, optional): number of epochs to train for.
If this is passed in, it will override the config. Defaults to None.
batch_size (int, optional): number of samples per batch.
If this is passed in, it will override the config. Defaults to None.
results_fp (str, optional): filepath to save results to. Defaults to None.
Returns:
ray.air.result.Result: training results.
"""
# Set up
train_loop_config = json.loads(train_loop_config)
train_loop_config["num_samples"] = num_samples
train_loop_config["num_epochs"] = num_epochs
train_loop_config["batch_size"] = batch_size
# Scaling config
scaling_config = ScalingConfig(
num_workers=num_workers,
use_gpu=bool(gpu_per_worker),
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
_max_cpu_fraction_per_node=0.8,
)
# Checkpoint config
checkpoint_config = CheckpointConfig(
num_to_keep=1,
checkpoint_score_attribute="val_loss",
checkpoint_score_order="min",
)
# MLflow callback
mlflow_callback = MLflowLoggerCallback(
tracking_uri=MLFLOW_TRACKING_URI,
experiment_name=experiment_name,
save_artifact=True,
)
# Run config
run_config = RunConfig(
callbacks=[mlflow_callback],
checkpoint_config=checkpoint_config,
)
# Dataset
ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"])
train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
tags = train_ds.unique(column="tag")
train_loop_config["num_classes"] = len(tags)
# Dataset config
dataset_config = {
"train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
"val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
}
# Preprocess
preprocessor = data.CustomPreprocessor()
train_ds = preprocessor.fit_transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()
# Trainer
trainer = TorchTrainer(
train_loop_per_worker=train_loop_per_worker,
train_loop_config=train_loop_config,
scaling_config=scaling_config,
run_config=run_config,
datasets={"train": train_ds, "val": val_ds},
dataset_config=dataset_config,
preprocessor=preprocessor,
)
# Train
results = trainer.fit()
d = {
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
"run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=results.metrics["trial_id"]),
"params": results.config["train_loop_config"],
"metrics": utils.dict_to_list(results.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
}
logger.info(json.dumps(d, indent=2))
if results_fp: # pragma: no cover, saving results
utils.save_dict(d, results_fp)
return results
if __name__ == "__main__": # pragma: no cover, application
if ray.is_initialized():
ray.shutdown()
ray.init()
app()

182
madewithml/tune.py Normal file
View File

@@ -0,0 +1,182 @@
import datetime
import json
import ray
import typer
from ray import tune
from ray.air.config import (
CheckpointConfig,
DatasetConfig,
RunConfig,
ScalingConfig,
)
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.train.torch import TorchTrainer
from ray.tune import Tuner
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.hyperopt import HyperOptSearch
from typing_extensions import Annotated
from madewithml import data, train, utils
from madewithml.config import MLFLOW_TRACKING_URI, logger
# Initialize Typer CLI app
app = typer.Typer()
@app.command()
def tune_models(
experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
initial_params: Annotated[str, typer.Option(help="initial config for the tuning workload.")] = None,
num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
num_runs: Annotated[int, typer.Option(help="number of runs in this tuning experiment.")] = 1,
num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
) -> ray.tune.result_grid.ResultGrid:
"""Hyperparameter tuning experiment.
Args:
experiment_name (str): name of the experiment for this training workload.
dataset_loc (str): location of the dataset.
initial_params (str): initial config for the tuning workload.
num_workers (int, optional): number of workers to use for training. Defaults to 1.
cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
num_runs (int, optional): number of runs in this tuning experiment. Defaults to 1.
num_samples (int, optional): number of samples to use from dataset.
If this is passed in, it will override the config. Defaults to None.
num_epochs (int, optional): number of epochs to train for.
If this is passed in, it will override the config. Defaults to None.
batch_size (int, optional): number of samples per batch.
If this is passed in, it will override the config. Defaults to None.
results_fp (str, optional): filepath to save the tuning results. Defaults to None.
Returns:
ray.tune.result_grid.ResultGrid: results of the tuning experiment.
"""
# Set up
utils.set_seeds()
train_loop_config = {}
train_loop_config["num_samples"] = num_samples
train_loop_config["num_epochs"] = num_epochs
train_loop_config["batch_size"] = batch_size
# Scaling config
scaling_config = ScalingConfig(
num_workers=num_workers,
use_gpu=bool(gpu_per_worker),
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
_max_cpu_fraction_per_node=0.8,
)
# Dataset
ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config.get("num_samples", None))
train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
tags = train_ds.unique(column="tag")
train_loop_config["num_classes"] = len(tags)
# Dataset config
dataset_config = {
"train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
"val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
}
# Preprocess
preprocessor = data.CustomPreprocessor()
train_ds = preprocessor.fit_transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()
# Trainer
trainer = TorchTrainer(
train_loop_per_worker=train.train_loop_per_worker,
train_loop_config=train_loop_config,
scaling_config=scaling_config,
datasets={"train": train_ds, "val": val_ds},
dataset_config=dataset_config,
preprocessor=preprocessor,
)
# Checkpoint configuration
checkpoint_config = CheckpointConfig(
num_to_keep=1,
checkpoint_score_attribute="val_loss",
checkpoint_score_order="min",
)
# Run configuration
mlflow_callback = MLflowLoggerCallback(
tracking_uri=MLFLOW_TRACKING_URI,
experiment_name=experiment_name,
save_artifact=True,
)
run_config = RunConfig(
callbacks=[mlflow_callback],
checkpoint_config=checkpoint_config,
)
# Hyperparameters to start with
initial_params = json.loads(initial_params)
search_alg = HyperOptSearch(points_to_evaluate=initial_params)
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2) # trade off b/w optimization and search space
# Parameter space
param_space = {
"train_loop_config": {
"dropout_p": tune.uniform(0.3, 0.9),
"lr": tune.loguniform(1e-5, 5e-4),
"lr_factor": tune.uniform(0.1, 0.9),
"lr_patience": tune.uniform(1, 10),
}
}
# Scheduler
scheduler = AsyncHyperBandScheduler(
max_t=train_loop_config["num_epochs"], # max epoch (<time_attr>) per trial
grace_period=1, # min epoch (<time_attr>) per trial
)
# Tune config
tune_config = tune.TuneConfig(
metric="val_loss",
mode="min",
search_alg=search_alg,
scheduler=scheduler,
num_samples=num_runs,
)
# Tuner
tuner = Tuner(
trainable=trainer,
run_config=run_config,
param_space=param_space,
tune_config=tune_config,
)
# Tune
results = tuner.fit()
best_trial = results.get_best_result(metric="val_loss", mode="min")
d = {
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
"run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=best_trial.metrics["trial_id"]),
"params": best_trial.config["train_loop_config"],
"metrics": utils.dict_to_list(best_trial.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
}
logger.info(json.dumps(d, indent=2))
if results_fp: # pragma: no cover, saving results
utils.save_dict(d, results_fp)
return results
if __name__ == "__main__": # pragma: no cover, application
if ray.is_initialized():
ray.shutdown()
ray.init()
app()

123
madewithml/utils.py Normal file
View File

@@ -0,0 +1,123 @@
import json
import os
import random
from typing import Any, Dict, List
import numpy as np
import torch
from ray.data import DatasetContext
from ray.train.torch import get_device
from madewithml.config import mlflow
DatasetContext.get_current().execution_options.preserve_order = True
def set_seeds(seed: int = 42):
"""Set seeds for reproducibility."""
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
eval("setattr(torch.backends.cudnn, 'deterministic', True)")
eval("setattr(torch.backends.cudnn, 'benchmark', False)")
os.environ["PYTHONHASHSEED"] = str(seed)
def load_dict(path: str) -> Dict:
"""Load a dictionary from a JSON's filepath.
Args:
path (str): location of file.
Returns:
Dict: loaded JSON data.
"""
with open(path) as fp:
d = json.load(fp)
return d
def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
"""Save a dictionary to a specific location.
Args:
d (Dict): data to save.
path (str): location of where to save the data.
cls (optional): encoder to use on dict data. Defaults to None.
sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
"""
directory = os.path.dirname(path)
if directory and not os.path.exists(directory): # pragma: no cover
os.makedirs(directory)
with open(path, "w") as fp:
json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
fp.write("\n")
def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
"""Pad an 2D array with zeros until all rows in the
2D array are of the same length as a the longest
row in the 2D array.
Args:
arr (np.array): input array
Returns:
np.array: zero padded array
"""
max_len = max(len(row) for row in arr)
padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
for i, row in enumerate(arr):
padded_arr[i][: len(row)] = row
return padded_arr
def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: # pragma: no cover, air internal
"""Convert a batch of numpy arrays to tensors (with appropriate padding).
Args:
batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.
Returns:
Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
"""
batch["ids"] = pad_array(batch["ids"])
batch["masks"] = pad_array(batch["masks"])
dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
tensor_batch = {}
for key, array in batch.items():
tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
return tensor_batch
def get_run_id(experiment_name: str, trial_id: str) -> str: # pragma: no cover, mlflow functionality
"""Get the MLflow run ID for a specific Ray trial ID.
Args:
experiment_name (str): name of the experiment.
trial_id (str): id of the trial.
Returns:
str: run id of the trial.
"""
trial_name = f"TorchTrainer_{trial_id}"
run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
return run.run_id
def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
"""Convert a dictionary to a list of dictionaries.
Args:
data (Dict): input dictionary.
keys (List[str]): keys to include in the output list of dictionaries.
Returns:
List[Dict[str, Any]]: output list of dictionaries.
"""
list_of_dicts = []
for i in range(len(data[keys[0]])):
new_dict = {key: data[key][i] for key in keys}
list_of_dicts.append(new_dict)
return list_of_dicts