ML for Developers

2023-07-26 04:53:11 -07:00
commit 776a75b010
54 changed files with 55464 additions and 0 deletions
--- a/madewithml/config.py
+++ b/madewithml/config.py
@@ -0,0 +1,244 @@
+# config.py
+import logging
+import sys
+from pathlib import Path
+
+import mlflow
+import pretty_errors  # NOQA: F401 (imported but unused)
+
+# Directories
+ROOT_DIR = Path(__file__).parent.parent.absolute()
+LOGS_DIR = Path(ROOT_DIR, "logs")
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+
+# Config MLflow
+MODEL_REGISTRY = Path("/tmp/mlflow")
+Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
+MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute())
+mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
+
+# Logger
+logging_config = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "minimal": {"format": "%(message)s"},
+        "detailed": {"format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"},
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "stream": sys.stdout,
+            "formatter": "minimal",
+            "level": logging.DEBUG,
+        },
+        "info": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "filename": Path(LOGS_DIR, "info.log"),
+            "maxBytes": 10485760,  # 1 MB
+            "backupCount": 10,
+            "formatter": "detailed",
+            "level": logging.INFO,
+        },
+        "error": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "filename": Path(LOGS_DIR, "error.log"),
+            "maxBytes": 10485760,  # 1 MB
+            "backupCount": 10,
+            "formatter": "detailed",
+            "level": logging.ERROR,
+        },
+    },
+    "root": {
+        "handlers": ["console", "info", "error"],
+        "level": logging.INFO,
+        "propagate": True,
+    },
+}
+
+# Logger
+logging.config.dictConfig(logging_config)
+logger = logging.getLogger()
+
+# Constraints
+STOPWORDS = [
+    "i",
+    "me",
+    "my",
+    "myself",
+    "we",
+    "our",
+    "ours",
+    "ourselves",
+    "you",
+    "you're",
+    "you've",
+    "you'll",
+    "you'd",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+    "he",
+    "him",
+    "his",
+    "himself",
+    "she",
+    "she's",
+    "her",
+    "hers",
+    "herself",
+    "it",
+    "it's",
+    "its",
+    "itself",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "themselves",
+    "what",
+    "which",
+    "who",
+    "whom",
+    "this",
+    "that",
+    "that'll",
+    "these",
+    "those",
+    "am",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "having",
+    "do",
+    "does",
+    "did",
+    "doing",
+    "a",
+    "an",
+    "the",
+    "and",
+    "but",
+    "if",
+    "or",
+    "because",
+    "as",
+    "until",
+    "while",
+    "of",
+    "at",
+    "by",
+    "for",
+    "with",
+    "about",
+    "against",
+    "between",
+    "into",
+    "through",
+    "during",
+    "before",
+    "after",
+    "above",
+    "below",
+    "to",
+    "from",
+    "up",
+    "down",
+    "in",
+    "out",
+    "on",
+    "off",
+    "over",
+    "under",
+    "again",
+    "further",
+    "then",
+    "once",
+    "here",
+    "there",
+    "when",
+    "where",
+    "why",
+    "how",
+    "all",
+    "any",
+    "both",
+    "each",
+    "few",
+    "more",
+    "most",
+    "other",
+    "some",
+    "such",
+    "no",
+    "nor",
+    "not",
+    "only",
+    "own",
+    "same",
+    "so",
+    "than",
+    "too",
+    "very",
+    "s",
+    "t",
+    "can",
+    "will",
+    "just",
+    "don",
+    "don't",
+    "should",
+    "should've",
+    "now",
+    "d",
+    "ll",
+    "m",
+    "o",
+    "re",
+    "ve",
+    "y",
+    "ain",
+    "aren",
+    "aren't",
+    "couldn",
+    "couldn't",
+    "didn",
+    "didn't",
+    "doesn",
+    "doesn't",
+    "hadn",
+    "hadn't",
+    "hasn",
+    "hasn't",
+    "haven",
+    "haven't",
+    "isn",
+    "isn't",
+    "ma",
+    "mightn",
+    "mightn't",
+    "mustn",
+    "mustn't",
+    "needn",
+    "needn't",
+    "shan",
+    "shan't",
+    "shouldn",
+    "shouldn't",
+    "wasn",
+    "wasn't",
+    "weren",
+    "weren't",
+    "won",
+    "won't",
+    "wouldn",
+    "wouldn't",
+]
--- a/madewithml/data.py
+++ b/madewithml/data.py
@@ -0,0 +1,147 @@
+import re
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import ray
+from ray.data import Dataset
+from ray.data.preprocessor import Preprocessor
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer
+
+from madewithml.config import STOPWORDS
+
+
+def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
+    """Load data from source into a Ray Dataset.
+
+    Args:
+        dataset_loc (str): Location of the dataset.
+        num_samples (int, optional): The number of samples to load. Defaults to None.
+
+    Returns:
+        Dataset: Our dataset represented by a Ray Dataset.
+    """
+    ds = ray.data.read_csv(dataset_loc)
+    ds = ds.random_shuffle(seed=1234)
+    ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
+    return ds
+
+
+def stratify_split(
+    ds: Dataset,
+    stratify: str,
+    test_size: float,
+    shuffle: bool = True,
+    seed: int = 1234,
+) -> Tuple[Dataset, Dataset]:
+    """Split a dataset into train and test splits with equal
+    amounts of data points from each class in the column we
+    want to stratify on.
+
+    Args:
+        ds (Dataset): Input dataset to split.
+        stratify (str): Name of column to split on.
+        test_size (float): Proportion of dataset to split for test set.
+        shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
+        seed (int, optional): seed for shuffling. Defaults to 1234.
+
+    Returns:
+        Tuple[Dataset, Dataset]: the stratified train and test datasets.
+    """
+
+    def _add_split(df: pd.DataFrame) -> pd.DataFrame:  # pragma: no cover, used in parent function
+        """Naively split a dataframe into train and test splits.
+        Add a column specifying whether it's the train or test split."""
+        train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
+        train["_split"] = "train"
+        test["_split"] = "test"
+        return pd.concat([train, test])
+
+    def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame:  # pragma: no cover, used in parent function
+        """Filter by data points that match the split column's value
+        and return the dataframe with the _split column dropped."""
+        return df[df["_split"] == split].drop("_split", axis=1)
+
+    # Train, test split with stratify
+    grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas")  # group by each unique value in the column we want to stratify on
+    train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas")  # combine
+    test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas")  # combine
+
+    # Shuffle each split (required)
+    train_ds = train_ds.random_shuffle(seed=seed)
+    test_ds = test_ds.random_shuffle(seed=seed)
+
+    return train_ds, test_ds
+
+
+def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
+    """Clean raw text string.
+
+    Args:
+        text (str): Raw text to clean.
+        stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS.
+
+    Returns:
+        str: cleaned text.
+    """
+    # Lower
+    text = text.lower()
+
+    # Remove stopwords
+    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
+    text = pattern.sub(" ", text)
+
+    # Spacing and filters
+    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)  # add spacing
+    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
+    text = re.sub(" +", " ", text)  # remove multiple spaces
+    text = text.strip()  # strip white space at the ends
+    text = re.sub(r"http\S+", "", text)  # remove links
+
+    return text
+
+
+def tokenize(batch: Dict) -> Dict:
+    """Tokenize the text input in our batch using a tokenizer.
+
+    Args:
+        batch (Dict): batch of data with the text inputs to tokenize.
+
+    Returns:
+        Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
+    """
+    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
+    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
+    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))
+
+
+def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
+    """Preprocess the data in our dataframe.
+
+    Args:
+        df (pd.DataFrame): Raw dataframe to preprocess.
+        class_to_index (Dict): Mapping of class names to indices.
+
+    Returns:
+        Dict: preprocessed data (ids, masks, targets).
+    """
+    df["text"] = df.title + " " + df.description  # feature engineering
+    df["text"] = df.text.apply(clean_text)  # clean text
+    df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore")  # clean dataframe
+    df = df[["text", "tag"]]  # rearrange columns
+    df["tag"] = df["tag"].map(class_to_index)  # label encoding
+    outputs = tokenize(df)
+    return outputs
+
+
+class CustomPreprocessor(Preprocessor):
+    """Custom preprocessor class."""
+
+    def _fit(self, ds):
+        tags = ds.unique(column="tag")
+        self.class_to_index = {tag: i for i, tag in enumerate(tags)}
+        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
+
+    def _transform_pandas(self, batch):  # could also do _transform_numpy
+        return preprocess(batch, class_to_index=self.class_to_index)
--- a/madewithml/evaluate.py
+++ b/madewithml/evaluate.py
@@ -0,0 +1,154 @@
+import datetime
+import json
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+import ray
+import ray.train.torch  # NOQA: F401 (imported but unused)
+import typer
+from ray.data import Dataset
+from ray.train.torch.torch_predictor import TorchPredictor
+from sklearn.metrics import precision_recall_fscore_support
+from snorkel.slicing import PandasSFApplier, slicing_function
+from typing_extensions import Annotated
+
+from madewithml import predict, utils
+from madewithml.config import logger
+
+# Initialize Typer CLI app
+app = typer.Typer()
+
+
+def get_overall_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict:  # pragma: no cover, eval workload
+    """Get overall performance metrics.
+
+    Args:
+        y_true (np.ndarray): ground truth labels.
+        y_pred (np.ndarray): predicted labels.
+
+    Returns:
+        Dict: overall metrics.
+    """
+    metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
+    overall_metrics = {
+        "precision": metrics[0],
+        "recall": metrics[1],
+        "f1": metrics[2],
+        "num_samples": np.float64(len(y_true)),
+    }
+    return overall_metrics
+
+
+def get_per_class_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_to_index: Dict) -> Dict:  # pragma: no cover, eval workload
+    """Get per class performance metrics.
+
+    Args:
+        y_true (np.ndarray): ground truth labels.
+        y_pred (np.ndarray): predicted labels.
+        class_to_index (Dict): dictionary mapping class to index.
+
+    Returns:
+        Dict: per class metrics.
+    """
+    per_class_metrics = {}
+    metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
+    for i, _class in enumerate(class_to_index):
+        per_class_metrics[_class] = {
+            "precision": metrics[0][i],
+            "recall": metrics[1][i],
+            "f1": metrics[2][i],
+            "num_samples": np.float64(metrics[3][i]),
+        }
+    sorted_per_class_metrics = OrderedDict(sorted(per_class_metrics.items(), key=lambda tag: tag[1]["f1"], reverse=True))
+    return sorted_per_class_metrics
+
+
+@slicing_function()
+def nlp_llm(x):  # pragma: no cover, eval workload
+    """NLP projects that use LLMs."""
+    nlp_project = "natural-language-processing" in x.tag
+    llm_terms = ["transformer", "llm", "bert"]
+    llm_project = any(s.lower() in x.text.lower() for s in llm_terms)
+    return nlp_project and llm_project
+
+
+@slicing_function()
+def short_text(x):  # pragma: no cover, eval workload
+    """Projects with short titles and descriptions."""
+    return len(x.text.split()) < 8  # less than 8 words
+
+
+def get_slice_metrics(y_true: np.ndarray, y_pred: np.ndarray, ds: Dataset) -> Dict:  # pragma: no cover, eval workload
+    """Get performance metrics for slices.
+
+    Args:
+        y_true (np.ndarray): ground truth labels.
+        y_pred (np.ndarray): predicted labels.
+        ds (Dataset): Ray dataset with labels.
+    Returns:
+        Dict: performance metrics for slices.
+    """
+    slice_metrics = {}
+    df = ds.to_pandas()
+    df["text"] = df["title"] + " " + df["description"]
+    slices = PandasSFApplier([nlp_llm, short_text]).apply(df)
+    for slice_name in slices.dtype.names:
+        mask = slices[slice_name].astype(bool)
+        if sum(mask):
+            metrics = precision_recall_fscore_support(y_true[mask], y_pred[mask], average="micro")
+            slice_metrics[slice_name] = {}
+            slice_metrics[slice_name]["precision"] = metrics[0]
+            slice_metrics[slice_name]["recall"] = metrics[1]
+            slice_metrics[slice_name]["f1"] = metrics[2]
+            slice_metrics[slice_name]["num_samples"] = len(y_true[mask])
+    return slice_metrics
+
+
+@app.command()
+def evaluate(
+    run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
+    dataset_loc: Annotated[str, typer.Option(help="dataset (with labels) to evaluate on")] = None,
+    results_fp: Annotated[str, typer.Option(help="location to save evaluation results to")] = None,
+) -> Dict:  # pragma: no cover, eval workload
+    """Evaluate on the holdout dataset.
+
+    Args:
+        run_id (str): id of the specific run to load from. Defaults to None.
+        dataset_loc (str): dataset (with labels) to evaluate on.
+        results_fp (str, optional): location to save evaluation results to. Defaults to None.
+
+    Returns:
+        Dict: model's performance metrics on the dataset.
+    """
+    # Load
+    ds = ray.data.read_csv(dataset_loc)
+    best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
+    predictor = TorchPredictor.from_checkpoint(best_checkpoint)
+
+    # y_true
+    preprocessor = predictor.get_preprocessor()
+    preprocessed_ds = preprocessor.transform(ds)
+    values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
+    y_true = np.stack([item["targets"] for item in values])
+
+    # y_pred
+    z = predictor.predict(data=ds.to_pandas())["predictions"]
+    y_pred = np.stack(z).argmax(1)
+
+    # Metrics
+    metrics = {
+        "timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
+        "run_id": run_id,
+        "overall": get_overall_metrics(y_true=y_true, y_pred=y_pred),
+        "per_class": get_per_class_metrics(y_true=y_true, y_pred=y_pred, class_to_index=preprocessor.class_to_index),
+        "slices": get_slice_metrics(y_true=y_true, y_pred=y_pred, ds=ds),
+    }
+    logger.info(json.dumps(metrics, indent=2))
+    if results_fp:  # pragma: no cover, saving results
+        utils.save_dict(d=metrics, path=results_fp)
+    return metrics
+
+
+if __name__ == "__main__":  # pragma: no cover, checked during evaluation workload
+    app()
--- a/madewithml/models.py
+++ b/madewithml/models.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn as nn
+
+
+class FinetunedLLM(nn.Module):  # pragma: no cover, torch model
+    """Model architecture for a Large Language Model (LLM) that we will fine-tune."""
+
+    def __init__(self, llm, dropout_p, embedding_dim, num_classes):
+        super(FinetunedLLM, self).__init__()
+        self.llm = llm
+        self.dropout = torch.nn.Dropout(dropout_p)
+        self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
+
+    def forward(self, batch):
+        ids, masks = batch["ids"], batch["masks"]
+        seq, pool = self.llm(input_ids=ids, attention_mask=masks)
+        z = self.dropout(pool)
+        z = self.fc1(z)
+        return z
--- a/madewithml/predict.py
+++ b/madewithml/predict.py
@@ -0,0 +1,139 @@
+import json
+from typing import Any, Dict, Iterable, List
+from urllib.parse import urlparse
+
+import pandas as pd
+import ray
+import torch
+import typer
+from numpyencoder import NumpyEncoder
+from ray.air import Result
+from ray.train.torch import TorchPredictor
+from ray.train.torch.torch_checkpoint import TorchCheckpoint
+from typing_extensions import Annotated
+
+from madewithml.config import logger, mlflow
+
+# Initialize Typer CLI app
+app = typer.Typer()
+
+
+def decode(indices: Iterable[Any], index_to_class: Dict) -> List:
+    """Decode indices to labels.
+
+    Args:
+        indices (Iterable[Any]): Iterable (list, array, etc.) with indices.
+        index_to_class (Dict): mapping between indices and labels.
+
+    Returns:
+        List: list of labels.
+    """
+    return [index_to_class[index] for index in indices]
+
+
+def format_prob(prob: Iterable, index_to_class: Dict) -> Dict:
+    """Format probabilities to a dictionary mapping class label to probability.
+
+    Args:
+        prob (Iterable): probabilities.
+        index_to_class (Dict): mapping between indices and labels.
+
+    Returns:
+        Dict: Dictionary mapping class label to probability.
+    """
+    d = {}
+    for i, item in enumerate(prob):
+        d[index_to_class[i]] = item
+    return d
+
+
+def predict_with_proba(
+    df: pd.DataFrame,
+    predictor: ray.train.torch.torch_predictor.TorchPredictor,
+) -> List:  # pragma: no cover, tested with inference workload
+    """Predict tags (with probabilities) for input data from a dataframe.
+
+    Args:
+        df (pd.DataFrame): dataframe with input features.
+        predictor (ray.train.torch.torch_predictor.TorchPredictor): loaded predictor from a checkpoint.
+
+    Returns:
+        List: list of predicted labels.
+    """
+    preprocessor = predictor.get_preprocessor()
+    z = predictor.predict(data=df)["predictions"]
+    import numpy as np
+
+    y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
+    results = []
+    for i, prob in enumerate(y_prob):
+        tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
+        results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
+    return results
+
+
+@app.command()
+def get_best_run_id(experiment_name: str = "", metric: str = "", mode: str = "") -> str:  # pragma: no cover, mlflow logic
+    """Get the best run_id from an MLflow experiment.
+
+    Args:
+        experiment_name (str): name of the experiment.
+        metric (str): metric to filter by.
+        mode (str): direction of metric (ASC/DESC).
+
+    Returns:
+        str: best run id from experiment.
+    """
+    sorted_runs = mlflow.search_runs(
+        experiment_names=[experiment_name],
+        order_by=[f"metrics.{metric} {mode}"],
+    )
+    run_id = sorted_runs.iloc[0].run_id
+    print(run_id)
+    return run_id
+
+
+def get_best_checkpoint(run_id: str) -> TorchCheckpoint:  # pragma: no cover, mlflow logic
+    """Get the best checkpoint from a specific run.
+
+    Args:
+        run_id (str): ID of the run to get the best checkpoint from.
+
+    Returns:
+        TorchCheckpoint: Best checkpoint from the run.
+    """
+    artifact_dir = urlparse(mlflow.get_run(run_id).info.artifact_uri).path  # get path from mlflow
+    results = Result.from_path(artifact_dir)
+    return results.best_checkpoints[0][0]
+
+
+@app.command()
+def predict(
+    run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
+    title: Annotated[str, typer.Option(help="project title")] = None,
+    description: Annotated[str, typer.Option(help="project description")] = None,
+) -> Dict:  # pragma: no cover, tested with inference workload
+    """Predict the tag for a project given it's title and description.
+
+    Args:
+        run_id (str): id of the specific run to load from. Defaults to None.
+        title (str, optional): project title. Defaults to "".
+        description (str, optional): project description. Defaults to "".
+
+    Returns:
+        Dict: prediction results for the input data.
+    """
+    # Load components
+    best_checkpoint = get_best_checkpoint(run_id=run_id)
+    predictor = TorchPredictor.from_checkpoint(best_checkpoint)
+    preprocessor = predictor.get_preprocessor()
+
+    # Predict
+    sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
+    results = predict_with_proba(df=sample_df, predictor=predictor, index_to_class=preprocessor.index_to_class)
+    logger.info(json.dumps(results, cls=NumpyEncoder, indent=2))
+    return results
+
+
+if __name__ == "__main__":  # pragma: no cover, application
+    app()
--- a/madewithml/serve.py
+++ b/madewithml/serve.py
@@ -0,0 +1,79 @@
+import argparse
+from http import HTTPStatus
+from typing import Dict
+
+import pandas as pd
+import ray
+from fastapi import FastAPI
+from ray import serve
+from ray.train.torch import TorchPredictor
+from starlette.requests import Request
+
+from madewithml import evaluate, predict
+from madewithml.config import MLFLOW_TRACKING_URI, mlflow
+
+# Define application
+app = FastAPI(
+    title="Made With ML",
+    description="Classify machine learning projects.",
+    version="0.1",
+)
+
+
+@serve.deployment(route_prefix="/", num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
+@serve.ingress(app)
+class ModelDeployment:
+    def __init__(self, run_id: str, threshold: int = 0.9):
+        """Initialize the model."""
+        self.run_id = run_id
+        self.threshold = threshold
+        mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)  # so workers have access to model registry
+        best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
+        self.predictor = TorchPredictor.from_checkpoint(best_checkpoint)
+        self.preprocessor = self.predictor.get_preprocessor()
+
+    @app.get("/")
+    def _index(self) -> Dict:
+        """Health check."""
+        response = {
+            "message": HTTPStatus.OK.phrase,
+            "status-code": HTTPStatus.OK,
+            "data": {},
+        }
+        return response
+
+    @app.get("/run_id/")
+    def _run_id(self) -> Dict:
+        """Get the run ID."""
+        return {"run_id": self.run_id}
+
+    @app.post("/evaluate/")
+    async def _evaluate(self, request: Request) -> Dict:
+        data = await request.json()
+        results = evaluate.evaluate(run_id=self.run_id, dataset_loc=data.get("dataset"))
+        return {"results": results}
+
+    @app.post("/predict/")
+    async def _predict(self, request: Request) -> Dict:
+        # Get prediction
+        data = await request.json()
+        df = pd.DataFrame([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
+        results = predict.predict_with_proba(df=df, predictor=self.predictor)
+
+        # Apply custom logic
+        for i, result in enumerate(results):
+            pred = result["prediction"]
+            prob = result["probabilities"]
+            if prob[pred] < self.threshold:
+                results[i]["prediction"] = "other"
+
+        return {"results": results}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--run_id", help="run ID to use for serving.")
+    parser.add_argument("--threshold", type=float, default=0.9, help="threshold for `other` class.")
+    args = parser.parse_args()
+    ray.init()
+    serve.run(ModelDeployment.bind(run_id=args.run_id, threshold=args.threshold))
--- a/madewithml/train.py
+++ b/madewithml/train.py
@@ -0,0 +1,256 @@
+import datetime
+import json
+from typing import Tuple
+
+import numpy as np
+import ray
+import ray.train as train
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import typer
+from ray.air import session
+from ray.air.config import (
+    CheckpointConfig,
+    DatasetConfig,
+    RunConfig,
+    ScalingConfig,
+)
+from ray.air.integrations.mlflow import MLflowLoggerCallback
+from ray.data import Dataset
+from ray.train.torch import TorchCheckpoint, TorchTrainer
+from transformers import BertModel
+from typing_extensions import Annotated
+
+from madewithml import data, models, utils
+from madewithml.config import MLFLOW_TRACKING_URI, logger
+
+# Initialize Typer CLI app
+app = typer.Typer()
+
+
+def train_step(
+    ds: Dataset,
+    batch_size: int,
+    model: nn.Module,
+    num_classes: int,
+    loss_fn: torch.nn.modules.loss._WeightedLoss,
+    optimizer: torch.optim.Optimizer,
+) -> float:  # pragma: no cover, tested via train workload
+    """Train step.
+
+    Args:
+        ds (Dataset): dataset to iterate batches from.
+        batch_size (int): size of each batch.
+        model (nn.Module): model to train.
+        num_classes (int): number of classes.
+        loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
+        optimizer (torch.optimizer.Optimizer): optimizer to use for updating the model's weights.
+
+    Returns:
+        float: cumulative loss for the dataset.
+    """
+    model.train()
+    loss = 0.0
+    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
+    for i, batch in enumerate(ds_generator):
+        optimizer.zero_grad()  # reset gradients
+        z = model(batch)  # forward pass
+        targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
+        J = loss_fn(z, targets)  # define loss
+        J.backward()  # backward pass
+        optimizer.step()  # update weights
+        loss += (J.detach().item() - loss) / (i + 1)  # cumulative loss
+    return loss
+
+
+def eval_step(
+    ds: Dataset, batch_size: int, model: nn.Module, num_classes: int, loss_fn: torch.nn.modules.loss._WeightedLoss
+) -> Tuple[float, np.array, np.array]:  # pragma: no cover, tested via train workload
+    """Eval step.
+
+    Args:
+        ds (Dataset): dataset to iterate batches from.
+        batch_size (int): size of each batch.
+        model (nn.Module): model to train.
+        num_classes (int): number of classes.
+        loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
+
+    Returns:
+        Tuple[float, np.array, np.array]: cumulative loss, ground truths and predictions.
+    """
+    model.eval()
+    loss = 0.0
+    y_trues, y_preds = [], []
+    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
+    with torch.inference_mode():
+        for i, batch in enumerate(ds_generator):
+            z = model(batch)
+            targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
+            J = loss_fn(z, targets).item()
+            loss += (J - loss) / (i + 1)
+            y_trues.extend(batch["targets"].cpu().numpy())
+            y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
+    return loss, np.vstack(y_trues), np.vstack(y_preds)
+
+
+def train_loop_per_worker(config: dict) -> None:  # pragma: no cover, tested via train workload
+    """Training loop that each worker will execute.
+
+    Args:
+        config (dict): arguments to use for training.
+    """
+    # Hyperparameters
+    dropout_p = config["dropout_p"]
+    lr = config["lr"]
+    lr_factor = config["lr_factor"]
+    lr_patience = config["lr_patience"]
+    batch_size = config["batch_size"]
+    num_epochs = config["num_epochs"]
+    num_classes = config["num_classes"]
+
+    # Get datasets
+    utils.set_seeds()
+    train_ds = session.get_dataset_shard("train")
+    val_ds = session.get_dataset_shard("val")
+
+    # Model
+    llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
+    model = models.FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
+    model = train.torch.prepare_model(model)
+
+    # Training components
+    loss_fn = nn.BCEWithLogitsLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)
+
+    # Training
+    batch_size_per_worker = batch_size // session.get_world_size()
+    for epoch in range(num_epochs):
+        # Step
+        train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
+        val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
+        scheduler.step(val_loss)
+
+        # Checkpoint
+        metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
+        checkpoint = TorchCheckpoint.from_model(model=model)
+        session.report(metrics, checkpoint=checkpoint)
+
+
+@app.command()
+def train_model(
+    experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
+    dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
+    train_loop_config: Annotated[str, typer.Option(help="arguments to use for training.")] = None,
+    num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
+    cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
+    gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
+    num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
+    num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
+    batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
+    results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
+) -> ray.air.result.Result:
+    """Main train function to train our model as a distributed workload.
+
+    Args:
+        experiment_name (str): name of the experiment for this training workload.
+        dataset_loc (str): location of the dataset.
+        train_loop_config (str): arguments to use for training.
+        num_workers (int, optional): number of workers to use for training. Defaults to 1.
+        cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
+        gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
+        num_samples (int, optional): number of samples to use from dataset.
+            If this is passed in, it will override the config. Defaults to None.
+        num_epochs (int, optional): number of epochs to train for.
+            If this is passed in, it will override the config. Defaults to None.
+        batch_size (int, optional): number of samples per batch.
+            If this is passed in, it will override the config. Defaults to None.
+        results_fp (str, optional): filepath to save results to. Defaults to None.
+
+    Returns:
+        ray.air.result.Result: training results.
+    """
+    # Set up
+    train_loop_config = json.loads(train_loop_config)
+    train_loop_config["num_samples"] = num_samples
+    train_loop_config["num_epochs"] = num_epochs
+    train_loop_config["batch_size"] = batch_size
+
+    # Scaling config
+    scaling_config = ScalingConfig(
+        num_workers=num_workers,
+        use_gpu=bool(gpu_per_worker),
+        resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
+        _max_cpu_fraction_per_node=0.8,
+    )
+
+    # Checkpoint config
+    checkpoint_config = CheckpointConfig(
+        num_to_keep=1,
+        checkpoint_score_attribute="val_loss",
+        checkpoint_score_order="min",
+    )
+
+    # MLflow callback
+    mlflow_callback = MLflowLoggerCallback(
+        tracking_uri=MLFLOW_TRACKING_URI,
+        experiment_name=experiment_name,
+        save_artifact=True,
+    )
+
+    # Run config
+    run_config = RunConfig(
+        callbacks=[mlflow_callback],
+        checkpoint_config=checkpoint_config,
+    )
+
+    # Dataset
+    ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"])
+    train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
+    tags = train_ds.unique(column="tag")
+    train_loop_config["num_classes"] = len(tags)
+
+    # Dataset config
+    dataset_config = {
+        "train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
+        "val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
+    }
+
+    # Preprocess
+    preprocessor = data.CustomPreprocessor()
+    train_ds = preprocessor.fit_transform(train_ds)
+    val_ds = preprocessor.transform(val_ds)
+    train_ds = train_ds.materialize()
+    val_ds = val_ds.materialize()
+
+    # Trainer
+    trainer = TorchTrainer(
+        train_loop_per_worker=train_loop_per_worker,
+        train_loop_config=train_loop_config,
+        scaling_config=scaling_config,
+        run_config=run_config,
+        datasets={"train": train_ds, "val": val_ds},
+        dataset_config=dataset_config,
+        preprocessor=preprocessor,
+    )
+
+    # Train
+    results = trainer.fit()
+    d = {
+        "timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
+        "run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=results.metrics["trial_id"]),
+        "params": results.config["train_loop_config"],
+        "metrics": utils.dict_to_list(results.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
+    }
+    logger.info(json.dumps(d, indent=2))
+    if results_fp:  # pragma: no cover, saving results
+        utils.save_dict(d, results_fp)
+    return results
+
+
+if __name__ == "__main__":  # pragma: no cover, application
+    if ray.is_initialized():
+        ray.shutdown()
+    ray.init()
+    app()
--- a/madewithml/tune.py
+++ b/madewithml/tune.py
@@ -0,0 +1,182 @@
+import datetime
+import json
+
+import ray
+import typer
+from ray import tune
+from ray.air.config import (
+    CheckpointConfig,
+    DatasetConfig,
+    RunConfig,
+    ScalingConfig,
+)
+from ray.air.integrations.mlflow import MLflowLoggerCallback
+from ray.train.torch import TorchTrainer
+from ray.tune import Tuner
+from ray.tune.schedulers import AsyncHyperBandScheduler
+from ray.tune.search import ConcurrencyLimiter
+from ray.tune.search.hyperopt import HyperOptSearch
+from typing_extensions import Annotated
+
+from madewithml import data, train, utils
+from madewithml.config import MLFLOW_TRACKING_URI, logger
+
+# Initialize Typer CLI app
+app = typer.Typer()
+
+
+@app.command()
+def tune_models(
+    experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
+    dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
+    initial_params: Annotated[str, typer.Option(help="initial config for the tuning workload.")] = None,
+    num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
+    cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
+    gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
+    num_runs: Annotated[int, typer.Option(help="number of runs in this tuning experiment.")] = 1,
+    num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
+    num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
+    batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
+    results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
+) -> ray.tune.result_grid.ResultGrid:
+    """Hyperparameter tuning experiment.
+
+    Args:
+        experiment_name (str): name of the experiment for this training workload.
+        dataset_loc (str): location of the dataset.
+        initial_params (str): initial config for the tuning workload.
+        num_workers (int, optional): number of workers to use for training. Defaults to 1.
+        cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
+        gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
+        num_runs (int, optional): number of runs in this tuning experiment. Defaults to 1.
+        num_samples (int, optional): number of samples to use from dataset.
+            If this is passed in, it will override the config. Defaults to None.
+        num_epochs (int, optional): number of epochs to train for.
+            If this is passed in, it will override the config. Defaults to None.
+        batch_size (int, optional): number of samples per batch.
+            If this is passed in, it will override the config. Defaults to None.
+        results_fp (str, optional): filepath to save the tuning results. Defaults to None.
+
+    Returns:
+        ray.tune.result_grid.ResultGrid: results of the tuning experiment.
+    """
+    # Set up
+    utils.set_seeds()
+    train_loop_config = {}
+    train_loop_config["num_samples"] = num_samples
+    train_loop_config["num_epochs"] = num_epochs
+    train_loop_config["batch_size"] = batch_size
+
+    # Scaling config
+    scaling_config = ScalingConfig(
+        num_workers=num_workers,
+        use_gpu=bool(gpu_per_worker),
+        resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
+        _max_cpu_fraction_per_node=0.8,
+    )
+
+    # Dataset
+    ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config.get("num_samples", None))
+    train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
+    tags = train_ds.unique(column="tag")
+    train_loop_config["num_classes"] = len(tags)
+
+    # Dataset config
+    dataset_config = {
+        "train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
+        "val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
+    }
+
+    # Preprocess
+    preprocessor = data.CustomPreprocessor()
+    train_ds = preprocessor.fit_transform(train_ds)
+    val_ds = preprocessor.transform(val_ds)
+    train_ds = train_ds.materialize()
+    val_ds = val_ds.materialize()
+
+    # Trainer
+    trainer = TorchTrainer(
+        train_loop_per_worker=train.train_loop_per_worker,
+        train_loop_config=train_loop_config,
+        scaling_config=scaling_config,
+        datasets={"train": train_ds, "val": val_ds},
+        dataset_config=dataset_config,
+        preprocessor=preprocessor,
+    )
+
+    # Checkpoint configuration
+    checkpoint_config = CheckpointConfig(
+        num_to_keep=1,
+        checkpoint_score_attribute="val_loss",
+        checkpoint_score_order="min",
+    )
+
+    # Run configuration
+    mlflow_callback = MLflowLoggerCallback(
+        tracking_uri=MLFLOW_TRACKING_URI,
+        experiment_name=experiment_name,
+        save_artifact=True,
+    )
+    run_config = RunConfig(
+        callbacks=[mlflow_callback],
+        checkpoint_config=checkpoint_config,
+    )
+
+    # Hyperparameters to start with
+    initial_params = json.loads(initial_params)
+    search_alg = HyperOptSearch(points_to_evaluate=initial_params)
+    search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)  # trade off b/w optimization and search space
+
+    # Parameter space
+    param_space = {
+        "train_loop_config": {
+            "dropout_p": tune.uniform(0.3, 0.9),
+            "lr": tune.loguniform(1e-5, 5e-4),
+            "lr_factor": tune.uniform(0.1, 0.9),
+            "lr_patience": tune.uniform(1, 10),
+        }
+    }
+
+    # Scheduler
+    scheduler = AsyncHyperBandScheduler(
+        max_t=train_loop_config["num_epochs"],  # max epoch (<time_attr>) per trial
+        grace_period=1,  # min epoch (<time_attr>) per trial
+    )
+
+    # Tune config
+    tune_config = tune.TuneConfig(
+        metric="val_loss",
+        mode="min",
+        search_alg=search_alg,
+        scheduler=scheduler,
+        num_samples=num_runs,
+    )
+
+    # Tuner
+    tuner = Tuner(
+        trainable=trainer,
+        run_config=run_config,
+        param_space=param_space,
+        tune_config=tune_config,
+    )
+
+    # Tune
+    results = tuner.fit()
+    best_trial = results.get_best_result(metric="val_loss", mode="min")
+    d = {
+        "timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
+        "run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=best_trial.metrics["trial_id"]),
+        "params": best_trial.config["train_loop_config"],
+        "metrics": utils.dict_to_list(best_trial.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
+    }
+    logger.info(json.dumps(d, indent=2))
+    if results_fp:  # pragma: no cover, saving results
+        utils.save_dict(d, results_fp)
+    return results
+
+
+if __name__ == "__main__":  # pragma: no cover, application
+    if ray.is_initialized():
+        ray.shutdown()
+    ray.init()
+    app()
--- a/madewithml/utils.py
+++ b/madewithml/utils.py
@@ -0,0 +1,123 @@
+import json
+import os
+import random
+from typing import Any, Dict, List
+
+import numpy as np
+import torch
+from ray.data import DatasetContext
+from ray.train.torch import get_device
+
+from madewithml.config import mlflow
+
+DatasetContext.get_current().execution_options.preserve_order = True
+
+
+def set_seeds(seed: int = 42):
+    """Set seeds for reproducibility."""
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
+    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
+    os.environ["PYTHONHASHSEED"] = str(seed)
+
+
+def load_dict(path: str) -> Dict:
+    """Load a dictionary from a JSON's filepath.
+
+    Args:
+        path (str): location of file.
+
+    Returns:
+        Dict: loaded JSON data.
+    """
+    with open(path) as fp:
+        d = json.load(fp)
+    return d
+
+
+def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
+    """Save a dictionary to a specific location.
+
+    Args:
+        d (Dict): data to save.
+        path (str): location of where to save the data.
+        cls (optional): encoder to use on dict data. Defaults to None.
+        sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
+    """
+    directory = os.path.dirname(path)
+    if directory and not os.path.exists(directory):  # pragma: no cover
+        os.makedirs(directory)
+    with open(path, "w") as fp:
+        json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
+        fp.write("\n")
+
+
+def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
+    """Pad an 2D array with zeros until all rows in the
+    2D array are of the same length as a the longest
+    row in the 2D array.
+
+    Args:
+        arr (np.array): input array
+
+    Returns:
+        np.array: zero padded array
+    """
+    max_len = max(len(row) for row in arr)
+    padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
+    for i, row in enumerate(arr):
+        padded_arr[i][: len(row)] = row
+    return padded_arr
+
+
+def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]:  # pragma: no cover, air internal
+    """Convert a batch of numpy arrays to tensors (with appropriate padding).
+
+    Args:
+        batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.
+
+    Returns:
+        Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
+    """
+    batch["ids"] = pad_array(batch["ids"])
+    batch["masks"] = pad_array(batch["masks"])
+    dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
+    tensor_batch = {}
+    for key, array in batch.items():
+        tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
+    return tensor_batch
+
+
+def get_run_id(experiment_name: str, trial_id: str) -> str:  # pragma: no cover, mlflow functionality
+    """Get the MLflow run ID for a specific Ray trial ID.
+
+    Args:
+        experiment_name (str): name of the experiment.
+        trial_id (str): id of the trial.
+
+    Returns:
+        str: run id of the trial.
+    """
+    trial_name = f"TorchTrainer_{trial_id}"
+    run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
+    return run.run_id
+
+
+def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
+    """Convert a dictionary to a list of dictionaries.
+
+    Args:
+        data (Dict): input dictionary.
+        keys (List[str]): keys to include in the output list of dictionaries.
+
+    Returns:
+        List[Dict[str, Any]]: output list of dictionaries.
+    """
+    list_of_dicts = []
+    for i in range(len(data[keys[0]])):
+        new_dict = {key: data[key][i] for key in keys}
+        list_of_dicts.append(new_dict)
+    return list_of_dicts