ML for Developers
This commit is contained in:
244
madewithml/config.py
Normal file
244
madewithml/config.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# config.py
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import mlflow
|
||||
import pretty_errors # NOQA: F401 (imported but unused)
|
||||
|
||||
# Directories
|
||||
ROOT_DIR = Path(__file__).parent.parent.absolute()
|
||||
LOGS_DIR = Path(ROOT_DIR, "logs")
|
||||
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Config MLflow
|
||||
MODEL_REGISTRY = Path("/tmp/mlflow")
|
||||
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
|
||||
MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute())
|
||||
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
|
||||
|
||||
# Logger
|
||||
logging_config = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"minimal": {"format": "%(message)s"},
|
||||
"detailed": {"format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"},
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": sys.stdout,
|
||||
"formatter": "minimal",
|
||||
"level": logging.DEBUG,
|
||||
},
|
||||
"info": {
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": Path(LOGS_DIR, "info.log"),
|
||||
"maxBytes": 10485760, # 1 MB
|
||||
"backupCount": 10,
|
||||
"formatter": "detailed",
|
||||
"level": logging.INFO,
|
||||
},
|
||||
"error": {
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": Path(LOGS_DIR, "error.log"),
|
||||
"maxBytes": 10485760, # 1 MB
|
||||
"backupCount": 10,
|
||||
"formatter": "detailed",
|
||||
"level": logging.ERROR,
|
||||
},
|
||||
},
|
||||
"root": {
|
||||
"handlers": ["console", "info", "error"],
|
||||
"level": logging.INFO,
|
||||
"propagate": True,
|
||||
},
|
||||
}
|
||||
|
||||
# Logger
|
||||
logging.config.dictConfig(logging_config)
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Constraints
|
||||
STOPWORDS = [
|
||||
"i",
|
||||
"me",
|
||||
"my",
|
||||
"myself",
|
||||
"we",
|
||||
"our",
|
||||
"ours",
|
||||
"ourselves",
|
||||
"you",
|
||||
"you're",
|
||||
"you've",
|
||||
"you'll",
|
||||
"you'd",
|
||||
"your",
|
||||
"yours",
|
||||
"yourself",
|
||||
"yourselves",
|
||||
"he",
|
||||
"him",
|
||||
"his",
|
||||
"himself",
|
||||
"she",
|
||||
"she's",
|
||||
"her",
|
||||
"hers",
|
||||
"herself",
|
||||
"it",
|
||||
"it's",
|
||||
"its",
|
||||
"itself",
|
||||
"they",
|
||||
"them",
|
||||
"their",
|
||||
"theirs",
|
||||
"themselves",
|
||||
"what",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"this",
|
||||
"that",
|
||||
"that'll",
|
||||
"these",
|
||||
"those",
|
||||
"am",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"having",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"doing",
|
||||
"a",
|
||||
"an",
|
||||
"the",
|
||||
"and",
|
||||
"but",
|
||||
"if",
|
||||
"or",
|
||||
"because",
|
||||
"as",
|
||||
"until",
|
||||
"while",
|
||||
"of",
|
||||
"at",
|
||||
"by",
|
||||
"for",
|
||||
"with",
|
||||
"about",
|
||||
"against",
|
||||
"between",
|
||||
"into",
|
||||
"through",
|
||||
"during",
|
||||
"before",
|
||||
"after",
|
||||
"above",
|
||||
"below",
|
||||
"to",
|
||||
"from",
|
||||
"up",
|
||||
"down",
|
||||
"in",
|
||||
"out",
|
||||
"on",
|
||||
"off",
|
||||
"over",
|
||||
"under",
|
||||
"again",
|
||||
"further",
|
||||
"then",
|
||||
"once",
|
||||
"here",
|
||||
"there",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"all",
|
||||
"any",
|
||||
"both",
|
||||
"each",
|
||||
"few",
|
||||
"more",
|
||||
"most",
|
||||
"other",
|
||||
"some",
|
||||
"such",
|
||||
"no",
|
||||
"nor",
|
||||
"not",
|
||||
"only",
|
||||
"own",
|
||||
"same",
|
||||
"so",
|
||||
"than",
|
||||
"too",
|
||||
"very",
|
||||
"s",
|
||||
"t",
|
||||
"can",
|
||||
"will",
|
||||
"just",
|
||||
"don",
|
||||
"don't",
|
||||
"should",
|
||||
"should've",
|
||||
"now",
|
||||
"d",
|
||||
"ll",
|
||||
"m",
|
||||
"o",
|
||||
"re",
|
||||
"ve",
|
||||
"y",
|
||||
"ain",
|
||||
"aren",
|
||||
"aren't",
|
||||
"couldn",
|
||||
"couldn't",
|
||||
"didn",
|
||||
"didn't",
|
||||
"doesn",
|
||||
"doesn't",
|
||||
"hadn",
|
||||
"hadn't",
|
||||
"hasn",
|
||||
"hasn't",
|
||||
"haven",
|
||||
"haven't",
|
||||
"isn",
|
||||
"isn't",
|
||||
"ma",
|
||||
"mightn",
|
||||
"mightn't",
|
||||
"mustn",
|
||||
"mustn't",
|
||||
"needn",
|
||||
"needn't",
|
||||
"shan",
|
||||
"shan't",
|
||||
"shouldn",
|
||||
"shouldn't",
|
||||
"wasn",
|
||||
"wasn't",
|
||||
"weren",
|
||||
"weren't",
|
||||
"won",
|
||||
"won't",
|
||||
"wouldn",
|
||||
"wouldn't",
|
||||
]
|
||||
147
madewithml/data.py
Normal file
147
madewithml/data.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import ray
|
||||
from ray.data import Dataset
|
||||
from ray.data.preprocessor import Preprocessor
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import BertTokenizer
|
||||
|
||||
from madewithml.config import STOPWORDS
|
||||
|
||||
|
||||
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
|
||||
"""Load data from source into a Ray Dataset.
|
||||
|
||||
Args:
|
||||
dataset_loc (str): Location of the dataset.
|
||||
num_samples (int, optional): The number of samples to load. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dataset: Our dataset represented by a Ray Dataset.
|
||||
"""
|
||||
ds = ray.data.read_csv(dataset_loc)
|
||||
ds = ds.random_shuffle(seed=1234)
|
||||
ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
|
||||
return ds
|
||||
|
||||
|
||||
def stratify_split(
|
||||
ds: Dataset,
|
||||
stratify: str,
|
||||
test_size: float,
|
||||
shuffle: bool = True,
|
||||
seed: int = 1234,
|
||||
) -> Tuple[Dataset, Dataset]:
|
||||
"""Split a dataset into train and test splits with equal
|
||||
amounts of data points from each class in the column we
|
||||
want to stratify on.
|
||||
|
||||
Args:
|
||||
ds (Dataset): Input dataset to split.
|
||||
stratify (str): Name of column to split on.
|
||||
test_size (float): Proportion of dataset to split for test set.
|
||||
shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
|
||||
seed (int, optional): seed for shuffling. Defaults to 1234.
|
||||
|
||||
Returns:
|
||||
Tuple[Dataset, Dataset]: the stratified train and test datasets.
|
||||
"""
|
||||
|
||||
def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function
|
||||
"""Naively split a dataframe into train and test splits.
|
||||
Add a column specifying whether it's the train or test split."""
|
||||
train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
|
||||
train["_split"] = "train"
|
||||
test["_split"] = "test"
|
||||
return pd.concat([train, test])
|
||||
|
||||
def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function
|
||||
"""Filter by data points that match the split column's value
|
||||
and return the dataframe with the _split column dropped."""
|
||||
return df[df["_split"] == split].drop("_split", axis=1)
|
||||
|
||||
# Train, test split with stratify
|
||||
grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on
|
||||
train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine
|
||||
test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine
|
||||
|
||||
# Shuffle each split (required)
|
||||
train_ds = train_ds.random_shuffle(seed=seed)
|
||||
test_ds = test_ds.random_shuffle(seed=seed)
|
||||
|
||||
return train_ds, test_ds
|
||||
|
||||
|
||||
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
|
||||
"""Clean raw text string.
|
||||
|
||||
Args:
|
||||
text (str): Raw text to clean.
|
||||
stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS.
|
||||
|
||||
Returns:
|
||||
str: cleaned text.
|
||||
"""
|
||||
# Lower
|
||||
text = text.lower()
|
||||
|
||||
# Remove stopwords
|
||||
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
|
||||
text = pattern.sub(" ", text)
|
||||
|
||||
# Spacing and filters
|
||||
text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing
|
||||
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
|
||||
text = re.sub(" +", " ", text) # remove multiple spaces
|
||||
text = text.strip() # strip white space at the ends
|
||||
text = re.sub(r"http\S+", "", text) # remove links
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def tokenize(batch: Dict) -> Dict:
|
||||
"""Tokenize the text input in our batch using a tokenizer.
|
||||
|
||||
Args:
|
||||
batch (Dict): batch of data with the text inputs to tokenize.
|
||||
|
||||
Returns:
|
||||
Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
|
||||
"""
|
||||
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
|
||||
encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
|
||||
return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))
|
||||
|
||||
|
||||
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
|
||||
"""Preprocess the data in our dataframe.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Raw dataframe to preprocess.
|
||||
class_to_index (Dict): Mapping of class names to indices.
|
||||
|
||||
Returns:
|
||||
Dict: preprocessed data (ids, masks, targets).
|
||||
"""
|
||||
df["text"] = df.title + " " + df.description # feature engineering
|
||||
df["text"] = df.text.apply(clean_text) # clean text
|
||||
df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe
|
||||
df = df[["text", "tag"]] # rearrange columns
|
||||
df["tag"] = df["tag"].map(class_to_index) # label encoding
|
||||
outputs = tokenize(df)
|
||||
return outputs
|
||||
|
||||
|
||||
class CustomPreprocessor(Preprocessor):
|
||||
"""Custom preprocessor class."""
|
||||
|
||||
def _fit(self, ds):
|
||||
tags = ds.unique(column="tag")
|
||||
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
|
||||
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
|
||||
|
||||
def _transform_pandas(self, batch): # could also do _transform_numpy
|
||||
return preprocess(batch, class_to_index=self.class_to_index)
|
||||
154
madewithml/evaluate.py
Normal file
154
madewithml/evaluate.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import datetime
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
import ray.train.torch # NOQA: F401 (imported but unused)
|
||||
import typer
|
||||
from ray.data import Dataset
|
||||
from ray.train.torch.torch_predictor import TorchPredictor
|
||||
from sklearn.metrics import precision_recall_fscore_support
|
||||
from snorkel.slicing import PandasSFApplier, slicing_function
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from madewithml import predict, utils
|
||||
from madewithml.config import logger
|
||||
|
||||
# Initialize Typer CLI app
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def get_overall_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict: # pragma: no cover, eval workload
|
||||
"""Get overall performance metrics.
|
||||
|
||||
Args:
|
||||
y_true (np.ndarray): ground truth labels.
|
||||
y_pred (np.ndarray): predicted labels.
|
||||
|
||||
Returns:
|
||||
Dict: overall metrics.
|
||||
"""
|
||||
metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
|
||||
overall_metrics = {
|
||||
"precision": metrics[0],
|
||||
"recall": metrics[1],
|
||||
"f1": metrics[2],
|
||||
"num_samples": np.float64(len(y_true)),
|
||||
}
|
||||
return overall_metrics
|
||||
|
||||
|
||||
def get_per_class_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_to_index: Dict) -> Dict: # pragma: no cover, eval workload
|
||||
"""Get per class performance metrics.
|
||||
|
||||
Args:
|
||||
y_true (np.ndarray): ground truth labels.
|
||||
y_pred (np.ndarray): predicted labels.
|
||||
class_to_index (Dict): dictionary mapping class to index.
|
||||
|
||||
Returns:
|
||||
Dict: per class metrics.
|
||||
"""
|
||||
per_class_metrics = {}
|
||||
metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
|
||||
for i, _class in enumerate(class_to_index):
|
||||
per_class_metrics[_class] = {
|
||||
"precision": metrics[0][i],
|
||||
"recall": metrics[1][i],
|
||||
"f1": metrics[2][i],
|
||||
"num_samples": np.float64(metrics[3][i]),
|
||||
}
|
||||
sorted_per_class_metrics = OrderedDict(sorted(per_class_metrics.items(), key=lambda tag: tag[1]["f1"], reverse=True))
|
||||
return sorted_per_class_metrics
|
||||
|
||||
|
||||
@slicing_function()
|
||||
def nlp_llm(x): # pragma: no cover, eval workload
|
||||
"""NLP projects that use LLMs."""
|
||||
nlp_project = "natural-language-processing" in x.tag
|
||||
llm_terms = ["transformer", "llm", "bert"]
|
||||
llm_project = any(s.lower() in x.text.lower() for s in llm_terms)
|
||||
return nlp_project and llm_project
|
||||
|
||||
|
||||
@slicing_function()
|
||||
def short_text(x): # pragma: no cover, eval workload
|
||||
"""Projects with short titles and descriptions."""
|
||||
return len(x.text.split()) < 8 # less than 8 words
|
||||
|
||||
|
||||
def get_slice_metrics(y_true: np.ndarray, y_pred: np.ndarray, ds: Dataset) -> Dict: # pragma: no cover, eval workload
|
||||
"""Get performance metrics for slices.
|
||||
|
||||
Args:
|
||||
y_true (np.ndarray): ground truth labels.
|
||||
y_pred (np.ndarray): predicted labels.
|
||||
ds (Dataset): Ray dataset with labels.
|
||||
Returns:
|
||||
Dict: performance metrics for slices.
|
||||
"""
|
||||
slice_metrics = {}
|
||||
df = ds.to_pandas()
|
||||
df["text"] = df["title"] + " " + df["description"]
|
||||
slices = PandasSFApplier([nlp_llm, short_text]).apply(df)
|
||||
for slice_name in slices.dtype.names:
|
||||
mask = slices[slice_name].astype(bool)
|
||||
if sum(mask):
|
||||
metrics = precision_recall_fscore_support(y_true[mask], y_pred[mask], average="micro")
|
||||
slice_metrics[slice_name] = {}
|
||||
slice_metrics[slice_name]["precision"] = metrics[0]
|
||||
slice_metrics[slice_name]["recall"] = metrics[1]
|
||||
slice_metrics[slice_name]["f1"] = metrics[2]
|
||||
slice_metrics[slice_name]["num_samples"] = len(y_true[mask])
|
||||
return slice_metrics
|
||||
|
||||
|
||||
@app.command()
|
||||
def evaluate(
|
||||
run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
|
||||
dataset_loc: Annotated[str, typer.Option(help="dataset (with labels) to evaluate on")] = None,
|
||||
results_fp: Annotated[str, typer.Option(help="location to save evaluation results to")] = None,
|
||||
) -> Dict: # pragma: no cover, eval workload
|
||||
"""Evaluate on the holdout dataset.
|
||||
|
||||
Args:
|
||||
run_id (str): id of the specific run to load from. Defaults to None.
|
||||
dataset_loc (str): dataset (with labels) to evaluate on.
|
||||
results_fp (str, optional): location to save evaluation results to. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict: model's performance metrics on the dataset.
|
||||
"""
|
||||
# Load
|
||||
ds = ray.data.read_csv(dataset_loc)
|
||||
best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
|
||||
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
|
||||
|
||||
# y_true
|
||||
preprocessor = predictor.get_preprocessor()
|
||||
preprocessed_ds = preprocessor.transform(ds)
|
||||
values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
|
||||
y_true = np.stack([item["targets"] for item in values])
|
||||
|
||||
# y_pred
|
||||
z = predictor.predict(data=ds.to_pandas())["predictions"]
|
||||
y_pred = np.stack(z).argmax(1)
|
||||
|
||||
# Metrics
|
||||
metrics = {
|
||||
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
|
||||
"run_id": run_id,
|
||||
"overall": get_overall_metrics(y_true=y_true, y_pred=y_pred),
|
||||
"per_class": get_per_class_metrics(y_true=y_true, y_pred=y_pred, class_to_index=preprocessor.class_to_index),
|
||||
"slices": get_slice_metrics(y_true=y_true, y_pred=y_pred, ds=ds),
|
||||
}
|
||||
logger.info(json.dumps(metrics, indent=2))
|
||||
if results_fp: # pragma: no cover, saving results
|
||||
utils.save_dict(d=metrics, path=results_fp)
|
||||
return metrics
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover, checked during evaluation workload
|
||||
app()
|
||||
19
madewithml/models.py
Normal file
19
madewithml/models.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class FinetunedLLM(nn.Module): # pragma: no cover, torch model
|
||||
"""Model architecture for a Large Language Model (LLM) that we will fine-tune."""
|
||||
|
||||
def __init__(self, llm, dropout_p, embedding_dim, num_classes):
|
||||
super(FinetunedLLM, self).__init__()
|
||||
self.llm = llm
|
||||
self.dropout = torch.nn.Dropout(dropout_p)
|
||||
self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
|
||||
|
||||
def forward(self, batch):
|
||||
ids, masks = batch["ids"], batch["masks"]
|
||||
seq, pool = self.llm(input_ids=ids, attention_mask=masks)
|
||||
z = self.dropout(pool)
|
||||
z = self.fc1(z)
|
||||
return z
|
||||
139
madewithml/predict.py
Normal file
139
madewithml/predict.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import json
|
||||
from typing import Any, Dict, Iterable, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pandas as pd
|
||||
import ray
|
||||
import torch
|
||||
import typer
|
||||
from numpyencoder import NumpyEncoder
|
||||
from ray.air import Result
|
||||
from ray.train.torch import TorchPredictor
|
||||
from ray.train.torch.torch_checkpoint import TorchCheckpoint
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from madewithml.config import logger, mlflow
|
||||
|
||||
# Initialize Typer CLI app
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def decode(indices: Iterable[Any], index_to_class: Dict) -> List:
|
||||
"""Decode indices to labels.
|
||||
|
||||
Args:
|
||||
indices (Iterable[Any]): Iterable (list, array, etc.) with indices.
|
||||
index_to_class (Dict): mapping between indices and labels.
|
||||
|
||||
Returns:
|
||||
List: list of labels.
|
||||
"""
|
||||
return [index_to_class[index] for index in indices]
|
||||
|
||||
|
||||
def format_prob(prob: Iterable, index_to_class: Dict) -> Dict:
|
||||
"""Format probabilities to a dictionary mapping class label to probability.
|
||||
|
||||
Args:
|
||||
prob (Iterable): probabilities.
|
||||
index_to_class (Dict): mapping between indices and labels.
|
||||
|
||||
Returns:
|
||||
Dict: Dictionary mapping class label to probability.
|
||||
"""
|
||||
d = {}
|
||||
for i, item in enumerate(prob):
|
||||
d[index_to_class[i]] = item
|
||||
return d
|
||||
|
||||
|
||||
def predict_with_proba(
|
||||
df: pd.DataFrame,
|
||||
predictor: ray.train.torch.torch_predictor.TorchPredictor,
|
||||
) -> List: # pragma: no cover, tested with inference workload
|
||||
"""Predict tags (with probabilities) for input data from a dataframe.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): dataframe with input features.
|
||||
predictor (ray.train.torch.torch_predictor.TorchPredictor): loaded predictor from a checkpoint.
|
||||
|
||||
Returns:
|
||||
List: list of predicted labels.
|
||||
"""
|
||||
preprocessor = predictor.get_preprocessor()
|
||||
z = predictor.predict(data=df)["predictions"]
|
||||
import numpy as np
|
||||
|
||||
y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
|
||||
results = []
|
||||
for i, prob in enumerate(y_prob):
|
||||
tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
|
||||
results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
|
||||
return results
|
||||
|
||||
|
||||
@app.command()
|
||||
def get_best_run_id(experiment_name: str = "", metric: str = "", mode: str = "") -> str: # pragma: no cover, mlflow logic
|
||||
"""Get the best run_id from an MLflow experiment.
|
||||
|
||||
Args:
|
||||
experiment_name (str): name of the experiment.
|
||||
metric (str): metric to filter by.
|
||||
mode (str): direction of metric (ASC/DESC).
|
||||
|
||||
Returns:
|
||||
str: best run id from experiment.
|
||||
"""
|
||||
sorted_runs = mlflow.search_runs(
|
||||
experiment_names=[experiment_name],
|
||||
order_by=[f"metrics.{metric} {mode}"],
|
||||
)
|
||||
run_id = sorted_runs.iloc[0].run_id
|
||||
print(run_id)
|
||||
return run_id
|
||||
|
||||
|
||||
def get_best_checkpoint(run_id: str) -> TorchCheckpoint: # pragma: no cover, mlflow logic
|
||||
"""Get the best checkpoint from a specific run.
|
||||
|
||||
Args:
|
||||
run_id (str): ID of the run to get the best checkpoint from.
|
||||
|
||||
Returns:
|
||||
TorchCheckpoint: Best checkpoint from the run.
|
||||
"""
|
||||
artifact_dir = urlparse(mlflow.get_run(run_id).info.artifact_uri).path # get path from mlflow
|
||||
results = Result.from_path(artifact_dir)
|
||||
return results.best_checkpoints[0][0]
|
||||
|
||||
|
||||
@app.command()
|
||||
def predict(
|
||||
run_id: Annotated[str, typer.Option(help="id of the specific run to load from")] = None,
|
||||
title: Annotated[str, typer.Option(help="project title")] = None,
|
||||
description: Annotated[str, typer.Option(help="project description")] = None,
|
||||
) -> Dict: # pragma: no cover, tested with inference workload
|
||||
"""Predict the tag for a project given it's title and description.
|
||||
|
||||
Args:
|
||||
run_id (str): id of the specific run to load from. Defaults to None.
|
||||
title (str, optional): project title. Defaults to "".
|
||||
description (str, optional): project description. Defaults to "".
|
||||
|
||||
Returns:
|
||||
Dict: prediction results for the input data.
|
||||
"""
|
||||
# Load components
|
||||
best_checkpoint = get_best_checkpoint(run_id=run_id)
|
||||
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
|
||||
preprocessor = predictor.get_preprocessor()
|
||||
|
||||
# Predict
|
||||
sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
|
||||
results = predict_with_proba(df=sample_df, predictor=predictor, index_to_class=preprocessor.index_to_class)
|
||||
logger.info(json.dumps(results, cls=NumpyEncoder, indent=2))
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover, application
|
||||
app()
|
||||
79
madewithml/serve.py
Normal file
79
madewithml/serve.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import argparse
|
||||
from http import HTTPStatus
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
import ray
|
||||
from fastapi import FastAPI
|
||||
from ray import serve
|
||||
from ray.train.torch import TorchPredictor
|
||||
from starlette.requests import Request
|
||||
|
||||
from madewithml import evaluate, predict
|
||||
from madewithml.config import MLFLOW_TRACKING_URI, mlflow
|
||||
|
||||
# Define application
|
||||
app = FastAPI(
|
||||
title="Made With ML",
|
||||
description="Classify machine learning projects.",
|
||||
version="0.1",
|
||||
)
|
||||
|
||||
|
||||
@serve.deployment(route_prefix="/", num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
|
||||
@serve.ingress(app)
|
||||
class ModelDeployment:
|
||||
def __init__(self, run_id: str, threshold: int = 0.9):
|
||||
"""Initialize the model."""
|
||||
self.run_id = run_id
|
||||
self.threshold = threshold
|
||||
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # so workers have access to model registry
|
||||
best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
|
||||
self.predictor = TorchPredictor.from_checkpoint(best_checkpoint)
|
||||
self.preprocessor = self.predictor.get_preprocessor()
|
||||
|
||||
@app.get("/")
|
||||
def _index(self) -> Dict:
|
||||
"""Health check."""
|
||||
response = {
|
||||
"message": HTTPStatus.OK.phrase,
|
||||
"status-code": HTTPStatus.OK,
|
||||
"data": {},
|
||||
}
|
||||
return response
|
||||
|
||||
@app.get("/run_id/")
|
||||
def _run_id(self) -> Dict:
|
||||
"""Get the run ID."""
|
||||
return {"run_id": self.run_id}
|
||||
|
||||
@app.post("/evaluate/")
|
||||
async def _evaluate(self, request: Request) -> Dict:
|
||||
data = await request.json()
|
||||
results = evaluate.evaluate(run_id=self.run_id, dataset_loc=data.get("dataset"))
|
||||
return {"results": results}
|
||||
|
||||
@app.post("/predict/")
|
||||
async def _predict(self, request: Request) -> Dict:
|
||||
# Get prediction
|
||||
data = await request.json()
|
||||
df = pd.DataFrame([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
|
||||
results = predict.predict_with_proba(df=df, predictor=self.predictor)
|
||||
|
||||
# Apply custom logic
|
||||
for i, result in enumerate(results):
|
||||
pred = result["prediction"]
|
||||
prob = result["probabilities"]
|
||||
if prob[pred] < self.threshold:
|
||||
results[i]["prediction"] = "other"
|
||||
|
||||
return {"results": results}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run_id", help="run ID to use for serving.")
|
||||
parser.add_argument("--threshold", type=float, default=0.9, help="threshold for `other` class.")
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
serve.run(ModelDeployment.bind(run_id=args.run_id, threshold=args.threshold))
|
||||
256
madewithml/train.py
Normal file
256
madewithml/train.py
Normal file
@@ -0,0 +1,256 @@
|
||||
import datetime
|
||||
import json
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
import ray.train as train
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import typer
|
||||
from ray.air import session
|
||||
from ray.air.config import (
|
||||
CheckpointConfig,
|
||||
DatasetConfig,
|
||||
RunConfig,
|
||||
ScalingConfig,
|
||||
)
|
||||
from ray.air.integrations.mlflow import MLflowLoggerCallback
|
||||
from ray.data import Dataset
|
||||
from ray.train.torch import TorchCheckpoint, TorchTrainer
|
||||
from transformers import BertModel
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from madewithml import data, models, utils
|
||||
from madewithml.config import MLFLOW_TRACKING_URI, logger
|
||||
|
||||
# Initialize Typer CLI app
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def train_step(
|
||||
ds: Dataset,
|
||||
batch_size: int,
|
||||
model: nn.Module,
|
||||
num_classes: int,
|
||||
loss_fn: torch.nn.modules.loss._WeightedLoss,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
) -> float: # pragma: no cover, tested via train workload
|
||||
"""Train step.
|
||||
|
||||
Args:
|
||||
ds (Dataset): dataset to iterate batches from.
|
||||
batch_size (int): size of each batch.
|
||||
model (nn.Module): model to train.
|
||||
num_classes (int): number of classes.
|
||||
loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
|
||||
optimizer (torch.optimizer.Optimizer): optimizer to use for updating the model's weights.
|
||||
|
||||
Returns:
|
||||
float: cumulative loss for the dataset.
|
||||
"""
|
||||
model.train()
|
||||
loss = 0.0
|
||||
ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
|
||||
for i, batch in enumerate(ds_generator):
|
||||
optimizer.zero_grad() # reset gradients
|
||||
z = model(batch) # forward pass
|
||||
targets = F.one_hot(batch["targets"], num_classes=num_classes).float() # one-hot (for loss_fn)
|
||||
J = loss_fn(z, targets) # define loss
|
||||
J.backward() # backward pass
|
||||
optimizer.step() # update weights
|
||||
loss += (J.detach().item() - loss) / (i + 1) # cumulative loss
|
||||
return loss
|
||||
|
||||
|
||||
def eval_step(
|
||||
ds: Dataset, batch_size: int, model: nn.Module, num_classes: int, loss_fn: torch.nn.modules.loss._WeightedLoss
|
||||
) -> Tuple[float, np.array, np.array]: # pragma: no cover, tested via train workload
|
||||
"""Eval step.
|
||||
|
||||
Args:
|
||||
ds (Dataset): dataset to iterate batches from.
|
||||
batch_size (int): size of each batch.
|
||||
model (nn.Module): model to train.
|
||||
num_classes (int): number of classes.
|
||||
loss_fn (torch.nn.loss._WeightedLoss): loss function to use between labels and predictions.
|
||||
|
||||
Returns:
|
||||
Tuple[float, np.array, np.array]: cumulative loss, ground truths and predictions.
|
||||
"""
|
||||
model.eval()
|
||||
loss = 0.0
|
||||
y_trues, y_preds = [], []
|
||||
ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=utils.collate_fn)
|
||||
with torch.inference_mode():
|
||||
for i, batch in enumerate(ds_generator):
|
||||
z = model(batch)
|
||||
targets = F.one_hot(batch["targets"], num_classes=num_classes).float() # one-hot (for loss_fn)
|
||||
J = loss_fn(z, targets).item()
|
||||
loss += (J - loss) / (i + 1)
|
||||
y_trues.extend(batch["targets"].cpu().numpy())
|
||||
y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
|
||||
return loss, np.vstack(y_trues), np.vstack(y_preds)
|
||||
|
||||
|
||||
def train_loop_per_worker(config: dict) -> None: # pragma: no cover, tested via train workload
|
||||
"""Training loop that each worker will execute.
|
||||
|
||||
Args:
|
||||
config (dict): arguments to use for training.
|
||||
"""
|
||||
# Hyperparameters
|
||||
dropout_p = config["dropout_p"]
|
||||
lr = config["lr"]
|
||||
lr_factor = config["lr_factor"]
|
||||
lr_patience = config["lr_patience"]
|
||||
batch_size = config["batch_size"]
|
||||
num_epochs = config["num_epochs"]
|
||||
num_classes = config["num_classes"]
|
||||
|
||||
# Get datasets
|
||||
utils.set_seeds()
|
||||
train_ds = session.get_dataset_shard("train")
|
||||
val_ds = session.get_dataset_shard("val")
|
||||
|
||||
# Model
|
||||
llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
|
||||
model = models.FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
|
||||
model = train.torch.prepare_model(model)
|
||||
|
||||
# Training components
|
||||
loss_fn = nn.BCEWithLogitsLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)
|
||||
|
||||
# Training
|
||||
batch_size_per_worker = batch_size // session.get_world_size()
|
||||
for epoch in range(num_epochs):
|
||||
# Step
|
||||
train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
|
||||
val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
|
||||
scheduler.step(val_loss)
|
||||
|
||||
# Checkpoint
|
||||
metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
|
||||
checkpoint = TorchCheckpoint.from_model(model=model)
|
||||
session.report(metrics, checkpoint=checkpoint)
|
||||
|
||||
|
||||
@app.command()
|
||||
def train_model(
|
||||
experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
|
||||
dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
|
||||
train_loop_config: Annotated[str, typer.Option(help="arguments to use for training.")] = None,
|
||||
num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
|
||||
cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
|
||||
gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
|
||||
num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
|
||||
num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
|
||||
batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
|
||||
results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
|
||||
) -> ray.air.result.Result:
|
||||
"""Main train function to train our model as a distributed workload.
|
||||
|
||||
Args:
|
||||
experiment_name (str): name of the experiment for this training workload.
|
||||
dataset_loc (str): location of the dataset.
|
||||
train_loop_config (str): arguments to use for training.
|
||||
num_workers (int, optional): number of workers to use for training. Defaults to 1.
|
||||
cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
|
||||
gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
|
||||
num_samples (int, optional): number of samples to use from dataset.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
num_epochs (int, optional): number of epochs to train for.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
batch_size (int, optional): number of samples per batch.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
results_fp (str, optional): filepath to save results to. Defaults to None.
|
||||
|
||||
Returns:
|
||||
ray.air.result.Result: training results.
|
||||
"""
|
||||
# Set up
|
||||
train_loop_config = json.loads(train_loop_config)
|
||||
train_loop_config["num_samples"] = num_samples
|
||||
train_loop_config["num_epochs"] = num_epochs
|
||||
train_loop_config["batch_size"] = batch_size
|
||||
|
||||
# Scaling config
|
||||
scaling_config = ScalingConfig(
|
||||
num_workers=num_workers,
|
||||
use_gpu=bool(gpu_per_worker),
|
||||
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
|
||||
_max_cpu_fraction_per_node=0.8,
|
||||
)
|
||||
|
||||
# Checkpoint config
|
||||
checkpoint_config = CheckpointConfig(
|
||||
num_to_keep=1,
|
||||
checkpoint_score_attribute="val_loss",
|
||||
checkpoint_score_order="min",
|
||||
)
|
||||
|
||||
# MLflow callback
|
||||
mlflow_callback = MLflowLoggerCallback(
|
||||
tracking_uri=MLFLOW_TRACKING_URI,
|
||||
experiment_name=experiment_name,
|
||||
save_artifact=True,
|
||||
)
|
||||
|
||||
# Run config
|
||||
run_config = RunConfig(
|
||||
callbacks=[mlflow_callback],
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
# Dataset
|
||||
ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"])
|
||||
train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
|
||||
tags = train_ds.unique(column="tag")
|
||||
train_loop_config["num_classes"] = len(tags)
|
||||
|
||||
# Dataset config
|
||||
dataset_config = {
|
||||
"train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
|
||||
"val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
|
||||
}
|
||||
|
||||
# Preprocess
|
||||
preprocessor = data.CustomPreprocessor()
|
||||
train_ds = preprocessor.fit_transform(train_ds)
|
||||
val_ds = preprocessor.transform(val_ds)
|
||||
train_ds = train_ds.materialize()
|
||||
val_ds = val_ds.materialize()
|
||||
|
||||
# Trainer
|
||||
trainer = TorchTrainer(
|
||||
train_loop_per_worker=train_loop_per_worker,
|
||||
train_loop_config=train_loop_config,
|
||||
scaling_config=scaling_config,
|
||||
run_config=run_config,
|
||||
datasets={"train": train_ds, "val": val_ds},
|
||||
dataset_config=dataset_config,
|
||||
preprocessor=preprocessor,
|
||||
)
|
||||
|
||||
# Train
|
||||
results = trainer.fit()
|
||||
d = {
|
||||
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
|
||||
"run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=results.metrics["trial_id"]),
|
||||
"params": results.config["train_loop_config"],
|
||||
"metrics": utils.dict_to_list(results.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
|
||||
}
|
||||
logger.info(json.dumps(d, indent=2))
|
||||
if results_fp: # pragma: no cover, saving results
|
||||
utils.save_dict(d, results_fp)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover, application
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
ray.init()
|
||||
app()
|
||||
182
madewithml/tune.py
Normal file
182
madewithml/tune.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import ray
|
||||
import typer
|
||||
from ray import tune
|
||||
from ray.air.config import (
|
||||
CheckpointConfig,
|
||||
DatasetConfig,
|
||||
RunConfig,
|
||||
ScalingConfig,
|
||||
)
|
||||
from ray.air.integrations.mlflow import MLflowLoggerCallback
|
||||
from ray.train.torch import TorchTrainer
|
||||
from ray.tune import Tuner
|
||||
from ray.tune.schedulers import AsyncHyperBandScheduler
|
||||
from ray.tune.search import ConcurrencyLimiter
|
||||
from ray.tune.search.hyperopt import HyperOptSearch
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from madewithml import data, train, utils
|
||||
from madewithml.config import MLFLOW_TRACKING_URI, logger
|
||||
|
||||
# Initialize Typer CLI app
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def tune_models(
|
||||
experiment_name: Annotated[str, typer.Option(help="name of the experiment for this training workload.")] = None,
|
||||
dataset_loc: Annotated[str, typer.Option(help="location of the dataset.")] = None,
|
||||
initial_params: Annotated[str, typer.Option(help="initial config for the tuning workload.")] = None,
|
||||
num_workers: Annotated[int, typer.Option(help="number of workers to use for training.")] = 1,
|
||||
cpu_per_worker: Annotated[int, typer.Option(help="number of CPUs to use per worker.")] = 1,
|
||||
gpu_per_worker: Annotated[int, typer.Option(help="number of GPUs to use per worker.")] = 0,
|
||||
num_runs: Annotated[int, typer.Option(help="number of runs in this tuning experiment.")] = 1,
|
||||
num_samples: Annotated[int, typer.Option(help="number of samples to use from dataset.")] = None,
|
||||
num_epochs: Annotated[int, typer.Option(help="number of epochs to train for.")] = 1,
|
||||
batch_size: Annotated[int, typer.Option(help="number of samples per batch.")] = 256,
|
||||
results_fp: Annotated[str, typer.Option(help="filepath to save results to.")] = None,
|
||||
) -> ray.tune.result_grid.ResultGrid:
|
||||
"""Hyperparameter tuning experiment.
|
||||
|
||||
Args:
|
||||
experiment_name (str): name of the experiment for this training workload.
|
||||
dataset_loc (str): location of the dataset.
|
||||
initial_params (str): initial config for the tuning workload.
|
||||
num_workers (int, optional): number of workers to use for training. Defaults to 1.
|
||||
cpu_per_worker (int, optional): number of CPUs to use per worker. Defaults to 1.
|
||||
gpu_per_worker (int, optional): number of GPUs to use per worker. Defaults to 0.
|
||||
num_runs (int, optional): number of runs in this tuning experiment. Defaults to 1.
|
||||
num_samples (int, optional): number of samples to use from dataset.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
num_epochs (int, optional): number of epochs to train for.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
batch_size (int, optional): number of samples per batch.
|
||||
If this is passed in, it will override the config. Defaults to None.
|
||||
results_fp (str, optional): filepath to save the tuning results. Defaults to None.
|
||||
|
||||
Returns:
|
||||
ray.tune.result_grid.ResultGrid: results of the tuning experiment.
|
||||
"""
|
||||
# Set up
|
||||
utils.set_seeds()
|
||||
train_loop_config = {}
|
||||
train_loop_config["num_samples"] = num_samples
|
||||
train_loop_config["num_epochs"] = num_epochs
|
||||
train_loop_config["batch_size"] = batch_size
|
||||
|
||||
# Scaling config
|
||||
scaling_config = ScalingConfig(
|
||||
num_workers=num_workers,
|
||||
use_gpu=bool(gpu_per_worker),
|
||||
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
|
||||
_max_cpu_fraction_per_node=0.8,
|
||||
)
|
||||
|
||||
# Dataset
|
||||
ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config.get("num_samples", None))
|
||||
train_ds, val_ds = data.stratify_split(ds, stratify="tag", test_size=0.2)
|
||||
tags = train_ds.unique(column="tag")
|
||||
train_loop_config["num_classes"] = len(tags)
|
||||
|
||||
# Dataset config
|
||||
dataset_config = {
|
||||
"train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
|
||||
"val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
|
||||
}
|
||||
|
||||
# Preprocess
|
||||
preprocessor = data.CustomPreprocessor()
|
||||
train_ds = preprocessor.fit_transform(train_ds)
|
||||
val_ds = preprocessor.transform(val_ds)
|
||||
train_ds = train_ds.materialize()
|
||||
val_ds = val_ds.materialize()
|
||||
|
||||
# Trainer
|
||||
trainer = TorchTrainer(
|
||||
train_loop_per_worker=train.train_loop_per_worker,
|
||||
train_loop_config=train_loop_config,
|
||||
scaling_config=scaling_config,
|
||||
datasets={"train": train_ds, "val": val_ds},
|
||||
dataset_config=dataset_config,
|
||||
preprocessor=preprocessor,
|
||||
)
|
||||
|
||||
# Checkpoint configuration
|
||||
checkpoint_config = CheckpointConfig(
|
||||
num_to_keep=1,
|
||||
checkpoint_score_attribute="val_loss",
|
||||
checkpoint_score_order="min",
|
||||
)
|
||||
|
||||
# Run configuration
|
||||
mlflow_callback = MLflowLoggerCallback(
|
||||
tracking_uri=MLFLOW_TRACKING_URI,
|
||||
experiment_name=experiment_name,
|
||||
save_artifact=True,
|
||||
)
|
||||
run_config = RunConfig(
|
||||
callbacks=[mlflow_callback],
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
# Hyperparameters to start with
|
||||
initial_params = json.loads(initial_params)
|
||||
search_alg = HyperOptSearch(points_to_evaluate=initial_params)
|
||||
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2) # trade off b/w optimization and search space
|
||||
|
||||
# Parameter space
|
||||
param_space = {
|
||||
"train_loop_config": {
|
||||
"dropout_p": tune.uniform(0.3, 0.9),
|
||||
"lr": tune.loguniform(1e-5, 5e-4),
|
||||
"lr_factor": tune.uniform(0.1, 0.9),
|
||||
"lr_patience": tune.uniform(1, 10),
|
||||
}
|
||||
}
|
||||
|
||||
# Scheduler
|
||||
scheduler = AsyncHyperBandScheduler(
|
||||
max_t=train_loop_config["num_epochs"], # max epoch (<time_attr>) per trial
|
||||
grace_period=1, # min epoch (<time_attr>) per trial
|
||||
)
|
||||
|
||||
# Tune config
|
||||
tune_config = tune.TuneConfig(
|
||||
metric="val_loss",
|
||||
mode="min",
|
||||
search_alg=search_alg,
|
||||
scheduler=scheduler,
|
||||
num_samples=num_runs,
|
||||
)
|
||||
|
||||
# Tuner
|
||||
tuner = Tuner(
|
||||
trainable=trainer,
|
||||
run_config=run_config,
|
||||
param_space=param_space,
|
||||
tune_config=tune_config,
|
||||
)
|
||||
|
||||
# Tune
|
||||
results = tuner.fit()
|
||||
best_trial = results.get_best_result(metric="val_loss", mode="min")
|
||||
d = {
|
||||
"timestamp": datetime.datetime.now().strftime("%B %d, %Y %I:%M:%S %p"),
|
||||
"run_id": utils.get_run_id(experiment_name=experiment_name, trial_id=best_trial.metrics["trial_id"]),
|
||||
"params": best_trial.config["train_loop_config"],
|
||||
"metrics": utils.dict_to_list(best_trial.metrics_dataframe.to_dict(), keys=["epoch", "train_loss", "val_loss"]),
|
||||
}
|
||||
logger.info(json.dumps(d, indent=2))
|
||||
if results_fp: # pragma: no cover, saving results
|
||||
utils.save_dict(d, results_fp)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover, application
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
ray.init()
|
||||
app()
|
||||
123
madewithml/utils.py
Normal file
123
madewithml/utils.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from ray.data import DatasetContext
|
||||
from ray.train.torch import get_device
|
||||
|
||||
from madewithml.config import mlflow
|
||||
|
||||
DatasetContext.get_current().execution_options.preserve_order = True
|
||||
|
||||
|
||||
def set_seeds(seed: int = 42):
|
||||
"""Set seeds for reproducibility."""
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
eval("setattr(torch.backends.cudnn, 'deterministic', True)")
|
||||
eval("setattr(torch.backends.cudnn, 'benchmark', False)")
|
||||
os.environ["PYTHONHASHSEED"] = str(seed)
|
||||
|
||||
|
||||
def load_dict(path: str) -> Dict:
|
||||
"""Load a dictionary from a JSON's filepath.
|
||||
|
||||
Args:
|
||||
path (str): location of file.
|
||||
|
||||
Returns:
|
||||
Dict: loaded JSON data.
|
||||
"""
|
||||
with open(path) as fp:
|
||||
d = json.load(fp)
|
||||
return d
|
||||
|
||||
|
||||
def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
|
||||
"""Save a dictionary to a specific location.
|
||||
|
||||
Args:
|
||||
d (Dict): data to save.
|
||||
path (str): location of where to save the data.
|
||||
cls (optional): encoder to use on dict data. Defaults to None.
|
||||
sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
|
||||
"""
|
||||
directory = os.path.dirname(path)
|
||||
if directory and not os.path.exists(directory): # pragma: no cover
|
||||
os.makedirs(directory)
|
||||
with open(path, "w") as fp:
|
||||
json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
|
||||
fp.write("\n")
|
||||
|
||||
|
||||
def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
|
||||
"""Pad an 2D array with zeros until all rows in the
|
||||
2D array are of the same length as a the longest
|
||||
row in the 2D array.
|
||||
|
||||
Args:
|
||||
arr (np.array): input array
|
||||
|
||||
Returns:
|
||||
np.array: zero padded array
|
||||
"""
|
||||
max_len = max(len(row) for row in arr)
|
||||
padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
|
||||
for i, row in enumerate(arr):
|
||||
padded_arr[i][: len(row)] = row
|
||||
return padded_arr
|
||||
|
||||
|
||||
def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: # pragma: no cover, air internal
|
||||
"""Convert a batch of numpy arrays to tensors (with appropriate padding).
|
||||
|
||||
Args:
|
||||
batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.
|
||||
|
||||
Returns:
|
||||
Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
|
||||
"""
|
||||
batch["ids"] = pad_array(batch["ids"])
|
||||
batch["masks"] = pad_array(batch["masks"])
|
||||
dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
|
||||
tensor_batch = {}
|
||||
for key, array in batch.items():
|
||||
tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
|
||||
return tensor_batch
|
||||
|
||||
|
||||
def get_run_id(experiment_name: str, trial_id: str) -> str: # pragma: no cover, mlflow functionality
|
||||
"""Get the MLflow run ID for a specific Ray trial ID.
|
||||
|
||||
Args:
|
||||
experiment_name (str): name of the experiment.
|
||||
trial_id (str): id of the trial.
|
||||
|
||||
Returns:
|
||||
str: run id of the trial.
|
||||
"""
|
||||
trial_name = f"TorchTrainer_{trial_id}"
|
||||
run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
|
||||
return run.run_id
|
||||
|
||||
|
||||
def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Convert a dictionary to a list of dictionaries.
|
||||
|
||||
Args:
|
||||
data (Dict): input dictionary.
|
||||
keys (List[str]): keys to include in the output list of dictionaries.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: output list of dictionaries.
|
||||
"""
|
||||
list_of_dicts = []
|
||||
for i in range(len(data[keys[0]])):
|
||||
new_dict = {key: data[key][i] for key in keys}
|
||||
list_of_dicts.append(new_dict)
|
||||
return list_of_dicts
|
||||
Reference in New Issue
Block a user