updated to Ray 2.7

This commit is contained in:
GokuMohandas 2023-09-18 22:03:20 -07:00
parent 71b3d50a05
commit b98bd5b1ae
15 changed files with 3484 additions and 2086 deletions

View File

@ -108,6 +108,7 @@ touch .env
```bash
# Inside .env
GITHUB_USERNAME="CHANGE_THIS_TO_YOUR_USERNAME" # ← CHANGE THIS
```
```bash
source .env
```
@ -120,8 +121,6 @@ Now we're ready to clone the repository that has all of our code:
```bash
git clone https://github.com/GokuMohandas/Made-With-ML.git .
git remote set-url origin https://github.com/$GITHUB_USERNAME/Made-With-ML.git # <-- CHANGE THIS to your username
git checkout -b dev
```
### Virtual environment
@ -289,7 +288,6 @@ python madewithml/evaluate.py \
### Inference
```bash
# Get run ID
export EXPERIMENT_NAME="llm"
export RUN_ID=$(python madewithml/predict.py get-best-run-id --experiment-name $EXPERIMENT_NAME --metric val_loss --mode ASC)
python madewithml/predict.py predict \
@ -485,17 +483,23 @@ We're not going to manually deploy our application every time we make a change.
<img src="https://madewithml.com/static/images/mlops/cicd/cicd.png">
</div>
1. We'll start by adding the necessary credentials to the [`/settings/secrets/actions`](https://github.com/GokuMohandas/Made-With-ML/settings/secrets/actions) page of our GitHub repository.
1. Create a new github branch to save our changes to and execute CI/CD workloads:
```bash
git remote set-url origin https://github.com/$GITHUB_USERNAME/Made-With-ML.git # <-- CHANGE THIS to your username
git checkout -b dev
```
2. We'll start by adding the necessary credentials to the [`/settings/secrets/actions`](https://github.com/GokuMohandas/Made-With-ML/settings/secrets/actions) page of our GitHub repository.
``` bash
export ANYSCALE_HOST=https://console.anyscale.com
export ANYSCALE_CLI_TOKEN=$YOUR_CLI_TOKEN # retrieved from https://console.anyscale.com/o/madewithml/credentials
```
2. Now we can make changes to our code (not on `main` branch) and push them to GitHub. But in order to push our code to GitHub, we'll need to first authenticate with our credentials before pushing to our repository:
3. Now we can make changes to our code (not on `main` branch) and push them to GitHub. But in order to push our code to GitHub, we'll need to first authenticate with our credentials before pushing to our repository:
```bash
git config --global user.name "Your Name" # <-- CHANGE THIS to your name
git config --global user.name $GITHUB_USERNAME # <-- CHANGE THIS to your username
git config --global user.email you@example.com # <-- CHANGE THIS to your email
git add .
git commit -m "" # <-- CHANGE THIS to your message
@ -504,13 +508,13 @@ git push origin dev
Now you will be prompted to enter your username and password (personal access token). Follow these steps to get personal access token: [New GitHub personal access token](https://github.com/settings/tokens/new) → Add a name → Toggle `repo` and `workflow` → Click `Generate token` (scroll down) → Copy the token and paste it when prompted for your password.
3. Now we can start a PR from this branch to our `main` branch and this will trigger the [workloads workflow](/.github/workflows/workloads.yaml). If the workflow (Anyscale Jobs) succeeds, this will produce comments with the training and evaluation results directly on the PR.
4. Now we can start a PR from this branch to our `main` branch and this will trigger the [workloads workflow](/.github/workflows/workloads.yaml). If the workflow (Anyscale Jobs) succeeds, this will produce comments with the training and evaluation results directly on the PR.
<div align="center">
<img src="https://madewithml.com/static/images/mlops/cicd/comments.png">
</div>
4. If we like the results, we can merge the PR into the `main` branch. This will trigger the [serve workflow](/.github/workflows/serve.yaml) which will rollout our new service to production!
5. If we like the results, we can merge the PR into the `main` branch. This will trigger the [serve workflow](/.github/workflows/serve.yaml) which will rollout our new service to production!
### Continual learning

View File

@ -5,7 +5,6 @@ import sys
from pathlib import Path
import mlflow
import pretty_errors # NOQA: F401 (imported but unused)
# Directories
ROOT_DIR = Path(__file__).parent.parent.absolute()

View File

@ -5,7 +5,6 @@ import numpy as np
import pandas as pd
import ray
from ray.data import Dataset
from ray.data.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
@ -135,13 +134,18 @@ def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
return outputs
class CustomPreprocessor(Preprocessor):
class CustomPreprocessor:
"""Custom preprocessor class."""
def _fit(self, ds):
def __init__(self, class_to_index={}):
self.class_to_index = class_to_index or {} # mutable defaults
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
def fit(self, ds):
tags = ds.unique(column="tag")
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
return self
def _transform_pandas(self, batch): # could also do _transform_numpy
return preprocess(batch, class_to_index=self.class_to_index)
def transform(self, ds):
return ds.map_batches(preprocess, fn_kwargs={"class_to_index": self.class_to_index}, batch_format="pandas")

View File

@ -8,13 +8,13 @@ import ray
import ray.train.torch # NOQA: F401 (imported but unused)
import typer
from ray.data import Dataset
from ray.train.torch.torch_predictor import TorchPredictor
from sklearn.metrics import precision_recall_fscore_support
from snorkel.slicing import PandasSFApplier, slicing_function
from typing_extensions import Annotated
from madewithml import predict, utils
from madewithml.config import logger
from madewithml.predict import TorchPredictor
# Initialize Typer CLI app
app = typer.Typer()
@ -133,8 +133,8 @@ def evaluate(
y_true = np.stack([item["targets"] for item in values])
# y_pred
z = predictor.predict(data=ds.to_pandas())["predictions"]
y_pred = np.stack(z).argmax(1)
predictions = preprocessed_ds.map_batches(predictor).take_all()
y_pred = np.array([d["output"] for d in predictions])
# Metrics
metrics = {

View File

@ -1,13 +1,20 @@
import json
import os
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel
class FinetunedLLM(nn.Module): # pragma: no cover, torch model
"""Model architecture for a Large Language Model (LLM) that we will fine-tune."""
class FinetunedLLM(nn.Module):
def __init__(self, llm, dropout_p, embedding_dim, num_classes):
super(FinetunedLLM, self).__init__()
self.llm = llm
self.dropout_p = dropout_p
self.embedding_dim = embedding_dim
self.num_classes = num_classes
self.dropout = torch.nn.Dropout(dropout_p)
self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
@ -17,3 +24,36 @@ class FinetunedLLM(nn.Module): # pragma: no cover, torch model
z = self.dropout(pool)
z = self.fc1(z)
return z
@torch.inference_mode()
def predict(self, batch):
self.eval()
z = self(batch)
y_pred = torch.argmax(z, dim=1).cpu().numpy()
return y_pred
@torch.inference_mode()
def predict_proba(self, batch):
self.eval()
z = self(batch)
y_probs = F.softmax(z, dim=1).cpu().numpy()
return y_probs
def save(self, dp):
with open(Path(dp, "args.json"), "w") as fp:
contents = {
"dropout_p": self.dropout_p,
"embedding_dim": self.embedding_dim,
"num_classes": self.num_classes,
}
json.dump(contents, fp, indent=4, sort_keys=False)
torch.save(self.state_dict(), os.path.join(dp, "model.pt"))
@classmethod
def load(cls, args_fp, state_dict_fp):
with open(args_fp, "r") as fp:
kwargs = json.load(fp=fp)
llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
model = cls(llm=llm, **kwargs)
model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device("cpu")))
return model

View File

@ -1,19 +1,20 @@
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List
from urllib.parse import urlparse
import numpy as np
import pandas as pd
import ray
import torch
import typer
from numpyencoder import NumpyEncoder
from ray.air import Result
from ray.train.torch import TorchPredictor
from ray.train.torch.torch_checkpoint import TorchCheckpoint
from typing_extensions import Annotated
from madewithml.config import logger, mlflow
from madewithml.data import CustomPreprocessor
from madewithml.models import FinetunedLLM
from madewithml.utils import collate_fn
# Initialize Typer CLI app
app = typer.Typer()
@ -48,25 +49,51 @@ def format_prob(prob: Iterable, index_to_class: Dict) -> Dict:
return d
def predict_with_proba(
df: pd.DataFrame,
predictor: ray.train.torch.torch_predictor.TorchPredictor,
class TorchPredictor:
def __init__(self, preprocessor, model):
self.preprocessor = preprocessor
self.model = model
self.model.eval()
def __call__(self, batch):
results = self.model.predict(collate_fn(batch))
return {"output": results}
def predict_proba(self, batch):
results = self.model.predict_proba(collate_fn(batch))
return {"output": results}
def get_preprocessor(self):
return self.preprocessor
@classmethod
def from_checkpoint(cls, checkpoint):
metadata = checkpoint.get_metadata()
preprocessor = CustomPreprocessor(class_to_index=metadata["class_to_index"])
model = FinetunedLLM.load(Path(checkpoint.path, "args.json"), Path(checkpoint.path, "model.pt"))
return cls(preprocessor=preprocessor, model=model)
def predict_proba(
ds: ray.data.dataset.Dataset,
predictor: TorchPredictor,
) -> List: # pragma: no cover, tested with inference workload
"""Predict tags (with probabilities) for input data from a dataframe.
Args:
df (pd.DataFrame): dataframe with input features.
predictor (ray.train.torch.torch_predictor.TorchPredictor): loaded predictor from a checkpoint.
predictor (TorchPredictor): loaded predictor from a checkpoint.
Returns:
List: list of predicted labels.
"""
preprocessor = predictor.get_preprocessor()
z = predictor.predict(data=df)["predictions"]
y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
preprocessed_ds = preprocessor.transform(ds)
outputs = preprocessed_ds.map_batches(predictor.predict_proba)
y_prob = np.array([d["output"] for d in outputs.take_all()])
results = []
for i, prob in enumerate(y_prob):
tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
tag = preprocessor.index_to_class[prob.argmax()]
results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
return results
@ -125,11 +152,10 @@ def predict(
# Load components
best_checkpoint = get_best_checkpoint(run_id=run_id)
predictor = TorchPredictor.from_checkpoint(best_checkpoint)
# preprocessor = predictor.get_preprocessor()
# Predict
sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
results = predict_with_proba(df=sample_df, predictor=predictor)
sample_ds = ray.data.from_items([{"title": title, "description": description, "tag": "other"}])
results = predict_proba(ds=sample_ds, predictor=predictor)
logger.info(json.dumps(results, cls=NumpyEncoder, indent=2))
return results

View File

@ -3,11 +3,9 @@ import os
from http import HTTPStatus
from typing import Dict
import pandas as pd
import ray
from fastapi import FastAPI
from ray import serve
from ray.train.torch import TorchPredictor
from starlette.requests import Request
from madewithml import evaluate, predict
@ -21,7 +19,7 @@ app = FastAPI(
)
@serve.deployment(route_prefix="/", num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
@serve.deployment(num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
@serve.ingress(app)
class ModelDeployment:
def __init__(self, run_id: str, threshold: int = 0.9):
@ -30,8 +28,7 @@ class ModelDeployment:
self.threshold = threshold
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # so workers have access to model registry
best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
self.predictor = TorchPredictor.from_checkpoint(best_checkpoint)
self.preprocessor = self.predictor.get_preprocessor()
self.predictor = predict.TorchPredictor.from_checkpoint(best_checkpoint)
@app.get("/")
def _index(self) -> Dict:
@ -55,11 +52,10 @@ class ModelDeployment:
return {"results": results}
@app.post("/predict/")
async def _predict(self, request: Request) -> Dict:
# Get prediction
async def _predict(self, request: Request):
data = await request.json()
df = pd.DataFrame([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
results = predict.predict_with_proba(df=df, predictor=self.predictor)
sample_ds = ray.data.from_items([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
results = predict.predict_proba(ds=sample_ds, predictor=self.predictor)
# Apply custom logic
for i, result in enumerate(results):

View File

@ -1,6 +1,7 @@
import datetime
import json
import os
import tempfile
from typing import Tuple
import numpy as np
@ -10,21 +11,23 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
import typer
from ray.air import session
from ray.air.config import (
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.data import Dataset
from ray.train import (
Checkpoint,
CheckpointConfig,
DatasetConfig,
DataConfig,
RunConfig,
ScalingConfig,
)
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.data import Dataset
from ray.train.torch import TorchCheckpoint, TorchTrainer
from ray.train.torch import TorchTrainer
from torch.nn.parallel.distributed import DistributedDataParallel
from transformers import BertModel
from typing_extensions import Annotated
from madewithml import data, models, utils
from madewithml import data, utils
from madewithml.config import EFS_DIR, MLFLOW_TRACKING_URI, logger
from madewithml.models import FinetunedLLM
# Initialize Typer CLI app
app = typer.Typer()
@ -106,18 +109,18 @@ def train_loop_per_worker(config: dict) -> None: # pragma: no cover, tested via
lr = config["lr"]
lr_factor = config["lr_factor"]
lr_patience = config["lr_patience"]
batch_size = config["batch_size"]
num_epochs = config["num_epochs"]
batch_size = config["batch_size"]
num_classes = config["num_classes"]
# Get datasets
utils.set_seeds()
train_ds = session.get_dataset_shard("train")
val_ds = session.get_dataset_shard("val")
train_ds = train.get_dataset_shard("train")
val_ds = train.get_dataset_shard("val")
# Model
llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
model = models.FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
model = FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
model = train.torch.prepare_model(model)
# Training components
@ -126,7 +129,8 @@ def train_loop_per_worker(config: dict) -> None: # pragma: no cover, tested via
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)
# Training
batch_size_per_worker = batch_size // session.get_world_size()
num_workers = train.get_context().get_world_size()
batch_size_per_worker = batch_size // num_workers
for epoch in range(num_epochs):
# Step
train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
@ -134,9 +138,14 @@ def train_loop_per_worker(config: dict) -> None: # pragma: no cover, tested via
scheduler.step(val_loss)
# Checkpoint
metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
checkpoint = TorchCheckpoint.from_model(model=model)
session.report(metrics, checkpoint=checkpoint)
with tempfile.TemporaryDirectory() as dp:
if isinstance(model, DistributedDataParallel): # cpu
model.module.save(dp=dp)
else:
model.save(dp=dp)
metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
checkpoint = Checkpoint.from_directory(dp)
train.report(metrics, checkpoint=checkpoint)
@app.command()
@ -183,7 +192,6 @@ def train_model(
num_workers=num_workers,
use_gpu=bool(gpu_per_worker),
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
_max_cpu_fraction_per_node=0.8,
)
# Checkpoint config
@ -201,7 +209,7 @@ def train_model(
)
# Run config
run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR)
run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR, local_dir=EFS_DIR)
# Dataset
ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"])
@ -210,14 +218,13 @@ def train_model(
train_loop_config["num_classes"] = len(tags)
# Dataset config
dataset_config = {
"train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
"val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
}
options = ray.data.ExecutionOptions(preserve_order=True)
dataset_config = DataConfig(datasets_to_split=["train"], execution_options=options)
# Preprocess
preprocessor = data.CustomPreprocessor()
train_ds = preprocessor.fit_transform(train_ds)
preprocessor = preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()
@ -230,7 +237,7 @@ def train_model(
run_config=run_config,
datasets={"train": train_ds, "val": val_ds},
dataset_config=dataset_config,
preprocessor=preprocessor,
metadata={"class_to_index": preprocessor.class_to_index},
)
# Train

View File

@ -73,7 +73,6 @@ def tune_models(
num_workers=num_workers,
use_gpu=bool(gpu_per_worker),
resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
_max_cpu_fraction_per_node=0.8,
)
# Dataset
@ -90,7 +89,8 @@ def tune_models(
# Preprocess
preprocessor = data.CustomPreprocessor()
train_ds = preprocessor.fit_transform(train_ds)
preprocessor = preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()
@ -102,7 +102,7 @@ def tune_models(
scaling_config=scaling_config,
datasets={"train": train_ds, "val": val_ds},
dataset_config=dataset_config,
preprocessor=preprocessor,
metadata={"class_to_index": preprocessor.class_to_index},
)
# Checkpoint configuration
@ -118,7 +118,7 @@ def tune_models(
experiment_name=experiment_name,
save_artifact=True,
)
run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR)
run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR, local_dir=EFS_DIR)
# Hyperparameters to start with
initial_params = json.loads(initial_params)

File diff suppressed because one or more lines are too long

View File

@ -7,7 +7,6 @@ nltk==3.8.1
numpy==1.24.3
numpyencoder==0.3.0
pandas==2.0.1
pretty-errors==1.2.25
python-dotenv==1.0.0
ray[air]==2.6.0
scikit-learn==1.2.2

View File

@ -54,5 +54,7 @@ def test_preprocess(df, class_to_index):
def test_fit_transform(dataset_loc, preprocessor):
ds = data.load_data(dataset_loc=dataset_loc)
preprocessor.fit_transform(ds)
preprocessor = preprocessor.fit(ds)
preprocessed_ds = preprocessor.transform(ds)
assert len(preprocessor.class_to_index) == 4
assert ds.count() == preprocessed_ds.count()

View File

@ -4,6 +4,7 @@ from pathlib import Path
import numpy as np
import pytest
import torch
from ray.train.torch import get_device
from madewithml import utils
@ -42,9 +43,9 @@ def test_collate_fn():
}
processed_batch = utils.collate_fn(batch)
expected_batch = {
"ids": torch.tensor([[1, 2, 0], [1, 2, 3]], dtype=torch.int32),
"masks": torch.tensor([[1, 1, 0], [1, 1, 1]], dtype=torch.int32),
"targets": torch.tensor([3, 1], dtype=torch.int64),
"ids": torch.as_tensor([[1, 2, 0], [1, 2, 3]], dtype=torch.int32, device=get_device()),
"masks": torch.as_tensor([[1, 1, 0], [1, 1, 1]], dtype=torch.int32, device=get_device()),
"targets": torch.as_tensor([3, 1], dtype=torch.int64, device=get_device()),
}
for k in batch:
assert torch.allclose(processed_batch[k], expected_batch[k])

View File

@ -1,7 +1,7 @@
import pytest
from ray.train.torch.torch_predictor import TorchPredictor
from madewithml import predict
from madewithml.predict import TorchPredictor
def pytest_addoption(parser):

View File

@ -1,12 +1,9 @@
import numpy as np
import pandas as pd
import ray
from madewithml import predict
def get_label(text, predictor):
df = pd.DataFrame({"title": [text], "description": "", "tag": "other"})
z = predictor.predict(data=df)["predictions"]
preprocessor = predictor.get_preprocessor()
label = predict.decode(np.stack(z).argmax(1), preprocessor.index_to_class)[0]
return label
sample_ds = ray.data.from_items([{"title": text, "description": "", "tag": "other"}])
results = predict.predict_proba(ds=sample_ds, predictor=predictor)
return results[0]["prediction"]