updated to Ray 2.7

2023-09-18 22:03:20 -07:00 · 2023-09-18 22:03:20 -07:00 · b98bd5b1ae
commit b98bd5b1ae
parent 71b3d50a05
15 changed files with 3484 additions and 2086 deletions
--- a/README.md
+++ b/README.md
@ -108,6 +108,7 @@ touch .env
 ```bash
 # Inside .env
 GITHUB_USERNAME="CHANGE_THIS_TO_YOUR_USERNAME"  # ← CHANGE THIS
 ```
 ```bash
 source .env
 ```
@ -120,8 +121,6 @@ Now we're ready to clone the repository that has all of our code:
 ```bash
 git clone https://github.com/GokuMohandas/Made-With-ML.git .
 git remote set-url origin https://github.com/$GITHUB_USERNAME/Made-With-ML.git  # <-- CHANGE THIS to your username
 git checkout -b dev
 ```
 ### Virtual environment
@ -289,7 +288,6 @@ python madewithml/evaluate.py \
 ### Inference
 ```bash
 # Get run ID
 export EXPERIMENT_NAME="llm"
 export RUN_ID=$(python madewithml/predict.py get-best-run-id --experiment-name $EXPERIMENT_NAME --metric val_loss --mode ASC)
 python madewithml/predict.py predict \
@ -485,17 +483,23 @@ We're not going to manually deploy our application every time we make a change.
  <img src="https://madewithml.com/static/images/mlops/cicd/cicd.png">
 </div>
-1. We'll start by adding the necessary credentials to the [`/settings/secrets/actions`](https://github.com/GokuMohandas/Made-With-ML/settings/secrets/actions) page of our GitHub repository.
+1. Create a new github branch to save our changes to and execute CI/CD workloads:
 ```bash
 git remote set-url origin https://github.com/$GITHUB_USERNAME/Made-With-ML.git  # <-- CHANGE THIS to your username
 git checkout -b dev
 ```
 2. We'll start by adding the necessary credentials to the [`/settings/secrets/actions`](https://github.com/GokuMohandas/Made-With-ML/settings/secrets/actions) page of our GitHub repository.
 ``` bash
 export ANYSCALE_HOST=https://console.anyscale.com
 export ANYSCALE_CLI_TOKEN=$YOUR_CLI_TOKEN  # retrieved from https://console.anyscale.com/o/madewithml/credentials
 ```
-2. Now we can make changes to our code (not on `main` branch) and push them to GitHub. But in order to push our code to GitHub, we'll need to first authenticate with our credentials before pushing to our repository:
+3. Now we can make changes to our code (not on `main` branch) and push them to GitHub. But in order to push our code to GitHub, we'll need to first authenticate with our credentials before pushing to our repository:
 ```bash
-git config --global user.name "Your Name"  # <-- CHANGE THIS to your name
+git config --global user.name $GITHUB_USERNAME  # <-- CHANGE THIS to your username
 git config --global user.email you@example.com  # <-- CHANGE THIS to your email
 git add .
 git commit -m ""  # <-- CHANGE THIS to your message
@ -504,13 +508,13 @@ git push origin dev
 Now you will be prompted to enter your username and password (personal access token). Follow these steps to get personal access token: [New GitHub personal access token](https://github.com/settings/tokens/new) → Add a name → Toggle `repo` and `workflow` → Click `Generate token` (scroll down) → Copy the token and paste it when prompted for your password.
-3. Now we can start a PR from this branch to our `main` branch and this will trigger the [workloads workflow](/.github/workflows/workloads.yaml). If the workflow (Anyscale Jobs) succeeds, this will produce comments with the training and evaluation results directly on the PR.
+4. Now we can start a PR from this branch to our `main` branch and this will trigger the [workloads workflow](/.github/workflows/workloads.yaml). If the workflow (Anyscale Jobs) succeeds, this will produce comments with the training and evaluation results directly on the PR.
 <div align="center">
  <img src="https://madewithml.com/static/images/mlops/cicd/comments.png">
 </div>
-4. If we like the results, we can merge the PR into the `main` branch. This will trigger the [serve workflow](/.github/workflows/serve.yaml) which will rollout our new service to production!
+5. If we like the results, we can merge the PR into the `main` branch. This will trigger the [serve workflow](/.github/workflows/serve.yaml) which will rollout our new service to production!
 ### Continual learning
--- a/madewithml/config.py
+++ b/madewithml/config.py
@ -5,7 +5,6 @@ import sys
 from pathlib import Path
 import mlflow
 import pretty_errors  # NOQA: F401 (imported but unused)
 # Directories
 ROOT_DIR = Path(__file__).parent.parent.absolute()
--- a/madewithml/data.py
+++ b/madewithml/data.py
@ -5,7 +5,6 @@ import numpy as np
 import pandas as pd
 import ray
 from ray.data import Dataset
 from ray.data.preprocessor import Preprocessor
 from sklearn.model_selection import train_test_split
 from transformers import BertTokenizer
@ -135,13 +134,18 @@ def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
    return outputs
-class CustomPreprocessor(Preprocessor):
+class CustomPreprocessor:
    """Custom preprocessor class."""
-    def _fit(self, ds):
+    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    def fit(self, ds):
        tags = ds.unique(column="tag")
        self.class_to_index = {tag: i for i, tag in enumerate(tags)}
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        return self
-    def _transform_pandas(self, batch):  # could also do _transform_numpy
+    def transform(self, ds):
-        return preprocess(batch, class_to_index=self.class_to_index)
+        return ds.map_batches(preprocess, fn_kwargs={"class_to_index": self.class_to_index}, batch_format="pandas")
--- a/madewithml/evaluate.py
+++ b/madewithml/evaluate.py
@ -8,13 +8,13 @@ import ray
 import ray.train.torch  # NOQA: F401 (imported but unused)
 import typer
 from ray.data import Dataset
 from ray.train.torch.torch_predictor import TorchPredictor
 from sklearn.metrics import precision_recall_fscore_support
 from snorkel.slicing import PandasSFApplier, slicing_function
 from typing_extensions import Annotated
 from madewithml import predict, utils
 from madewithml.config import logger
 from madewithml.predict import TorchPredictor
 # Initialize Typer CLI app
 app = typer.Typer()
@ -133,8 +133,8 @@ def evaluate(
    y_true = np.stack([item["targets"] for item in values])
    # y_pred
-    z = predictor.predict(data=ds.to_pandas())["predictions"]
+    predictions = preprocessed_ds.map_batches(predictor).take_all()
-    y_pred = np.stack(z).argmax(1)
+    y_pred = np.array([d["output"] for d in predictions])
    # Metrics
    metrics = {
--- a/madewithml/models.py
+++ b/madewithml/models.py
@ -1,13 +1,20 @@
 import json
 import os
 from pathlib import Path
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import BertModel
-class FinetunedLLM(nn.Module):  # pragma: no cover, torch model
+class FinetunedLLM(nn.Module):
    """Model architecture for a Large Language Model (LLM) that we will fine-tune."""
    def __init__(self, llm, dropout_p, embedding_dim, num_classes):
        super(FinetunedLLM, self).__init__()
        self.llm = llm
        self.dropout_p = dropout_p
        self.embedding_dim = embedding_dim
        self.num_classes = num_classes
        self.dropout = torch.nn.Dropout(dropout_p)
        self.fc1 = torch.nn.Linear(embedding_dim, num_classes)
@ -17,3 +24,36 @@ class FinetunedLLM(nn.Module):  # pragma: no cover, torch model
        z = self.dropout(pool)
        z = self.fc1(z)
        return z
    @torch.inference_mode()
    def predict(self, batch):
        self.eval()
        z = self(batch)
        y_pred = torch.argmax(z, dim=1).cpu().numpy()
        return y_pred
    @torch.inference_mode()
    def predict_proba(self, batch):
        self.eval()
        z = self(batch)
        y_probs = F.softmax(z, dim=1).cpu().numpy()
        return y_probs
    def save(self, dp):
        with open(Path(dp, "args.json"), "w") as fp:
            contents = {
                "dropout_p": self.dropout_p,
                "embedding_dim": self.embedding_dim,
                "num_classes": self.num_classes,
            }
            json.dump(contents, fp, indent=4, sort_keys=False)
        torch.save(self.state_dict(), os.path.join(dp, "model.pt"))
    @classmethod
    def load(cls, args_fp, state_dict_fp):
        with open(args_fp, "r") as fp:
            kwargs = json.load(fp=fp)
        llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
        model = cls(llm=llm, **kwargs)
        model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device("cpu")))
        return model
--- a/madewithml/predict.py
+++ b/madewithml/predict.py
@ -1,19 +1,20 @@
 import json
 from pathlib import Path
 from typing import Any, Dict, Iterable, List
 from urllib.parse import urlparse
 import numpy as np
 import pandas as pd
 import ray
 import torch
 import typer
 from numpyencoder import NumpyEncoder
 from ray.air import Result
 from ray.train.torch import TorchPredictor
 from ray.train.torch.torch_checkpoint import TorchCheckpoint
 from typing_extensions import Annotated
 from madewithml.config import logger, mlflow
 from madewithml.data import CustomPreprocessor
 from madewithml.models import FinetunedLLM
 from madewithml.utils import collate_fn
 # Initialize Typer CLI app
 app = typer.Typer()
@ -48,25 +49,51 @@ def format_prob(prob: Iterable, index_to_class: Dict) -> Dict:
    return d
-def predict_with_proba(
+class TorchPredictor:
-    df: pd.DataFrame,
+    def __init__(self, preprocessor, model):
-    predictor: ray.train.torch.torch_predictor.TorchPredictor,
+        self.preprocessor = preprocessor
        self.model = model
        self.model.eval()
    def __call__(self, batch):
        results = self.model.predict(collate_fn(batch))
        return {"output": results}
    def predict_proba(self, batch):
        results = self.model.predict_proba(collate_fn(batch))
        return {"output": results}
    def get_preprocessor(self):
        return self.preprocessor
    @classmethod
    def from_checkpoint(cls, checkpoint):
        metadata = checkpoint.get_metadata()
        preprocessor = CustomPreprocessor(class_to_index=metadata["class_to_index"])
        model = FinetunedLLM.load(Path(checkpoint.path, "args.json"), Path(checkpoint.path, "model.pt"))
        return cls(preprocessor=preprocessor, model=model)
 def predict_proba(
    ds: ray.data.dataset.Dataset,
    predictor: TorchPredictor,
 ) -> List:  # pragma: no cover, tested with inference workload
    """Predict tags (with probabilities) for input data from a dataframe.
    Args:
        df (pd.DataFrame): dataframe with input features.
-        predictor (ray.train.torch.torch_predictor.TorchPredictor): loaded predictor from a checkpoint.
+        predictor (TorchPredictor): loaded predictor from a checkpoint.
    Returns:
        List: list of predicted labels.
    """
    preprocessor = predictor.get_preprocessor()
-    z = predictor.predict(data=df)["predictions"]
+    preprocessed_ds = preprocessor.transform(ds)
-    y_prob = torch.tensor(np.stack(z)).softmax(dim=1).numpy()
+    outputs = preprocessed_ds.map_batches(predictor.predict_proba)
    y_prob = np.array([d["output"] for d in outputs.take_all()])
    results = []
    for i, prob in enumerate(y_prob):
-        tag = decode([z[i].argmax()], preprocessor.index_to_class)[0]
+        tag = preprocessor.index_to_class[prob.argmax()]
        results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
    return results
@ -125,11 +152,10 @@ def predict(
    # Load components
    best_checkpoint = get_best_checkpoint(run_id=run_id)
    predictor = TorchPredictor.from_checkpoint(best_checkpoint)
    # preprocessor = predictor.get_preprocessor()
    # Predict
-    sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}])
+    sample_ds = ray.data.from_items([{"title": title, "description": description, "tag": "other"}])
-    results = predict_with_proba(df=sample_df, predictor=predictor)
+    results = predict_proba(ds=sample_ds, predictor=predictor)
    logger.info(json.dumps(results, cls=NumpyEncoder, indent=2))
    return results
--- a/madewithml/serve.py
+++ b/madewithml/serve.py
@ -3,11 +3,9 @@ import os
 from http import HTTPStatus
 from typing import Dict
 import pandas as pd
 import ray
 from fastapi import FastAPI
 from ray import serve
 from ray.train.torch import TorchPredictor
 from starlette.requests import Request
 from madewithml import evaluate, predict
@ -21,7 +19,7 @@ app = FastAPI(
 )
-@serve.deployment(route_prefix="/", num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
+@serve.deployment(num_replicas="1", ray_actor_options={"num_cpus": 8, "num_gpus": 0})
@serve.ingress(app)
 class ModelDeployment:
    def __init__(self, run_id: str, threshold: int = 0.9):
@ -30,8 +28,7 @@ class ModelDeployment:
        self.threshold = threshold
        mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)  # so workers have access to model registry
        best_checkpoint = predict.get_best_checkpoint(run_id=run_id)
-        self.predictor = TorchPredictor.from_checkpoint(best_checkpoint)
+        self.predictor = predict.TorchPredictor.from_checkpoint(best_checkpoint)
        self.preprocessor = self.predictor.get_preprocessor()
    @app.get("/")
    def _index(self) -> Dict:
@ -55,11 +52,10 @@ class ModelDeployment:
        return {"results": results}
    @app.post("/predict/")
-    async def _predict(self, request: Request) -> Dict:
+    async def _predict(self, request: Request):
        # Get prediction
        data = await request.json()
-        df = pd.DataFrame([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
+        sample_ds = ray.data.from_items([{"title": data.get("title", ""), "description": data.get("description", ""), "tag": ""}])
-        results = predict.predict_with_proba(df=df, predictor=self.predictor)
+        results = predict.predict_proba(ds=sample_ds, predictor=self.predictor)
        # Apply custom logic
        for i, result in enumerate(results):
--- a/madewithml/train.py
+++ b/madewithml/train.py
@ -1,6 +1,7 @@
 import datetime
 import json
 import os
 import tempfile
 from typing import Tuple
 import numpy as np
@ -10,21 +11,23 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import typer
-from ray.air import session
+from ray.air.integrations.mlflow import MLflowLoggerCallback
-from ray.air.config import (
+from ray.data import Dataset
 from ray.train import (
    Checkpoint,
    CheckpointConfig,
-    DatasetConfig,
+    DataConfig,
    RunConfig,
    ScalingConfig,
 )
-from ray.air.integrations.mlflow import MLflowLoggerCallback
+from ray.train.torch import TorchTrainer
-from ray.data import Dataset
+from torch.nn.parallel.distributed import DistributedDataParallel
 from ray.train.torch import TorchCheckpoint, TorchTrainer
 from transformers import BertModel
 from typing_extensions import Annotated
-from madewithml import data, models, utils
+from madewithml import data, utils
 from madewithml.config import EFS_DIR, MLFLOW_TRACKING_URI, logger
 from madewithml.models import FinetunedLLM
 # Initialize Typer CLI app
 app = typer.Typer()
@ -106,18 +109,18 @@ def train_loop_per_worker(config: dict) -> None:  # pragma: no cover, tested via
    lr = config["lr"]
    lr_factor = config["lr_factor"]
    lr_patience = config["lr_patience"]
    batch_size = config["batch_size"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_classes = config["num_classes"]
    # Get datasets
    utils.set_seeds()
-    train_ds = session.get_dataset_shard("train")
+    train_ds = train.get_dataset_shard("train")
-    val_ds = session.get_dataset_shard("val")
+    val_ds = train.get_dataset_shard("val")
    # Model
    llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
-    model = models.FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
+    model = FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
    model = train.torch.prepare_model(model)
    # Training components
@ -126,7 +129,8 @@ def train_loop_per_worker(config: dict) -> None:  # pragma: no cover, tested via
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)
    # Training
-    batch_size_per_worker = batch_size // session.get_world_size()
+    num_workers = train.get_context().get_world_size()
    batch_size_per_worker = batch_size // num_workers
    for epoch in range(num_epochs):
        # Step
        train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
@ -134,9 +138,14 @@ def train_loop_per_worker(config: dict) -> None:  # pragma: no cover, tested via
        scheduler.step(val_loss)
        # Checkpoint
        with tempfile.TemporaryDirectory() as dp:
            if isinstance(model, DistributedDataParallel):  # cpu
                model.module.save(dp=dp)
            else:
                model.save(dp=dp)
            metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
-        checkpoint = TorchCheckpoint.from_model(model=model)
+            checkpoint = Checkpoint.from_directory(dp)
-        session.report(metrics, checkpoint=checkpoint)
+            train.report(metrics, checkpoint=checkpoint)
@app.command()
@ -183,7 +192,6 @@ def train_model(
        num_workers=num_workers,
        use_gpu=bool(gpu_per_worker),
        resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
        _max_cpu_fraction_per_node=0.8,
    )
    # Checkpoint config
@ -201,7 +209,7 @@ def train_model(
    )
    # Run config
-    run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR)
+    run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR, local_dir=EFS_DIR)
    # Dataset
    ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"])
@ -210,14 +218,13 @@ def train_model(
    train_loop_config["num_classes"] = len(tags)
    # Dataset config
-    dataset_config = {
+    options = ray.data.ExecutionOptions(preserve_order=True)
-        "train": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
+    dataset_config = DataConfig(datasets_to_split=["train"], execution_options=options)
        "val": DatasetConfig(fit=False, transform=False, randomize_block_order=False),
    }
    # Preprocess
    preprocessor = data.CustomPreprocessor()
-    train_ds = preprocessor.fit_transform(train_ds)
+    preprocessor = preprocessor.fit(train_ds)
    train_ds = preprocessor.transform(train_ds)
    val_ds = preprocessor.transform(val_ds)
    train_ds = train_ds.materialize()
    val_ds = val_ds.materialize()
@ -230,7 +237,7 @@ def train_model(
        run_config=run_config,
        datasets={"train": train_ds, "val": val_ds},
        dataset_config=dataset_config,
-        preprocessor=preprocessor,
+        metadata={"class_to_index": preprocessor.class_to_index},
    )
    # Train
--- a/madewithml/tune.py
+++ b/madewithml/tune.py
@ -73,7 +73,6 @@ def tune_models(
        num_workers=num_workers,
        use_gpu=bool(gpu_per_worker),
        resources_per_worker={"CPU": cpu_per_worker, "GPU": gpu_per_worker},
        _max_cpu_fraction_per_node=0.8,
    )
    # Dataset
@ -90,7 +89,8 @@ def tune_models(
    # Preprocess
    preprocessor = data.CustomPreprocessor()
-    train_ds = preprocessor.fit_transform(train_ds)
+    preprocessor = preprocessor.fit(train_ds)
    train_ds = preprocessor.transform(train_ds)
    val_ds = preprocessor.transform(val_ds)
    train_ds = train_ds.materialize()
    val_ds = val_ds.materialize()
@ -102,7 +102,7 @@ def tune_models(
        scaling_config=scaling_config,
        datasets={"train": train_ds, "val": val_ds},
        dataset_config=dataset_config,
-        preprocessor=preprocessor,
+        metadata={"class_to_index": preprocessor.class_to_index},
    )
    # Checkpoint configuration
@ -118,7 +118,7 @@ def tune_models(
        experiment_name=experiment_name,
        save_artifact=True,
    )
-    run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR)
+    run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR, local_dir=EFS_DIR)
    # Hyperparameters to start with
    initial_params = json.loads(initial_params)
--- a/notebooks/madewithml.ipynb
+++ b/notebooks/madewithml.ipynb
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,6 @@ nltk==3.8.1
 numpy==1.24.3
 numpyencoder==0.3.0
 pandas==2.0.1
 pretty-errors==1.2.25
 python-dotenv==1.0.0
 ray[air]==2.6.0
 scikit-learn==1.2.2
--- a/tests/code/test_data.py
+++ b/tests/code/test_data.py
@ -54,5 +54,7 @@ def test_preprocess(df, class_to_index):
 def test_fit_transform(dataset_loc, preprocessor):
    ds = data.load_data(dataset_loc=dataset_loc)
-    preprocessor.fit_transform(ds)
+    preprocessor = preprocessor.fit(ds)
    preprocessed_ds = preprocessor.transform(ds)
    assert len(preprocessor.class_to_index) == 4
    assert ds.count() == preprocessed_ds.count()
--- a/tests/code/test_utils.py
+++ b/tests/code/test_utils.py
@ -4,6 +4,7 @@ from pathlib import Path
 import numpy as np
 import pytest
 import torch
 from ray.train.torch import get_device
 from madewithml import utils
@ -42,9 +43,9 @@ def test_collate_fn():
    }
    processed_batch = utils.collate_fn(batch)
    expected_batch = {
-        "ids": torch.tensor([[1, 2, 0], [1, 2, 3]], dtype=torch.int32),
+        "ids": torch.as_tensor([[1, 2, 0], [1, 2, 3]], dtype=torch.int32, device=get_device()),
-        "masks": torch.tensor([[1, 1, 0], [1, 1, 1]], dtype=torch.int32),
+        "masks": torch.as_tensor([[1, 1, 0], [1, 1, 1]], dtype=torch.int32, device=get_device()),
-        "targets": torch.tensor([3, 1], dtype=torch.int64),
+        "targets": torch.as_tensor([3, 1], dtype=torch.int64, device=get_device()),
    }
    for k in batch:
        assert torch.allclose(processed_batch[k], expected_batch[k])
--- a/tests/model/conftest.py
+++ b/tests/model/conftest.py
@ -1,7 +1,7 @@
 import pytest
 from ray.train.torch.torch_predictor import TorchPredictor
 from madewithml import predict
 from madewithml.predict import TorchPredictor
 def pytest_addoption(parser):
--- a/tests/model/utils.py
+++ b/tests/model/utils.py
@ -1,12 +1,9 @@
-import numpy as np
+import ray
 import pandas as pd
 from madewithml import predict
 def get_label(text, predictor):
-    df = pd.DataFrame({"title": [text], "description": "", "tag": "other"})
+    sample_ds = ray.data.from_items([{"title": text, "description": "", "tag": "other"}])
-    z = predictor.predict(data=df)["predictions"]
+    results = predict.predict_proba(ds=sample_ds, predictor=predictor)
-    preprocessor = predictor.get_preprocessor()
+    return results[0]["prediction"]
    label = predict.decode(np.stack(z).argmax(1), preprocessor.index_to_class)[0]
    return label