ML for Developers

2023-07-26 04:53:11 -07:00
commit 776a75b010
54 changed files with 55464 additions and 0 deletions
--- a/deploy/cluster_compute.yaml
+++ b/deploy/cluster_compute.yaml
@@ -0,0 +1,22 @@
+cloud: madewithml-us-east-2
+region: us-east2
+head_node_type:
+  name: head_node_type
+  instance_type: m5.2xlarge  # 8 CPU, 0 GPU, 32 GB RAM
+worker_node_types:
+- name: gpu_worker
+  instance_type: g4dn.xlarge  # 4 CPU, 1 GPU, 16 GB RAM
+  min_workers: 0
+  max_workers: 1
+  use_spot: False
+aws:
+  BlockDeviceMappings:
+  - DeviceName: "/dev/sda1"
+    Ebs:
+      VolumeSize: 500
+      DeleteOnTermination: true
+  TagSpecifications:
+  - ResourceType: instance
+    Tags:
+      - Key: as-feature-multi-zone
+        Value: "true"
--- a/deploy/cluster_env.yaml
+++ b/deploy/cluster_env.yaml
@@ -0,0 +1,12 @@
+base_image: anyscale/ray:2.6.0-py310-cu118
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages: []
+  conda_packages: []
+
+post_build_cmds:
+  - python3 -m pip install --upgrade pip setuptools wheel
+  - python3 -m pip install -r https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/requirements.txt
--- a/deploy/jobs/workloads.sh
+++ b/deploy/jobs/workloads.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+export PYTHONPATH=$PYTHONPATH:$PWD
+export RAY_AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE=1
+mkdir results
+
+# Test data
+export RESULTS_FILE=results/test_data_results.txt
+export DATASET_LOC="https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
+pytest --dataset-loc=$DATASET_LOC tests/data --verbose --disable-warnings > $RESULTS_FILE
+cat $RESULTS_FILE
+
+# Test code
+export RESULTS_FILE=results/test_code_results.txt
+python -m pytest tests/code --verbose --disable-warnings > $RESULTS_FILE
+cat $RESULTS_FILE
+
+# Train
+export EXPERIMENT_NAME="llm"
+export RESULTS_FILE=results/training_results.json
+export DATASET_LOC="https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
+export TRAIN_LOOP_CONFIG='{"dropout_p": 0.5, "lr": 1e-4, "lr_factor": 0.8, "lr_patience": 3}'
+python madewithml/train.py \
+    --experiment-name "$EXPERIMENT_NAME" \
+    --dataset-loc "$DATASET_LOC" \
+    --train-loop-config "$TRAIN_LOOP_CONFIG" \
+    --num-workers 1 \
+    --cpu-per-worker 10 \
+    --gpu-per-worker 1 \
+    --num-epochs 10 \
+    --batch-size 256 \
+    --results-fp $RESULTS_FILE
+
+# Get and save run ID
+export RUN_ID=$(python -c "import os; from madewithml import utils; d = utils.load_dict(os.getenv('RESULTS_FILE')); print(d['run_id'])")
+export RUN_ID_FILE=results/run_id.txt
+echo $RUN_ID > $RUN_ID_FILE  # used for serving later
+
+# Evaluate
+export RESULTS_FILE=results/evaluation_results.json
+export HOLDOUT_LOC="https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/holdout.csv"
+python madewithml/evaluate.py \
+    --run-id $RUN_ID \
+    --dataset-loc $HOLDOUT_LOC \
+    --results-fp $RESULTS_FILE
+
+# Test model
+RESULTS_FILE=results/test_model_results.txt
+pytest --run-id=$RUN_ID tests/model --verbose --disable-warnings > $RESULTS_FILE
+cat $RESULTS_FILE
+
+# Save to S3
+export MODEL_REGISTRY=$(python -c "from madewithml import config; print(config.MODEL_REGISTRY)")
+aws s3 cp $MODEL_REGISTRY s3://madewithml/$GITHUB_USERNAME/mlflow/ --recursive
+aws s3 cp results/ s3://madewithml/$GITHUB_USERNAME/results/ --recursive
--- a/deploy/jobs/workloads.yaml
+++ b/deploy/jobs/workloads.yaml
@@ -0,0 +1,11 @@
+name: workloads
+project_id: prj_v9izs5t1d6b512ism8c5rkq4wm
+cluster_env: madewithml-cluster-env
+compute_config: madewithml-cluster-compute
+runtime_env:
+  working_dir: .
+  upload_path: s3://madewithml/GokuMohandas/jobs  # <--- CHANGE USERNAME (case-sensitive)
+  env_vars:
+    GITHUB_USERNAME: GokuMohandas  # <--- CHANGE USERNAME (case-sensitive)
+entrypoint: bash deploy/jobs/workloads.sh
+max_retries: 0
--- a/deploy/services/serve_model.py
+++ b/deploy/services/serve_model.py
@@ -0,0 +1,17 @@
+import os
+import subprocess
+import sys
+
+sys.path.append(".")
+
+from madewithml.config import MODEL_REGISTRY  # NOQA: E402
+from madewithml.serve import ModelDeployment  # NOQA: E402
+
+# Copy from S3
+github_username = os.environ.get("GITHUB_USERNAME")
+subprocess.check_output(["aws", "s3", "cp", f"s3://madewithml/{github_username}/mlflow/", str(MODEL_REGISTRY), "--recursive"])
+subprocess.check_output(["aws", "s3", "cp", f"s3://madewithml/{github_username}/results/", "./", "--recursive"])
+
+# Entrypoint
+run_id = [line.strip() for line in open("run_id.txt")][0]
+entrypoint = ModelDeployment.bind(run_id=run_id, threshold=0.9)
--- a/deploy/services/serve_model.yaml
+++ b/deploy/services/serve_model.yaml
@@ -0,0 +1,12 @@
+name: madewithml
+project_id: prj_v9izs5t1d6b512ism8c5rkq4wm
+cluster_env: madewithml-cluster-env
+compute_config: madewithml-cluster-compute
+ray_serve_config:
+  import_path: deploy.services.serve_model:entrypoint
+  runtime_env:
+    working_dir: .
+    upload_path: s3://madewithml/GokuMohandas/services  # <--- CHANGE USERNAME (case-sensitive)
+    env_vars:
+      GITHUB_USERNAME: GokuMohandas  # <--- CHANGE USERNAME (case-sensitive)
+rollout_strategy: ROLLOUT # ROLLOUT or IN_PLACE