ML for Developers
This commit is contained in:
13
tests/code/conftest.py
Normal file
13
tests/code/conftest.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import pytest
|
||||
|
||||
from madewithml.data import CustomPreprocessor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dataset_loc():
|
||||
return "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def preprocessor():
|
||||
return CustomPreprocessor()
|
||||
58
tests/code/test_data.py
Normal file
58
tests/code/test_data.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
from madewithml import data
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def df():
|
||||
data = [{"title": "a0", "description": "b0", "tag": "c0"}]
|
||||
df = pd.DataFrame(data)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def class_to_index():
|
||||
class_to_index = {"c0": 0, "c1": 1}
|
||||
return class_to_index
|
||||
|
||||
|
||||
def test_load_data(dataset_loc):
|
||||
num_samples = 10
|
||||
ds = data.load_data(dataset_loc=dataset_loc, num_samples=num_samples)
|
||||
assert ds.count() == num_samples
|
||||
|
||||
|
||||
def test_stratify_split():
|
||||
n_per_class = 10
|
||||
targets = n_per_class * ["c1"] + n_per_class * ["c2"]
|
||||
ds = ray.data.from_items([dict(target=t) for t in targets])
|
||||
train_ds, test_ds = data.stratify_split(ds, stratify="target", test_size=0.5)
|
||||
train_target_counts = train_ds.to_pandas().target.value_counts().to_dict()
|
||||
test_target_counts = test_ds.to_pandas().target.value_counts().to_dict()
|
||||
assert train_target_counts == test_target_counts
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, sw, clean_text",
|
||||
[
|
||||
("hi", [], "hi"),
|
||||
("hi you", ["you"], "hi"),
|
||||
("hi yous", ["you"], "hi yous"),
|
||||
],
|
||||
)
|
||||
def test_clean_text(text, sw, clean_text):
|
||||
assert data.clean_text(text=text, stopwords=sw) == clean_text
|
||||
|
||||
|
||||
def test_preprocess(df, class_to_index):
|
||||
assert "text" not in df.columns
|
||||
outputs = data.preprocess(df, class_to_index=class_to_index)
|
||||
assert set(outputs) == {"ids", "masks", "targets"}
|
||||
|
||||
|
||||
def test_fit_transform(dataset_loc, preprocessor):
|
||||
ds = data.load_data(dataset_loc=dataset_loc)
|
||||
preprocessor.fit_transform(ds)
|
||||
assert len(preprocessor.class_to_index) == 4
|
||||
11
tests/code/test_predict.py
Normal file
11
tests/code/test_predict.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from madewithml import predict
|
||||
|
||||
|
||||
def test_decode():
|
||||
decoded = predict.decode(indices=[0, 1, 1], index_to_class={0: "x", 1: "y"})
|
||||
assert decoded == ["x", "y", "y"]
|
||||
|
||||
|
||||
def test_format_prob():
|
||||
d = predict.format_prob(prob=[0.1, 0.9], index_to_class={0: "x", 1: "y"})
|
||||
assert d == {"x": 0.1, "y": 0.9}
|
||||
27
tests/code/test_train.py
Normal file
27
tests/code/test_train.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import utils
|
||||
|
||||
from madewithml import train
|
||||
|
||||
|
||||
@pytest.mark.training
|
||||
def test_train_model(dataset_loc):
|
||||
experiment_name = utils.generate_experiment_name(prefix="test_train")
|
||||
train_loop_config = {"dropout_p": 0.5, "lr": 1e-4, "lr_factor": 0.8, "lr_patience": 3}
|
||||
result = train.train_model(
|
||||
experiment_name=experiment_name,
|
||||
dataset_loc=dataset_loc,
|
||||
train_loop_config=json.dumps(train_loop_config),
|
||||
num_workers=6,
|
||||
cpu_per_worker=1,
|
||||
gpu_per_worker=0,
|
||||
num_epochs=2,
|
||||
num_samples=512,
|
||||
batch_size=256,
|
||||
results_fp=None,
|
||||
)
|
||||
utils.delete_experiment(experiment_name=experiment_name)
|
||||
train_loss_list = result.metrics_dataframe.to_dict()["train_loss"]
|
||||
assert train_loss_list[0] > train_loss_list[1] # loss decreased
|
||||
37
tests/code/test_tune.py
Normal file
37
tests/code/test_tune.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import utils
|
||||
|
||||
from madewithml import tune
|
||||
|
||||
|
||||
@pytest.mark.training
|
||||
def test_tune_models(dataset_loc):
|
||||
num_runs = 2
|
||||
experiment_name = utils.generate_experiment_name(prefix="test_tune")
|
||||
initial_params = [
|
||||
{
|
||||
"train_loop_config": {
|
||||
"dropout_p": 0.5,
|
||||
"lr": 1e-4,
|
||||
"lr_factor": 0.8,
|
||||
"lr_patience": 3,
|
||||
}
|
||||
}
|
||||
]
|
||||
results = tune.tune_models(
|
||||
experiment_name=experiment_name,
|
||||
dataset_loc=dataset_loc,
|
||||
initial_params=json.dumps(initial_params),
|
||||
num_workers=6,
|
||||
cpu_per_worker=1,
|
||||
gpu_per_worker=0,
|
||||
num_runs=num_runs,
|
||||
num_epochs=1,
|
||||
num_samples=512,
|
||||
batch_size=256,
|
||||
results_fp=None,
|
||||
)
|
||||
utils.delete_experiment(experiment_name=experiment_name)
|
||||
assert len(results.get_dataframe()) == num_runs
|
||||
61
tests/code/test_utils.py
Normal file
61
tests/code/test_utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from madewithml import utils
|
||||
|
||||
|
||||
def test_set_seed():
|
||||
utils.set_seeds()
|
||||
a = np.random.randn(2, 3)
|
||||
b = np.random.randn(2, 3)
|
||||
utils.set_seeds()
|
||||
x = np.random.randn(2, 3)
|
||||
y = np.random.randn(2, 3)
|
||||
assert np.array_equal(a, x)
|
||||
assert np.array_equal(b, y)
|
||||
|
||||
|
||||
def test_save_and_load_dict():
|
||||
with tempfile.TemporaryDirectory() as dp:
|
||||
d = {"hello": "world"}
|
||||
fp = Path(dp, "d.json")
|
||||
utils.save_dict(d=d, path=fp)
|
||||
d = utils.load_dict(path=fp)
|
||||
assert d["hello"] == "world"
|
||||
|
||||
|
||||
def test_pad_array():
|
||||
arr = np.array([[1, 2], [1, 2, 3]], dtype="object")
|
||||
padded_arr = np.array([[1, 2, 0], [1, 2, 3]])
|
||||
assert np.array_equal(utils.pad_array(arr), padded_arr)
|
||||
|
||||
|
||||
def test_collate_fn():
|
||||
batch = {
|
||||
"ids": np.array([[1, 2], [1, 2, 3]], dtype="object"),
|
||||
"masks": np.array([[1, 1], [1, 1, 1]], dtype="object"),
|
||||
"targets": np.array([3, 1]),
|
||||
}
|
||||
processed_batch = utils.collate_fn(batch)
|
||||
expected_batch = {
|
||||
"ids": torch.tensor([[1, 2, 0], [1, 2, 3]], dtype=torch.int32),
|
||||
"masks": torch.tensor([[1, 1, 0], [1, 1, 1]], dtype=torch.int32),
|
||||
"targets": torch.tensor([3, 1], dtype=torch.int64),
|
||||
}
|
||||
for k in batch:
|
||||
assert torch.allclose(processed_batch[k], expected_batch[k])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"d, keys, list",
|
||||
[
|
||||
({"a": [1, 2], "b": [1, 2]}, ["a", "b"], [{"a": 1, "b": 1}, {"a": 2, "b": 2}]),
|
||||
({"a": [1, 2], "b": [1, 2]}, ["a"], [{"a": 1}, {"a": 2}]),
|
||||
],
|
||||
)
|
||||
def test_dict_to_list(d, keys, list):
|
||||
assert utils.dict_to_list(d, keys=keys) == list
|
||||
13
tests/code/utils.py
Normal file
13
tests/code/utils.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import uuid
|
||||
|
||||
from madewithml.config import mlflow
|
||||
|
||||
|
||||
def generate_experiment_name(prefix: str = "test") -> str:
|
||||
return f"{prefix}-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def delete_experiment(experiment_name: str) -> None:
|
||||
client = mlflow.tracking.MlflowClient()
|
||||
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
|
||||
client.delete_experiment(experiment_id=experiment_id)
|
||||
Reference in New Issue
Block a user