ML for Developers

This commit is contained in:
GokuMohandas
2023-07-26 04:53:11 -07:00
commit 776a75b010
54 changed files with 55464 additions and 0 deletions

17
tests/data/conftest.py Normal file
View File

@@ -0,0 +1,17 @@
import great_expectations as ge
import pandas as pd
import pytest
def pytest_addoption(parser):
"""Add option to specify dataset location when executing tests from CLI.
Ex: pytest --dataset-loc=$DATASET_LOC tests/data --verbose --disable-warnings
"""
parser.addoption("--dataset-loc", action="store", default=None, help="Dataset location.")
@pytest.fixture(scope="module")
def df(request):
dataset_loc = request.config.getoption("--dataset-loc")
df = ge.dataset.PandasDataset(pd.read_csv(dataset_loc))
return df

View File

@@ -0,0 +1,15 @@
def test_dataset(df):
"""Test dataset quality and integrity."""
column_list = ["id", "created_on", "title", "description", "tag"]
df.expect_table_columns_to_match_ordered_list(column_list=column_list) # schema adherence
tags = ["computer-vision", "natural-language-processing", "mlops", "other"]
df.expect_column_values_to_be_in_set(column="tag", value_set=tags) # expected labels
df.expect_compound_columns_to_be_unique(column_list=["title", "description"]) # data leaks
df.expect_column_values_to_not_be_null(column="tag") # missing values
df.expect_column_values_to_be_unique(column="id") # unique values
df.expect_column_values_to_be_of_type(column="title", type_="str") # type adherence
# Expectation suite
expectation_suite = df.get_expectation_suite(discard_failed_expectations=False)
results = df.validate(expectation_suite=expectation_suite, only_return_failures=True).to_json_dict()
assert results["success"]