import re from typing import Dict, List, Tuple import numpy as np import pandas as pd import ray from ray.data import Dataset from sklearn.model_selection import train_test_split from transformers import BertTokenizer from madewithml.config import STOPWORDS def load_data(dataset_loc: str, num_samples: int = None) -> Dataset: """Load data from source into a Ray Dataset. Args: dataset_loc (str): Location of the dataset. num_samples (int, optional): The number of samples to load. Defaults to None. Returns: Dataset: Our dataset represented by a Ray Dataset. """ ds = ray.data.read_csv(dataset_loc) ds = ds.random_shuffle(seed=1234) ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds return ds def stratify_split( ds: Dataset, stratify: str, test_size: float, shuffle: bool = True, seed: int = 1234, ) -> Tuple[Dataset, Dataset]: """Split a dataset into train and test splits with equal amounts of data points from each class in the column we want to stratify on. Args: ds (Dataset): Input dataset to split. stratify (str): Name of column to split on. test_size (float): Proportion of dataset to split for test set. shuffle (bool, optional): whether to shuffle the dataset. Defaults to True. seed (int, optional): seed for shuffling. Defaults to 1234. Returns: Tuple[Dataset, Dataset]: the stratified train and test datasets. """ def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function """Naively split a dataframe into train and test splits. Add a column specifying whether it's the train or test split.""" train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed) train["_split"] = "train" test["_split"] = "test" return pd.concat([train, test]) def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function """Filter by data points that match the split column's value and return the dataframe with the _split column dropped.""" return df[df["_split"] == split].drop("_split", axis=1) # Train, test split with stratify grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine # Shuffle each split (required) train_ds = train_ds.random_shuffle(seed=seed) test_ds = test_ds.random_shuffle(seed=seed) return train_ds, test_ds def clean_text(text: str, stopwords: List = STOPWORDS) -> str: """Clean raw text string. Args: text (str): Raw text to clean. stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS. Returns: str: cleaned text. """ # Lower text = text.lower() # Remove stopwords pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*") text = pattern.sub(" ", text) # Spacing and filters text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars text = re.sub(" +", " ", text) # remove multiple spaces text = text.strip() # strip white space at the ends text = re.sub(r"http\S+", "", text) # remove links return text def tokenize(batch: Dict) -> Dict: """Tokenize the text input in our batch using a tokenizer. Args: batch (Dict): batch of data with the text inputs to tokenize. Returns: Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs. """ tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False) encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest") return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"])) def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict: """Preprocess the data in our dataframe. Args: df (pd.DataFrame): Raw dataframe to preprocess. class_to_index (Dict): Mapping of class names to indices. Returns: Dict: preprocessed data (ids, masks, targets). """ df["text"] = df.title + " " + df.description # feature engineering df["text"] = df.text.apply(clean_text) # clean text df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe df = df[["text", "tag"]] # rearrange columns df["tag"] = df["tag"].map(class_to_index) # label encoding outputs = tokenize(df) return outputs class CustomPreprocessor: """Custom preprocessor class.""" def __init__(self, class_to_index={}): self.class_to_index = class_to_index or {} # mutable defaults self.index_to_class = {v: k for k, v in self.class_to_index.items()} def fit(self, ds): tags = ds.unique(column="tag") self.class_to_index = {tag: i for i, tag in enumerate(tags)} self.index_to_class = {v: k for k, v in self.class_to_index.items()} return self def transform(self, ds): return ds.map_batches(preprocess, fn_kwargs={"class_to_index": self.class_to_index}, batch_format="pandas")