148 lines
5.4 KiB
Python
148 lines
5.4 KiB
Python
import re
|
|
from typing import Dict, List, Tuple
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import ray
|
|
from ray.data import Dataset
|
|
from ray.data.preprocessor import Preprocessor
|
|
from sklearn.model_selection import train_test_split
|
|
from transformers import BertTokenizer
|
|
|
|
from madewithml.config import STOPWORDS
|
|
|
|
|
|
def load_data(dataset_loc: str, num_samples: int = None) -> Dataset:
|
|
"""Load data from source into a Ray Dataset.
|
|
|
|
Args:
|
|
dataset_loc (str): Location of the dataset.
|
|
num_samples (int, optional): The number of samples to load. Defaults to None.
|
|
|
|
Returns:
|
|
Dataset: Our dataset represented by a Ray Dataset.
|
|
"""
|
|
ds = ray.data.read_csv(dataset_loc)
|
|
ds = ds.random_shuffle(seed=1234)
|
|
ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
|
|
return ds
|
|
|
|
|
|
def stratify_split(
|
|
ds: Dataset,
|
|
stratify: str,
|
|
test_size: float,
|
|
shuffle: bool = True,
|
|
seed: int = 1234,
|
|
) -> Tuple[Dataset, Dataset]:
|
|
"""Split a dataset into train and test splits with equal
|
|
amounts of data points from each class in the column we
|
|
want to stratify on.
|
|
|
|
Args:
|
|
ds (Dataset): Input dataset to split.
|
|
stratify (str): Name of column to split on.
|
|
test_size (float): Proportion of dataset to split for test set.
|
|
shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
|
|
seed (int, optional): seed for shuffling. Defaults to 1234.
|
|
|
|
Returns:
|
|
Tuple[Dataset, Dataset]: the stratified train and test datasets.
|
|
"""
|
|
|
|
def _add_split(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover, used in parent function
|
|
"""Naively split a dataframe into train and test splits.
|
|
Add a column specifying whether it's the train or test split."""
|
|
train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
|
|
train["_split"] = "train"
|
|
test["_split"] = "test"
|
|
return pd.concat([train, test])
|
|
|
|
def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame: # pragma: no cover, used in parent function
|
|
"""Filter by data points that match the split column's value
|
|
and return the dataframe with the _split column dropped."""
|
|
return df[df["_split"] == split].drop("_split", axis=1)
|
|
|
|
# Train, test split with stratify
|
|
grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas") # group by each unique value in the column we want to stratify on
|
|
train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas") # combine
|
|
test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas") # combine
|
|
|
|
# Shuffle each split (required)
|
|
train_ds = train_ds.random_shuffle(seed=seed)
|
|
test_ds = test_ds.random_shuffle(seed=seed)
|
|
|
|
return train_ds, test_ds
|
|
|
|
|
|
def clean_text(text: str, stopwords: List = STOPWORDS) -> str:
|
|
"""Clean raw text string.
|
|
|
|
Args:
|
|
text (str): Raw text to clean.
|
|
stopwords (List, optional): list of words to filter out. Defaults to STOPWORDS.
|
|
|
|
Returns:
|
|
str: cleaned text.
|
|
"""
|
|
# Lower
|
|
text = text.lower()
|
|
|
|
# Remove stopwords
|
|
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
|
|
text = pattern.sub(" ", text)
|
|
|
|
# Spacing and filters
|
|
text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text) # add spacing
|
|
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
|
|
text = re.sub(" +", " ", text) # remove multiple spaces
|
|
text = text.strip() # strip white space at the ends
|
|
text = re.sub(r"http\S+", "", text) # remove links
|
|
|
|
return text
|
|
|
|
|
|
def tokenize(batch: Dict) -> Dict:
|
|
"""Tokenize the text input in our batch using a tokenizer.
|
|
|
|
Args:
|
|
batch (Dict): batch of data with the text inputs to tokenize.
|
|
|
|
Returns:
|
|
Dict: batch of data with the results of tokenization (`input_ids` and `attention_mask`) on the text inputs.
|
|
"""
|
|
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
|
|
encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
|
|
return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))
|
|
|
|
|
|
def preprocess(df: pd.DataFrame, class_to_index: Dict) -> Dict:
|
|
"""Preprocess the data in our dataframe.
|
|
|
|
Args:
|
|
df (pd.DataFrame): Raw dataframe to preprocess.
|
|
class_to_index (Dict): Mapping of class names to indices.
|
|
|
|
Returns:
|
|
Dict: preprocessed data (ids, masks, targets).
|
|
"""
|
|
df["text"] = df.title + " " + df.description # feature engineering
|
|
df["text"] = df.text.apply(clean_text) # clean text
|
|
df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore") # clean dataframe
|
|
df = df[["text", "tag"]] # rearrange columns
|
|
df["tag"] = df["tag"].map(class_to_index) # label encoding
|
|
outputs = tokenize(df)
|
|
return outputs
|
|
|
|
|
|
class CustomPreprocessor(Preprocessor):
|
|
"""Custom preprocessor class."""
|
|
|
|
def _fit(self, ds):
|
|
tags = ds.unique(column="tag")
|
|
self.class_to_index = {tag: i for i, tag in enumerate(tags)}
|
|
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
|
|
|
|
def _transform_pandas(self, batch): # could also do _transform_numpy
|
|
return preprocess(batch, class_to_index=self.class_to_index)
|