example-code-2e/08-def-type-hints/charindex.py

"""
``name_index`` builds an inverted index mapping words to sets of Unicode
characters which contain that word in their names. For example::

    >>> index = name_index(32, 65)
    >>> sorted(index['SIGN'])
    ['#', '$', '%', '+', '<', '=', '>']
    >>> sorted(index['DIGIT'])
    ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    >>> index['DIGIT'] & index['EIGHT']
    {'8'}
"""

# tag::CHARINDEX[]
import sys
import re
import unicodedata
from typing import Dict, Set, Iterator

RE_WORD = re.compile(r'\w+')
STOP_CODE = sys.maxunicode + 1

def tokenize(text: str) -> Iterator[str]:  # <1>
    """return iterable of uppercased words"""
    for match in RE_WORD.finditer(text):
        yield match.group().upper()

def name_index(start: int = 32, end: int = STOP_CODE) -> Dict[str, Set[str]]:
    index: Dict[str, Set[str]] = {}  # <2>
    for char in (chr(i) for i in range(start, end)):
        if name := unicodedata.name(char, ''):  # <3>
            for word in tokenize(name):
                index.setdefault(word, set()).add(char)
    return index
# end::CHARINDEX[]