example-code-2e/08-def-type-hints/charindex.py
2021-01-31 22:48:38 +01:00

36 lines
1.1 KiB
Python

"""
``name_index`` builds an inverted index mapping words to sets of Unicode
characters which contain that word in their names. For example::
>>> index = name_index(32, 65)
>>> sorted(index['SIGN'])
['#', '$', '%', '+', '<', '=', '>']
>>> sorted(index['DIGIT'])
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
>>> index['DIGIT'] & index['EIGHT']
{'8'}
"""
# tag::CHARINDEX[]
import sys
import re
import unicodedata
from typing import Dict, Set, Iterator
RE_WORD = re.compile(r'\w+')
STOP_CODE = sys.maxunicode + 1
def tokenize(text: str) -> Iterator[str]: # <1>
"""return iterable of uppercased words"""
for match in RE_WORD.finditer(text):
yield match.group().upper()
def name_index(start: int = 32, end: int = STOP_CODE) -> Dict[str, Set[str]]:
index: Dict[str, Set[str]] = {} # <2>
for char in (chr(i) for i in range(start, end)):
if name := unicodedata.name(char, ''): # <3>
for word in tokenize(name):
index.setdefault(word, set()).add(char)
return index
# end::CHARINDEX[]