36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
"""
|
|
``name_index`` builds an inverted index mapping words to sets of Unicode
|
|
characters which contain that word in their names. For example::
|
|
|
|
>>> index = name_index(32, 65)
|
|
>>> sorted(index['SIGN'])
|
|
['#', '$', '%', '+', '<', '=', '>']
|
|
>>> sorted(index['DIGIT'])
|
|
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
|
>>> index['DIGIT'] & index['EIGHT']
|
|
{'8'}
|
|
"""
|
|
|
|
# tag::CHARINDEX[]
|
|
import sys
|
|
import re
|
|
import unicodedata
|
|
from collections.abc import Iterator
|
|
|
|
RE_WORD = re.compile(r'\w+')
|
|
STOP_CODE = sys.maxunicode + 1
|
|
|
|
def tokenize(text: str) -> Iterator[str]: # <1>
|
|
"""return iterable of uppercased words"""
|
|
for match in RE_WORD.finditer(text):
|
|
yield match.group().upper()
|
|
|
|
def name_index(start: int = 32, end: int = STOP_CODE) -> dict[str, set[str]]:
|
|
index: dict[str, set[str]] = {} # <2>
|
|
for char in (chr(i) for i in range(start, end)):
|
|
if name := unicodedata.name(char, ''): # <3>
|
|
for word in tokenize(name):
|
|
index.setdefault(word, set()).add(char)
|
|
return index
|
|
# end::CHARINDEX[]
|