example-code-2e/21-async/mojifinder/charindex.py

89 lines
2.4 KiB
Python
Raw Normal View History

2021-02-24 22:55:55 +01:00
#!/usr/bin/env python
"""
Class ``InvertedIndex`` builds an inverted index mapping each word to
the set of Unicode characters which contain that word in their names.
2021-03-22 16:24:21 +01:00
Optional arguments to the constructor are ``first`` and ``last+1``
character codes to index, to make testing easier. In the examples
below, only the ASCII range was indexed.
2021-02-24 22:55:55 +01:00
2021-03-22 16:24:21 +01:00
The `entries` attribute is a `defaultdict` with uppercased single
words as keys::
2021-02-24 22:55:55 +01:00
>>> idx = InvertedIndex(32, 128)
2021-03-22 16:24:21 +01:00
>>> idx.entries['DOLLAR']
{'$'}
2021-02-24 22:55:55 +01:00
>>> sorted(idx.entries['SIGN'])
['#', '$', '%', '+', '<', '=', '>']
2021-03-22 16:24:21 +01:00
>>> idx.entries['A'] & idx.entries['SMALL']
{'a'}
>>> idx.entries['BRILLIG']
set()
The `.search()` method takes a string, uppercases it, splits it into
words, and returns the intersection of the entries for each word::
>>> idx.search('capital a')
{'A'}
2021-02-24 22:55:55 +01:00
"""
import sys
import unicodedata
from collections import defaultdict
from collections.abc import Iterator
STOP_CODE: int = sys.maxunicode + 1
Char = str
Index = defaultdict[str, set[Char]]
def tokenize(text: str) -> Iterator[str]:
"""return iterator of uppercased words"""
for word in text.upper().replace('-', ' ').split():
yield word
class InvertedIndex:
entries: Index
def __init__(self, start: int = 32, stop: int = STOP_CODE):
entries: Index = defaultdict(set)
for char in (chr(i) for i in range(start, stop)):
name = unicodedata.name(char, '')
if name:
for word in tokenize(name):
entries[word].add(char)
self.entries = entries
2021-03-22 16:24:21 +01:00
def search(self, query: str) -> set[Char]:
2021-02-24 22:55:55 +01:00
if words := list(tokenize(query)):
2021-03-22 16:24:21 +01:00
found = self.entries[words[0]]
return found.intersection(*(self.entries[w] for w in words[1:]))
2021-02-24 22:55:55 +01:00
else:
2021-03-22 16:24:21 +01:00
return set()
2021-02-24 22:55:55 +01:00
2021-03-22 16:24:21 +01:00
def format_results(chars: set[Char]) -> Iterator[str]:
for char in sorted(chars):
2021-02-24 22:55:55 +01:00
name = unicodedata.name(char)
code = ord(char)
yield f'U+{code:04X}\t{char}\t{name}'
def main(words: list[str]) -> None:
if not words:
print('Please give one or more words to search.')
2021-03-22 16:24:21 +01:00
sys.exit(2) # command line usage error
2021-02-24 22:55:55 +01:00
index = InvertedIndex()
chars = index.search(' '.join(words))
for line in format_results(chars):
print(line)
print('' * 66, f'{len(chars)} found')
if __name__ == '__main__':
main(sys.argv[1:])