update from O'Reilly repo

This commit is contained in:
Luciano Ramalho
2021-03-22 12:24:21 -03:00
parent e1cd63aa04
commit 2f8bf06270
28 changed files with 470 additions and 125 deletions

View File

@@ -4,28 +4,28 @@
Class ``InvertedIndex`` builds an inverted index mapping each word to
the set of Unicode characters which contain that word in their names.
Optional arguments to the constructor are ``first`` and ``last+1`` character
codes to index, to make testing easier.
Optional arguments to the constructor are ``first`` and ``last+1``
character codes to index, to make testing easier. In the examples
below, only the ASCII range was indexed.
In the example below, only the ASCII range was indexed::
The `entries` attribute is a `defaultdict` with uppercased single
words as keys::
>>> idx = InvertedIndex(32, 128)
>>> idx.entries['DOLLAR']
{'$'}
>>> sorted(idx.entries['SIGN'])
['#', '$', '%', '+', '<', '=', '>']
>>> sorted(idx.entries['DIGIT'])
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
>>> idx.entries['DIGIT'] & idx.entries['EIGHT']
{'8'}
>>> idx.search('digit')
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
>>> idx.search('eight digit')
['8']
>>> idx.search('a letter')
['A', 'a']
>>> idx.search('a letter capital')
['A']
>>> idx.search('borogove')
[]
>>> idx.entries['A'] & idx.entries['SMALL']
{'a'}
>>> idx.entries['BRILLIG']
set()
The `.search()` method takes a string, uppercases it, splits it into
words, and returns the intersection of the entries for each word::
>>> idx.search('capital a')
{'A'}
"""
@@ -58,17 +58,16 @@ class InvertedIndex:
entries[word].add(char)
self.entries = entries
def search(self, query: str) -> list[Char]:
def search(self, query: str) -> set[Char]:
if words := list(tokenize(query)):
first = self.entries[words[0]]
result = first.intersection(*(self.entries[w] for w in words[1:]))
return sorted(result)
found = self.entries[words[0]]
return found.intersection(*(self.entries[w] for w in words[1:]))
else:
return []
return set()
def format_results(chars: list[Char]) -> Iterator[str]:
for char in chars:
def format_results(chars: set[Char]) -> Iterator[str]:
for char in sorted(chars):
name = unicodedata.name(char)
code = ord(char)
yield f'U+{code:04X}\t{char}\t{name}'
@@ -77,7 +76,7 @@ def format_results(chars: list[Char]) -> Iterator[str]:
def main(words: list[str]) -> None:
if not words:
print('Please give one or more words to search.')
sys.exit()
sys.exit(2) # command line usage error
index = InvertedIndex()
chars = index.search(' '.join(words))
for line in format_results(chars):