renamed 22-async/

2021-03-11 16:11:39 -03:00
parent 7c88948b22
commit e1cd63aa04
15 changed files with 8 additions and 8 deletions
--- a/22-async/mojifinder/charindex.py
+++ b/22-async/mojifinder/charindex.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+"""
+Class ``InvertedIndex`` builds an inverted index mapping each word to
+the set of Unicode characters which contain that word in their names.
+
+Optional arguments to the constructor are ``first`` and ``last+1`` character
+codes to index, to make testing easier.
+
+In the example below, only the ASCII range was indexed::
+
+    >>> idx = InvertedIndex(32, 128)
+    >>> sorted(idx.entries['SIGN'])
+    ['#', '$', '%', '+', '<', '=', '>']
+    >>> sorted(idx.entries['DIGIT'])
+    ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+    >>> idx.entries['DIGIT'] & idx.entries['EIGHT']
+    {'8'}
+    >>> idx.search('digit')
+    ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+    >>> idx.search('eight digit')
+    ['8']
+    >>> idx.search('a letter')
+    ['A', 'a']
+    >>> idx.search('a letter capital')
+    ['A']
+    >>> idx.search('borogove')
+    []
+
+"""
+
+import sys
+import unicodedata
+from collections import defaultdict
+from collections.abc import Iterator
+
+STOP_CODE: int = sys.maxunicode + 1
+
+Char = str
+Index = defaultdict[str, set[Char]]
+
+
+def tokenize(text: str) -> Iterator[str]:
+    """return iterator of uppercased words"""
+    for word in text.upper().replace('-', ' ').split():
+        yield word
+
+
+class InvertedIndex:
+    entries: Index
+
+    def __init__(self, start: int = 32, stop: int = STOP_CODE):
+        entries: Index = defaultdict(set)
+        for char in (chr(i) for i in range(start, stop)):
+            name = unicodedata.name(char, '')
+            if name:
+                for word in tokenize(name):
+                    entries[word].add(char)
+        self.entries = entries
+
+    def search(self, query: str) -> list[Char]:
+        if words := list(tokenize(query)):
+            first = self.entries[words[0]]
+            result = first.intersection(*(self.entries[w] for w in words[1:]))
+            return sorted(result)
+        else:
+            return []
+
+
+def format_results(chars: list[Char]) -> Iterator[str]:
+    for char in chars:
+        name = unicodedata.name(char)
+        code = ord(char)
+        yield f'U+{code:04X}\t{char}\t{name}'
+
+
+def main(words: list[str]) -> None:
+    if not words:
+        print('Please give one or more words to search.')
+        sys.exit()
+    index = InvertedIndex()
+    chars = index.search(' '.join(words))
+    for line in format_results(chars):
+        print(line)
+    print('─' * 66, f'{len(chars)} found')
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])