renamed 22-async/
This commit is contained in:
89
22-async/mojifinder/charindex.py
Executable file
89
22-async/mojifinder/charindex.py
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Class ``InvertedIndex`` builds an inverted index mapping each word to
|
||||
the set of Unicode characters which contain that word in their names.
|
||||
|
||||
Optional arguments to the constructor are ``first`` and ``last+1`` character
|
||||
codes to index, to make testing easier.
|
||||
|
||||
In the example below, only the ASCII range was indexed::
|
||||
|
||||
>>> idx = InvertedIndex(32, 128)
|
||||
>>> sorted(idx.entries['SIGN'])
|
||||
['#', '$', '%', '+', '<', '=', '>']
|
||||
>>> sorted(idx.entries['DIGIT'])
|
||||
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
||||
>>> idx.entries['DIGIT'] & idx.entries['EIGHT']
|
||||
{'8'}
|
||||
>>> idx.search('digit')
|
||||
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
||||
>>> idx.search('eight digit')
|
||||
['8']
|
||||
>>> idx.search('a letter')
|
||||
['A', 'a']
|
||||
>>> idx.search('a letter capital')
|
||||
['A']
|
||||
>>> idx.search('borogove')
|
||||
[]
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
import unicodedata
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterator
|
||||
|
||||
STOP_CODE: int = sys.maxunicode + 1
|
||||
|
||||
Char = str
|
||||
Index = defaultdict[str, set[Char]]
|
||||
|
||||
|
||||
def tokenize(text: str) -> Iterator[str]:
|
||||
"""return iterator of uppercased words"""
|
||||
for word in text.upper().replace('-', ' ').split():
|
||||
yield word
|
||||
|
||||
|
||||
class InvertedIndex:
|
||||
entries: Index
|
||||
|
||||
def __init__(self, start: int = 32, stop: int = STOP_CODE):
|
||||
entries: Index = defaultdict(set)
|
||||
for char in (chr(i) for i in range(start, stop)):
|
||||
name = unicodedata.name(char, '')
|
||||
if name:
|
||||
for word in tokenize(name):
|
||||
entries[word].add(char)
|
||||
self.entries = entries
|
||||
|
||||
def search(self, query: str) -> list[Char]:
|
||||
if words := list(tokenize(query)):
|
||||
first = self.entries[words[0]]
|
||||
result = first.intersection(*(self.entries[w] for w in words[1:]))
|
||||
return sorted(result)
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def format_results(chars: list[Char]) -> Iterator[str]:
|
||||
for char in chars:
|
||||
name = unicodedata.name(char)
|
||||
code = ord(char)
|
||||
yield f'U+{code:04X}\t{char}\t{name}'
|
||||
|
||||
|
||||
def main(words: list[str]) -> None:
|
||||
if not words:
|
||||
print('Please give one or more words to search.')
|
||||
sys.exit()
|
||||
index = InvertedIndex()
|
||||
chars = index.search(' '.join(words))
|
||||
for line in format_results(chars):
|
||||
print(line)
|
||||
print('─' * 66, f'{len(chars)} found')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
Reference in New Issue
Block a user