#!/usr/bin/env python3 """ Unicode character finder utility: find characters based on words in their official names. This can be used from the command line, just pass words as arguments. Here is the ``main`` function which makes it happen:: >>> main('rook') # doctest: +NORMALIZE_WHITESPACE U+2656 ♖ WHITE CHESS ROOK U+265C ♜ BLACK CHESS ROOK (2 matches for 'rook') >>> main('rook', 'black') # doctest: +NORMALIZE_WHITESPACE U+265C ♜ BLACK CHESS ROOK (1 match for 'rook black') >>> main('white bishop') # doctest: +NORMALIZE_WHITESPACE U+2657 ♗ WHITE CHESS BISHOP (1 match for 'white bishop') >>> main("jabberwocky's vest") (No match for "jabberwocky's vest") For exploring words that occur in the character names, there is the ``word_report`` function:: >>> index = UnicodeNameIndex(sample_chars) >>> index.word_report() 3 SIGN 2 A 2 EURO 2 LATIN 2 LETTER 1 CAPITAL 1 CURRENCY 1 DOLLAR 1 SMALL >>> index = UnicodeNameIndex() >>> index.word_report(10) 75821 CJK 75761 IDEOGRAPH 74656 UNIFIED 13196 SYLLABLE 11735 HANGUL 7616 LETTER 2232 WITH 2180 SIGN 2122 SMALL 1709 CAPITAL Note: characters with names starting with 'CJK UNIFIED IDEOGRAPH' are indexed with those three words only, excluding the hexadecimal codepoint at the end of the name. """ import sys import re import unicodedata import pickle import warnings RE_WORD = re.compile('\w+') INDEX_NAME = 'charfinder_index.pickle' MINIMUM_SAVE_LEN = 10000 CJK_PREFIX = 'CJK UNIFIED IDEOGRAPH' sample_chars = [ '$', # DOLLAR SIGN 'A', # LATIN CAPITAL LETTER A 'a', # LATIN SMALL LETTER A '\u20a0', # EURO-CURRENCY SIGN '\u20ac', # EURO SIGN ] def tokenize(text): """return iterable of uppercased words""" for match in RE_WORD.finditer(text): yield match.group().upper() class UnicodeNameIndex: def __init__(self, chars=None): self.load(chars) def load(self, chars=None): self.index = None if chars is None: try: with open(INDEX_NAME, 'rb') as fp: self.index = pickle.load(fp) except OSError: pass if self.index is None: self.build_index(chars) if len(self.index) > MINIMUM_SAVE_LEN: try: self.save() except OSError as exc: warnings.warn('Could not save {!r}: {}' .format(INDEX_NAME, exc)) def save(self): with open(INDEX_NAME, 'wb') as fp: pickle.dump(self.index, fp) def build_index(self, chars=None): if chars is None: chars = (chr(i) for i in range(32, sys.maxunicode)) index = {} for char in chars: try: name = unicodedata.name(char) except ValueError: continue if name.startswith(CJK_PREFIX): name = CJK_PREFIX code = ord(char) for word in tokenize(name): index.setdefault(word, set()).add(code) self.index = index def __len__(self): return len(self.index) def word_rank(self, top=None): res = [(len(self.index[key]), key) for key in self.index] res.sort(key=lambda item: (-item[0], item[1])) if top is not None: res = res[:top] return res def word_report(self, top=None): for postings, key in self.word_rank(top): print('{:5} {}'.format(postings, key)) def find_codes(self, query): result_sets = [] for word in tokenize(query): if word in self.index: result_sets.append(self.index[word]) else: # shorcut: no such word result_sets = [] break if result_sets: result = result_sets[0] result.intersection_update(*result_sets[1:]) else: result = set() if len(result) > 0: for code in sorted(result): yield code def describe(self, code): code_str = 'U+{:04X}'.format(code) char = chr(code) name = unicodedata.name(char) return '{:7}\t{}\t{}'.format(code_str, char, name) def find_descriptions(self, query): for code in self.find_codes(query): yield self.describe(code) def main(*args): index = UnicodeNameIndex() query = ' '.join(args) counter = 0 for line in index.find_descriptions(query): print(line) counter += 1 if counter == 0: msg = 'No match' elif counter == 1: msg = '1 match' else: msg = '{} matches'.format(counter) print('({} for {!r})'.format(msg, query)) if __name__ == '__main__': if len(sys.argv) > 1: main(*sys.argv[1:]) else: print('Usage: {} word1 [word2]...'.format(sys.argv[0]))