#!/usr/bin/env python3 """ Unicode character finder utility: find characters based on words in their official names. This can be used from the command line, just pass words as arguments. Here is the ``main`` function which makes it happen:: >>> main('rook') # doctest: +NORMALIZE_WHITESPACE U+2656 ♖ WHITE CHESS ROOK U+265C ♜ BLACK CHESS ROOK (2 matches for 'rook') >>> main('rook', 'black') # doctest: +NORMALIZE_WHITESPACE U+265C ♜ BLACK CHESS ROOK (1 match for 'rook black') >>> main('white bishop') # doctest: +NORMALIZE_WHITESPACE U+2657 ♗ WHITE CHESS BISHOP (1 match for 'white bishop') >>> main("jabberwocky's vest") (No match for "jabberwocky's vest") For exploring words that occur in the character names, there is the ``word_report`` function:: >>> index = UnicodeNameIndex(sample_chars) >>> index.word_report() 3 SIGN 2 A 2 EURO 2 LATIN 2 LETTER 1 CAPITAL 1 CURRENCY 1 DOLLAR 1 SMALL >>> index = UnicodeNameIndex() >>> index.word_report(10) 75821 CJK 75761 IDEOGRAPH 74656 UNIFIED 13196 SYLLABLE 11735 HANGUL 7616 LETTER 2232 WITH 2180 SIGN 2122 SMALL 1709 CAPITAL Note: characters with names starting with 'CJK UNIFIED IDEOGRAPH' are indexed with those three words only, excluding the hexadecimal codepoint at the end of the name. """ import sys import re import unicodedata import pickle import warnings import itertools from collections import namedtuple RE_WORD = re.compile(r'\w+') RE_UNICODE_NAME = re.compile('^[A-Z0-9 -]+$') RE_CODEPOINT = re.compile(r'U\+([0-9A-F]{4,6})') INDEX_NAME = 'charfinder_index.pickle' MINIMUM_SAVE_LEN = 10000 CJK_UNI_PREFIX = 'CJK UNIFIED IDEOGRAPH' CJK_CMP_PREFIX = 'CJK COMPATIBILITY IDEOGRAPH' sample_chars = [ '$', # DOLLAR SIGN 'A', # LATIN CAPITAL LETTER A 'a', # LATIN SMALL LETTER A '\u20a0', # EURO-CURRENCY SIGN '\u20ac', # EURO SIGN ] def tokenize(text): """return iterable of uppercased words""" for match in RE_WORD.finditer(text): yield match.group().upper() def query_type(text): text_upper = text.upper() if 'U+' in text_upper: return 'CODEPOINT' elif RE_UNICODE_NAME.match(text_upper): return 'NAME' else: return 'CHARACTERS' CharDescription = namedtuple('CharDescription', 'code_str char name') QueryResult = namedtuple('QueryResult', 'len items') class UnicodeNameIndex: def __init__(self, chars=None): self.load(chars) def load(self, chars=None): self.index = None if chars is None: try: with open(INDEX_NAME, 'rb') as fp: self.index = pickle.load(fp) except OSError: pass if self.index is None: self.build_index(chars) if len(self.index) > MINIMUM_SAVE_LEN: try: self.save() except OSError as exc: warnings.warn('Could not save {!r}: {}' .format(INDEX_NAME, exc)) def save(self): with open(INDEX_NAME, 'wb') as fp: pickle.dump(self.index, fp) def build_index(self, chars=None): if chars is None: chars = (chr(i) for i in range(32, sys.maxunicode)) index = {} for char in chars: try: name = unicodedata.name(char) except ValueError: continue if name.startswith(CJK_UNI_PREFIX): name = CJK_UNI_PREFIX elif name.startswith(CJK_CMP_PREFIX): name = CJK_CMP_PREFIX for word in tokenize(name): index.setdefault(word, set()).add(char) self.index = index def __len__(self): return len(self.index) def word_rank(self, top=None): res = [(len(self.index[key]), key) for key in self.index] res.sort(key=lambda item: (-item[0], item[1])) if top is not None: res = res[:top] return res def word_report(self, top=None): for postings, key in self.word_rank(top): print('{:5} {}'.format(postings, key)) def find_chars(self, query, start=0, stop=None): stop = sys.maxsize if stop is None else stop result_sets = [] for word in tokenize(query): if word in self.index: result_sets.append(self.index[word]) else: # shorcut: no such word result_sets = [] break if result_sets: result = result_sets[0].intersection(*result_sets[1:]) result = sorted(result) # must sort for consistency result_iter = itertools.islice(result, start, stop) return QueryResult(len(result), (char for char in result_iter)) return QueryResult(0, ()) def find_codes(self, query, start=0, stop=None): return (ord(char) for char in self.find_chars(query, start, stop).items) def describe(self, char): code_str = 'U+{:04X}'.format(ord(char)) name = unicodedata.name(char) return CharDescription(code_str, char, name) def find_descriptions(self, query, start=0, stop=None): for char in self.find_chars(query, start, stop).items: yield self.describe(char) def get_descriptions(self, chars): for char in chars: yield self.describe(char) def describe_str(self, char): return '{:7}\t{}\t{}'.format(*self.describe(char)) def find_description_strs(self, query, start=0, stop=None): for char in self.find_chars(query, start, stop).items: yield self.describe_str(char) @staticmethod # not an instance method due to concurrency def status(query, counter): if counter == 0: msg = 'No match' elif counter == 1: msg = '1 match' else: msg = '{} matches'.format(counter) return '{} for {!r}'.format(msg, query) def main(*args): index = UnicodeNameIndex() query = ' '.join(args) n = 0 for n, line in enumerate(index.find_description_strs(query), 1): print(line) print('({})'.format(index.status(query, n))) if __name__ == '__main__': if len(sys.argv) > 1: main(*sys.argv[1:]) else: print('Usage: {} word1 [word2]...'.format(sys.argv[0]))