example-code-2e/04-text-byte/categories.py

46 lines
1.1 KiB
Python
Raw Normal View History

2021-02-15 00:28:07 +01:00
import sys
import collections
2021-05-21 23:56:12 +02:00
from unicodedata import category
2021-02-15 00:28:07 +01:00
def category_stats():
counts = collections.Counter()
firsts = {}
for code in range(sys.maxunicode + 1):
char = chr(code)
cat = category(char)
if cat not in counts:
firsts[cat] = char
counts[cat] += 1
return counts, firsts
def category_scan(desired):
for code in range(sys.maxunicode + 1):
char = chr(code)
if category(char) == desired:
2021-05-21 23:56:12 +02:00
yield char
2021-02-15 00:28:07 +01:00
def main(args):
count = 0
if len(args) == 2:
for char in category_scan(args[1]):
print(char, end=' ')
count += 1
if count > 200:
break
2021-05-21 23:56:12 +02:00
print()
2021-02-15 00:28:07 +01:00
print(count, 'characters shown')
else:
counts, firsts = category_stats()
2021-07-08 04:45:54 +02:00
for i, (cat, count) in enumerate(counts.most_common(), 1):
2021-02-15 00:28:07 +01:00
first = firsts[cat]
if cat == 'Cs':
first = f'(surrogate U+{ord(first):04X})'
2021-07-08 04:45:54 +02:00
print(f'{i:2} {count:6} {cat} {first}')
2021-02-15 00:28:07 +01:00
if __name__ == '__main__':
main(sys.argv)