ch01-12: clean up by @eumiro
This commit is contained in:
45
04-text-byte/categories.py
Normal file
45
04-text-byte/categories.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import sys
|
||||
import collections
|
||||
from unicodedata import name, category
|
||||
|
||||
|
||||
def category_stats():
|
||||
counts = collections.Counter()
|
||||
firsts = {}
|
||||
for code in range(sys.maxunicode + 1):
|
||||
char = chr(code)
|
||||
cat = category(char)
|
||||
if cat not in counts:
|
||||
firsts[cat] = char
|
||||
counts[cat] += 1
|
||||
return counts, firsts
|
||||
|
||||
|
||||
def category_scan(desired):
|
||||
for code in range(sys.maxunicode + 1):
|
||||
char = chr(code)
|
||||
if category(char) == desired:
|
||||
yield char
|
||||
|
||||
|
||||
def main(args):
|
||||
count = 0
|
||||
if len(args) == 2:
|
||||
for char in category_scan(args[1]):
|
||||
print(char, end=' ')
|
||||
count += 1
|
||||
if count > 200:
|
||||
break
|
||||
print()
|
||||
print(count, 'characters shown')
|
||||
else:
|
||||
counts, firsts = category_stats()
|
||||
for cat, count in counts.most_common():
|
||||
first = firsts[cat]
|
||||
if cat == 'Cs':
|
||||
first = f'(surrogate U+{ord(first):04X})'
|
||||
print(f'{count:6} {cat} {first}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
@@ -1,5 +1,6 @@
|
||||
|
||||
"""
|
||||
Radical folding and text sanitizing.
|
||||
Radical folding and diacritic mark removal.
|
||||
|
||||
Handling a string with `cp1252` symbols:
|
||||
|
||||
@@ -45,30 +46,33 @@ def shave_marks_latin(txt):
|
||||
"""Remove all diacritic marks from Latin base characters"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
latin_base = False
|
||||
keepers = []
|
||||
preserve = []
|
||||
for c in norm_txt:
|
||||
if unicodedata.combining(c) and latin_base: # <2>
|
||||
continue # ignore diacritic on Latin base char
|
||||
keepers.append(c) # <3>
|
||||
preserve.append(c) # <3>
|
||||
# if it isn't combining char, it's a new base char
|
||||
if not unicodedata.combining(c): # <4>
|
||||
latin_base = c in string.ascii_letters
|
||||
shaved = ''.join(keepers)
|
||||
shaved = ''.join(preserve)
|
||||
return unicodedata.normalize('NFC', shaved) # <5>
|
||||
# end::SHAVE_MARKS_LATIN[]
|
||||
|
||||
# tag::ASCIIZE[]
|
||||
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
|
||||
"""'f"*^<''""---~>""")
|
||||
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
|
||||
"""'f"^<''""---~>""")
|
||||
|
||||
multi_map = str.maketrans({ # <2>
|
||||
'€': '<euro>',
|
||||
'€': 'EUR',
|
||||
'…': '...',
|
||||
'Æ': 'AE',
|
||||
'æ': 'ae',
|
||||
'Œ': 'OE',
|
||||
'™': '(TM)',
|
||||
'œ': 'oe',
|
||||
'™': '(TM)',
|
||||
'‰': '<per mille>',
|
||||
'‡': '**',
|
||||
'†': '**',
|
||||
'‡': '***',
|
||||
})
|
||||
|
||||
multi_map.update(single_map) # <3>
|
||||
Reference in New Issue
Block a user