ch01-12: clean up by @eumiro

This commit is contained in:
Luciano Ramalho
2021-02-14 20:28:07 -03:00
parent 584a7f21ca
commit 03ace4f4ae
33 changed files with 1383 additions and 86 deletions

View File

@@ -0,0 +1,45 @@
import sys
import collections
from unicodedata import name, category
def category_stats():
counts = collections.Counter()
firsts = {}
for code in range(sys.maxunicode + 1):
char = chr(code)
cat = category(char)
if cat not in counts:
firsts[cat] = char
counts[cat] += 1
return counts, firsts
def category_scan(desired):
for code in range(sys.maxunicode + 1):
char = chr(code)
if category(char) == desired:
yield char
def main(args):
count = 0
if len(args) == 2:
for char in category_scan(args[1]):
print(char, end=' ')
count += 1
if count > 200:
break
print()
print(count, 'characters shown')
else:
counts, firsts = category_stats()
for cat, count in counts.most_common():
first = firsts[cat]
if cat == 'Cs':
first = f'(surrogate U+{ord(first):04X})'
print(f'{count:6} {cat} {first}')
if __name__ == '__main__':
main(sys.argv)

View File

@@ -1,5 +1,6 @@
"""
Radical folding and text sanitizing.
Radical folding and diacritic mark removal.
Handling a string with `cp1252` symbols:
@@ -45,30 +46,33 @@ def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
latin_base = False
keepers = []
preserve = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base: # <2>
continue # ignore diacritic on Latin base char
keepers.append(c) # <3>
preserve.append(c) # <3>
# if it isn't combining char, it's a new base char
if not unicodedata.combining(c): # <4>
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
shaved = ''.join(preserve)
return unicodedata.normalize('NFC', shaved) # <5>
# end::SHAVE_MARKS_LATIN[]
# tag::ASCIIZE[]
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
"""'f"*^<''""---~>""")
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
"""'f"^<''""---~>""")
multi_map = str.maketrans({ # <2>
'': '<euro>',
'': 'EUR',
'': '...',
'Æ': 'AE',
'æ': 'ae',
'Œ': 'OE',
'': '(TM)',
'œ': 'oe',
'': '(TM)',
'': '<per mille>',
'': '**',
'': '**',
'': '***',
})
multi_map.update(single_map) # <3>