ch01-12: clean up by @eumiro

2021-02-14 20:28:07 -03:00
parent 584a7f21ca
commit 03ace4f4ae
33 changed files with 1383 additions and 86 deletions
--- a/04-text-byte/categories.py
+++ b/04-text-byte/categories.py
@@ -0,0 +1,45 @@
+import sys
+import collections
+from unicodedata import name, category
+
+
+def category_stats():
+    counts = collections.Counter()
+    firsts = {}
+    for code in range(sys.maxunicode + 1):
+        char = chr(code)
+        cat = category(char)
+        if cat not in counts:
+            firsts[cat] = char
+        counts[cat] += 1
+    return counts, firsts
+
+
+def category_scan(desired):
+    for code in range(sys.maxunicode + 1):
+        char = chr(code)
+        if category(char) == desired:
+            yield char    
+
+
+def main(args):
+    count = 0
+    if len(args) == 2:
+        for char in category_scan(args[1]):
+            print(char, end=' ')
+            count += 1
+            if count > 200:
+                break
+        print() 
+        print(count, 'characters shown')
+    else:
+        counts, firsts = category_stats()
+        for cat, count in counts.most_common():
+            first = firsts[cat]
+            if cat == 'Cs':
+                first = f'(surrogate U+{ord(first):04X})'
+            print(f'{count:6} {cat} {first}')
+
+
+if __name__ == '__main__':
+    main(sys.argv)
--- a/04-text-byte/simplify.py
+++ b/04-text-byte/simplify.py
@@ -1,5 +1,6 @@
+
 """
-Radical folding and text sanitizing.
+Radical folding and diacritic mark removal.

 Handling a string with `cp1252` symbols:

@@ -45,30 +46,33 @@ def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
    latin_base = False
-    keepers = []
+    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:   # <2>
            continue  # ignore diacritic on Latin base char
-        keepers.append(c)                             # <3>
+        preserve.append(c)                            # <3>
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):              # <4>
            latin_base = c in string.ascii_letters
-    shaved = ''.join(keepers)
+    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)   # <5>
 # end::SHAVE_MARKS_LATIN[]

 # tag::ASCIIZE[]
-single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
-                           """'f"*^<''""---~>""")
+single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""",  # <1>
+                           """'f"^<''""---~>""")

 multi_map = str.maketrans({  # <2>
-    '€': '<euro>',
+    '€': 'EUR',
    '…': '...',
+    'Æ': 'AE',
+    'æ': 'ae',
    'Œ': 'OE',
-    '™': '(TM)',
    'œ': 'oe',
+    '™': '(TM)',
    '‰': '<per mille>',
-    '‡': '**',
+    '†': '**',
+    '‡': '***',
 })

 multi_map.update(single_map)  # <3>