updated from Atlas

2015-04-01 22:48:56 -03:00
parent aab93699a4
commit 573e1a94c4
109 changed files with 5 additions and 6 deletions
--- a/04-text-byte/default_encodings.py
+++ b/04-text-byte/default_encodings.py
@@ -0,0 +1,21 @@
+import sys, locale
+
+expressions = """
+        locale.getpreferredencoding()
+        type(my_file)
+        my_file.encoding
+        sys.stdout.isatty()
+        sys.stdout.encoding
+        sys.stdin.isatty()
+        sys.stdin.encoding
+        sys.stderr.isatty()
+        sys.stderr.encoding
+        sys.getdefaultencoding()
+        sys.getfilesystemencoding()
+    """
+
+my_file = open('dummy', 'w')
+
+for expression in expressions.split():
+    value = eval(expression)
+    print(expression.rjust(30), '->', repr(value))
--- a/04-text-byte/normeq.py
+++ b/04-text-byte/normeq.py
@@ -0,0 +1,39 @@
+"""
+Utility functions for normalized Unicode string comparison.
+
+Using Normal Form C, case sensitive:
+
+    >>> s1 = 'café'
+    >>> s2 = 'cafe\u0301'
+    >>> s1 == s2
+    False
+    >>> nfc_equal(s1, s2)
+    True
+    >>> nfc_equal('A', 'a')
+    False
+
+Using Normal Form C with case folding:
+
+    >>> s3 = 'Straße'
+    >>> s4 = 'strasse'
+    >>> s3 == s4
+    False
+    >>> nfc_equal(s3, s4)
+    False
+    >>> fold_equal(s3, s4)
+    True
+    >>> fold_equal(s1, s2)
+    True
+    >>> fold_equal('A', 'a')
+    True
+
+"""
+
+from unicodedata import normalize
+
+def nfc_equal(str1, str2):
+    return normalize('NFC', str1) == normalize('NFC', str2)
+
+def fold_equal(str1, str2):
+    return (normalize('NFC', str1).casefold() ==
+            normalize('NFC', str2).casefold())
--- a/04-text-byte/numerics_demo.py
+++ b/04-text-byte/numerics_demo.py
@@ -0,0 +1,18 @@
+# BEGIN NUMERICS_DEMO
+import unicodedata
+import re
+
+re_digit = re.compile(r'\d')
+
+sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
+
+for char in sample:
+    print('U+%04x' % ord(char),                       # <1>
+          char.center(6),                             # <2>
+          're_dig' if re_digit.match(char) else '-',  # <3>
+          'isdig' if char.isdigit() else '-',         # <4>
+          'isnum' if char.isnumeric() else '-',       # <5>
+          format(unicodedata.numeric(char), '5.2f'),  # <6>
+          unicodedata.name(char),                     # <7>
+          sep='\t')
+# END NUMERICS_DEMO
--- a/04-text-byte/ramanujan.py
+++ b/04-text-byte/ramanujan.py
@@ -0,0 +1,21 @@
+# BEGIN RE_DEMO
+import re
+
+re_numbers_str = re.compile(r'\d+')     # <1>
+re_words_str = re.compile(r'\w+')
+re_numbers_bytes = re.compile(rb'\d+')  # <2>
+re_words_bytes = re.compile(rb'\w+')
+
+text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  # <3>
+            " as 1729 = 1³ + 12³ = 9³ + 10³.")        # <4>
+
+text_bytes = text_str.encode('utf_8')  # <5>
+
+print('Text', repr(text_str), sep='\n  ')
+print('Numbers')
+print('  str  :', re_numbers_str.findall(text_str))      # <6>
+print('  bytes:', re_numbers_bytes.findall(text_bytes))  # <7>
+print('Words')
+print('  str  :', re_words_str.findall(text_str))        # <8>
+print('  bytes:', re_words_bytes.findall(text_bytes))    # <9>
+# END RE_DEMO
--- a/04-text-byte/sanitize.py
+++ b/04-text-byte/sanitize.py
@@ -0,0 +1,87 @@
+
+"""
+Radical folding and text sanitizing.
+
+Handling a string with `cp1252` symbols:
+
+    >>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
+    >>> shave_marks(order)
+    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
+    >>> shave_marks_latin(order)
+    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
+    >>> dewinize(order)
+    '"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
+    >>> asciize(order)
+    '"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
+
+Handling a string with Greek and Latin accented characters:
+
+    >>> greek = 'Ζέφυρος, Zéfiro'
+    >>> shave_marks(greek)
+    'Ζεφυρος, Zefiro'
+    >>> shave_marks_latin(greek)
+    'Ζέφυρος, Zefiro'
+    >>> dewinize(greek)
+    'Ζέφυρος, Zéfiro'
+    >>> asciize(greek)
+    'Ζέφυρος, Zefiro'
+
+"""
+
+# BEGIN SHAVE_MARKS
+import unicodedata
+import string
+
+
+def shave_marks(txt):
+    """Remove all diacritic marks"""
+    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
+    shaved = ''.join(c for c in norm_txt
+                     if not unicodedata.combining(c))  # <2>
+    return unicodedata.normalize('NFC', shaved)  # <3>
+# END SHAVE_MARKS
+
+# BEGIN SHAVE_MARKS_LATIN
+def shave_marks_latin(txt):
+    """Remove all diacritic marks from Latin base characters"""
+    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
+    latin_base = False
+    keepers = []
+    for c in norm_txt:
+        if unicodedata.combining(c) and latin_base:   # <2>
+            continue  # ignore diacritic on Latin base char
+        keepers.append(c)                             # <3>
+        # if it isn't combining char, it's a new base char
+        if not unicodedata.combining(c):              # <4>
+            latin_base = c in string.ascii_letters
+    shaved = ''.join(keepers)
+    return unicodedata.normalize('NFC', shaved)   # <5>
+# END SHAVE_MARKS_LATIN
+
+# BEGIN ASCIIZE
+single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
+                           """'f"*^<''""---~>""")
+
+multi_map = str.maketrans({  # <2>
+    '€': '<euro>',
+    '…': '...',
+    'Œ': 'OE',
+    '™': '(TM)',
+    'œ': 'oe',
+    '‰': '<per mille>',
+    '‡': '**',
+})
+
+multi_map.update(single_map)  # <3>
+
+
+def dewinize(txt):
+    """Replace Win1252 symbols with ASCII chars or sequences"""
+    return txt.translate(multi_map)  # <4>
+
+
+def asciize(txt):
+    no_marks = shave_marks_latin(dewinize(txt))     # <5>
+    no_marks = no_marks.replace('ß', 'ss')          # <6>
+    return unicodedata.normalize('NFKC', no_marks)  # <7>
+# END ASCIIZE