updated contents from Atlas repo

2014-10-14 14:26:55 -03:00
parent 40688c038d
commit 981d5bc473
157 changed files with 71134 additions and 1 deletions
--- a/strings-bytes/cafe-gr.txt
+++ b/strings-bytes/cafe-gr.txt
@@ -0,0 +1 @@
+καφέ
--- a/strings-bytes/cafe.txt
+++ b/strings-bytes/cafe.txt
@@ -0,0 +1 @@
+café
--- a/strings-bytes/casefold_demo.py
+++ b/strings-bytes/casefold_demo.py
@@ -0,0 +1,19 @@
+import sys
+from unicodedata import name, normalize
+
+changed = 0
+assigned = 0
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    cf = char.casefold()
+    assigned += 1
+    if cf != char.lower():
+        cf_display = ' '.join(cf)
+        cf_names = ';'.join(name(c) for c in cf)
+        changed += 1
+        print('%4d U+%04x' % (changed, i), char, cf_display, char_name + ' -> ' + cf_names, sep='\t')
+
+print(changed, '/', assigned, '=', changed/assigned*100)
--- a/strings-bytes/category_demo.py
+++ b/strings-bytes/category_demo.py
@@ -0,0 +1,15 @@
+import sys
+import unicodedata
+
+categories = set()
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    name = unicodedata.name(char, None)
+    if name is None:
+        continue
+    cat = unicodedata.category(char)
+    if cat[0] not in categories:
+        print('U+%04x' % i, char.center(6),
+              cat, name, sep='\t')
+        categories.add(cat[0])
--- a/strings-bytes/charfinder.py
+++ b/strings-bytes/charfinder.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+from unicodedata import name
+import sys
+
+if len(sys.argv) > 1:
+    query = sys.argv[1:]
+else:
+    query = input('search words: ').split()
+
+query = [s.upper() for s in query]
+
+count = 0
+for i in range(20, sys.maxunicode):
+    car = chr(i)
+    descr = name(car, None)
+    if descr is None:
+        continue
+    words = descr.split()
+    if all(word in words for word in query):
+        print('{i:5d} {i:04x} {car:^5} {descr}'.format(**locals()))
+        count += 1
+
+print('{0} character(s) found'.format(count))
--- a/strings-bytes/currency_demo.py
+++ b/strings-bytes/currency_demo.py
@@ -0,0 +1,9 @@
+import sys
+import unicodedata
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    if unicodedata.category(char) == 'Sc':
+        name = unicodedata.name(char, None)
+        print('U+%04x' % i, char.center(6),
+              name, sep='\t')
--- a/strings-bytes/default_encodings.py
+++ b/strings-bytes/default_encodings.py
@@ -0,0 +1,21 @@
+import sys, locale
+
+expressions = """
+        locale.getpreferredencoding()
+        type(my_file)
+        my_file.encoding
+        sys.stdout.isatty()
+        sys.stdout.encoding
+        sys.stdin.isatty()
+        sys.stdin.encoding
+        sys.stderr.isatty()
+        sys.stderr.encoding
+        sys.getdefaultencoding()
+        sys.getfilesystemencoding()
+    """
+
+my_file = open('dummy', 'w')
+
+for expression in expressions.split():
+    value = eval(expression)
+    print(expression.rjust(30), '->', repr(value))
--- a/strings-bytes/encodings_demo.py
+++ b/strings-bytes/encodings_demo.py
@@ -0,0 +1,54 @@
+import unicodedata
+
+encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
+
+widths = {encoding:1 for encoding in encodings[:-3]}
+widths.update(zip(encodings[-3:], (2, 4, 4)))
+
+chars = sorted([
+    'A',  # \u0041 : LATIN CAPITAL LETTER A
+    '¿',  # \u00bf : INVERTED QUESTION MARK
+    'Ã',  # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
+    'á',  # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
+    'Ω',  # \u03a9 : GREEK CAPITAL LETTER OMEGA
+    'µ',
+    'Ц',
+    '€',  # \u20ac : EURO SIGN
+    '“',
+    '┌',
+    '气',
+    '氣', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
+    '𝄞',  # \u1d11e : MUSICAL SYMBOL G CLEF
+])
+
+callout1_code = 0x278a  # ➊   DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
+
+missing_mark = '*'
+
+def list_chars():
+    for char in chars:
+        print('%r,  # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
+
+def show_encodings():
+    print(end='\t\t')
+    for encoding in encodings:
+        print(encoding.ljust(widths[encoding] * 2), end='\t')
+    print()
+
+    for lineno, char in enumerate(chars):
+        codepoint = 'U+{:04X}'.format(ord(char))
+        print(char, codepoint, sep='\t', end='\t')
+        for encoding in encodings:
+            try:
+                bytes = char.encode(encoding)
+                dump = ' '.join('%02X' % byte for byte in bytes)
+            except UnicodeEncodeError:
+                dump = missing_mark
+            dump = dump.ljust(widths[encoding] * 2)
+            print(dump, end='\t')
+        # print(chr(callout1_code + lineno))
+        print(unicodedata.name(char))
+        # print()
+
+#list_chars()
+show_encodings()
--- a/strings-bytes/identifier_norm.py
+++ b/strings-bytes/identifier_norm.py
@@ -0,0 +1,7 @@
+
+café = 1
+café = 2
+names = {(name, tuple(name)):value
+         for name, value in globals().items()
+         if not name.startswith('__')}
+print(names)
--- a/strings-bytes/identifier_norm_writer.py
+++ b/strings-bytes/identifier_norm_writer.py
@@ -0,0 +1,12 @@
+
+src = """
+café = 1
+cafe\u0301 = 2
+names = {(name, tuple(name)):value
+         for name, value in globals().items()
+         if not name.startswith('__')}
+print(names)
+"""
+
+with open('identifier_norm.py', 'tw', encoding='utf8') as out:
+    out.write(src)
--- a/strings-bytes/nfc_demo.py
+++ b/strings-bytes/nfc_demo.py
@@ -0,0 +1,14 @@
+import sys
+from unicodedata import name, normalize
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    nfc = normalize('NFC', char)
+    if nfc == char:
+        continue
+    if len(nfc) > 1:
+        nfc_display = ' '.join(nfc)
+        print('U+%04x' % i, char, nfc_display, char_name, sep='\t')
--- a/strings-bytes/nfk_demo.py
+++ b/strings-bytes/nfk_demo.py
@@ -0,0 +1,16 @@
+import sys
+from unicodedata import name, normalize
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    kc = normalize('NFKC', char)
+    if kc == char:
+        continue
+    kd = normalize('NFKD', char)
+    if kc != kd:
+        kc_display = ' '.join(kc)
+        kd_display = ' '.join(kd)
+        print('U+%04x' % i, char, kc_display, kd_display, char_name, sep='\t')
--- a/strings-bytes/normeq.py
+++ b/strings-bytes/normeq.py
@@ -0,0 +1,39 @@
+"""
+Utility functions for normalized Unicode string comparison.
+
+Using Normal Form C, case sensitive:
+
+    >>> s1 = 'café'
+    >>> s2 = 'cafe\u0301'
+    >>> s1 == s2
+    False
+    >>> nfc_equal(s1, s2)
+    True
+    >>> nfc_equal('A', 'a')
+    False
+
+Using Normal Form C with case folding:
+
+    >>> s3 = 'Straße'
+    >>> s4 = 'strasse'
+    >>> s3 == s4
+    False
+    >>> nfc_equal(s3, s4)
+    False
+    >>> fold_equal(s3, s4)
+    True
+    >>> fold_equal(s1, s2)
+    True
+    >>> fold_equal('A', 'a')
+    True
+
+"""
+
+from unicodedata import normalize
+
+def nfc_equal(str1, str2):
+    return normalize('NFC', str1) == normalize('NFC', str2)
+
+def fold_equal(str1, str2):
+    return (normalize('NFC', str1).casefold() ==
+            normalize('NFC', str2).casefold())
--- a/strings-bytes/numerics.py
+++ b/strings-bytes/numerics.py
@@ -0,0 +1,15 @@
+import sys
+from unicodedata import name
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    try:
+        char_name = name(char)
+    except ValueError: # no such name
+        continue
+    flags = []
+    flags.append('D' if char.isdigit() else '')
+    flags.append('N' if char.isnumeric() else '')
+    if any(flags):
+        flags = '\t'.join(flags)
+        print('U+%04x' % i, char, flags, char_name, sep='\t')
--- a/strings-bytes/numerics_demo.py
+++ b/strings-bytes/numerics_demo.py
@@ -0,0 +1,18 @@
+# BEGIN NUMERICS_DEMO
+import unicodedata
+import re
+
+re_digit = re.compile(r'\d')
+
+sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
+
+for char in sample:
+    print('U+%04x' % ord(char),                       # <1>
+          char.center(6),                             # <2>
+          're_dig' if re_digit.match(char) else '-',  # <3>
+          'isdig' if char.isdigit() else '-',         # <4>
+          'isnum' if char.isnumeric() else '-',       # <5>
+          format(unicodedata.numeric(char), '5.2f'),  # <6>
+          unicodedata.name(char),                     # <7>
+          sep='\t')
+# END NUMERICS_DEMO
--- a/strings-bytes/numerics_demo.txt
+++ b/strings-bytes/numerics_demo.txt
@@ -0,0 +1,9 @@
+U+0031	1	re_dig	isdig	isnum	 1.00	DIGIT ONE
+U+00b2	²	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
+U+00bc	¼	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
+U+0969	३	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
+U+136b	፫	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
+U+216b	Ⅻ	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
+U+2466	⑦	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
+U+2480	⒀	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
+U+3285	㊅	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX
--- a/strings-bytes/ola.py
+++ b/strings-bytes/ola.py
@@ -0,0 +1,3 @@
+# coding: cp1252
+
+print('Olá, Mundo!')
--- a/strings-bytes/plane_count.py
+++ b/strings-bytes/plane_count.py
@@ -0,0 +1,16 @@
+import sys
+from unicodedata import name, normalize
+
+total_count = 0
+bmp_count = 0
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    total_count += 1
+    if i <= 0xffff:
+        bmp_count += 1
+
+print(total_count, bmp_count, bmp_count/total_count, bmp_count/total_count*100)
--- a/strings-bytes/ramanujan.py
+++ b/strings-bytes/ramanujan.py
@@ -0,0 +1,21 @@
+# BEGIN RE_DEMO
+import re
+
+re_numbers_str = re.compile(r'\d+')     # <1>
+re_words_str = re.compile(r'\w+')
+re_numbers_bytes = re.compile(rb'\d+')  # <2>
+re_words_bytes = re.compile(rb'\w+')
+
+text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"  # <3>
+            " as 1729 = 1³ + 12³ = 9³ + 10³.")        # <4>
+
+text_bytes = text_str.encode('utf_8')  # <5>
+
+print('Text', repr(text_str), sep='\n  ')
+print('Numbers')
+print('  str  :', re_numbers_str.findall(text_str))      # <6>
+print('  bytes:', re_numbers_bytes.findall(text_bytes))  # <7>
+print('Words')
+print('  str  :', re_words_str.findall(text_str))        # <8>
+print('  bytes:', re_words_bytes.findall(text_bytes))    # <9>
+# END RE_DEMO
--- a/strings-bytes/sanitize.py
+++ b/strings-bytes/sanitize.py
@@ -0,0 +1,87 @@
+
+"""
+Radical folding and text sanitizing.
+
+Handling a string with `cp1252` symbols:
+
+    >>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
+    >>> shave_marks(order)
+    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
+    >>> shave_marks_latin(order)
+    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
+    >>> dewinize(order)
+    '"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
+    >>> asciize(order)
+    '"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
+
+Handling a string with Greek and Latin accented characters:
+
+    >>> greek = 'Ζέφυρος, Zéfiro'
+    >>> shave_marks(greek)
+    'Ζεφυρος, Zefiro'
+    >>> shave_marks_latin(greek)
+    'Ζέφυρος, Zefiro'
+    >>> dewinize(greek)
+    'Ζέφυρος, Zéfiro'
+    >>> asciize(greek)
+    'Ζέφυρος, Zefiro'
+
+"""
+
+# BEGIN SHAVE_MARKS
+import unicodedata
+import string
+
+
+def shave_marks(txt):
+    """Remove all diacritic marks"""
+    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
+    shaved = ''.join(c for c in norm_txt
+                     if not unicodedata.combining(c))  # <2>
+    return unicodedata.normalize('NFC', shaved)  # <3>
+# END SHAVE_MARKS
+
+# BEGIN SHAVE_MARKS_LATIN
+def shave_marks_latin(txt):
+    """Remove all diacritic marks from Latin base characters"""
+    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
+    latin_base = False
+    keepers = []
+    for c in norm_txt:
+        if unicodedata.combining(c) and latin_base:   # <2>
+            continue  # ignore diacritic on Latin base char
+        keepers.append(c)                             # <3>
+        # if it isn't combining char, it's a new base char
+        if not unicodedata.combining(c):              # <4>
+            latin_base = c in string.ascii_letters
+    shaved = ''.join(keepers)
+    return unicodedata.normalize('NFC', shaved)   # <5>
+# END SHAVE_MARKS_LATIN
+
+# BEGIN ASCIIZE
+single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
+                           """'f"*^<''""---~>""")
+
+multi_map = str.maketrans({  # <2>
+    '€': '<euro>',
+    '…': '...',
+    'Œ': 'OE',
+    '™': '(TM)',
+    'œ': 'oe',
+    '‰': '<per mille>',
+    '‡': '**',
+})
+
+multi_map.update(single_map)  # <3>
+
+
+def dewinize(txt):
+    """Replace Win1252 symbols with ASCII chars or sequences"""
+    return txt.translate(multi_map)  # <4>
+
+
+def asciize(txt):
+    no_marks = shave_marks_latin(dewinize(txt))     # <5>
+    no_marks = no_marks.replace('ß', 'ss')          # <6>
+    return unicodedata.normalize('NFKC', no_marks)  # <7>
+# END ASCIIZE
--- a/strings-bytes/sorting.py
+++ b/strings-bytes/sorting.py
@@ -0,0 +1,26 @@
+import locale
+
+def check(sorted_list):
+    return 'CORRECT' if fruits == sorted_list else 'WRONG'
+
+fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
+
+print(locale.getlocale(locale.LC_COLLATE))
+
+print('manual_sort ', fruits)
+
+plain_sort = sorted(fruits)
+
+print('plain_sort  ', plain_sort, check(plain_sort))
+
+locale_sort1 = sorted(fruits, key=locale.strxfrm)
+
+print('locale_sort1', locale_sort1, check(locale_sort1))
+
+locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
+
+print('locale set to:', locale.getlocale(locale.LC_COLLATE))
+
+locale_sort2 = sorted(fruits, key=locale.strxfrm)
+
+print('locale_sort2', locale_sort2, check(locale_sort2))
--- a/strings-bytes/sorting_uca.py
+++ b/strings-bytes/sorting_uca.py
@@ -0,0 +1,18 @@
+from pyuca import Collator
+
+def check(sorted_list):
+    return 'CORRECT' if fruits == sorted_list else 'WRONG'
+
+fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
+
+print('manual_sort', fruits)
+
+plain_sort = sorted(fruits)
+
+print('plain_sort ', plain_sort, check(plain_sort))
+
+coll = Collator()
+
+pyuca_sort = sorted(fruits, key=coll.sort_key)
+
+print('pyuca_sort ', pyuca_sort, check(pyuca_sort))
--- a/strings-bytes/sorting_uca.txt
+++ b/strings-bytes/sorting_uca.txt
@@ -0,0 +1,11 @@
+PS > pip install pyuca
+Downloading/unpacking pyuca
+  Running setup.py (path:C:\Users\...) egg_info for package pyuca
+Installing collected packages: pyuca
+  Running setup.py install for pyuca
+Successfully installed pyuca
+Cleaning up...
+PS > python .\sorting_uca.py
+manual_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju']
+plain_sort  ['acaíba', 'acerola', 'açaí', 'caju', 'cajá'] WRONG
+pyuca_sort  ['açaí', 'acaíba', 'acerola', 'cajá', 'caju'] CORRECT
--- a/strings-bytes/str_repr.py
+++ b/strings-bytes/str_repr.py
@@ -0,0 +1,14 @@
+
+last_len = 0
+last_repr = ''
+lengths = set()
+for i in range(0x110000):
+    r = repr(chr(i))[1:-1]
+    if len(r) != last_len:
+        lengths.add(len(r))
+        last_len = len(r)
+        if i > 0:
+            prev_repr = repr(chr(i-1))[1:-1]
+            print('{}'.format(prev_repr))
+        print('U+{:04x} {:{max_len}} ...'.format(i, r, max_len=max(lengths)), end=' ')
+        last_repr = r
--- a/strings-bytes/str_repr2.py
+++ b/strings-bytes/str_repr2.py
@@ -0,0 +1,50 @@
+from itertools import groupby
+
+def bare_repr(codepoint):
+    return repr(chr(codepoint))[1:-1]
+
+def display(codepoint):
+    repstr = repr(chr(codepoint))[1:-1]
+    print('U+{:04x} {:{max_len}}'.format(
+            codepoint, repstr, max_len=max(lengths)))
+
+def repr_shape(codepoint):
+    brepr = bare_repr(codepoint)
+    if len(brepr) == 1:
+        shape = 'GLYPH'
+    else:
+        shape = brepr[:2]
+        escapes.add(shape)
+    return len(brepr), shape
+
+escapes = set()
+
+group_gen = groupby((codepoint for codepoint in range(0x110000)), repr_shape)
+
+for len_shape, group in group_gen:
+    len_brepr, shape = len_shape
+    group = list(group)
+    cp_first = group[0]
+    cp_last = group[-1]
+    cp_mid = group[len(group)//2]
+    if len(group) == 1:
+        glyph_sample = bare_repr(cp_first) if shape == 'GLYPH' else ''
+        print('{:6d} U+{:04X}          {:5} {}'.format(
+            len(group), cp_first, shape, glyph_sample))
+    else:
+        if len(group) == 2:
+            if shape == 'GLYPH':
+                glyph_sample = bare_repr(cp_first) + ' ' + bare_repr(cp_last)
+            else:
+                glyph_sample = ''
+            print('{:6d} U+{:04X} , U+{:04X} {:5} {}'.format(
+                len(group), cp_first, cp_last, shape, glyph_sample))
+        else:
+            if shape == 'GLYPH':
+                glyph_sample = ' '.join([bare_repr(cp_first),
+                                    bare_repr(cp_mid), bare_repr(cp_last)])
+            else:
+                glyph_sample = ''
+            print('{:6d} U+{:04X}...U+{:04X} {:5} {}'.format(
+                len(group), cp_first, cp_last, shape, glyph_sample))
+print('escapes:', ' '.join(sorted(escapes, key=str.upper)))
--- a/strings-bytes/strings-bytes-test.txt
+++ b/strings-bytes/strings-bytes-test.txt
@@ -0,0 +1,22 @@
+>>> s = 'naïve'  <1>
+>>> b = b'naïve'  <2>
+Traceback (most recent call last):
+  ...
+SyntaxError: bytes can only contain ASCII literal characters.
+>>> b = bytes('naïve', 'iso8859-1')  <3>
+>>> b         <4>
+b'na\xefve'
+>>> s         <5>
+'naïve'
+>>> b == s.encode('iso8859-1')  <6>
+True
+>>> s[2]      <7>
+'ï'
+>>> b[2]      <8>
+239
+>>> ord(s[2]) <9>
+239
+>>> s.upper() <10>
+'NAÏVE'
+>>> b.upper() <11>
+b'NA\xefVE'