updated from Atlas

2015-04-01 22:48:56 -03:00
parent aab93699a4
commit 573e1a94c4
109 changed files with 5 additions and 6 deletions
--- a/attic/strings-bytes/cafe-gr.txt
+++ b/attic/strings-bytes/cafe-gr.txt
@@ -0,0 +1 @@
+καφέ
--- a/attic/strings-bytes/cafe.txt
+++ b/attic/strings-bytes/cafe.txt
@@ -0,0 +1 @@
+café
--- a/attic/strings-bytes/casefold_demo.py
+++ b/attic/strings-bytes/casefold_demo.py
@@ -0,0 +1,19 @@
+import sys
+from unicodedata import name, normalize
+
+changed = 0
+assigned = 0
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    cf = char.casefold()
+    assigned += 1
+    if cf != char.lower():
+        cf_display = ' '.join(cf)
+        cf_names = ';'.join(name(c) for c in cf)
+        changed += 1
+        print('%4d U+%04x' % (changed, i), char, cf_display, char_name + ' -> ' + cf_names, sep='\t')
+
+print(changed, '/', assigned, '=', changed/assigned*100)
--- a/attic/strings-bytes/category_demo.py
+++ b/attic/strings-bytes/category_demo.py
@@ -0,0 +1,15 @@
+import sys
+import unicodedata
+
+categories = set()
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    name = unicodedata.name(char, None)
+    if name is None:
+        continue
+    cat = unicodedata.category(char)
+    if cat[0] not in categories:
+        print('U+%04x' % i, char.center(6),
+              cat, name, sep='\t')
+        categories.add(cat[0])
--- a/attic/strings-bytes/charfinder.py
+++ b/attic/strings-bytes/charfinder.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+from unicodedata import name
+import sys
+
+if len(sys.argv) > 1:
+    query = sys.argv[1:]
+else:
+    query = input('search words: ').split()
+
+query = [s.upper() for s in query]
+
+count = 0
+for i in range(20, sys.maxunicode):
+    car = chr(i)
+    descr = name(car, None)
+    if descr is None:
+        continue
+    words = descr.split()
+    if all(word in words for word in query):
+        print('{i:5d} {i:04x} {car:^5} {descr}'.format(**locals()))
+        count += 1
+
+print('{0} character(s) found'.format(count))
--- a/attic/strings-bytes/currency_demo.py
+++ b/attic/strings-bytes/currency_demo.py
@@ -0,0 +1,9 @@
+import sys
+import unicodedata
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    if unicodedata.category(char) == 'Sc':
+        name = unicodedata.name(char, None)
+        print('U+%04x' % i, char.center(6),
+              name, sep='\t')
--- a/attic/strings-bytes/encodings_demo.py
+++ b/attic/strings-bytes/encodings_demo.py
@@ -0,0 +1,54 @@
+import unicodedata
+
+encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
+
+widths = {encoding:1 for encoding in encodings[:-3]}
+widths.update(zip(encodings[-3:], (2, 4, 4)))
+
+chars = sorted([
+    'A',  # \u0041 : LATIN CAPITAL LETTER A
+    '¿',  # \u00bf : INVERTED QUESTION MARK
+    'Ã',  # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
+    'á',  # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
+    'Ω',  # \u03a9 : GREEK CAPITAL LETTER OMEGA
+    'µ',
+    'Ц',
+    '€',  # \u20ac : EURO SIGN
+    '“',
+    '┌',
+    '气',
+    '氣', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
+    '𝄞',  # \u1d11e : MUSICAL SYMBOL G CLEF
+])
+
+callout1_code = 0x278a  # ➊   DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
+
+missing_mark = '*'
+
+def list_chars():
+    for char in chars:
+        print('%r,  # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
+
+def show_encodings():
+    print(end='\t\t')
+    for encoding in encodings:
+        print(encoding.ljust(widths[encoding] * 2), end='\t')
+    print()
+
+    for lineno, char in enumerate(chars):
+        codepoint = 'U+{:04X}'.format(ord(char))
+        print(char, codepoint, sep='\t', end='\t')
+        for encoding in encodings:
+            try:
+                bytes = char.encode(encoding)
+                dump = ' '.join('%02X' % byte for byte in bytes)
+            except UnicodeEncodeError:
+                dump = missing_mark
+            dump = dump.ljust(widths[encoding] * 2)
+            print(dump, end='\t')
+        # print(chr(callout1_code + lineno))
+        print(unicodedata.name(char))
+        # print()
+
+#list_chars()
+show_encodings()
--- a/attic/strings-bytes/identifier_norm.py
+++ b/attic/strings-bytes/identifier_norm.py
@@ -0,0 +1,7 @@
+
+café = 1
+café = 2
+names = {(name, tuple(name)):value
+         for name, value in globals().items()
+         if not name.startswith('__')}
+print(names)
--- a/attic/strings-bytes/identifier_norm_writer.py
+++ b/attic/strings-bytes/identifier_norm_writer.py
@@ -0,0 +1,12 @@
+
+src = """
+café = 1
+cafe\u0301 = 2
+names = {(name, tuple(name)):value
+         for name, value in globals().items()
+         if not name.startswith('__')}
+print(names)
+"""
+
+with open('identifier_norm.py', 'tw', encoding='utf8') as out:
+    out.write(src)
--- a/attic/strings-bytes/nfc_demo.py
+++ b/attic/strings-bytes/nfc_demo.py
@@ -0,0 +1,14 @@
+import sys
+from unicodedata import name, normalize
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    nfc = normalize('NFC', char)
+    if nfc == char:
+        continue
+    if len(nfc) > 1:
+        nfc_display = ' '.join(nfc)
+        print('U+%04x' % i, char, nfc_display, char_name, sep='\t')
--- a/attic/strings-bytes/nfk_demo.py
+++ b/attic/strings-bytes/nfk_demo.py
@@ -0,0 +1,16 @@
+import sys
+from unicodedata import name, normalize
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    kc = normalize('NFKC', char)
+    if kc == char:
+        continue
+    kd = normalize('NFKD', char)
+    if kc != kd:
+        kc_display = ' '.join(kc)
+        kd_display = ' '.join(kd)
+        print('U+%04x' % i, char, kc_display, kd_display, char_name, sep='\t')
--- a/attic/strings-bytes/numerics.py
+++ b/attic/strings-bytes/numerics.py
@@ -0,0 +1,15 @@
+import sys
+from unicodedata import name
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    try:
+        char_name = name(char)
+    except ValueError: # no such name
+        continue
+    flags = []
+    flags.append('D' if char.isdigit() else '')
+    flags.append('N' if char.isnumeric() else '')
+    if any(flags):
+        flags = '\t'.join(flags)
+        print('U+%04x' % i, char, flags, char_name, sep='\t')
--- a/attic/strings-bytes/numerics_demo.txt
+++ b/attic/strings-bytes/numerics_demo.txt
@@ -0,0 +1,9 @@
+U+0031	1	re_dig	isdig	isnum	 1.00	DIGIT ONE
+U+00b2	²	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
+U+00bc	¼	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
+U+0969	३	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
+U+136b	፫	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
+U+216b	Ⅻ	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
+U+2466	⑦	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
+U+2480	⒀	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
+U+3285	㊅	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX
--- a/attic/strings-bytes/ola.py
+++ b/attic/strings-bytes/ola.py
@@ -0,0 +1,3 @@
+# coding: cp1252
+
+print('Olá, Mundo!')
--- a/attic/strings-bytes/plane_count.py
+++ b/attic/strings-bytes/plane_count.py
@@ -0,0 +1,16 @@
+import sys
+from unicodedata import name, normalize
+
+total_count = 0
+bmp_count = 0
+
+for i in range(sys.maxunicode):
+    char = chr(i)
+    char_name = name(char, None)
+    if char_name is None:
+        continue
+    total_count += 1
+    if i <= 0xffff:
+        bmp_count += 1
+
+print(total_count, bmp_count, bmp_count/total_count, bmp_count/total_count*100)
--- a/attic/strings-bytes/sorting.py
+++ b/attic/strings-bytes/sorting.py
@@ -0,0 +1,26 @@
+import locale
+
+def check(sorted_list):
+    return 'CORRECT' if fruits == sorted_list else 'WRONG'
+
+fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
+
+print(locale.getlocale(locale.LC_COLLATE))
+
+print('manual_sort ', fruits)
+
+plain_sort = sorted(fruits)
+
+print('plain_sort  ', plain_sort, check(plain_sort))
+
+locale_sort1 = sorted(fruits, key=locale.strxfrm)
+
+print('locale_sort1', locale_sort1, check(locale_sort1))
+
+locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
+
+print('locale set to:', locale.getlocale(locale.LC_COLLATE))
+
+locale_sort2 = sorted(fruits, key=locale.strxfrm)
+
+print('locale_sort2', locale_sort2, check(locale_sort2))
--- a/attic/strings-bytes/sorting_uca.py
+++ b/attic/strings-bytes/sorting_uca.py
@@ -0,0 +1,18 @@
+from pyuca import Collator
+
+def check(sorted_list):
+    return 'CORRECT' if fruits == sorted_list else 'WRONG'
+
+fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
+
+print('manual_sort', fruits)
+
+plain_sort = sorted(fruits)
+
+print('plain_sort ', plain_sort, check(plain_sort))
+
+coll = Collator()
+
+pyuca_sort = sorted(fruits, key=coll.sort_key)
+
+print('pyuca_sort ', pyuca_sort, check(pyuca_sort))
--- a/attic/strings-bytes/sorting_uca.txt
+++ b/attic/strings-bytes/sorting_uca.txt
@@ -0,0 +1,11 @@
+PS > pip install pyuca
+Downloading/unpacking pyuca
+  Running setup.py (path:C:\Users\...) egg_info for package pyuca
+Installing collected packages: pyuca
+  Running setup.py install for pyuca
+Successfully installed pyuca
+Cleaning up...
+PS > python .\sorting_uca.py
+manual_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju']
+plain_sort  ['acaíba', 'acerola', 'açaí', 'caju', 'cajá'] WRONG
+pyuca_sort  ['açaí', 'acaíba', 'acerola', 'cajá', 'caju'] CORRECT
--- a/attic/strings-bytes/str_repr.py
+++ b/attic/strings-bytes/str_repr.py
@@ -0,0 +1,14 @@
+
+last_len = 0
+last_repr = ''
+lengths = set()
+for i in range(0x110000):
+    r = repr(chr(i))[1:-1]
+    if len(r) != last_len:
+        lengths.add(len(r))
+        last_len = len(r)
+        if i > 0:
+            prev_repr = repr(chr(i-1))[1:-1]
+            print('{}'.format(prev_repr))
+        print('U+{:04x} {:{max_len}} ...'.format(i, r, max_len=max(lengths)), end=' ')
+        last_repr = r
--- a/attic/strings-bytes/str_repr2.py
+++ b/attic/strings-bytes/str_repr2.py
@@ -0,0 +1,50 @@
+from itertools import groupby
+
+def bare_repr(codepoint):
+    return repr(chr(codepoint))[1:-1]
+
+def display(codepoint):
+    repstr = repr(chr(codepoint))[1:-1]
+    print('U+{:04x} {:{max_len}}'.format(
+            codepoint, repstr, max_len=max(lengths)))
+
+def repr_shape(codepoint):
+    brepr = bare_repr(codepoint)
+    if len(brepr) == 1:
+        shape = 'GLYPH'
+    else:
+        shape = brepr[:2]
+        escapes.add(shape)
+    return len(brepr), shape
+
+escapes = set()
+
+group_gen = groupby((codepoint for codepoint in range(0x110000)), repr_shape)
+
+for len_shape, group in group_gen:
+    len_brepr, shape = len_shape
+    group = list(group)
+    cp_first = group[0]
+    cp_last = group[-1]
+    cp_mid = group[len(group)//2]
+    if len(group) == 1:
+        glyph_sample = bare_repr(cp_first) if shape == 'GLYPH' else ''
+        print('{:6d} U+{:04X}          {:5} {}'.format(
+            len(group), cp_first, shape, glyph_sample))
+    else:
+        if len(group) == 2:
+            if shape == 'GLYPH':
+                glyph_sample = bare_repr(cp_first) + ' ' + bare_repr(cp_last)
+            else:
+                glyph_sample = ''
+            print('{:6d} U+{:04X} , U+{:04X} {:5} {}'.format(
+                len(group), cp_first, cp_last, shape, glyph_sample))
+        else:
+            if shape == 'GLYPH':
+                glyph_sample = ' '.join([bare_repr(cp_first),
+                                    bare_repr(cp_mid), bare_repr(cp_last)])
+            else:
+                glyph_sample = ''
+            print('{:6d} U+{:04X}...U+{:04X} {:5} {}'.format(
+                len(group), cp_first, cp_last, shape, glyph_sample))
+print('escapes:', ' '.join(sorted(escapes, key=str.upper)))
--- a/attic/strings-bytes/strings-bytes-test.txt
+++ b/attic/strings-bytes/strings-bytes-test.txt
@@ -0,0 +1,22 @@
+>>> s = 'naïve'  <1>
+>>> b = b'naïve'  <2>
+Traceback (most recent call last):
+  ...
+SyntaxError: bytes can only contain ASCII literal characters.
+>>> b = bytes('naïve', 'iso8859-1')  <3>
+>>> b         <4>
+b'na\xefve'
+>>> s         <5>
+'naïve'
+>>> b == s.encode('iso8859-1')  <6>
+True
+>>> s[2]      <7>
+'ï'
+>>> b[2]      <8>
+239
+>>> ord(s[2]) <9>
+239
+>>> s.upper() <10>
+'NAÏVE'
+>>> b.upper() <11>
+b'NA\xefVE'