updated from Atlas

This commit is contained in:
Luciano Ramalho
2015-04-01 22:48:56 -03:00
parent aab93699a4
commit 573e1a94c4
109 changed files with 5 additions and 6 deletions

View File

@@ -0,0 +1 @@
καφέ

View File

@@ -0,0 +1 @@
café

View File

@@ -0,0 +1,19 @@
import sys
from unicodedata import name, normalize
changed = 0
assigned = 0
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
cf = char.casefold()
assigned += 1
if cf != char.lower():
cf_display = ' '.join(cf)
cf_names = ';'.join(name(c) for c in cf)
changed += 1
print('%4d U+%04x' % (changed, i), char, cf_display, char_name + ' -> ' + cf_names, sep='\t')
print(changed, '/', assigned, '=', changed/assigned*100)

View File

@@ -0,0 +1,15 @@
import sys
import unicodedata
categories = set()
for i in range(sys.maxunicode):
char = chr(i)
name = unicodedata.name(char, None)
if name is None:
continue
cat = unicodedata.category(char)
if cat[0] not in categories:
print('U+%04x' % i, char.center(6),
cat, name, sep='\t')
categories.add(cat[0])

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python
from unicodedata import name
import sys
if len(sys.argv) > 1:
query = sys.argv[1:]
else:
query = input('search words: ').split()
query = [s.upper() for s in query]
count = 0
for i in range(20, sys.maxunicode):
car = chr(i)
descr = name(car, None)
if descr is None:
continue
words = descr.split()
if all(word in words for word in query):
print('{i:5d} {i:04x} {car:^5} {descr}'.format(**locals()))
count += 1
print('{0} character(s) found'.format(count))

View File

@@ -0,0 +1,9 @@
import sys
import unicodedata
for i in range(sys.maxunicode):
char = chr(i)
if unicodedata.category(char) == 'Sc':
name = unicodedata.name(char, None)
print('U+%04x' % i, char.center(6),
name, sep='\t')

View File

@@ -0,0 +1,54 @@
import unicodedata
encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
widths = {encoding:1 for encoding in encodings[:-3]}
widths.update(zip(encodings[-3:], (2, 4, 4)))
chars = sorted([
'A', # \u0041 : LATIN CAPITAL LETTER A
'¿', # \u00bf : INVERTED QUESTION MARK
'Ã', # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
'á', # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
'Ω', # \u03a9 : GREEK CAPITAL LETTER OMEGA
'µ',
'Ц',
'', # \u20ac : EURO SIGN
'',
'',
'',
'', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
'𝄞', # \u1d11e : MUSICAL SYMBOL G CLEF
])
callout1_code = 0x278a # ➊ DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
missing_mark = '*'
def list_chars():
for char in chars:
print('%r, # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
def show_encodings():
print(end='\t\t')
for encoding in encodings:
print(encoding.ljust(widths[encoding] * 2), end='\t')
print()
for lineno, char in enumerate(chars):
codepoint = 'U+{:04X}'.format(ord(char))
print(char, codepoint, sep='\t', end='\t')
for encoding in encodings:
try:
bytes = char.encode(encoding)
dump = ' '.join('%02X' % byte for byte in bytes)
except UnicodeEncodeError:
dump = missing_mark
dump = dump.ljust(widths[encoding] * 2)
print(dump, end='\t')
# print(chr(callout1_code + lineno))
print(unicodedata.name(char))
# print()
#list_chars()
show_encodings()

View File

@@ -0,0 +1,7 @@
café = 1
café = 2
names = {(name, tuple(name)):value
for name, value in globals().items()
if not name.startswith('__')}
print(names)

View File

@@ -0,0 +1,12 @@
src = """
café = 1
cafe\u0301 = 2
names = {(name, tuple(name)):value
for name, value in globals().items()
if not name.startswith('__')}
print(names)
"""
with open('identifier_norm.py', 'tw', encoding='utf8') as out:
out.write(src)

View File

@@ -0,0 +1,14 @@
import sys
from unicodedata import name, normalize
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
nfc = normalize('NFC', char)
if nfc == char:
continue
if len(nfc) > 1:
nfc_display = ' '.join(nfc)
print('U+%04x' % i, char, nfc_display, char_name, sep='\t')

View File

@@ -0,0 +1,16 @@
import sys
from unicodedata import name, normalize
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
kc = normalize('NFKC', char)
if kc == char:
continue
kd = normalize('NFKD', char)
if kc != kd:
kc_display = ' '.join(kc)
kd_display = ' '.join(kd)
print('U+%04x' % i, char, kc_display, kd_display, char_name, sep='\t')

View File

@@ -0,0 +1,15 @@
import sys
from unicodedata import name
for i in range(sys.maxunicode):
char = chr(i)
try:
char_name = name(char)
except ValueError: # no such name
continue
flags = []
flags.append('D' if char.isdigit() else '')
flags.append('N' if char.isnumeric() else '')
if any(flags):
flags = '\t'.join(flags)
print('U+%04x' % i, char, flags, char_name, sep='\t')

View File

@@ -0,0 +1,9 @@
U+0031 1 re_dig isdig isnum 1.00 DIGIT ONE
U+00b2 ² - isdig isnum 2.00 SUPERSCRIPT TWO
U+00bc ¼ - - isnum 0.25 VULGAR FRACTION ONE QUARTER
U+0969 ३ re_dig isdig isnum 3.00 DEVANAGARI DIGIT THREE
U+136b ፫ - isdig isnum 3.00 ETHIOPIC DIGIT THREE
U+216b Ⅻ - - isnum 12.00 ROMAN NUMERAL TWELVE
U+2466 ⑦ - isdig isnum 7.00 CIRCLED DIGIT SEVEN
U+2480 ⒀ - - isnum 13.00 PARENTHESIZED NUMBER THIRTEEN
U+3285 ㊅ - - isnum 6.00 CIRCLED IDEOGRAPH SIX

View File

@@ -0,0 +1,3 @@
# coding: cp1252
print('Olá, Mundo!')

View File

@@ -0,0 +1,16 @@
import sys
from unicodedata import name, normalize
total_count = 0
bmp_count = 0
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
total_count += 1
if i <= 0xffff:
bmp_count += 1
print(total_count, bmp_count, bmp_count/total_count, bmp_count/total_count*100)

View File

@@ -0,0 +1,26 @@
import locale
def check(sorted_list):
return 'CORRECT' if fruits == sorted_list else 'WRONG'
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
print(locale.getlocale(locale.LC_COLLATE))
print('manual_sort ', fruits)
plain_sort = sorted(fruits)
print('plain_sort ', plain_sort, check(plain_sort))
locale_sort1 = sorted(fruits, key=locale.strxfrm)
print('locale_sort1', locale_sort1, check(locale_sort1))
locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print('locale set to:', locale.getlocale(locale.LC_COLLATE))
locale_sort2 = sorted(fruits, key=locale.strxfrm)
print('locale_sort2', locale_sort2, check(locale_sort2))

View File

@@ -0,0 +1,18 @@
from pyuca import Collator
def check(sorted_list):
return 'CORRECT' if fruits == sorted_list else 'WRONG'
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
print('manual_sort', fruits)
plain_sort = sorted(fruits)
print('plain_sort ', plain_sort, check(plain_sort))
coll = Collator()
pyuca_sort = sorted(fruits, key=coll.sort_key)
print('pyuca_sort ', pyuca_sort, check(pyuca_sort))

View File

@@ -0,0 +1,11 @@
PS > pip install pyuca
Downloading/unpacking pyuca
Running setup.py (path:C:\Users\...) egg_info for package pyuca
Installing collected packages: pyuca
Running setup.py install for pyuca
Successfully installed pyuca
Cleaning up...
PS > python .\sorting_uca.py
manual_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju']
plain_sort ['acaíba', 'acerola', 'açaí', 'caju', 'cajá'] WRONG
pyuca_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju'] CORRECT

View File

@@ -0,0 +1,14 @@
last_len = 0
last_repr = ''
lengths = set()
for i in range(0x110000):
r = repr(chr(i))[1:-1]
if len(r) != last_len:
lengths.add(len(r))
last_len = len(r)
if i > 0:
prev_repr = repr(chr(i-1))[1:-1]
print('{}'.format(prev_repr))
print('U+{:04x} {:{max_len}} ...'.format(i, r, max_len=max(lengths)), end=' ')
last_repr = r

View File

@@ -0,0 +1,50 @@
from itertools import groupby
def bare_repr(codepoint):
return repr(chr(codepoint))[1:-1]
def display(codepoint):
repstr = repr(chr(codepoint))[1:-1]
print('U+{:04x} {:{max_len}}'.format(
codepoint, repstr, max_len=max(lengths)))
def repr_shape(codepoint):
brepr = bare_repr(codepoint)
if len(brepr) == 1:
shape = 'GLYPH'
else:
shape = brepr[:2]
escapes.add(shape)
return len(brepr), shape
escapes = set()
group_gen = groupby((codepoint for codepoint in range(0x110000)), repr_shape)
for len_shape, group in group_gen:
len_brepr, shape = len_shape
group = list(group)
cp_first = group[0]
cp_last = group[-1]
cp_mid = group[len(group)//2]
if len(group) == 1:
glyph_sample = bare_repr(cp_first) if shape == 'GLYPH' else ''
print('{:6d} U+{:04X} {:5} {}'.format(
len(group), cp_first, shape, glyph_sample))
else:
if len(group) == 2:
if shape == 'GLYPH':
glyph_sample = bare_repr(cp_first) + ' ' + bare_repr(cp_last)
else:
glyph_sample = ''
print('{:6d} U+{:04X} , U+{:04X} {:5} {}'.format(
len(group), cp_first, cp_last, shape, glyph_sample))
else:
if shape == 'GLYPH':
glyph_sample = ' '.join([bare_repr(cp_first),
bare_repr(cp_mid), bare_repr(cp_last)])
else:
glyph_sample = ''
print('{:6d} U+{:04X}...U+{:04X} {:5} {}'.format(
len(group), cp_first, cp_last, shape, glyph_sample))
print('escapes:', ' '.join(sorted(escapes, key=str.upper)))

View File

@@ -0,0 +1,22 @@
>>> s = 'naïve' <1>
>>> b = b'naïve' <2>
Traceback (most recent call last):
...
SyntaxError: bytes can only contain ASCII literal characters.
>>> b = bytes('naïve', 'iso8859-1') <3>
>>> b <4>
b'na\xefve'
>>> s <5>
'naïve'
>>> b == s.encode('iso8859-1') <6>
True
>>> s[2] <7>
'ï'
>>> b[2] <8>
239
>>> ord(s[2]) <9>
239
>>> s.upper() <10>
'NAÏVE'
>>> b.upper() <11>
b'NA\xefVE'