updated chapter 4 and appendix-A files

This commit is contained in:
Luciano Ramalho
2020-01-22 22:52:23 -03:00
parent a1d6c125bf
commit 49f52e29c7
25 changed files with 5219 additions and 10 deletions

28
04-text-byte/charfinder/cf.py Executable file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python3
import sys
import unicodedata
FIRST, LAST = ord(' '), sys.maxunicode # <1>
def find(*query_words, first=FIRST, last=LAST): # <2>
query = {w.upper() for w in query_words} # <3>
count = 0
for code in range(first, last + 1):
char = chr(code) # <4>
name = unicodedata.name(char, None) # <5>
if name and query.issubset(name.split()): # <6>
print(f'U+{code:04X}\t{char}\t{name}') # <7>
count += 1
print(f'({count} found)')
def main(words):
if words:
find(*words)
else:
print('Please provide words to find.')
if __name__ == '__main__':
main(sys.argv[1:])

View File

@@ -0,0 +1,36 @@
Doctests for ``cf.py``
======================
How to run the tests
----------------------
Run the ``doctest`` module from the command line::
$ python3 -m doctest cf_tests.rst
Tests
-----
Import functions for testing::
>>> from cf import find, main
Test ``find`` with single result::
>>> find("sign", "registered") # doctest:+NORMALIZE_WHITESPACE
U+00AE ® REGISTERED SIGN
(1 found)
Test ``find`` with two results::
>>> find("chess", "queen", last=0xFFFF) # doctest:+NORMALIZE_WHITESPACE
U+2655 ♕ WHITE CHESS QUEEN
U+265B ♛ BLACK CHESS QUEEN
(2 found)
Test ``main`` with no words::
>>> main([])
Please provide words to find.

View File

@@ -0,0 +1,2 @@
#!/bin/bash
python3 -m doctest cf_tests.rst $1

View File

@@ -1,4 +1,4 @@
# BEGIN NUMERICS_DEMO
# tag::NUMERICS_DEMO[]
import unicodedata
import re
@@ -15,4 +15,4 @@ for char in sample:
format(unicodedata.numeric(char), '5.2f'), # <6>
unicodedata.name(char), # <7>
sep='\t')
# END NUMERICS_DEMO
# end::NUMERICS_DEMO[]

View File

@@ -1,4 +1,4 @@
# BEGIN RE_DEMO
# tag::RE_DEMO[]
import re
re_numbers_str = re.compile(r'\d+') # <1>
@@ -18,4 +18,4 @@ print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
print('Words')
print(' str :', re_words_str.findall(text_str)) # <8>
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
# END RE_DEMO
# end::RE_DEMO[]

View File

@@ -28,7 +28,7 @@ Handling a string with Greek and Latin accented characters:
"""
# BEGIN SHAVE_MARKS
# tag::SHAVE_MARKS[]
import unicodedata
import string
@@ -39,9 +39,9 @@ def shave_marks(txt):
shaved = ''.join(c for c in norm_txt
if not unicodedata.combining(c)) # <2>
return unicodedata.normalize('NFC', shaved) # <3>
# END SHAVE_MARKS
# end::SHAVE_MARKS[]
# BEGIN SHAVE_MARKS_LATIN
# tag::SHAVE_MARKS_LATIN[]
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
@@ -56,9 +56,9 @@ def shave_marks_latin(txt):
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved) # <5>
# END SHAVE_MARKS_LATIN
# end::SHAVE_MARKS_LATIN[]
# BEGIN ASCIIZE
# tag::ASCIIZE[]
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
"""'f"*^<''""---~>""")
@@ -84,4 +84,4 @@ def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt)) # <5>
no_marks = no_marks.replace('ß', 'ss') # <6>
return unicodedata.normalize('NFKC', no_marks) # <7>
# END ASCIIZE
# end::ASCIIZE[]

12
04-text-byte/skin.py Executable file
View File

@@ -0,0 +1,12 @@
from unicodedata import name
SKIN1 = 0x1F3FB # EMOJI MODIFIER FITZPATRICK TYPE-1-2 # <1>
SKINS = [chr(i) for i in range(SKIN1, SKIN1 + 5)] # <2>
THUMB = '\U0001F44d' # THUMBS UP SIGN 👍
examples = [THUMB] # <3>
examples.extend(THUMB + skin for skin in SKINS) # <4>
for example in examples:
print(example, end='\t') # <5>
print(' + '.join(name(char) for char in example)) # <6>

View File

@@ -0,0 +1,6 @@
# REGIONAL INDICATOR SYMBOLS
RIS_A = '\U0001F1E6' # LETTER A
RIS_U = '\U0001F1FA' # LETTER U
print(RIS_A + RIS_U) # AU: Australia
print(RIS_U + RIS_A) # UA: Ukraine
print(RIS_A + RIS_A) # AA: no such country

View File

@@ -0,0 +1,28 @@
from unicodedata import name
zwg_sample = """
1F468 200D 1F9B0 |man: red hair |E11.0
1F9D1 200D 1F91D 200D 1F9D1 |people holding hands |E12.0
1F3CA 1F3FF 200D 2640 FE0F |woman swimming: dark skin tone |E4.0
1F469 1F3FE 200D 2708 FE0F |woman pilot: medium-dark skin tone |E4.0
1F468 200D 1F469 200D 1F467 |family: man, woman, girl |E2.0
1F3F3 FE0F 200D 26A7 FE0F |transgender flag |E13.0
1F469 200D 2764 FE0F 200D 1F48B 200D 1F469 |kiss: woman, woman |E2.0
"""
markers = {'\u200D': 'ZWG', # ZERO WIDTH JOINER
'\uFE0F': 'V16', # VARIATION SELECTOR-16
}
for line in zwg_sample.strip().split('\n'):
code, descr, version = (s.strip() for s in line.split('|'))
chars = [chr(int(c, 16)) for c in code.split()]
print(''.join(chars), version, descr, sep='\t', end='')
while chars:
char = chars.pop(0)
if char in markers:
print(' + ' + markers[char], end='')
else:
ucode = f'U+{ord(char):04X}'
print(f'\n\t{char}\t{ucode}\t{name(char)}', end='')
print()