updated chapter 4 and appendix-A files

2020-01-22 22:52:23 -03:00
parent a1d6c125bf
commit 49f52e29c7
25 changed files with 5219 additions and 10 deletions
--- a/04-text-byte/charfinder/cf.py
+++ b/04-text-byte/charfinder/cf.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import sys
+import unicodedata
+
+FIRST, LAST = ord(' '), sys.maxunicode              # <1>
+
+
+def find(*query_words, first=FIRST, last=LAST):     # <2>
+    query = {w.upper() for w in query_words}        # <3>
+    count = 0
+    for code in range(first, last + 1):    
+        char = chr(code)                            # <4>
+        name = unicodedata.name(char, None)         # <5>
+        if name and query.issubset(name.split()):   # <6>
+            print(f'U+{code:04X}\t{char}\t{name}')  # <7>
+            count += 1
+    print(f'({count} found)')
+
+
+def main(words):
+    if words:
+        find(*words)
+    else:
+        print('Please provide words to find.')
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/04-text-byte/charfinder/cf_tests.rst
+++ b/04-text-byte/charfinder/cf_tests.rst
@@ -0,0 +1,36 @@
+Doctests for ``cf.py``
+======================
+
+How to run the tests
+----------------------
+
+Run the ``doctest`` module from the command line::
+
+    $ python3 -m doctest cf_tests.rst
+
+
+Tests
+-----
+
+Import functions for testing::
+
+    >>> from cf import find, main
+
+Test ``find`` with single result::
+
+    >>> find("sign", "registered")  # doctest:+NORMALIZE_WHITESPACE
+    U+00AE  ®   REGISTERED SIGN
+    (1 found)
+
+
+Test ``find`` with two results::
+
+    >>> find("chess", "queen", last=0xFFFF)  # doctest:+NORMALIZE_WHITESPACE
+    U+2655	♕	WHITE CHESS QUEEN
+    U+265B	♛	BLACK CHESS QUEEN
+    (2 found)
+
+Test ``main`` with no words::
+
+    >>> main([])
+    Please provide words to find.
--- a/04-text-byte/charfinder/test.sh
+++ b/04-text-byte/charfinder/test.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python3 -m doctest cf_tests.rst $1
--- a/04-text-byte/numerics_demo.py
+++ b/04-text-byte/numerics_demo.py
@@ -1,4 +1,4 @@
-# BEGIN NUMERICS_DEMO
+# tag::NUMERICS_DEMO[]
 import unicodedata
 import re

@@ -15,4 +15,4 @@ for char in sample:
          format(unicodedata.numeric(char), '5.2f'),  # <6>
          unicodedata.name(char),                     # <7>
          sep='\t')
-# END NUMERICS_DEMO
+# end::NUMERICS_DEMO[]
--- a/04-text-byte/ramanujan.py
+++ b/04-text-byte/ramanujan.py
@@ -1,4 +1,4 @@
-# BEGIN RE_DEMO
+# tag::RE_DEMO[]
 import re

 re_numbers_str = re.compile(r'\d+')     # <1>
@@ -18,4 +18,4 @@ print('  bytes:', re_numbers_bytes.findall(text_bytes))  # <7>
 print('Words')
 print('  str  :', re_words_str.findall(text_str))        # <8>
 print('  bytes:', re_words_bytes.findall(text_bytes))    # <9>
-# END RE_DEMO
+# end::RE_DEMO[]
--- a/04-text-byte/sanitize.py
+++ b/04-text-byte/sanitize.py
@@ -28,7 +28,7 @@ Handling a string with Greek and Latin accented characters:

 """

-# BEGIN SHAVE_MARKS
+# tag::SHAVE_MARKS[]
 import unicodedata
 import string

@@ -39,9 +39,9 @@ def shave_marks(txt):
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))  # <2>
    return unicodedata.normalize('NFC', shaved)  # <3>
-# END SHAVE_MARKS
+# end::SHAVE_MARKS[]

-# BEGIN SHAVE_MARKS_LATIN
+# tag::SHAVE_MARKS_LATIN[]
 def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
@@ -56,9 +56,9 @@ def shave_marks_latin(txt):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)   # <5>
-# END SHAVE_MARKS_LATIN
+# end::SHAVE_MARKS_LATIN[]

-# BEGIN ASCIIZE
+# tag::ASCIIZE[]
 single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
                           """'f"*^<''""---~>""")

@@ -84,4 +84,4 @@ def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))     # <5>
    no_marks = no_marks.replace('ß', 'ss')          # <6>
    return unicodedata.normalize('NFKC', no_marks)  # <7>
-# END ASCIIZE
+# end::ASCIIZE[]
--- a/04-text-byte/skin.py
+++ b/04-text-byte/skin.py
@@ -0,0 +1,12 @@
+from unicodedata import name
+
+SKIN1 = 0x1F3FB  # EMOJI MODIFIER FITZPATRICK TYPE-1-2  # <1>
+SKINS = [chr(i) for i in range(SKIN1, SKIN1 + 5)]       # <2>
+THUMB = '\U0001F44d'  # THUMBS UP SIGN 👍
+
+examples = [THUMB]                                      # <3>
+examples.extend(THUMB + skin for skin in SKINS)         # <4>
+
+for example in examples:
+    print(example, end='\t')                            # <5>
+    print(' + '.join(name(char) for char in example))   # <6>
--- a/04-text-byte/two_flags.py
+++ b/04-text-byte/two_flags.py
@@ -0,0 +1,6 @@
+# REGIONAL INDICATOR SYMBOLS
+RIS_A = '\U0001F1E6'  # LETTER A
+RIS_U = '\U0001F1FA'  # LETTER U
+print(RIS_A + RIS_U)  # AU: Australia
+print(RIS_U + RIS_A)  # UA: Ukraine
+print(RIS_A + RIS_A)  # AA: no such country
--- a/04-text-byte/zwj_sample.py
+++ b/04-text-byte/zwj_sample.py
@@ -0,0 +1,28 @@
+from unicodedata import name
+
+zwg_sample = """
+1F468 200D 1F9B0            |man: red hair                      |E11.0
+1F9D1 200D 1F91D 200D 1F9D1 |people holding hands               |E12.0
+1F3CA 1F3FF 200D 2640 FE0F  |woman swimming: dark skin tone     |E4.0
+1F469 1F3FE 200D 2708 FE0F  |woman pilot: medium-dark skin tone |E4.0
+1F468 200D 1F469 200D 1F467 |family: man, woman, girl           |E2.0
+1F3F3 FE0F 200D 26A7 FE0F   |transgender flag                   |E13.0
+1F469 200D 2764 FE0F 200D 1F48B 200D 1F469 |kiss: woman, woman  |E2.0
+"""
+
+markers = {'\u200D': 'ZWG', # ZERO WIDTH JOINER
+           '\uFE0F': 'V16', # VARIATION SELECTOR-16
+          }
+
+for line in zwg_sample.strip().split('\n'):
+    code, descr, version = (s.strip() for s in line.split('|'))
+    chars = [chr(int(c, 16)) for c in code.split()]
+    print(''.join(chars), version, descr, sep='\t', end='')
+    while chars:
+        char = chars.pop(0)
+        if char in markers:
+            print(' + ' + markers[char], end='')
+        else:
+            ucode = f'U+{ord(char):04X}'
+            print(f'\n\t{char}\t{ucode}\t{name(char)}', end='')
+    print()