updated chapter 4 and appendix-A files
This commit is contained in:
28
04-text-byte/charfinder/cf.py
Executable file
28
04-text-byte/charfinder/cf.py
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import unicodedata
|
||||
|
||||
FIRST, LAST = ord(' '), sys.maxunicode # <1>
|
||||
|
||||
|
||||
def find(*query_words, first=FIRST, last=LAST): # <2>
|
||||
query = {w.upper() for w in query_words} # <3>
|
||||
count = 0
|
||||
for code in range(first, last + 1):
|
||||
char = chr(code) # <4>
|
||||
name = unicodedata.name(char, None) # <5>
|
||||
if name and query.issubset(name.split()): # <6>
|
||||
print(f'U+{code:04X}\t{char}\t{name}') # <7>
|
||||
count += 1
|
||||
print(f'({count} found)')
|
||||
|
||||
|
||||
def main(words):
|
||||
if words:
|
||||
find(*words)
|
||||
else:
|
||||
print('Please provide words to find.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
36
04-text-byte/charfinder/cf_tests.rst
Normal file
36
04-text-byte/charfinder/cf_tests.rst
Normal file
@@ -0,0 +1,36 @@
|
||||
Doctests for ``cf.py``
|
||||
======================
|
||||
|
||||
How to run the tests
|
||||
----------------------
|
||||
|
||||
Run the ``doctest`` module from the command line::
|
||||
|
||||
$ python3 -m doctest cf_tests.rst
|
||||
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
Import functions for testing::
|
||||
|
||||
>>> from cf import find, main
|
||||
|
||||
Test ``find`` with single result::
|
||||
|
||||
>>> find("sign", "registered") # doctest:+NORMALIZE_WHITESPACE
|
||||
U+00AE ® REGISTERED SIGN
|
||||
(1 found)
|
||||
|
||||
|
||||
Test ``find`` with two results::
|
||||
|
||||
>>> find("chess", "queen", last=0xFFFF) # doctest:+NORMALIZE_WHITESPACE
|
||||
U+2655 ♕ WHITE CHESS QUEEN
|
||||
U+265B ♛ BLACK CHESS QUEEN
|
||||
(2 found)
|
||||
|
||||
Test ``main`` with no words::
|
||||
|
||||
>>> main([])
|
||||
Please provide words to find.
|
||||
2
04-text-byte/charfinder/test.sh
Executable file
2
04-text-byte/charfinder/test.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
python3 -m doctest cf_tests.rst $1
|
||||
@@ -1,4 +1,4 @@
|
||||
# BEGIN NUMERICS_DEMO
|
||||
# tag::NUMERICS_DEMO[]
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
@@ -15,4 +15,4 @@ for char in sample:
|
||||
format(unicodedata.numeric(char), '5.2f'), # <6>
|
||||
unicodedata.name(char), # <7>
|
||||
sep='\t')
|
||||
# END NUMERICS_DEMO
|
||||
# end::NUMERICS_DEMO[]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# BEGIN RE_DEMO
|
||||
# tag::RE_DEMO[]
|
||||
import re
|
||||
|
||||
re_numbers_str = re.compile(r'\d+') # <1>
|
||||
@@ -18,4 +18,4 @@ print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
|
||||
print('Words')
|
||||
print(' str :', re_words_str.findall(text_str)) # <8>
|
||||
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
|
||||
# END RE_DEMO
|
||||
# end::RE_DEMO[]
|
||||
|
||||
@@ -28,7 +28,7 @@ Handling a string with Greek and Latin accented characters:
|
||||
|
||||
"""
|
||||
|
||||
# BEGIN SHAVE_MARKS
|
||||
# tag::SHAVE_MARKS[]
|
||||
import unicodedata
|
||||
import string
|
||||
|
||||
@@ -39,9 +39,9 @@ def shave_marks(txt):
|
||||
shaved = ''.join(c for c in norm_txt
|
||||
if not unicodedata.combining(c)) # <2>
|
||||
return unicodedata.normalize('NFC', shaved) # <3>
|
||||
# END SHAVE_MARKS
|
||||
# end::SHAVE_MARKS[]
|
||||
|
||||
# BEGIN SHAVE_MARKS_LATIN
|
||||
# tag::SHAVE_MARKS_LATIN[]
|
||||
def shave_marks_latin(txt):
|
||||
"""Remove all diacritic marks from Latin base characters"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
@@ -56,9 +56,9 @@ def shave_marks_latin(txt):
|
||||
latin_base = c in string.ascii_letters
|
||||
shaved = ''.join(keepers)
|
||||
return unicodedata.normalize('NFC', shaved) # <5>
|
||||
# END SHAVE_MARKS_LATIN
|
||||
# end::SHAVE_MARKS_LATIN[]
|
||||
|
||||
# BEGIN ASCIIZE
|
||||
# tag::ASCIIZE[]
|
||||
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
|
||||
"""'f"*^<''""---~>""")
|
||||
|
||||
@@ -84,4 +84,4 @@ def asciize(txt):
|
||||
no_marks = shave_marks_latin(dewinize(txt)) # <5>
|
||||
no_marks = no_marks.replace('ß', 'ss') # <6>
|
||||
return unicodedata.normalize('NFKC', no_marks) # <7>
|
||||
# END ASCIIZE
|
||||
# end::ASCIIZE[]
|
||||
|
||||
12
04-text-byte/skin.py
Executable file
12
04-text-byte/skin.py
Executable file
@@ -0,0 +1,12 @@
|
||||
from unicodedata import name
|
||||
|
||||
SKIN1 = 0x1F3FB # EMOJI MODIFIER FITZPATRICK TYPE-1-2 # <1>
|
||||
SKINS = [chr(i) for i in range(SKIN1, SKIN1 + 5)] # <2>
|
||||
THUMB = '\U0001F44d' # THUMBS UP SIGN 👍
|
||||
|
||||
examples = [THUMB] # <3>
|
||||
examples.extend(THUMB + skin for skin in SKINS) # <4>
|
||||
|
||||
for example in examples:
|
||||
print(example, end='\t') # <5>
|
||||
print(' + '.join(name(char) for char in example)) # <6>
|
||||
6
04-text-byte/two_flags.py
Normal file
6
04-text-byte/two_flags.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# REGIONAL INDICATOR SYMBOLS
|
||||
RIS_A = '\U0001F1E6' # LETTER A
|
||||
RIS_U = '\U0001F1FA' # LETTER U
|
||||
print(RIS_A + RIS_U) # AU: Australia
|
||||
print(RIS_U + RIS_A) # UA: Ukraine
|
||||
print(RIS_A + RIS_A) # AA: no such country
|
||||
28
04-text-byte/zwj_sample.py
Normal file
28
04-text-byte/zwj_sample.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from unicodedata import name
|
||||
|
||||
zwg_sample = """
|
||||
1F468 200D 1F9B0 |man: red hair |E11.0
|
||||
1F9D1 200D 1F91D 200D 1F9D1 |people holding hands |E12.0
|
||||
1F3CA 1F3FF 200D 2640 FE0F |woman swimming: dark skin tone |E4.0
|
||||
1F469 1F3FE 200D 2708 FE0F |woman pilot: medium-dark skin tone |E4.0
|
||||
1F468 200D 1F469 200D 1F467 |family: man, woman, girl |E2.0
|
||||
1F3F3 FE0F 200D 26A7 FE0F |transgender flag |E13.0
|
||||
1F469 200D 2764 FE0F 200D 1F48B 200D 1F469 |kiss: woman, woman |E2.0
|
||||
"""
|
||||
|
||||
markers = {'\u200D': 'ZWG', # ZERO WIDTH JOINER
|
||||
'\uFE0F': 'V16', # VARIATION SELECTOR-16
|
||||
}
|
||||
|
||||
for line in zwg_sample.strip().split('\n'):
|
||||
code, descr, version = (s.strip() for s in line.split('|'))
|
||||
chars = [chr(int(c, 16)) for c in code.split()]
|
||||
print(''.join(chars), version, descr, sep='\t', end='')
|
||||
while chars:
|
||||
char = chars.pop(0)
|
||||
if char in markers:
|
||||
print(' + ' + markers[char], end='')
|
||||
else:
|
||||
ucode = f'U+{ord(char):04X}'
|
||||
print(f'\n\t{char}\t{ucode}\t{name(char)}', end='')
|
||||
print()
|
||||
Reference in New Issue
Block a user