2021-02-15 00:28:07 +01:00
|
|
|
|
|
2014-10-14 19:26:55 +02:00
|
|
|
|
"""
|
2021-02-15 00:28:07 +01:00
|
|
|
|
Radical folding and diacritic mark removal.
|
2014-10-14 19:26:55 +02:00
|
|
|
|
|
|
|
|
|
Handling a string with `cp1252` symbols:
|
|
|
|
|
|
|
|
|
|
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
|
|
|
|
|
>>> shave_marks(order)
|
|
|
|
|
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
|
|
|
|
>>> shave_marks_latin(order)
|
|
|
|
|
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
|
|
|
|
>>> dewinize(order)
|
|
|
|
|
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
|
|
|
|
|
>>> asciize(order)
|
|
|
|
|
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
|
|
|
|
|
|
|
|
|
|
Handling a string with Greek and Latin accented characters:
|
|
|
|
|
|
|
|
|
|
>>> greek = 'Ζέφυρος, Zéfiro'
|
|
|
|
|
>>> shave_marks(greek)
|
|
|
|
|
'Ζεφυρος, Zefiro'
|
|
|
|
|
>>> shave_marks_latin(greek)
|
|
|
|
|
'Ζέφυρος, Zefiro'
|
|
|
|
|
>>> dewinize(greek)
|
|
|
|
|
'Ζέφυρος, Zéfiro'
|
|
|
|
|
>>> asciize(greek)
|
|
|
|
|
'Ζέφυρος, Zefiro'
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# tag::SHAVE_MARKS[]
|
2014-10-14 19:26:55 +02:00
|
|
|
|
import unicodedata
|
|
|
|
|
import string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shave_marks(txt):
|
|
|
|
|
"""Remove all diacritic marks"""
|
|
|
|
|
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
|
|
|
|
shaved = ''.join(c for c in norm_txt
|
|
|
|
|
if not unicodedata.combining(c)) # <2>
|
|
|
|
|
return unicodedata.normalize('NFC', shaved) # <3>
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# end::SHAVE_MARKS[]
|
2014-10-14 19:26:55 +02:00
|
|
|
|
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# tag::SHAVE_MARKS_LATIN[]
|
2014-10-14 19:26:55 +02:00
|
|
|
|
def shave_marks_latin(txt):
|
|
|
|
|
"""Remove all diacritic marks from Latin base characters"""
|
|
|
|
|
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
|
|
|
|
latin_base = False
|
2021-02-15 00:28:07 +01:00
|
|
|
|
preserve = []
|
2014-10-14 19:26:55 +02:00
|
|
|
|
for c in norm_txt:
|
|
|
|
|
if unicodedata.combining(c) and latin_base: # <2>
|
|
|
|
|
continue # ignore diacritic on Latin base char
|
2021-02-15 00:28:07 +01:00
|
|
|
|
preserve.append(c) # <3>
|
2021-06-26 18:42:28 +02:00
|
|
|
|
# if it isn't a combining char, it's a new base char
|
2014-10-14 19:26:55 +02:00
|
|
|
|
if not unicodedata.combining(c): # <4>
|
|
|
|
|
latin_base = c in string.ascii_letters
|
2021-02-15 00:28:07 +01:00
|
|
|
|
shaved = ''.join(preserve)
|
2014-10-14 19:26:55 +02:00
|
|
|
|
return unicodedata.normalize('NFC', shaved) # <5>
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# end::SHAVE_MARKS_LATIN[]
|
2014-10-14 19:26:55 +02:00
|
|
|
|
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# tag::ASCIIZE[]
|
2021-02-15 00:28:07 +01:00
|
|
|
|
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
|
|
|
|
|
"""'f"^<''""---~>""")
|
2014-10-14 19:26:55 +02:00
|
|
|
|
|
|
|
|
|
multi_map = str.maketrans({ # <2>
|
2021-02-15 00:28:07 +01:00
|
|
|
|
'€': 'EUR',
|
2014-10-14 19:26:55 +02:00
|
|
|
|
'…': '...',
|
2021-02-15 00:28:07 +01:00
|
|
|
|
'Æ': 'AE',
|
|
|
|
|
'æ': 'ae',
|
2014-10-14 19:26:55 +02:00
|
|
|
|
'Œ': 'OE',
|
|
|
|
|
'œ': 'oe',
|
2021-02-15 00:28:07 +01:00
|
|
|
|
'™': '(TM)',
|
2014-10-14 19:26:55 +02:00
|
|
|
|
'‰': '<per mille>',
|
2021-02-15 00:28:07 +01:00
|
|
|
|
'†': '**',
|
|
|
|
|
'‡': '***',
|
2014-10-14 19:26:55 +02:00
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
multi_map.update(single_map) # <3>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dewinize(txt):
|
|
|
|
|
"""Replace Win1252 symbols with ASCII chars or sequences"""
|
|
|
|
|
return txt.translate(multi_map) # <4>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def asciize(txt):
|
|
|
|
|
no_marks = shave_marks_latin(dewinize(txt)) # <5>
|
|
|
|
|
no_marks = no_marks.replace('ß', 'ss') # <6>
|
|
|
|
|
return unicodedata.normalize('NFKC', no_marks) # <7>
|
2020-01-23 02:52:23 +01:00
|
|
|
|
# end::ASCIIZE[]
|