""" Radical folding and diacritic mark removal. Handling a string with `cp1252` symbols: >>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”' >>> shave_marks(order) '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”' >>> shave_marks_latin(order) '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”' >>> dewinize(order) '"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."' >>> asciize(order) '"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."' Handling a string with Greek and Latin accented characters: >>> greek = 'Ζέφυρος, Zéfiro' >>> shave_marks(greek) 'Ζεφυρος, Zefiro' >>> shave_marks_latin(greek) 'Ζέφυρος, Zefiro' >>> dewinize(greek) 'Ζέφυρος, Zéfiro' >>> asciize(greek) 'Ζέφυρος, Zefiro' """ # tag::SHAVE_MARKS[] import unicodedata import string def shave_marks(txt): """Remove all diacritic marks""" norm_txt = unicodedata.normalize('NFD', txt) # <1> shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c)) # <2> return unicodedata.normalize('NFC', shaved) # <3> # end::SHAVE_MARKS[] # tag::SHAVE_MARKS_LATIN[] def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) # <1> latin_base = False preserve = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: # <2> continue # ignore diacritic on Latin base char preserve.append(c) # <3> # if it isn't a combining char, it's a new base char if not unicodedata.combining(c): # <4> latin_base = c in string.ascii_letters shaved = ''.join(preserve) return unicodedata.normalize('NFC', shaved) # <5> # end::SHAVE_MARKS_LATIN[] # tag::ASCIIZE[] single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1> """'f"^<''""---~>""") multi_map = str.maketrans({ # <2> '€': 'EUR', '…': '...', 'Æ': 'AE', 'æ': 'ae', 'Œ': 'OE', 'œ': 'oe', '™': '(TM)', '‰': '', '†': '**', '‡': '***', }) multi_map.update(single_map) # <3> def dewinize(txt): """Replace Win1252 symbols with ASCII chars or sequences""" return txt.translate(multi_map) # <4> def asciize(txt): no_marks = shave_marks_latin(dewinize(txt)) # <5> no_marks = no_marks.replace('ß', 'ss') # <6> return unicodedata.normalize('NFKC', no_marks) # <7> # end::ASCIIZE[]