import pathlib import re import unicodedata def format_list(xs): return '[ ' + ', '.join([f'"{x}"' for x in xs]) + ' ]' def path_part(name): simplified = ''.join([alnum_or_space(c) for c in unaccent(name.lower())]) return '-'.join(simplified.split()) def unaccent(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') def alnum_or_space(c): if c.isalnum(): return c else: return ' ' def extension(path): return pathlib.Path(path).suffix def cleanup_text(s, lang): s = re.sub('\'', '’', s) s = re.sub('\.\.\.', '…', s) s = re.sub('\. \. \.', '…', s) s = cleanup_quotes(s, lang) if lang == 'fr': s = re.sub('“', '«', s) s = re.sub('”', '»', s) # Replace space by insecable spaces s = re.sub(r' ([:?\!»])', r' \1', s) s = re.sub('« ', '« ', s) # Add missing insecable spaces s = re.sub(r'([^ ])([:?\!»])', r'\1 \2', s) s = re.sub(r'«([^ ])', r'« \1', s) elif lang == 'en': s = re.sub('«', '“', s) s = re.sub('»', '”', s) return s def cleanup_quotes(s, lang): res = '' quoted = False for c in s: if c == '"': if quoted: quoted = False if lang == 'fr': res += '»' elif lang == 'en': res += '”' else: quoted = True if lang == 'fr': res += '«' elif lang == 'en': res += '“' else: res += c return res