Python源码示例:unicodedata.normalize()
示例1
def _make_content_disposition(disposition, file_name):
"""Create HTTP header for downloading a file with a UTF-8 filename.
This function implements the recommendations of :rfc:`6266#appendix-D`.
See this and related answers: https://stackoverflow.com/a/8996249/2173868.
"""
# As normalization algorithm for `unicodedata` is used composed form (NFC
# and NFKC) with compatibility equivalence criteria (NFK), so "NFKC" is the
# one. It first applies the compatibility decomposition, followed by the
# canonical composition. Should be displayed in the same manner, should be
# treated in the same way by applications such as alphabetizing names or
# searching, and may be substituted for each other.
# See: https://en.wikipedia.org/wiki/Unicode_equivalence.
ascii_name = (
unicodedata.normalize('NFKC', file_name).
encode('ascii', errors='ignore').decode()
)
header = '{}; filename="{}"'.format(disposition, ascii_name)
if ascii_name != file_name:
quoted_name = urllib.parse.quote(file_name)
header += '; filename*=UTF-8\'\'{}'.format(quoted_name)
return header
示例2
def _tokenize(self, text):
if not self._cased:
text = unicodedata.normalize('NFD', text)
text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
text = text.lower()
spaced = ''
for ch in text:
if self._is_punctuation(ch) or self._is_cjk_character(ch):
spaced += ' ' + ch + ' '
elif self._is_space(ch):
spaced += ' '
elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
continue
else:
spaced += ch
tokens = []
for word in spaced.strip().split():
tokens += self._word_piece_tokenize(word)
return tokens
示例3
def secure_filename(filename):
if isinstance(filename, str):
from unicodedata import normalize
filename = normalize('NFKD', filename).encode('ascii', 'ignore')
filename = filename.decode('ascii')
for sep in os.path.sep, os.path.altsep:
if sep:
filename = filename.replace(sep, ' ')
filename = str(_filename_ascii_strip_re.sub('', '_'.join(
filename.split()))).strip('._')
# on nt a couple of special files are present in each folder. We
# have to ensure that the target file is not such a filename. In
# this case we prepend an underline
if os.name == 'nt' and filename and \
filename.split('.')[0].upper() in _windows_device_files:
filename = '_' + filename
return filename
示例4
def unicodify(s, encoding='utf-8', norm=None):
"""Ensure string is Unicode.
.. versionadded:: 1.31
Decode encoded strings using ``encoding`` and normalise Unicode
to form ``norm`` if specified.
Args:
s (str): String to decode. May also be Unicode.
encoding (str, optional): Encoding to use on bytestrings.
norm (None, optional): Normalisation form to apply to Unicode string.
Returns:
unicode: Decoded, optionally normalised, Unicode string.
"""
if not isinstance(s, unicode):
s = unicode(s, encoding)
if norm:
from unicodedata import normalize
s = normalize(norm, s)
return s
示例5
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
示例6
def unicodify(s, encoding='utf-8', norm=None):
"""Ensure string is Unicode.
.. versionadded:: 1.31
Decode encoded strings using ``encoding`` and normalise Unicode
to form ``norm`` if specified.
Args:
s (str): String to decode. May also be Unicode.
encoding (str, optional): Encoding to use on bytestrings.
norm (None, optional): Normalisation form to apply to Unicode string.
Returns:
unicode: Decoded, optionally normalised, Unicode string.
"""
if not isinstance(s, unicode):
s = unicode(s, encoding)
if norm:
from unicodedata import normalize
s = normalize(norm, s)
return s
示例7
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
示例8
def strdisplaywidth(self, s):
def get_char_display_width(unicode_str):
r = unicodedata.east_asian_width(unicode_str)
if r == "F": # Fullwidth
return 1
elif r == "H": # Half-width
return 1
elif r == "W": # Wide
return 2
elif r == "Na": # Narrow
return 1
elif r == "A": # Ambiguous, go with 2
return 1
elif r == "N": # Neutral
return 1
else:
return 1
s = unicodedata.normalize('NFC', s)
w = 0
for c in s:
w += get_char_display_width(c)
return w
示例9
def normalizestr(string):
""" Converts special characters like copyright,
trademark signs to ascii name """
# print("input: '{}'".format(string))
input_string = string
for mark, ascii_repl in unicode_marks(string):
string = string.replace(mark, ascii_repl)
rv = []
# for c in unicodedata.normalize('NFKC', smart_text(string)):
for c in unicodedata.normalize('NFKC', string):
# cat = unicodedata.category(c)[0]
# if cat in 'LN' or c in ok:
rv.append(c)
new = ''.join(rv).strip()
result = unidecode(new)
if result != input_string:
print("Fixed string: '{}'".format(result))
return result
示例10
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
示例11
def fold_to_ascii(self, text):
"""Convert non-ASCII characters to closest ASCII equivalent.
.. versionadded:: 1.3
.. note:: This only works for a subset of European languages.
:param text: text to convert
:type text: ``unicode``
:returns: text containing only ASCII characters
:rtype: ``unicode``
"""
if isascii(text):
return text
text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
return unicode(unicodedata.normalize('NFKD',
text).encode('ascii', 'ignore'))
示例12
def _byteify(data, ignore_dicts = False):
# if this is a unicode string, return its string representation
if isinstance(data, unicode):
return unicodedata.normalize('NFKD', data).encode('ascii','ignore')
# if this is a list of values, return list of byteified values
if isinstance(data, list):
return [ _byteify(item, ignore_dicts=True) for item in data ]
# if this is a dictionary, return dictionary of byteified keys and values
# but only if we haven't already byteified it
if isinstance(data, dict) and not ignore_dicts:
return {
_byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
for key, value in data.iteritems()
}
# if it's anything else, return it in its original form
return data
示例13
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
partition_id=None, records_limit = -1):
query_date = datetime.datetime.now()
rows = self.list_epics()
if len(rows) == 0:
logging.info("Not epics.")
else:
nb = 0
for row in rows:
if 0 <= records_limit <= nb:
logging.info("Reached records_limit (%i), stopping." % records_limit)
return
encoded_row = {}
encoded_row["query_date"] = query_date
for key in row:
val = row[key]
if isinstance(val, unicode):
val = unicodedata.normalize('NFKD', val).encode('ascii','ignore')
encoded_row[str(key)] = val
yield encoded_row
nb += 1
示例14
def setup(self):
"""Setup."""
self.normalize = self.config['normalize'].upper()
self.convert_encoding = self.config['convert_encoding'].lower()
self.errors = self.config['errors'].lower()
if self.convert_encoding:
self.convert_encoding = codecs.lookup(
filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower()
).name
# Don't generate content with BOMs
if (
self.convert_encoding.startswith(('utf-32', 'utf-16')) and
not self.convert_encoding.endswith(('le', 'be'))
):
self.convert_encoding += '-le'
if self.convert_encoding == 'utf-8-sig':
self.convert_encoding = 'utf-8'
示例15
def normalize_string(text):
''' For theme media, do not modify unless modified in TV Tunes.
Remove dots from the last character as windows can not have directories
with dots at the end
'''
text = text.replace(":", "")
text = text.replace("/", "-")
text = text.replace("\\", "-")
text = text.replace("<", "")
text = text.replace(">", "")
text = text.replace("*", "")
text = text.replace("?", "")
text = text.replace('|', "")
text = text.strip()
text = text.rstrip('.')
text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore')
return text
示例16
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 这个函数去除掉text中的非间距字符
# 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
# 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
# normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
# category() 返回字符在UNICODE里分类的类型
cat = unicodedata.category(char)
if cat == "Mn":
# Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。
# https://www.fileformat.info/info/unicode/category/Mn/list.htm
continue
output.append(char)
return "".join(output)
示例17
def normalize(self, form):
"""
Return the Unicode normal form for the strings in the Series/Index.
For more information on the forms, see the
:func:`unicodedata.normalize`.
Parameters
----------
form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
Unicode form
Returns
-------
normalized : Series/Index of objects
"""
import unicodedata
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
result = _na_map(f, self._parent)
return self._wrap_result(result)
示例18
def clean_id(name, preserve_case=False):
"""
Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php
Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
call make_dokuwiki_pagename)
"""
main,ext = os.path.splitext(name)
# remove accents
try:
decomposed = unicodedata.normalize("NFKD", main)
no_accent = ''.join(c for c in decomposed if ord(c)<0x7f)
except TypeError:
no_accent = main # name was plaintext to begin with
# recombine without any other characters
result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext)
if not preserve_case:
result = result.lower()
while "__" in result:
result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
return result
示例19
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
示例20
def ustr(s):
"""Coerce `s` to normalised Unicode."""
return normalize('NFD', s.decode('utf-8'))
示例21
def text(self):
"""Unicode-decoded content of response body.
If no encoding can be determined from HTTP headers or the content
itself, the encoded response body will be returned instead.
:returns: Body of HTTP response
:rtype: unicode or str
"""
if self.encoding:
return unicodedata.normalize('NFC', unicode(self.content,
self.encoding))
return self.content
示例22
def decode(self, text, encoding=None, normalization=None):
"""Return ``text`` as normalised unicode.
If ``encoding`` and/or ``normalization`` is ``None``, the
``input_encoding``and ``normalization`` parameters passed to
:class:`Workflow` are used.
:param text: string
:type text: encoded or Unicode string. If ``text`` is already a
Unicode string, it will only be normalised.
:param encoding: The text encoding to use to decode ``text`` to
Unicode.
:type encoding: ``unicode`` or ``None``
:param normalization: The nomalisation form to apply to ``text``.
:type normalization: ``unicode`` or ``None``
:returns: decoded and normalised ``unicode``
:class:`Workflow` uses "NFC" normalisation by default. This is the
standard for Python and will work well with data from the web (via
:mod:`~workflow.web` or :mod:`json`).
macOS, on the other hand, uses "NFD" normalisation (nearly), so data
coming from the system (e.g. via :mod:`subprocess` or
:func:`os.listdir`/:mod:`os.path`) may not match. You should either
normalise this data, too, or change the default normalisation used by
:class:`Workflow`.
"""
encoding = encoding or self._input_encoding
normalization = normalization or self._normalizsation
if not isinstance(text, unicode):
text = unicode(text, encoding)
return unicodedata.normalize(normalization, text)
示例23
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
示例24
def normalizer(query):
query = unicodedata.normalize('NFKC', query).encode('ascii', 'ignore').decode('ascii')
query = query.lower()
query = ' '.join(query.split())
return query
示例25
def text_normalize(text):
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = text.lower()
text = re.sub("[^{}]".format(hp.vocab), " ", text)
text = re.sub("[ ]+", " ", text)
return text
示例26
def strip_accents(s):
u = unicode(s, "utf-8")
u_new = ''.join(c for c in ud.normalize('NFKD', u) if ud.category(c) != 'Mn')
return u_new.encode("utf-8")
示例27
def strip_accents(text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
示例28
def removeDisallowedFilenameChars(filename):
VALID = "-_.() %s%s" % (string.ascii_letters, string.digits)
cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore')
return ''.join(chr(c) for c in cleanedFilename if chr(c) in VALID)
示例29
def normalize_string(txt: Union[str, bytes]) -> str:
if isinstance(txt, bytes):
utxt = txt.decode("utf8")
elif isinstance(txt, str):
utxt = txt
else:
raise ValidationError("String value expected")
return unicodedata.normalize("NFKD", utxt)
示例30
def specialchar(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)