1409 lines
50 KiB
Python
1409 lines
50 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||
|
||
This library converts a bytestream to Unicode through any means
|
||
necessary. It is heavily based on code from Mark Pilgrim's `Universal
|
||
Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained
|
||
by Kurt McKee. It does not rewrite the body of an XML or HTML document
|
||
to reflect a new encoding; that's the job of `TreeBuilder`.
|
||
|
||
"""
|
||
|
||
# Use of this source code is governed by the MIT license.
|
||
__license__ = "MIT"
|
||
|
||
from html.entities import codepoint2name
|
||
from collections import defaultdict
|
||
import codecs
|
||
from html.entities import html5
|
||
import re
|
||
from logging import Logger, getLogger
|
||
from types import ModuleType
|
||
from typing import (
|
||
Dict,
|
||
Iterator,
|
||
List,
|
||
Optional,
|
||
Pattern,
|
||
Set,
|
||
Tuple,
|
||
Type,
|
||
Union,
|
||
cast,
|
||
)
|
||
from typing_extensions import Literal
|
||
from bs4._typing import (
|
||
_Encoding,
|
||
_Encodings,
|
||
)
|
||
import warnings
|
||
|
||
# Import a library to autodetect character encodings. We'll support
|
||
# any of a number of libraries that all support the same API:
|
||
#
|
||
# * cchardet
|
||
# * chardet
|
||
# * charset-normalizer
|
||
chardet_module: Optional[ModuleType] = None
|
||
try:
|
||
# PyPI package: cchardet
|
||
import cchardet
|
||
|
||
chardet_module = cchardet
|
||
except ImportError:
|
||
try:
|
||
# Debian package: python-chardet
|
||
# PyPI package: chardet
|
||
import chardet
|
||
|
||
chardet_module = chardet
|
||
except ImportError:
|
||
try:
|
||
# PyPI package: charset-normalizer
|
||
import charset_normalizer
|
||
|
||
chardet_module = charset_normalizer
|
||
except ImportError:
|
||
# No chardet available.
|
||
pass
|
||
|
||
|
||
def _chardet_dammit(s: bytes) -> Optional[str]:
|
||
"""Try as hard as possible to detect the encoding of a bytestring."""
|
||
if chardet_module is None or isinstance(s, str):
|
||
return None
|
||
module = chardet_module
|
||
return module.detect(s)["encoding"]
|
||
|
||
|
||
# Build bytestring and Unicode versions of regular expressions for finding
|
||
# a declared encoding inside an XML or HTML document.
|
||
xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" #: :meta private:
|
||
html_meta: str = (
|
||
"<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" #: :meta private:
|
||
)
|
||
|
||
# TODO-TYPING: The Pattern type here could use more refinement, but it's tricky.
|
||
encoding_res: Dict[Type, Dict[str, Pattern]] = dict()
|
||
encoding_res[bytes] = {
|
||
"html": re.compile(html_meta.encode("ascii"), re.I),
|
||
"xml": re.compile(xml_encoding.encode("ascii"), re.I),
|
||
}
|
||
encoding_res[str] = {
|
||
"html": re.compile(html_meta, re.I),
|
||
"xml": re.compile(xml_encoding, re.I),
|
||
}
|
||
|
||
|
||
class EntitySubstitution(object):
|
||
"""The ability to substitute XML or HTML entities for certain characters."""
|
||
|
||
#: A map of named HTML entities to the corresponding Unicode string.
|
||
#:
|
||
#: :meta hide-value:
|
||
HTML_ENTITY_TO_CHARACTER: Dict[str, str]
|
||
|
||
#: A map of Unicode strings to the corresponding named HTML entities;
|
||
#: the inverse of HTML_ENTITY_TO_CHARACTER.
|
||
#:
|
||
#: :meta hide-value:
|
||
CHARACTER_TO_HTML_ENTITY: Dict[str, str]
|
||
|
||
#: A regular expression that matches any character (or, in rare
|
||
#: cases, pair of characters) that can be replaced with a named
|
||
#: HTML entity.
|
||
#:
|
||
#: :meta hide-value:
|
||
CHARACTER_TO_HTML_ENTITY_RE: Pattern[str]
|
||
|
||
#: A very similar regular expression to
|
||
#: CHARACTER_TO_HTML_ENTITY_RE, but which also matches unescaped
|
||
#: ampersands. This is used by the 'html' formatted to provide
|
||
#: backwards-compatibility, even though the HTML5 spec allows most
|
||
#: ampersands to go unescaped.
|
||
#:
|
||
#: :meta hide-value:
|
||
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str]
|
||
|
||
@classmethod
|
||
def _populate_class_variables(cls) -> None:
|
||
"""Initialize variables used by this class to manage the plethora of
|
||
HTML5 named entities.
|
||
|
||
This function sets the following class variables:
|
||
|
||
CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to
|
||
entity names like "angmsdaa". When a single Unicode string has
|
||
multiple entity names, we try to choose the most commonly-used
|
||
name.
|
||
|
||
HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to
|
||
Unicode strings like "⦨".
|
||
|
||
CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any
|
||
Unicode string that corresponds to an HTML5 named entity.
|
||
|
||
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar
|
||
regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which
|
||
also matches unescaped ampersands. This is used by the 'html'
|
||
formatted to provide backwards-compatibility, even though the HTML5
|
||
spec allows most ampersands to go unescaped.
|
||
"""
|
||
unicode_to_name = {}
|
||
name_to_unicode = {}
|
||
|
||
short_entities = set()
|
||
long_entities_by_first_character = defaultdict(set)
|
||
|
||
for name_with_semicolon, character in sorted(html5.items()):
|
||
# "It is intentional, for legacy compatibility, that many
|
||
# code points have multiple character reference names. For
|
||
# example, some appear both with and without the trailing
|
||
# semicolon, or with different capitalizations."
|
||
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
||
#
|
||
# The parsers are in charge of handling (or not) character
|
||
# references with no trailing semicolon, so we remove the
|
||
# semicolon whenever it appears.
|
||
if name_with_semicolon.endswith(";"):
|
||
name = name_with_semicolon[:-1]
|
||
else:
|
||
name = name_with_semicolon
|
||
|
||
# When parsing HTML, we want to recognize any known named
|
||
# entity and convert it to a sequence of Unicode
|
||
# characters.
|
||
if name not in name_to_unicode:
|
||
name_to_unicode[name] = character
|
||
|
||
# When _generating_ HTML, we want to recognize special
|
||
# character sequences that _could_ be converted to named
|
||
# entities.
|
||
unicode_to_name[character] = name
|
||
|
||
# We also need to build a regular expression that lets us
|
||
# _find_ those characters in output strings so we can
|
||
# replace them.
|
||
#
|
||
# This is tricky, for two reasons.
|
||
|
||
if len(character) == 1 and ord(character) < 128 and character not in "<>":
|
||
# First, it would be annoying to turn single ASCII
|
||
# characters like | into named entities like
|
||
# |. The exceptions are <>, which we _must_
|
||
# turn into named entities to produce valid HTML.
|
||
continue
|
||
|
||
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
||
# We also do not want to turn _combinations_ of ASCII
|
||
# characters like 'fj' into named entities like 'fj',
|
||
# though that's more debateable.
|
||
continue
|
||
|
||
# Second, some named entities have a Unicode value that's
|
||
# a subset of the Unicode value for some _other_ named
|
||
# entity. As an example, \u2267' is ≧,
|
||
# but '\u2267\u0338' is ≧̸. Our regular
|
||
# expression needs to match the first two characters of
|
||
# "\u2267\u0338foo", but only the first character of
|
||
# "\u2267foo".
|
||
#
|
||
# In this step, we build two sets of characters that
|
||
# _eventually_ need to go into the regular expression. But
|
||
# we won't know exactly what the regular expression needs
|
||
# to look like until we've gone through the entire list of
|
||
# named entities.
|
||
if len(character) == 1 and character != "&":
|
||
short_entities.add(character)
|
||
else:
|
||
long_entities_by_first_character[character[0]].add(character)
|
||
|
||
# Now that we've been through the entire list of entities, we
|
||
# can create a regular expression that matches any of them.
|
||
particles = set()
|
||
for short in short_entities:
|
||
long_versions = long_entities_by_first_character[short]
|
||
if not long_versions:
|
||
particles.add(short)
|
||
else:
|
||
ignore = "".join([x[1] for x in long_versions])
|
||
# This finds, e.g. \u2267 but only if it is _not_
|
||
# followed by \u0338.
|
||
particles.add("%s(?![%s])" % (short, ignore))
|
||
|
||
for long_entities in list(long_entities_by_first_character.values()):
|
||
for long_entity in long_entities:
|
||
particles.add(long_entity)
|
||
|
||
re_definition = "(%s)" % "|".join(particles)
|
||
|
||
particles.add("&")
|
||
re_definition_with_ampersand = "(%s)" % "|".join(particles)
|
||
|
||
# If an entity shows up in both html5 and codepoint2name, it's
|
||
# likely that HTML5 gives it several different names, such as
|
||
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
||
# named entities, the codepoint2name name should take
|
||
# precedence where possible, since that's the more easily
|
||
# recognizable one.
|
||
for codepoint, name in list(codepoint2name.items()):
|
||
character = chr(codepoint)
|
||
unicode_to_name[character] = name
|
||
|
||
cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name
|
||
cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode
|
||
cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition)
|
||
cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile(
|
||
re_definition_with_ampersand
|
||
)
|
||
|
||
#: A map of Unicode strings to the corresponding named XML entities.
|
||
#:
|
||
#: :meta hide-value:
|
||
CHARACTER_TO_XML_ENTITY: Dict[str, str] = {
|
||
"'": "apos",
|
||
'"': "quot",
|
||
"&": "amp",
|
||
"<": "lt",
|
||
">": "gt",
|
||
}
|
||
|
||
# Matches any named or numeric HTML entity.
|
||
ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I)
|
||
|
||
#: A regular expression matching an angle bracket or an ampersand that
|
||
#: is not part of an XML or HTML entity.
|
||
#:
|
||
#: :meta hide-value:
|
||
BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile(
|
||
"([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")"
|
||
)
|
||
|
||
#: A regular expression matching an angle bracket or an ampersand.
|
||
#:
|
||
#: :meta hide-value:
|
||
AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])")
|
||
|
||
@classmethod
|
||
def _substitute_html_entity(cls, matchobj: re.Match) -> str:
|
||
"""Used with a regular expression to substitute the
|
||
appropriate HTML entity for a special character string."""
|
||
original_entity = matchobj.group(0)
|
||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity)
|
||
if entity is None:
|
||
return "&%s;" % original_entity
|
||
return "&%s;" % entity
|
||
|
||
@classmethod
|
||
def _substitute_xml_entity(cls, matchobj: re.Match) -> str:
|
||
"""Used with a regular expression to substitute the
|
||
appropriate XML entity for a special character string."""
|
||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||
return "&%s;" % entity
|
||
|
||
@classmethod
|
||
def _escape_entity_name(cls, matchobj: re.Match) -> str:
|
||
return "&%s;" % matchobj.group(1)
|
||
|
||
@classmethod
|
||
def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str:
|
||
possible_entity = matchobj.group(1)
|
||
if possible_entity in cls.HTML_ENTITY_TO_CHARACTER:
|
||
return "&%s;" % possible_entity
|
||
return "&%s;" % possible_entity
|
||
|
||
@classmethod
|
||
def quoted_attribute_value(cls, value: str) -> str:
|
||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||
|
||
Most strings will be quoted using double quotes.
|
||
|
||
Bob's Bar -> "Bob's Bar"
|
||
|
||
If a string contains double quotes, it will be quoted using
|
||
single quotes.
|
||
|
||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||
|
||
If a string contains both single and double quotes, the
|
||
double quotes will be escaped, and the string will be quoted
|
||
using double quotes.
|
||
|
||
Welcome to "Bob's Bar" -> Welcome to "Bob's bar"
|
||
|
||
:param value: The XML attribute value to quote
|
||
:return: The quoted value
|
||
"""
|
||
quote_with = '"'
|
||
if '"' in value:
|
||
if "'" in value:
|
||
# The string contains both single and double
|
||
# quotes. Turn the double quotes into
|
||
# entities. We quote the double quotes rather than
|
||
# the single quotes because the entity name is
|
||
# """ whether this is HTML or XML. If we
|
||
# quoted the single quotes, we'd have to decide
|
||
# between ' and &squot;.
|
||
replace_with = """
|
||
value = value.replace('"', replace_with)
|
||
else:
|
||
# There are double quotes but no single quotes.
|
||
# We can use single quotes to quote the attribute.
|
||
quote_with = "'"
|
||
return quote_with + value + quote_with
|
||
|
||
@classmethod
|
||
def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str:
|
||
"""Replace special XML characters with named XML entities.
|
||
|
||
The less-than sign will become <, the greater-than sign
|
||
will become >, and any ampersands will become &. If you
|
||
want ampersands that seem to be part of an entity definition
|
||
to be left alone, use `substitute_xml_containing_entities`
|
||
instead.
|
||
|
||
:param value: A string to be substituted.
|
||
|
||
:param make_quoted_attribute: If True, then the string will be
|
||
quoted, as befits an attribute value.
|
||
|
||
:return: A version of ``value`` with special characters replaced
|
||
with named entities.
|
||
"""
|
||
# Escape angle brackets and ampersands.
|
||
value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
|
||
|
||
if make_quoted_attribute:
|
||
value = cls.quoted_attribute_value(value)
|
||
return value
|
||
|
||
@classmethod
|
||
def substitute_xml_containing_entities(
|
||
cls, value: str, make_quoted_attribute: bool = False
|
||
) -> str:
|
||
"""Substitute XML entities for special XML characters.
|
||
|
||
:param value: A string to be substituted. The less-than sign will
|
||
become <, the greater-than sign will become >, and any
|
||
ampersands that are not part of an entity defition will
|
||
become &.
|
||
|
||
:param make_quoted_attribute: If True, then the string will be
|
||
quoted, as befits an attribute value.
|
||
"""
|
||
# Escape angle brackets, and ampersands that aren't part of
|
||
# entities.
|
||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value)
|
||
|
||
if make_quoted_attribute:
|
||
value = cls.quoted_attribute_value(value)
|
||
return value
|
||
|
||
@classmethod
|
||
def substitute_html(cls, s: str) -> str:
|
||
"""Replace certain Unicode characters with named HTML entities.
|
||
|
||
This differs from ``data.encode(encoding, 'xmlcharrefreplace')``
|
||
in that the goal is to make the result more readable (to those
|
||
with ASCII displays) rather than to recover from
|
||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||
character with "é" will make it more readable to some
|
||
people.
|
||
|
||
:param s: The string to be modified.
|
||
:return: The string with some Unicode characters replaced with
|
||
HTML entities.
|
||
"""
|
||
# Convert any appropriate characters to HTML entities.
|
||
return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub(
|
||
cls._substitute_html_entity, s
|
||
)
|
||
|
||
@classmethod
|
||
def substitute_html5(cls, s: str) -> str:
|
||
"""Replace certain Unicode characters with named HTML entities
|
||
using HTML5 rules.
|
||
|
||
Specifically, this method is much less aggressive about
|
||
escaping ampersands than substitute_html. Only ambiguous
|
||
ampersands are escaped, per the HTML5 standard:
|
||
|
||
"An ambiguous ampersand is a U+0026 AMPERSAND character (&)
|
||
that is followed by one or more ASCII alphanumerics, followed
|
||
by a U+003B SEMICOLON character (;), where these characters do
|
||
not match any of the names given in the named character
|
||
references section."
|
||
|
||
Unlike substitute_html5_raw, this method assumes HTML entities
|
||
were converted to Unicode characters on the way in, as
|
||
Beautiful Soup does. By the time Beautiful Soup does its work,
|
||
the only ambiguous ampersands that need to be escaped are the
|
||
ones that were escaped in the original markup when mentioning
|
||
HTML entities.
|
||
|
||
:param s: The string to be modified.
|
||
:return: The string with some Unicode characters replaced with
|
||
HTML entities.
|
||
"""
|
||
# First, escape any HTML entities found in the markup.
|
||
s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s)
|
||
|
||
# Next, convert any appropriate characters to unescaped HTML entities.
|
||
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
|
||
|
||
return s
|
||
|
||
@classmethod
|
||
def substitute_html5_raw(cls, s: str) -> str:
|
||
"""Replace certain Unicode characters with named HTML entities
|
||
using HTML5 rules.
|
||
|
||
substitute_html5_raw is similar to substitute_html5 but it is
|
||
designed for standalone use (whereas substitute_html5 is
|
||
designed for use with Beautiful Soup).
|
||
|
||
:param s: The string to be modified.
|
||
:return: The string with some Unicode characters replaced with
|
||
HTML entities.
|
||
"""
|
||
# First, escape the ampersand for anything that looks like an
|
||
# entity but isn't in the list of recognized entities. All other
|
||
# ampersands can be left alone.
|
||
s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s)
|
||
|
||
# Then, convert a range of Unicode characters to unescaped
|
||
# HTML entities.
|
||
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s)
|
||
|
||
return s
|
||
|
||
|
||
EntitySubstitution._populate_class_variables()
|
||
|
||
|
||
class EncodingDetector:
|
||
"""This class is capable of guessing a number of possible encodings
|
||
for a bytestring.
|
||
|
||
Order of precedence:
|
||
|
||
1. Encodings you specifically tell EncodingDetector to try first
|
||
(the ``known_definite_encodings`` argument to the constructor).
|
||
|
||
2. An encoding determined by sniffing the document's byte-order mark.
|
||
|
||
3. Encodings you specifically tell EncodingDetector to try if
|
||
byte-order mark sniffing fails (the ``user_encodings`` argument to the
|
||
constructor).
|
||
|
||
4. An encoding declared within the bytestring itself, either in an
|
||
XML declaration (if the bytestring is to be interpreted as an XML
|
||
document), or in a <meta> tag (if the bytestring is to be
|
||
interpreted as an HTML document.)
|
||
|
||
5. An encoding detected through textual analysis by chardet,
|
||
cchardet, or a similar external library.
|
||
|
||
6. UTF-8.
|
||
|
||
7. Windows-1252.
|
||
|
||
:param markup: Some markup in an unknown encoding.
|
||
|
||
:param known_definite_encodings: When determining the encoding
|
||
of ``markup``, these encodings will be tried first, in
|
||
order. In HTML terms, this corresponds to the "known
|
||
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
|
||
|
||
:param user_encodings: These encodings will be tried after the
|
||
``known_definite_encodings`` have been tried and failed, and
|
||
after an attempt to sniff the encoding by looking at a
|
||
byte order mark has failed. In HTML terms, this
|
||
corresponds to the step "user has explicitly instructed
|
||
the user agent to override the document's character
|
||
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
|
||
|
||
:param override_encodings: A **deprecated** alias for
|
||
``known_definite_encodings``. Any encodings here will be tried
|
||
immediately after the encodings in
|
||
``known_definite_encodings``.
|
||
|
||
:param is_html: If True, this markup is considered to be
|
||
HTML. Otherwise it's assumed to be XML.
|
||
|
||
:param exclude_encodings: These encodings will not be tried,
|
||
even if they otherwise would be.
|
||
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
markup: bytes,
|
||
known_definite_encodings: Optional[_Encodings] = None,
|
||
is_html: Optional[bool] = False,
|
||
exclude_encodings: Optional[_Encodings] = None,
|
||
user_encodings: Optional[_Encodings] = None,
|
||
override_encodings: Optional[_Encodings] = None,
|
||
):
|
||
self.known_definite_encodings = list(known_definite_encodings or [])
|
||
if override_encodings:
|
||
warnings.warn(
|
||
"The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.",
|
||
DeprecationWarning,
|
||
stacklevel=3,
|
||
)
|
||
self.known_definite_encodings += override_encodings
|
||
self.user_encodings = user_encodings or []
|
||
exclude_encodings = exclude_encodings or []
|
||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||
self.chardet_encoding = None
|
||
self.is_html = False if is_html is None else is_html
|
||
self.declared_encoding: Optional[str] = None
|
||
|
||
# First order of business: strip a byte-order mark.
|
||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||
|
||
known_definite_encodings: _Encodings
|
||
user_encodings: _Encodings
|
||
exclude_encodings: _Encodings
|
||
chardet_encoding: Optional[_Encoding]
|
||
is_html: bool
|
||
declared_encoding: Optional[_Encoding]
|
||
markup: bytes
|
||
sniffed_encoding: Optional[_Encoding]
|
||
|
||
def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool:
|
||
"""Should we even bother to try this encoding?
|
||
|
||
:param encoding: Name of an encoding.
|
||
:param tried: Encodings that have already been tried. This
|
||
will be modified as a side effect.
|
||
"""
|
||
if encoding is None:
|
||
return False
|
||
encoding = encoding.lower()
|
||
if encoding in self.exclude_encodings:
|
||
return False
|
||
if encoding not in tried:
|
||
tried.add(encoding)
|
||
return True
|
||
return False
|
||
|
||
@property
|
||
def encodings(self) -> Iterator[_Encoding]:
|
||
"""Yield a number of encodings that might work for this markup.
|
||
|
||
:yield: A sequence of strings. Each is the name of an encoding
|
||
that *might* work to convert a bytestring into Unicode.
|
||
"""
|
||
tried: Set[_Encoding] = set()
|
||
|
||
# First, try the known definite encodings
|
||
for e in self.known_definite_encodings:
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
# Did the document originally start with a byte-order mark
|
||
# that indicated its encoding?
|
||
if self.sniffed_encoding is not None and self._usable(
|
||
self.sniffed_encoding, tried
|
||
):
|
||
yield self.sniffed_encoding
|
||
|
||
# Sniffing the byte-order mark did nothing; try the user
|
||
# encodings.
|
||
for e in self.user_encodings:
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
# Look within the document for an XML or HTML encoding
|
||
# declaration.
|
||
if self.declared_encoding is None:
|
||
self.declared_encoding = self.find_declared_encoding(
|
||
self.markup, self.is_html
|
||
)
|
||
if self.declared_encoding is not None and self._usable(
|
||
self.declared_encoding, tried
|
||
):
|
||
yield self.declared_encoding
|
||
|
||
# Use third-party character set detection to guess at the
|
||
# encoding.
|
||
if self.chardet_encoding is None:
|
||
self.chardet_encoding = _chardet_dammit(self.markup)
|
||
if self.chardet_encoding is not None and self._usable(
|
||
self.chardet_encoding, tried
|
||
):
|
||
yield self.chardet_encoding
|
||
|
||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||
for e in ("utf-8", "windows-1252"):
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
@classmethod
|
||
def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]:
|
||
"""If a byte-order mark is present, strip it and return the encoding it implies.
|
||
|
||
:param data: A bytestring that may or may not begin with a
|
||
byte-order mark.
|
||
|
||
:return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark)
|
||
"""
|
||
encoding = None
|
||
if isinstance(data, str):
|
||
# Unicode data cannot have a byte-order mark.
|
||
return data, encoding
|
||
if (
|
||
(len(data) >= 4)
|
||
and (data[:2] == b"\xfe\xff")
|
||
and (data[2:4] != b"\x00\x00")
|
||
):
|
||
encoding = "utf-16be"
|
||
data = data[2:]
|
||
elif (
|
||
(len(data) >= 4)
|
||
and (data[:2] == b"\xff\xfe")
|
||
and (data[2:4] != b"\x00\x00")
|
||
):
|
||
encoding = "utf-16le"
|
||
data = data[2:]
|
||
elif data[:3] == b"\xef\xbb\xbf":
|
||
encoding = "utf-8"
|
||
data = data[3:]
|
||
elif data[:4] == b"\x00\x00\xfe\xff":
|
||
encoding = "utf-32be"
|
||
data = data[4:]
|
||
elif data[:4] == b"\xff\xfe\x00\x00":
|
||
encoding = "utf-32le"
|
||
data = data[4:]
|
||
return data, encoding
|
||
|
||
@classmethod
|
||
def find_declared_encoding(
|
||
cls,
|
||
markup: Union[bytes, str],
|
||
is_html: bool = False,
|
||
search_entire_document: bool = False,
|
||
) -> Optional[_Encoding]:
|
||
"""Given a document, tries to find an encoding declared within the
|
||
text of the document itself.
|
||
|
||
An XML encoding is declared at the beginning of the document.
|
||
|
||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||
beginning of the document.
|
||
|
||
:param markup: Some markup.
|
||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||
it's assumed to be XML.
|
||
:param search_entire_document: Since an encoding is supposed
|
||
to declared near the beginning of the document, most of
|
||
the time it's only necessary to search a few kilobytes of
|
||
data. Set this to True to force this method to search the
|
||
entire document.
|
||
:return: The declared encoding, if one is found.
|
||
"""
|
||
if search_entire_document:
|
||
xml_endpos = html_endpos = len(markup)
|
||
else:
|
||
xml_endpos = 1024
|
||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||
|
||
if isinstance(markup, bytes):
|
||
res = encoding_res[bytes]
|
||
else:
|
||
res = encoding_res[str]
|
||
|
||
xml_re = res["xml"]
|
||
html_re = res["html"]
|
||
declared_encoding: Optional[_Encoding] = None
|
||
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
||
if not declared_encoding_match and is_html:
|
||
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
||
if declared_encoding_match is not None:
|
||
declared_encoding = declared_encoding_match.groups()[0]
|
||
if declared_encoding:
|
||
if isinstance(declared_encoding, bytes):
|
||
declared_encoding = declared_encoding.decode("ascii", "replace")
|
||
return declared_encoding.lower()
|
||
return None
|
||
|
||
|
||
class UnicodeDammit:
|
||
"""A class for detecting the encoding of a bytestring containing an
|
||
HTML or XML document, and decoding it to Unicode. If the source
|
||
encoding is windows-1252, `UnicodeDammit` can also replace
|
||
Microsoft smart quotes with their HTML or XML equivalents.
|
||
|
||
:param markup: HTML or XML markup in an unknown encoding.
|
||
|
||
:param known_definite_encodings: When determining the encoding
|
||
of ``markup``, these encodings will be tried first, in
|
||
order. In HTML terms, this corresponds to the "known
|
||
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_.
|
||
|
||
:param user_encodings: These encodings will be tried after the
|
||
``known_definite_encodings`` have been tried and failed, and
|
||
after an attempt to sniff the encoding by looking at a
|
||
byte order mark has failed. In HTML terms, this
|
||
corresponds to the step "user has explicitly instructed
|
||
the user agent to override the document's character
|
||
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_.
|
||
|
||
:param override_encodings: A **deprecated** alias for
|
||
``known_definite_encodings``. Any encodings here will be tried
|
||
immediately after the encodings in
|
||
``known_definite_encodings``.
|
||
|
||
:param smart_quotes_to: By default, Microsoft smart quotes will,
|
||
like all other characters, be converted to Unicode
|
||
characters. Setting this to ``ascii`` will convert them to ASCII
|
||
quotes instead. Setting it to ``xml`` will convert them to XML
|
||
entity references, and setting it to ``html`` will convert them
|
||
to HTML entity references.
|
||
|
||
:param is_html: If True, ``markup`` is treated as an HTML
|
||
document. Otherwise it's treated as an XML document.
|
||
|
||
:param exclude_encodings: These encodings will not be considered,
|
||
even if the sniffing code thinks they might make sense.
|
||
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
markup: bytes,
|
||
known_definite_encodings: Optional[_Encodings] = [],
|
||
smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None,
|
||
is_html: bool = False,
|
||
exclude_encodings: Optional[_Encodings] = [],
|
||
user_encodings: Optional[_Encodings] = None,
|
||
override_encodings: Optional[_Encodings] = None,
|
||
):
|
||
self.smart_quotes_to = smart_quotes_to
|
||
self.tried_encodings = []
|
||
self.contains_replacement_characters = False
|
||
self.is_html = is_html
|
||
self.log = getLogger(__name__)
|
||
self.detector = EncodingDetector(
|
||
markup,
|
||
known_definite_encodings,
|
||
is_html,
|
||
exclude_encodings,
|
||
user_encodings,
|
||
override_encodings,
|
||
)
|
||
|
||
# Short-circuit if the data is in Unicode to begin with.
|
||
if isinstance(markup, str) or markup == b"":
|
||
self.markup = markup
|
||
self.unicode_markup = str(markup)
|
||
self.original_encoding = None
|
||
return
|
||
|
||
# The encoding detector may have stripped a byte-order mark.
|
||
# Use the stripped markup from this point on.
|
||
self.markup = self.detector.markup
|
||
|
||
u = None
|
||
for encoding in self.detector.encodings:
|
||
markup = self.detector.markup
|
||
u = self._convert_from(encoding)
|
||
if u is not None:
|
||
break
|
||
|
||
if not u:
|
||
# None of the encodings worked. As an absolute last resort,
|
||
# try them again with character replacement.
|
||
|
||
for encoding in self.detector.encodings:
|
||
if encoding != "ascii":
|
||
u = self._convert_from(encoding, "replace")
|
||
if u is not None:
|
||
self.log.warning(
|
||
"Some characters could not be decoded, and were "
|
||
"replaced with REPLACEMENT CHARACTER."
|
||
)
|
||
|
||
self.contains_replacement_characters = True
|
||
break
|
||
|
||
# If none of that worked, we could at this point force it to
|
||
# ASCII, but that would destroy so much data that I think
|
||
# giving up is better.
|
||
#
|
||
# Note that this is extremely unlikely, probably impossible,
|
||
# because the "replace" strategy is so powerful. Even running
|
||
# the Python binary through Unicode, Dammit gives you Unicode,
|
||
# albeit Unicode riddled with REPLACEMENT CHARACTER.
|
||
if u is None:
|
||
self.original_encoding = None
|
||
self.unicode_markup = None
|
||
else:
|
||
self.unicode_markup = u
|
||
|
||
#: The original markup, before it was converted to Unicode.
|
||
#: This is not necessarily the same as what was passed in to the
|
||
#: constructor, since any byte-order mark will be stripped.
|
||
markup: bytes
|
||
|
||
#: The Unicode version of the markup, following conversion. This
|
||
#: is set to None if there was simply no way to convert the
|
||
#: bytestring to Unicode (as with binary data).
|
||
unicode_markup: Optional[str]
|
||
|
||
#: This is True if `UnicodeDammit.unicode_markup` contains
|
||
#: U+FFFD REPLACEMENT_CHARACTER characters which were not present
|
||
#: in `UnicodeDammit.markup`. These mark character sequences that
|
||
#: could not be represented in Unicode.
|
||
contains_replacement_characters: bool
|
||
|
||
#: Unicode, Dammit's best guess as to the original character
|
||
#: encoding of `UnicodeDammit.markup`.
|
||
original_encoding: Optional[_Encoding]
|
||
|
||
#: The strategy used to handle Microsoft smart quotes.
|
||
smart_quotes_to: Optional[str]
|
||
|
||
#: The (encoding, error handling strategy) 2-tuples that were used to
|
||
#: try and convert the markup to Unicode.
|
||
tried_encodings: List[Tuple[_Encoding, str]]
|
||
|
||
log: Logger #: :meta private:
|
||
|
||
def _sub_ms_char(self, match: re.Match) -> bytes:
|
||
"""Changes a MS smart quote character to an XML or HTML
|
||
entity, or an ASCII character.
|
||
|
||
TODO: Since this is only used to convert smart quotes, it
|
||
could be simplified, and MS_CHARS_TO_ASCII made much less
|
||
parochial.
|
||
"""
|
||
orig: bytes = match.group(1)
|
||
sub: bytes
|
||
if self.smart_quotes_to == "ascii":
|
||
if orig in self.MS_CHARS_TO_ASCII:
|
||
sub = self.MS_CHARS_TO_ASCII[orig].encode()
|
||
else:
|
||
# Shouldn't happen; substitute the character
|
||
# with itself.
|
||
sub = orig
|
||
else:
|
||
if orig in self.MS_CHARS:
|
||
substitutions = self.MS_CHARS[orig]
|
||
if type(substitutions) is tuple:
|
||
if self.smart_quotes_to == "xml":
|
||
sub = b"&#x" + substitutions[1].encode() + b";"
|
||
else:
|
||
sub = b"&" + substitutions[0].encode() + b";"
|
||
else:
|
||
substitutions = cast(str, substitutions)
|
||
sub = substitutions.encode()
|
||
else:
|
||
# Shouldn't happen; substitute the character
|
||
# for itself.
|
||
sub = orig
|
||
return sub
|
||
|
||
#: This dictionary maps commonly seen values for "charset" in HTML
|
||
#: meta tags to the corresponding Python codec names. It only covers
|
||
#: values that aren't in Python's aliases and can't be determined
|
||
#: by the heuristics in `find_codec`.
|
||
#:
|
||
#: :meta hide-value:
|
||
CHARSET_ALIASES: Dict[str, _Encoding] = {
|
||
"macintosh": "mac-roman",
|
||
"x-sjis": "shift-jis",
|
||
}
|
||
|
||
#: A list of encodings that tend to contain Microsoft smart quotes.
|
||
#:
|
||
#: :meta hide-value:
|
||
ENCODINGS_WITH_SMART_QUOTES: _Encodings = [
|
||
"windows-1252",
|
||
"iso-8859-1",
|
||
"iso-8859-2",
|
||
]
|
||
|
||
def _convert_from(
|
||
self, proposed: _Encoding, errors: str = "strict"
|
||
) -> Optional[str]:
|
||
"""Attempt to convert the markup to the proposed encoding.
|
||
|
||
:param proposed: The name of a character encoding.
|
||
:param errors: An error handling strategy, used when calling `str`.
|
||
:return: The converted markup, or `None` if the proposed
|
||
encoding/error handling strategy didn't work.
|
||
"""
|
||
lookup_result = self.find_codec(proposed)
|
||
if lookup_result is None or (lookup_result, errors) in self.tried_encodings:
|
||
return None
|
||
proposed = lookup_result
|
||
self.tried_encodings.append((proposed, errors))
|
||
markup = self.markup
|
||
# Convert smart quotes to HTML if coming from an encoding
|
||
# that might have them.
|
||
if (
|
||
self.smart_quotes_to is not None
|
||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES
|
||
):
|
||
smart_quotes_re = b"([\x80-\x9f])"
|
||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||
|
||
try:
|
||
# print("Trying to convert document to %s (errors=%s)" % (
|
||
# proposed, errors))
|
||
u = self._to_unicode(markup, proposed, errors)
|
||
self.unicode_markup = u
|
||
self.original_encoding = proposed
|
||
except Exception:
|
||
# print("That didn't work!")
|
||
# print(e)
|
||
return None
|
||
# print("Correct encoding: %s" % proposed)
|
||
return self.unicode_markup
|
||
|
||
def _to_unicode(
|
||
self, data: bytes, encoding: _Encoding, errors: str = "strict"
|
||
) -> str:
|
||
"""Given a bytestring and its encoding, decodes the string into Unicode.
|
||
|
||
:param encoding: The name of an encoding.
|
||
:param errors: An error handling strategy, used when calling `str`.
|
||
"""
|
||
return str(data, encoding, errors)
|
||
|
||
@property
|
||
def declared_html_encoding(self) -> Optional[_Encoding]:
|
||
"""If the markup is an HTML document, returns the encoding, if any,
|
||
declared *inside* the document.
|
||
"""
|
||
if not self.is_html:
|
||
return None
|
||
return self.detector.declared_encoding
|
||
|
||
def find_codec(self, charset: _Encoding) -> Optional[str]:
|
||
"""Look up the Python codec corresponding to a given character set.
|
||
|
||
:param charset: The name of a character set.
|
||
:return: The name of a Python codec.
|
||
"""
|
||
value = (
|
||
self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||
or (charset and self._codec(charset.replace("-", "")))
|
||
or (charset and self._codec(charset.replace("-", "_")))
|
||
or (charset and charset.lower())
|
||
or charset
|
||
)
|
||
if value:
|
||
return value.lower()
|
||
return None
|
||
|
||
def _codec(self, charset: _Encoding) -> Optional[str]:
|
||
if not charset:
|
||
return charset
|
||
codec = None
|
||
try:
|
||
codecs.lookup(charset)
|
||
codec = charset
|
||
except (LookupError, ValueError):
|
||
pass
|
||
return codec
|
||
|
||
#: A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||
#:
|
||
#: :meta hide-value:
|
||
MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = {
|
||
b"\x80": ("euro", "20AC"),
|
||
b"\x81": " ",
|
||
b"\x82": ("sbquo", "201A"),
|
||
b"\x83": ("fnof", "192"),
|
||
b"\x84": ("bdquo", "201E"),
|
||
b"\x85": ("hellip", "2026"),
|
||
b"\x86": ("dagger", "2020"),
|
||
b"\x87": ("Dagger", "2021"),
|
||
b"\x88": ("circ", "2C6"),
|
||
b"\x89": ("permil", "2030"),
|
||
b"\x8a": ("Scaron", "160"),
|
||
b"\x8b": ("lsaquo", "2039"),
|
||
b"\x8c": ("OElig", "152"),
|
||
b"\x8d": "?",
|
||
b"\x8e": ("#x17D", "17D"),
|
||
b"\x8f": "?",
|
||
b"\x90": "?",
|
||
b"\x91": ("lsquo", "2018"),
|
||
b"\x92": ("rsquo", "2019"),
|
||
b"\x93": ("ldquo", "201C"),
|
||
b"\x94": ("rdquo", "201D"),
|
||
b"\x95": ("bull", "2022"),
|
||
b"\x96": ("ndash", "2013"),
|
||
b"\x97": ("mdash", "2014"),
|
||
b"\x98": ("tilde", "2DC"),
|
||
b"\x99": ("trade", "2122"),
|
||
b"\x9a": ("scaron", "161"),
|
||
b"\x9b": ("rsaquo", "203A"),
|
||
b"\x9c": ("oelig", "153"),
|
||
b"\x9d": "?",
|
||
b"\x9e": ("#x17E", "17E"),
|
||
b"\x9f": ("Yuml", ""),
|
||
}
|
||
|
||
#: A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||
#: horrors like stripping diacritical marks to turn á into a, but also
|
||
#: contains non-horrors like turning “ into ".
|
||
#:
|
||
#: Seriously, don't use this for anything other than removing smart
|
||
#: quotes.
|
||
#:
|
||
#: :meta private:
|
||
MS_CHARS_TO_ASCII: Dict[bytes, str] = {
|
||
b"\x80": "EUR",
|
||
b"\x81": " ",
|
||
b"\x82": ",",
|
||
b"\x83": "f",
|
||
b"\x84": ",,",
|
||
b"\x85": "...",
|
||
b"\x86": "+",
|
||
b"\x87": "++",
|
||
b"\x88": "^",
|
||
b"\x89": "%",
|
||
b"\x8a": "S",
|
||
b"\x8b": "<",
|
||
b"\x8c": "OE",
|
||
b"\x8d": "?",
|
||
b"\x8e": "Z",
|
||
b"\x8f": "?",
|
||
b"\x90": "?",
|
||
b"\x91": "'",
|
||
b"\x92": "'",
|
||
b"\x93": '"',
|
||
b"\x94": '"',
|
||
b"\x95": "*",
|
||
b"\x96": "-",
|
||
b"\x97": "--",
|
||
b"\x98": "~",
|
||
b"\x99": "(TM)",
|
||
b"\x9a": "s",
|
||
b"\x9b": ">",
|
||
b"\x9c": "oe",
|
||
b"\x9d": "?",
|
||
b"\x9e": "z",
|
||
b"\x9f": "Y",
|
||
b"\xa0": " ",
|
||
b"\xa1": "!",
|
||
b"\xa2": "c",
|
||
b"\xa3": "GBP",
|
||
b"\xa4": "$", # This approximation is especially parochial--this is the
|
||
# generic currency symbol.
|
||
b"\xa5": "YEN",
|
||
b"\xa6": "|",
|
||
b"\xa7": "S",
|
||
b"\xa8": "..",
|
||
b"\xa9": "",
|
||
b"\xaa": "(th)",
|
||
b"\xab": "<<",
|
||
b"\xac": "!",
|
||
b"\xad": " ",
|
||
b"\xae": "(R)",
|
||
b"\xaf": "-",
|
||
b"\xb0": "o",
|
||
b"\xb1": "+-",
|
||
b"\xb2": "2",
|
||
b"\xb3": "3",
|
||
b"\xb4": "'",
|
||
b"\xb5": "u",
|
||
b"\xb6": "P",
|
||
b"\xb7": "*",
|
||
b"\xb8": ",",
|
||
b"\xb9": "1",
|
||
b"\xba": "(th)",
|
||
b"\xbb": ">>",
|
||
b"\xbc": "1/4",
|
||
b"\xbd": "1/2",
|
||
b"\xbe": "3/4",
|
||
b"\xbf": "?",
|
||
b"\xc0": "A",
|
||
b"\xc1": "A",
|
||
b"\xc2": "A",
|
||
b"\xc3": "A",
|
||
b"\xc4": "A",
|
||
b"\xc5": "A",
|
||
b"\xc6": "AE",
|
||
b"\xc7": "C",
|
||
b"\xc8": "E",
|
||
b"\xc9": "E",
|
||
b"\xca": "E",
|
||
b"\xcb": "E",
|
||
b"\xcc": "I",
|
||
b"\xcd": "I",
|
||
b"\xce": "I",
|
||
b"\xcf": "I",
|
||
b"\xd0": "D",
|
||
b"\xd1": "N",
|
||
b"\xd2": "O",
|
||
b"\xd3": "O",
|
||
b"\xd4": "O",
|
||
b"\xd5": "O",
|
||
b"\xd6": "O",
|
||
b"\xd7": "*",
|
||
b"\xd8": "O",
|
||
b"\xd9": "U",
|
||
b"\xda": "U",
|
||
b"\xdb": "U",
|
||
b"\xdc": "U",
|
||
b"\xdd": "Y",
|
||
b"\xde": "b",
|
||
b"\xdf": "B",
|
||
b"\xe0": "a",
|
||
b"\xe1": "a",
|
||
b"\xe2": "a",
|
||
b"\xe3": "a",
|
||
b"\xe4": "a",
|
||
b"\xe5": "a",
|
||
b"\xe6": "ae",
|
||
b"\xe7": "c",
|
||
b"\xe8": "e",
|
||
b"\xe9": "e",
|
||
b"\xea": "e",
|
||
b"\xeb": "e",
|
||
b"\xec": "i",
|
||
b"\xed": "i",
|
||
b"\xee": "i",
|
||
b"\xef": "i",
|
||
b"\xf0": "o",
|
||
b"\xf1": "n",
|
||
b"\xf2": "o",
|
||
b"\xf3": "o",
|
||
b"\xf4": "o",
|
||
b"\xf5": "o",
|
||
b"\xf6": "o",
|
||
b"\xf7": "/",
|
||
b"\xf8": "o",
|
||
b"\xf9": "u",
|
||
b"\xfa": "u",
|
||
b"\xfb": "u",
|
||
b"\xfc": "u",
|
||
b"\xfd": "y",
|
||
b"\xfe": "b",
|
||
b"\xff": "y",
|
||
}
|
||
|
||
#: A map used when removing rogue Windows-1252/ISO-8859-1
|
||
#: characters in otherwise UTF-8 documents.
|
||
#:
|
||
#: Note that \\x81, \\x8d, \\x8f, \\x90, and \\x9d are undefined in
|
||
#: Windows-1252.
|
||
#:
|
||
#: :meta hide-value:
|
||
WINDOWS_1252_TO_UTF8: Dict[int, bytes] = {
|
||
0x80: b"\xe2\x82\xac", # €
|
||
0x82: b"\xe2\x80\x9a", # ‚
|
||
0x83: b"\xc6\x92", # ƒ
|
||
0x84: b"\xe2\x80\x9e", # „
|
||
0x85: b"\xe2\x80\xa6", # …
|
||
0x86: b"\xe2\x80\xa0", # †
|
||
0x87: b"\xe2\x80\xa1", # ‡
|
||
0x88: b"\xcb\x86", # ˆ
|
||
0x89: b"\xe2\x80\xb0", # ‰
|
||
0x8A: b"\xc5\xa0", # Š
|
||
0x8B: b"\xe2\x80\xb9", # ‹
|
||
0x8C: b"\xc5\x92", # Œ
|
||
0x8E: b"\xc5\xbd", # Ž
|
||
0x91: b"\xe2\x80\x98", # ‘
|
||
0x92: b"\xe2\x80\x99", # ’
|
||
0x93: b"\xe2\x80\x9c", # “
|
||
0x94: b"\xe2\x80\x9d", # ”
|
||
0x95: b"\xe2\x80\xa2", # •
|
||
0x96: b"\xe2\x80\x93", # –
|
||
0x97: b"\xe2\x80\x94", # —
|
||
0x98: b"\xcb\x9c", # ˜
|
||
0x99: b"\xe2\x84\xa2", # ™
|
||
0x9A: b"\xc5\xa1", # š
|
||
0x9B: b"\xe2\x80\xba", # ›
|
||
0x9C: b"\xc5\x93", # œ
|
||
0x9E: b"\xc5\xbe", # ž
|
||
0x9F: b"\xc5\xb8", # Ÿ
|
||
0xA0: b"\xc2\xa0", #
|
||
0xA1: b"\xc2\xa1", # ¡
|
||
0xA2: b"\xc2\xa2", # ¢
|
||
0xA3: b"\xc2\xa3", # £
|
||
0xA4: b"\xc2\xa4", # ¤
|
||
0xA5: b"\xc2\xa5", # ¥
|
||
0xA6: b"\xc2\xa6", # ¦
|
||
0xA7: b"\xc2\xa7", # §
|
||
0xA8: b"\xc2\xa8", # ¨
|
||
0xA9: b"\xc2\xa9", # ©
|
||
0xAA: b"\xc2\xaa", # ª
|
||
0xAB: b"\xc2\xab", # «
|
||
0xAC: b"\xc2\xac", # ¬
|
||
0xAD: b"\xc2\xad", #
|
||
0xAE: b"\xc2\xae", # ®
|
||
0xAF: b"\xc2\xaf", # ¯
|
||
0xB0: b"\xc2\xb0", # °
|
||
0xB1: b"\xc2\xb1", # ±
|
||
0xB2: b"\xc2\xb2", # ²
|
||
0xB3: b"\xc2\xb3", # ³
|
||
0xB4: b"\xc2\xb4", # ´
|
||
0xB5: b"\xc2\xb5", # µ
|
||
0xB6: b"\xc2\xb6", # ¶
|
||
0xB7: b"\xc2\xb7", # ·
|
||
0xB8: b"\xc2\xb8", # ¸
|
||
0xB9: b"\xc2\xb9", # ¹
|
||
0xBA: b"\xc2\xba", # º
|
||
0xBB: b"\xc2\xbb", # »
|
||
0xBC: b"\xc2\xbc", # ¼
|
||
0xBD: b"\xc2\xbd", # ½
|
||
0xBE: b"\xc2\xbe", # ¾
|
||
0xBF: b"\xc2\xbf", # ¿
|
||
0xC0: b"\xc3\x80", # À
|
||
0xC1: b"\xc3\x81", # Á
|
||
0xC2: b"\xc3\x82", # Â
|
||
0xC3: b"\xc3\x83", # Ã
|
||
0xC4: b"\xc3\x84", # Ä
|
||
0xC5: b"\xc3\x85", # Å
|
||
0xC6: b"\xc3\x86", # Æ
|
||
0xC7: b"\xc3\x87", # Ç
|
||
0xC8: b"\xc3\x88", # È
|
||
0xC9: b"\xc3\x89", # É
|
||
0xCA: b"\xc3\x8a", # Ê
|
||
0xCB: b"\xc3\x8b", # Ë
|
||
0xCC: b"\xc3\x8c", # Ì
|
||
0xCD: b"\xc3\x8d", # Í
|
||
0xCE: b"\xc3\x8e", # Î
|
||
0xCF: b"\xc3\x8f", # Ï
|
||
0xD0: b"\xc3\x90", # Ð
|
||
0xD1: b"\xc3\x91", # Ñ
|
||
0xD2: b"\xc3\x92", # Ò
|
||
0xD3: b"\xc3\x93", # Ó
|
||
0xD4: b"\xc3\x94", # Ô
|
||
0xD5: b"\xc3\x95", # Õ
|
||
0xD6: b"\xc3\x96", # Ö
|
||
0xD7: b"\xc3\x97", # ×
|
||
0xD8: b"\xc3\x98", # Ø
|
||
0xD9: b"\xc3\x99", # Ù
|
||
0xDA: b"\xc3\x9a", # Ú
|
||
0xDB: b"\xc3\x9b", # Û
|
||
0xDC: b"\xc3\x9c", # Ü
|
||
0xDD: b"\xc3\x9d", # Ý
|
||
0xDE: b"\xc3\x9e", # Þ
|
||
0xDF: b"\xc3\x9f", # ß
|
||
0xE0: b"\xc3\xa0", # à
|
||
0xE1: b"\xa1", # á
|
||
0xE2: b"\xc3\xa2", # â
|
||
0xE3: b"\xc3\xa3", # ã
|
||
0xE4: b"\xc3\xa4", # ä
|
||
0xE5: b"\xc3\xa5", # å
|
||
0xE6: b"\xc3\xa6", # æ
|
||
0xE7: b"\xc3\xa7", # ç
|
||
0xE8: b"\xc3\xa8", # è
|
||
0xE9: b"\xc3\xa9", # é
|
||
0xEA: b"\xc3\xaa", # ê
|
||
0xEB: b"\xc3\xab", # ë
|
||
0xEC: b"\xc3\xac", # ì
|
||
0xED: b"\xc3\xad", # í
|
||
0xEE: b"\xc3\xae", # î
|
||
0xEF: b"\xc3\xaf", # ï
|
||
0xF0: b"\xc3\xb0", # ð
|
||
0xF1: b"\xc3\xb1", # ñ
|
||
0xF2: b"\xc3\xb2", # ò
|
||
0xF3: b"\xc3\xb3", # ó
|
||
0xF4: b"\xc3\xb4", # ô
|
||
0xF5: b"\xc3\xb5", # õ
|
||
0xF6: b"\xc3\xb6", # ö
|
||
0xF7: b"\xc3\xb7", # ÷
|
||
0xF8: b"\xc3\xb8", # ø
|
||
0xF9: b"\xc3\xb9", # ù
|
||
0xFA: b"\xc3\xba", # ú
|
||
0xFB: b"\xc3\xbb", # û
|
||
0xFC: b"\xc3\xbc", # ü
|
||
0xFD: b"\xc3\xbd", # ý
|
||
0xFE: b"\xc3\xbe", # þ
|
||
}
|
||
|
||
#: :meta private:
|
||
MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [
|
||
(0xC2, 0xDF, 2), # 2-byte characters start with a byte C2-DF
|
||
(0xE0, 0xEF, 3), # 3-byte characters start with E0-EF
|
||
(0xF0, 0xF4, 4), # 4-byte characters start with F0-F4
|
||
]
|
||
|
||
#: :meta private:
|
||
FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||
|
||
#: :meta private:
|
||
LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||
|
||
@classmethod
|
||
def detwingle(
|
||
cls,
|
||
in_bytes: bytes,
|
||
main_encoding: _Encoding = "utf8",
|
||
embedded_encoding: _Encoding = "windows-1252",
|
||
) -> bytes:
|
||
"""Fix characters from one encoding embedded in some other encoding.
|
||
|
||
Currently the only situation supported is Windows-1252 (or its
|
||
subset ISO-8859-1), embedded in UTF-8.
|
||
|
||
:param in_bytes: A bytestring that you suspect contains
|
||
characters from multiple encodings. Note that this *must*
|
||
be a bytestring. If you've already converted the document
|
||
to Unicode, you're too late.
|
||
:param main_encoding: The primary encoding of ``in_bytes``.
|
||
:param embedded_encoding: The encoding that was used to embed characters
|
||
in the main document.
|
||
:return: A bytestring similar to ``in_bytes``, in which
|
||
``embedded_encoding`` characters have been converted to
|
||
their ``main_encoding`` equivalents.
|
||
"""
|
||
if embedded_encoding.replace("_", "-").lower() not in (
|
||
"windows-1252",
|
||
"windows_1252",
|
||
):
|
||
raise NotImplementedError(
|
||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||
"embedded encodings."
|
||
)
|
||
|
||
if main_encoding.lower() not in ("utf8", "utf-8"):
|
||
raise NotImplementedError(
|
||
"UTF-8 is the only currently supported main encoding."
|
||
)
|
||
|
||
byte_chunks = []
|
||
|
||
chunk_start = 0
|
||
pos = 0
|
||
while pos < len(in_bytes):
|
||
byte = in_bytes[pos]
|
||
if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
|
||
# This is the start of a UTF-8 multibyte character. Skip
|
||
# to the end.
|
||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||
if byte >= start and byte <= end:
|
||
pos += size
|
||
break
|
||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||
# We found a Windows-1252 character!
|
||
# Save the string up to this point as a chunk.
|
||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||
|
||
# Now translate the Windows-1252 character into UTF-8
|
||
# and add it as another, one-byte chunk.
|
||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||
pos += 1
|
||
chunk_start = pos
|
||
else:
|
||
# Go on to the next character.
|
||
pos += 1
|
||
if chunk_start == 0:
|
||
# The string is unchanged.
|
||
return in_bytes
|
||
else:
|
||
# Store the final chunk.
|
||
byte_chunks.append(in_bytes[chunk_start:])
|
||
return b"".join(byte_chunks)
|