Aktualisiere README und füge ti_status_checker_api.py hinzu

This commit is contained in:
2025-06-27 11:08:02 +02:00
parent ae0a20e93b
commit 6deaff4dbc
1173 changed files with 209002 additions and 3 deletions

View File

@ -0,0 +1,848 @@
from __future__ import annotations
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
from collections import defaultdict
import re
from types import ModuleType
from typing import (
Any,
cast,
Dict,
Iterable,
List,
Optional,
Pattern,
Set,
Tuple,
Type,
TYPE_CHECKING,
)
import warnings
import sys
from bs4.element import (
AttributeDict,
AttributeValueList,
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
RubyParenthesisString,
RubyTextString,
Stylesheet,
Script,
TemplateString,
nonwhitespace_re,
)
# Exceptions were moved to their own module in 4.13. Import here for
# backwards compatibility.
from bs4.exceptions import ParserRejectedMarkup
from bs4._typing import (
_AttributeValues,
_RawAttributeValue,
)
from bs4._warnings import XMLParsedAsHTMLWarning
if TYPE_CHECKING:
from bs4 import BeautifulSoup
from bs4.element import (
NavigableString,
Tag,
)
from bs4._typing import (
_AttributeValue,
_Encoding,
_Encodings,
_RawOrProcessedAttributeValues,
_RawMarkup,
)
__all__ = [
"HTMLTreeBuilder",
"SAXTreeBuilder",
"TreeBuilder",
"TreeBuilderRegistry",
]
# Some useful features for a TreeBuilder to have.
FAST = "fast"
PERMISSIVE = "permissive"
STRICT = "strict"
XML = "xml"
HTML = "html"
HTML_5 = "html5"
__all__ = [
"TreeBuilderRegistry",
"TreeBuilder",
"HTMLTreeBuilder",
"DetectsXMLParsedAsHTML",
"ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
]
class TreeBuilderRegistry(object):
"""A way of looking up TreeBuilder subclasses by their name or by desired
features.
"""
builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
builders: List[Type[TreeBuilder]]
def __init__(self) -> None:
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class: type[TreeBuilder]) -> None:
"""Register a treebuilder based on its advertised features.
:param treebuilder_class: A subclass of `TreeBuilder`. its
`TreeBuilder.features` attribute should list its features.
"""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
"""Look up a TreeBuilder subclass with the desired features.
:param features: A list of features to look for. If none are
provided, the most recently registered TreeBuilder subclass
will be used.
:return: A TreeBuilder subclass, or None if there's no
registered subclass with all the requested features.
"""
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
feature_list = list(features)
feature_list.reverse()
candidates = None
candidate_set = None
while len(feature_list) > 0:
feature = feature_list.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None or candidates is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
#: The `BeautifulSoup` constructor will take a list of features
#: and use it to look up `TreeBuilder` classes in this registry.
builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a textual document into a Beautiful Soup object tree.
This is an abstract superclass which smooths out the behavior of
different parser libraries into a single, unified interface.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this to a dictionary will
customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is ``multi_valued_attributes``.
:param preserve_whitespace_tags: A set of tags to treat
the way <pre> tags are treated in HTML. Tags in this set
are immune from pretty-printing; their contents will always be
output as-is.
:param string_containers: A dictionary mapping tag names to
the classes that should be instantiated to contain the textual
contents of those tags. The default is to use NavigableString
for every tag, no matter what the name. You can override the
default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
:param store_line_numbers: If the parser keeps track of the line
numbers and positions of the original markup, that information
will, by default, be stored in each corresponding
:py:class:`bs4.element.Tag` object. You can turn this off by
passing store_line_numbers=False; then Tag.sourcepos and
Tag.sourceline will always be None. If the parser you're using
doesn't keep track of this information, then store_line_numbers
is irrelevant.
:param attribute_dict_class: The value of a multi-valued attribute
(such as HTML's 'class') willl be stored in an instance of this
class. The default is Beautiful Soup's built-in
`AttributeValueList`, which is a normal Python list, and you
will probably never need to change it.
"""
USE_DEFAULT: Any = object() #: :meta private:
def __init__(
self,
multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
preserve_whitespace_tags: Set[str] = USE_DEFAULT,
store_line_numbers: bool = USE_DEFAULT,
string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
empty_element_tags: Set[str] = USE_DEFAULT,
attribute_dict_class: Type[AttributeDict] = AttributeDict,
attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
):
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
if empty_element_tags is self.USE_DEFAULT:
self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
else:
self.empty_element_tags = empty_element_tags
# TODO: store_line_numbers is probably irrelevant now that
# the behavior of sourceline and sourcepos has been made consistent
# everywhere.
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers
if string_containers == self.USE_DEFAULT:
string_containers = self.DEFAULT_STRING_CONTAINERS
self.string_containers = string_containers
self.attribute_dict_class = attribute_dict_class
self.attribute_value_list_class = attribute_value_list_class
NAME: str = "[Unknown tree builder]"
ALTERNATE_NAMES: Iterable[str] = []
features: Iterable[str] = []
is_xml: bool = False
picklable: bool = False
soup: Optional[BeautifulSoup] #: :meta private:
#: A tag will be considered an empty-element
#: tag when and only when it has no contents.
empty_element_tags: Optional[Set[str]] = None #: :meta private:
cdata_list_attributes: Dict[str, Set[str]] #: :meta private:
preserve_whitespace_tags: Set[str] #: :meta private:
string_containers: Dict[str, Type[NavigableString]] #: :meta private:
tracks_line_numbers: bool #: :meta private:
#: A value for these tag/attribute combinations is a space- or
#: comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
#: Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
#: The textual contents of tags with these names should be
#: instantiated with some class other than `bs4.element.NavigableString`.
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
#: By default, tags are treated as empty-element tags if they have
#: no contents--that is, using XML rules. HTMLTreeBuilder
#: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
#: HTML 4 and HTML5 standards.
DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
#: Most parsers don't keep track of line numbers.
TRACKS_LINE_NUMBERS: bool = False
def initialize_soup(self, soup: BeautifulSoup) -> None:
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
:param soup: A BeautifulSoup object.
"""
self.soup = soup
def reset(self) -> None:
"""Do any work necessary to reset the underlying parser
for a new document.
By default, this does nothing.
"""
pass
def can_be_empty_element(self, tag_name: str) -> bool:
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p/>" or "<p>".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no children.
"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
be left alone.
:param tag_name: The name of a markup tag.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup: _RawMarkup) -> None:
"""Run incoming markup through some parsing process."""
raise NotImplementedError()
def prepare_markup(
self,
markup: _RawMarkup,
user_specified_encoding: Optional[_Encoding] = None,
document_declared_encoding: Optional[_Encoding] = None,
exclude_encodings: Optional[_Encodings] = None,
) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
:param markup: The markup that's about to be parsed.
:param user_specified_encoding: The user asked to try this encoding
to convert the markup into a Unicode string.
:param document_declared_encoding: The markup itself claims to be
in this encoding. NOTE: This argument is not used by the
calling code and can probably be removed.
:param exclude_encodings: The user asked *not* to try any of
these encodings.
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy that the parser can try
to convert the document to Unicode and parse it. Each
strategy will be tried in turn.
By default, the only strategy is to parse the markup
as-is. See `LXMLTreeBuilderForXML` and
`HTMLParserTreeBuilder` for implementations that take into
account the quirks of particular parsers.
:meta private:
"""
yield markup, None, None, False
def test_fragment_to_document(self, fragment: str) -> str:
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of unit tests.
:param fragment: A fragment of HTML.
:return: A full HTML document.
:meta private:
"""
return fragment
def set_up_substitutions(self, tag: Tag) -> bool:
"""Set up any substitutions that will need to be performed on
a `Tag` when it's output as a string.
By default, this does nothing. See `HTMLTreeBuilder` for a
case where this is used.
:return: Whether or not a substitution was performed.
:meta private:
"""
return False
def _replace_cdata_list_attribute_values(
self, tag_name: str, attrs: _RawOrProcessedAttributeValues
) -> _AttributeValues:
"""When an attribute value is associated with a tag that can
have multiple values for that attribute, convert the string
value to a list of strings.
Basically, replaces class="foo bar" with class=["foo", "bar"]
NOTE: This method modifies its input in place.
:param tag_name: The name of a tag.
:param attrs: A dictionary containing the tag's attributes.
Any appropriate attribute values will be modified in place.
:return: The modified dictionary that was originally passed in.
"""
# First, cast the attrs dict to _AttributeValues. This might
# not be accurate yet, but it will be by the time this method
# returns.
modified_attrs = cast(_AttributeValues, attrs)
if not modified_attrs or not self.cdata_list_attributes:
# Nothing to do.
return modified_attrs
# There is at least a possibility that we need to modify one of
# the attribute values.
universal: Set[str] = self.cdata_list_attributes.get("*", set())
tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
for attr in list(modified_attrs.keys()):
modified_value: _AttributeValue
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
original_value: _AttributeValue = modified_attrs[attr]
if isinstance(original_value, _RawAttributeValue):
# This is a _RawAttributeValue (a string) that
# needs to be split and converted to a
# AttributeValueList so it can be an
# _AttributeValue.
modified_value = self.attribute_value_list_class(
nonwhitespace_re.findall(original_value)
)
else:
# html5lib calls setAttributes twice for the
# same tag when rearranging the parse tree. On
# the second call the attribute value here is
# already a list. This can also happen when a
# Tag object is cloned. If this happens, leave
# the value alone rather than trying to split
# it again.
modified_value = original_value
modified_attrs[attr] = modified_value
return modified_attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events.
This is not currently used for anything, and it will be removed
soon. It was a good idea, but it wasn't properly integrated into the
rest of Beautiful Soup, so there have been long stretches where it
hasn't worked properly.
"""
def __init__(self, *args: Any, **kwargs: Any) -> None:
warnings.warn(
"The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
DeprecationWarning,
stacklevel=2,
)
super(SAXTreeBuilder, self).__init__(*args, **kwargs)
def feed(self, markup: _RawMarkup) -> None:
raise NotImplementedError()
def close(self) -> None:
pass
def startElement(self, name: str, attrs: Dict[str, str]) -> None:
attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
# print("Start %s, %r" % (name, attrs))
assert self.soup is not None
self.soup.handle_starttag(name, None, None, attrs)
def endElement(self, name: str) -> None:
# print("End %s" % name)
assert self.soup is not None
self.soup.handle_endtag(name)
def startElementNS(
self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
) -> None:
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
# handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix: str) -> None:
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content: str) -> None:
assert self.soup is not None
self.soup.handle_data(content)
def startDocument(self) -> None:
pass
def endDocument(self) -> None:
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML, such as which tags are treated
specially by the HTML standard.
"""
#: Some HTML tags are defined as having no contents. Beautiful Soup
#: treats these specially.
DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(
[
# These are from HTML5.
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"menuitem",
"meta",
"param",
"source",
"track",
"wbr",
# These are from earlier versions of HTML and are removed in HTML5.
"basefont",
"bgsound",
"command",
"frame",
"image",
"isindex",
"nextid",
"spacer",
]
)
#: The HTML standard defines these tags as block-level elements. Beautiful
#: Soup does not treat these elements differently from other elements,
#: but it may do so eventually, and this information is available if
#: you need to use it.
DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
[
"address",
"article",
"aside",
"blockquote",
"canvas",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hr",
"li",
"main",
"nav",
"noscript",
"ol",
"output",
"p",
"pre",
"section",
"table",
"tfoot",
"ul",
"video",
]
)
#: These HTML tags need special treatment so they can be
#: represented by a string class other than `bs4.element.NavigableString`.
#:
#: For some of these tags, it's because the HTML standard defines
#: an unusual content model for them. I made this list by going
#: through the HTML spec
#: (https://html.spec.whatwg.org/#metadata-content) and looking for
#: "metadata content" elements that can contain strings.
#:
#: The Ruby tags (<rt> and <rp>) are here despite being normal
#: "phrasing content" tags, because the content they contain is
#: qualitatively different from other text in the document, and it
#: can be useful to be able to distinguish it.
#:
#: TODO: Arguably <noscript> could go here but it seems
#: qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
"rt": RubyTextString,
"rp": RubyParenthesisString,
"style": Stylesheet,
"script": Script,
"template": TemplateString,
}
#: The HTML standard defines these attributes as containing a
#: space-separated list of values, not a single value. That is,
#: class="foo bar" means that the 'class' attribute has two values,
#: 'foo' and 'bar', not the single value 'foo bar'. When we
#: encounter one of these attributes, we will parse its value into
#: a list of values if possible. Upon output, the list will be
#: converted back into a string.
DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
"*": {"class", "accesskey", "dropzone"},
"a": {"rel", "rev"},
"link": {"rel", "rev"},
"td": {"headers"},
"th": {"headers"},
"form": {"accept-charset"},
"object": {"archive"},
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
"area": {"rel"},
"icon": {"sizes"},
"iframe": {"sandbox"},
"output": {"for"},
}
#: By default, whitespace inside these HTML tags will be
#: preserved rather than being collapsed.
DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
def set_up_substitutions(self, tag: Tag) -> bool:
"""Replace the declared encoding in a <meta> tag with a placeholder,
to be substituted when the tag is output to a string.
An HTML document may come in to Beautiful Soup as one
encoding, but exit in a different encoding, and the <meta> tag
needs to be changed to reflect this.
:return: Whether or not a substitution was performed.
:meta private:
"""
# We are only interested in <meta> tags
if tag.name != "meta":
return False
# TODO: This cast will fail in the (very unlikely) scenario
# that the programmer who instantiates the TreeBuilder
# specifies meta['content'] or meta['charset'] as
# cdata_list_attributes.
content: Optional[str] = cast(Optional[str], tag.get("content"))
charset: Optional[str] = cast(Optional[str], tag.get("charset"))
# But we can accommodate meta['http-equiv'] being made a
# cdata_list_attribute (again, very unlikely) without much
# trouble.
http_equiv: List[str] = tag.get_attribute_list("http-equiv")
# We are interested in <meta> tags that say what encoding the
# document was originally in. This means HTML 5-style <meta>
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
#
# In both cases we will replace the value of the appropriate
# attribute with a standin object that can take on any
# encoding.
substituted = False
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
tag["charset"] = CharsetMetaAttributeValue(charset)
substituted = True
elif content is not None and any(
x.lower() == "content-type" for x in http_equiv
):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
tag["content"] = ContentMetaAttributeValue(content)
substituted = True
return substituted
class DetectsXMLParsedAsHTML(object):
"""A mixin class for any class (a TreeBuilder, or some class used by a
TreeBuilder) that's in a position to detect whether an XML
document is being incorrectly parsed as HTML, and issue an
appropriate warning.
This requires being able to observe an incoming processing
instruction that might be an XML declaration, and also able to
observe tags as they're opened. If you can't do that for a given
`TreeBuilder`, there's a less reliable implementation based on
examining the raw markup.
"""
#: Regular expression for seeing if string markup has an <html> tag.
LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
#: Regular expression for seeing if byte markup has an <html> tag.
LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
#: The start of an XML document string.
XML_PREFIX: str = "<?xml"
#: The start of an XML document bytestring.
XML_PREFIX_B: bytes = b"<?xml"
# This is typed as str, not `ProcessingInstruction`, because this
# check may be run before any Beautiful Soup objects are created.
_first_processing_instruction: Optional[str] #: :meta private:
_root_tag_name: Optional[str] #: :meta private:
@classmethod
def warn_if_markup_looks_like_xml(
cls, markup: Optional[_RawMarkup], stacklevel: int = 3
) -> bool:
"""Perform a check on some markup to see if it looks like XML
that's not XHTML. If so, issue a warning.
This is much less reliable than doing the check while parsing,
but some of the tree builders can't do that.
:param stacklevel: The stacklevel of the code calling this\
function.
:return: True if the markup looks like non-XHTML XML, False
otherwise.
"""
if markup is None:
return False
markup = markup[:500]
if isinstance(markup, bytes):
markup_b: bytes = markup
looks_like_xml = markup_b.startswith(
cls.XML_PREFIX_B
) and not cls.LOOKS_LIKE_HTML_B.search(markup)
else:
markup_s: str = markup
looks_like_xml = markup_s.startswith(
cls.XML_PREFIX
) and not cls.LOOKS_LIKE_HTML.search(markup)
if looks_like_xml:
cls._warn(stacklevel=stacklevel + 2)
return True
return False
@classmethod
def _warn(cls, stacklevel: int = 5) -> None:
"""Issue a warning about XML being parsed as HTML."""
warnings.warn(
XMLParsedAsHTMLWarning.MESSAGE,
XMLParsedAsHTMLWarning,
stacklevel=stacklevel,
)
def _initialize_xml_detector(self) -> None:
"""Call this method before parsing a document."""
self._first_processing_instruction = None
self._root_tag_name = None
def _document_might_be_xml(self, processing_instruction: str) -> None:
"""Call this method when encountering an XML declaration, or a
"processing instruction" that might be an XML declaration.
This helps Beautiful Soup detect potential issues later, if
the XML document turns out to be a non-XHTML document that's
being parsed as XML.
"""
if (
self._first_processing_instruction is not None
or self._root_tag_name is not None
):
# The document has already started. Don't bother checking
# anymore.
return
self._first_processing_instruction = processing_instruction
# We won't know until we encounter the first tag whether or
# not this is actually a problem.
def _root_tag_encountered(self, name: str) -> None:
"""Call this when you encounter the document's root tag.
This is where we actually check whether an XML document is
being incorrectly parsed as HTML, and issue the warning.
"""
if self._root_tag_name is not None:
# This method was incorrectly called multiple times. Do
# nothing.
return
self._root_tag_name = name
if (
name != "html"
and self._first_processing_instruction is not None
and self._first_processing_instruction.lower().startswith("xml ")
):
# We encountered an XML declaration and then a tag other
# than 'html'. This is a reliable indicator that a
# non-XHTML document is being parsed as XML.
self._warn(stacklevel=10)
def register_treebuilders_from(module: ModuleType) -> None:
"""Copy TreeBuilders from the given module into this module."""
this_module = sys.modules[__name__]
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last resort.
from . import _htmlparser # noqa: E402
register_treebuilders_from(_htmlparser)
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass

View File

@ -0,0 +1,594 @@
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
"HTML5TreeBuilder",
]
from typing import (
Any,
cast,
Dict,
Iterable,
Optional,
Sequence,
TYPE_CHECKING,
Tuple,
Union,
)
from typing_extensions import TypeAlias
from bs4._typing import (
_AttributeValue,
_AttributeValues,
_Encoding,
_Encodings,
_NamespaceURL,
_RawMarkup,
)
import warnings
from bs4.builder import (
DetectsXMLParsedAsHTML,
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import (
NamespacedAttribute,
PageElement,
nonwhitespace_re,
)
import html5lib
from html5lib.constants import (
namespaces,
)
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
if TYPE_CHECKING:
from bs4 import BeautifulSoup
from html5lib.treebuilders import base as treebuilder_base
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
build a tree.
Note that `HTML5TreeBuilder` does not support some common HTML
`TreeBuilder` features. Some of these features could theoretically
be implemented, but at the very least it's quite difficult,
because html5lib moves the parse tree around as it's being built.
Specifically:
* This `TreeBuilder` doesn't use different subclasses of
`NavigableString` (e.g. `Script`) based on the name of the tag
in which the string was found.
* You can't use a `SoupStrainer` to parse only part of a document.
"""
NAME: str = "html5lib"
features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML]
#: html5lib can tell us which line number and position in the
#: original file is the source of an element.
TRACKS_LINE_NUMBERS: bool = True
underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:
user_specified_encoding: Optional[_Encoding]
def prepare_markup(
self,
markup: _RawMarkup,
user_specified_encoding: Optional[_Encoding] = None,
document_declared_encoding: Optional[_Encoding] = None,
exclude_encodings: Optional[_Encodings] = None,
) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
for variable, name in (
(document_declared_encoding, "document_declared_encoding"),
(exclude_encodings, "exclude_encodings"),
):
if variable:
warnings.warn(
f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
stacklevel=3,
)
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
def feed(self, markup: _RawMarkup) -> None:
"""Run some incoming markup through some parsing process,
populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
"""
if self.soup is not None and self.soup.parse_only is not None:
warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4,
)
# self.underlying_builder is probably None now, but it'll be set
# when html5lib calls self.create_treebuilder().
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
assert self.underlying_builder is not None
self.underlying_builder.parser = parser
extra_kwargs = dict()
if not isinstance(markup, str):
# kwargs, specifically override_encoding, will eventually
# be passed in to html5lib's
# HTMLBinaryInputStream.__init__.
extra_kwargs["override_encoding"] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
# The encoding is an html5lib Encoding object. We want to
# use a string for compatibility with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
self.underlying_builder.parser = None
def create_treebuilder(
self, namespaceHTMLElements: bool
) -> "TreeBuilderForHtml5lib":
"""Called by html5lib to instantiate the kind of class it
calls a 'TreeBuilder'.
:param namespaceHTMLElements: Whether or not to namespace HTML elements.
:meta private:
"""
self.underlying_builder = TreeBuilderForHtml5lib(
namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
)
return self.underlying_builder
def test_fragment_to_document(self, fragment: str) -> str:
"""See `TreeBuilder`."""
return "<html><head></head><body>%s</body></html>" % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
soup: "BeautifulSoup" #: :meta private:
parser: Optional[html5lib.HTMLParser] #: :meta private:
def __init__(
self,
namespaceHTMLElements: bool,
soup: Optional["BeautifulSoup"] = None,
store_line_numbers: bool = True,
**kwargs: Any,
):
if soup:
self.soup = soup
else:
warnings.warn(
"The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
DeprecationWarning,
stacklevel=2,
)
from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? Using
# html5lib doesn't cause an infinite loop and is more
# accurate. Best to get rid of this entire section, I think.
self.soup = BeautifulSoup(
"", "html.parser", store_line_numbers=store_line_numbers, **kwargs
)
# TODO: What are **kwargs exactly? Should they be passed in
# here in addition to/instead of being passed to the BeautifulSoup
# constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to a real html5lib HTMLParser object,
# which we can use to track the current line number.
self.parser = None
self.store_line_numbers = store_line_numbers
def documentClass(self) -> "Element":
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token: Dict[str, Any]) -> None:
name: str = cast(str, token["name"])
publicId: Optional[str] = cast(Optional[str], token["publicId"])
systemId: Optional[str] = cast(Optional[str], token["systemId"])
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name: str, namespace: str) -> "Element":
sourceline: Optional[int] = None
sourcepos: Optional[int] = None
if self.parser is not None and self.store_line_numbers:
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
sourceline, sourcepos = self.parser.tokenizer.stream.position()
assert sourcepos is not None
sourcepos = sourcepos - 1
tag = self.soup.new_tag(
name, namespace, sourceline=sourceline, sourcepos=sourcepos
)
return Element(tag, self.soup, namespace)
def commentClass(self, data: str) -> "TextNode":
return TextNode(Comment(data), self.soup)
def fragmentClass(self) -> "Element":
"""This is only used by html5lib HTMLParser.parseFragment(),
which is never used by Beautiful Soup, only by the html5lib
unit tests. Since we don't currently hook into those tests,
the implementation is left blank.
"""
raise NotImplementedError()
def getFragment(self) -> "Element":
"""This is only used by the html5lib unit tests. Since we
don't currently hook into those tests, the implementation is
left blank.
"""
raise NotImplementedError()
def appendChild(self, node: "Element") -> None:
# TODO: This code is not covered by the BS4 tests, and
# apparently not triggered by the html5lib test suite either.
# But it doesn't seem test-specific and there are calls to it
# (or a method with the same name) all over html5lib, so I'm
# leaving the implementation in place rather than replacing it
# with NotImplementedError()
self.soup.append(node.element)
def getDocument(self) -> "BeautifulSoup":
return self.soup
def testSerializer(self, element: "Element") -> str:
"""This is only used by the html5lib unit tests. Since we
don't currently hook into those tests, the implementation is
left blank.
"""
raise NotImplementedError()
class AttrList(object):
"""Represents a Tag's attributes in a way compatible with html5lib."""
element: Tag
attrs: _AttributeValues
def __init__(self, element: Tag):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
return list(self.attrs.items()).__iter__()
def __setitem__(self, name: str, value: _AttributeValue) -> None:
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = self.element.cdata_list_attributes or {}
if name in list_attr.get("*", []) or (
self.element.name in list_attr
and name in list_attr.get(self.element.name, [])
):
# A node that is being cloned may have already undergone
# this procedure. Check for this and skip it.
if not isinstance(value, list):
assert isinstance(value, str)
value = self.element.attribute_value_list_class(
nonwhitespace_re.findall(value)
)
self.element[name] = value
def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
return list(self.attrs.items())
def keys(self) -> Iterable[str]:
return list(self.attrs.keys())
def __len__(self) -> int:
return len(self.attrs)
def __getitem__(self, name: str) -> _AttributeValue:
return self.attrs[name]
def __contains__(self, name: str) -> bool:
return name in list(self.attrs.keys())
class BeautifulSoupNode(treebuilder_base.Node):
element: PageElement
soup: "BeautifulSoup"
namespace: Optional[_NamespaceURL]
@property
def nodeType(self) -> int:
"""Return the html5lib constant corresponding to the type of
the underlying DOM object.
NOTE: This property is only accessed by the html5lib test
suite, not by Beautiful Soup proper.
"""
raise NotImplementedError()
# TODO-TYPING: typeshed stubs are incorrect about this;
# cloneNode returns a new Node, not None.
def cloneNode(self) -> treebuilder_base.Node:
raise NotImplementedError()
class Element(BeautifulSoupNode):
element: Tag
namespace: Optional[_NamespaceURL]
def __init__(
self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
):
treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node: "BeautifulSoupNode") -> None:
string_child: Optional[NavigableString] = None
child: PageElement
if type(node.element) is NavigableString:
string_child = child = node.element
else:
child = node.element
node.parent = self
if (
child is not None
and child.parent is not None
and not isinstance(child, str)
):
node.element.extract()
if (
string_child is not None
and self.element.contents
and type(self.element.contents[-1]) is NavigableString
):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
# Tell Beautiful Soup to act as if it parsed this element
# immediately after the parent's last descendant. (Or
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
elif self.element.next_element is not None:
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self.soup._last_descendant()
else:
most_recent_element = self.element
self.soup.object_was_parsed(
child, parent=self.element, most_recent_element=most_recent_element
)
def getAttributes(self) -> AttrList:
if isinstance(self.element, Comment):
return {}
return AttrList(self.element)
# An HTML5lib attribute name may either be a single string,
# or a tuple (namespace, name).
_Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
# Now we can define the type this method accepts as a dictionary
# mapping those attribute names to single string values.
_Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
if attributes is not None and len(attributes) > 0:
# Replace any namespaced attributes with
# NamespacedAttribute objects.
for name, value in list(attributes.items()):
if isinstance(name, tuple):
new_name = NamespacedAttribute(*name)
del attributes[name]
attributes[new_name] = value
# We can now cast attributes to the type of Dict
# used by Beautiful Soup.
normalized_attributes = cast(_AttributeValues, attributes)
# Values for tags like 'class' came in as single strings;
# replace them with lists of strings as appropriate.
self.soup.builder._replace_cdata_list_attribute_values(
self.name, normalized_attributes
)
# Then set the attributes on the Tag associated with this
# BeautifulSoupNode.
for name, value_or_values in list(normalized_attributes.items()):
self.element[name] = value_or_values
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
#
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
self.soup.builder.set_up_substitutions(self.element)
attributes = property(getAttributes, setAttributes)
def insertText(
self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
) -> None:
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(
self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
) -> None:
index = self.element.index(refNode.element)
if (
type(node.element) is NavigableString
and self.element.contents
and type(self.element.contents[index - 1]) is NavigableString
):
# (See comments in appendChild)
old_node = self.element.contents[index - 1]
assert type(old_node) is NavigableString
new_str = self.soup.new_string(old_node + node.element)
old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node: "Element") -> None:
node.element.extract()
def reparentChildren(self, new_parent: "Element") -> None:
"""Move all of this tag's children into another tag."""
# print("MOVE", self.element.contents)
# print("FROM", self.element)
# print("TO", new_parent.element)
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
# are removed.
final_next_element = element.next_sibling
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
if len(new_parent_element.contents) > 0:
# The new parent already contains children. We will be
# appending this tag's children to the end.
# We can make this assertion since we know new_parent has
# children.
assert new_parents_last_descendant is not None
new_parents_last_child = new_parent_element.contents[-1]
new_parents_last_descendant_next_element = (
new_parents_last_descendant.next_element
)
else:
# The new parent contains no children.
new_parents_last_child = None
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
# parent's last descendant. It has no .next_sibling and
# its .next_element is whatever the previous last
# descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(
is_initialized=False, accept_self=True
)
# Since we passed accept_self=True into _last_descendant,
# there's no possibility that the result is None.
assert last_childs_last_descendant is not None
last_childs_last_descendant.next_element = (
new_parents_last_descendant_next_element
)
if new_parents_last_descendant_next_element is not None:
# TODO-COVERAGE: This code has no test coverage and
# I'm not sure how to get html5lib to go through this
# path, but it's just the other side of the previous
# line.
new_parents_last_descendant_next_element.previous_element = (
last_childs_last_descendant
)
last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
new_parent_element.contents.append(child)
# Now that this element has no children, change its .next_element.
element.contents = []
element.next_element = final_next_element
# print("DONE WITH MOVE")
# print("FROM", self.element)
# print("TO", new_parent_element)
# TODO-TYPING: typeshed stubs are incorrect about this;
# hasContent returns a boolean, not None.
def hasContent(self) -> bool:
return len(self.element.contents) > 0
# TODO-TYPING: typeshed stubs are incorrect about this;
# cloneNode returns a new Node, not None.
def cloneNode(self) -> treebuilder_base.Node:
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key, value in self.attributes:
node.attributes[key] = value
return node
def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
if self.namespace is None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(BeautifulSoupNode):
element: NavigableString
def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup

View File

@ -0,0 +1,474 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
from __future__ import annotations
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
"HTMLParserTreeBuilder",
]
from html.parser import HTMLParser
from typing import (
Any,
Callable,
cast,
Dict,
Iterable,
List,
Optional,
TYPE_CHECKING,
Tuple,
Type,
Union,
)
from bs4.element import (
AttributeDict,
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
DetectsXMLParsedAsHTML,
HTML,
HTMLTreeBuilder,
STRICT,
)
from bs4.exceptions import ParserRejectedMarkup
if TYPE_CHECKING:
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from bs4._typing import (
_Encoding,
_Encodings,
_RawMarkup,
)
HTMLPARSER = "html.parser"
_DuplicateAttributeHandler = Callable[[Dict[str, str], str, str], None]
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
#: Constant to handle duplicate attributes by ignoring later values
#: and keeping the earlier ones.
REPLACE: str = "replace"
#: Constant to handle duplicate attributes by replacing earlier values
#: with later ones.
IGNORE: str = "ignore"
"""A subclass of the Python standard library's HTMLParser class, which
listens for HTMLParser events and translates them into calls
to Beautiful Soup's tree construction API.
:param on_duplicate_attribute: A strategy for what to do if a
tag includes the same attribute more than once. Accepted
values are: REPLACE (replace earlier values with later
ones, the default), IGNORE (keep the earliest value
encountered), or a callable. A callable must take three
arguments: the dictionary of attributes already processed,
the name of the duplicate attribute, and the most recent value
encountered.
"""
def __init__(
self,
soup: BeautifulSoup,
*args: Any,
on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] = REPLACE,
**kwargs: Any,
):
self.soup = soup
self.on_duplicate_attribute = on_duplicate_attribute
self.attribute_dict_class = soup.builder.attribute_dict_class
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
# without an explicit closing tag. If we encounter a closing tag
# of this type, we'll associate it with one of those entries.
#
# This isn't a stack because we don't care about the
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
self._initialize_xml_detector()
on_duplicate_attribute: Union[str, _DuplicateAttributeHandler]
already_closed_empty_element: List[str]
soup: BeautifulSoup
def error(self, message: str) -> None:
# NOTE: This method is required so long as Python 3.9 is
# supported. The corresponding code is removed from HTMLParser
# in 3.5, but not removed from ParserBase until 3.10.
# https://github.com/python/cpython/issues/76025
#
# The original implementation turned the error into a warning,
# but in every case I discovered, this made HTMLParser
# immediately crash with an error message that was less
# helpful than the warning. The new implementation makes it
# more clear that html.parser just can't parse this
# markup. The 3.10 implementation does the same, though it
# raises AssertionError rather than calling a method. (We
# catch this error and wrap it in a ParserRejectedMarkup.)
raise ParserRejectedMarkup(message)
def handle_startendtag(
self, name: str, attrs: List[Tuple[str, Optional[str]]]
) -> None:
"""Handle an incoming empty-element tag.
html.parser only calls this method when the markup looks like
<tag/>.
"""
# `handle_empty_element` tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag, and we want to call
# handle_endtag ourselves.
self.handle_starttag(name, attrs, handle_empty_element=False)
self.handle_endtag(name)
def handle_starttag(
self,
name: str,
attrs: List[Tuple[str, Optional[str]]],
handle_empty_element: bool = True,
) -> None:
"""Handle an opening tag, e.g. '<tag>'
:param handle_empty_element: True if this tag is known to be
an empty-element tag (i.e. there is not expected to be any
closing tag).
"""
# TODO: handle namespaces here?
attr_dict: AttributeDict = self.attribute_dict_class()
for key, value in attrs:
# Change None attribute values to the empty string
# for consistency with the other tree builders.
if value is None:
value = ""
if key in attr_dict:
# A single attribute shows up multiple times in this
# tag. How to handle it depends on the
# on_duplicate_attribute setting.
on_dupe = self.on_duplicate_attribute
if on_dupe == self.IGNORE:
pass
elif on_dupe in (None, self.REPLACE):
attr_dict[key] = value
else:
on_dupe = cast(_DuplicateAttributeHandler, on_dupe)
on_dupe(attr_dict, key, value)
else:
attr_dict[key] = value
# print("START", name)
sourceline: Optional[int]
sourcepos: Optional[int]
if self.soup.builder.store_line_numbers:
sourceline, sourcepos = self.getpos()
else:
sourceline = sourcepos = None
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos
)
if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in
# handle_startendtag, but only if the original markup looked like
# <tag/>.)
#
# So we need to call handle_endtag() ourselves. Since we
# know the start event is identical to the end event, we
# don't want handle_endtag() to cross off any previous end
# events for tags of this name.
self.handle_endtag(name, check_already_closed=False)
# But we might encounter an explicit closing tag for this tag
# later on. If so, we want to ignore it.
self.already_closed_empty_element.append(name)
if self._root_tag_name is None:
self._root_tag_encountered(name)
def handle_endtag(self, name: str, check_already_closed: bool = True) -> None:
"""Handle a closing tag, e.g. '</tag>'
:param name: A tag name.
:param check_already_closed: True if this tag is expected to
be the closing portion of an empty-element tag,
e.g. '<tag></tag>'.
"""
# print("END", name)
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just
# check it off the list.
# print("ALREADY CLOSED", name)
self.already_closed_empty_element.remove(name)
else:
self.soup.handle_endtag(name)
def handle_data(self, data: str) -> None:
"""Handle some textual data that shows up between tags."""
self.soup.handle_data(data)
def handle_charref(self, name: str) -> None:
"""Handle a numeric character reference by converting it to the
corresponding Unicode character and treating it as textual
data.
:param name: Character number, possibly in hexadecimal.
"""
# TODO: This was originally a workaround for a bug in
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
# been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith("x"):
real_name = int(name.lstrip("x"), 16)
elif name.startswith("X"):
real_name = int(name.lstrip("X"), 16)
else:
real_name = int(name)
data = None
if real_name < 256:
# HTML numeric entities are supposed to reference Unicode
# code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, "windows-1252"):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError:
pass
if not data:
try:
data = chr(real_name)
except (ValueError, OverflowError):
pass
data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name: str) -> None:
"""Handle a named entity reference by converting it to the
corresponding Unicode character(s) and treating it as textual
data.
:param name: Name of the entity reference.
"""
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
# If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data: str) -> None:
"""Handle an HTML comment.
:param data: The text of the comment.
"""
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data: str) -> None:
"""Handle a DOCTYPE declaration.
:param data: The text of the declaration.
"""
self.soup.endData()
data = data[len("DOCTYPE ") :]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data: str) -> None:
"""Handle a declaration of unknown type -- probably a CDATA block.
:param data: The text of the declaration.
"""
cls: Type[NavigableString]
if data.upper().startswith("CDATA["):
cls = CData
data = data[len("CDATA[") :]
else:
cls = Declaration
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(cls)
def handle_pi(self, data: str) -> None:
"""Handle a processing instruction.
:param data: The text of the instruction.
"""
self.soup.endData()
self.soup.handle_data(data)
self._document_might_be_xml(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
"""A Beautiful soup `bs4.builder.TreeBuilder` that uses the
:py:class:`html.parser.HTMLParser` parser, found in the Python
standard library.
"""
is_xml: bool = False
picklable: bool = True
NAME: str = HTMLPARSER
features: Iterable[str] = [NAME, HTML, STRICT]
parser_args: Tuple[Iterable[Any], Dict[str, Any]]
#: The html.parser knows which line number and position in the
#: original file is the source of an element.
TRACKS_LINE_NUMBERS: bool = True
def __init__(
self,
parser_args: Optional[Iterable[Any]] = None,
parser_kwargs: Optional[Dict[str, Any]] = None,
**kwargs: Any,
):
"""Constructor.
:param parser_args: Positional arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param parser_kwargs: Keyword arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param kwargs: Keyword arguments for the superclass constructor.
"""
# Some keyword arguments will be pulled out of kwargs and placed
# into parser_kwargs.
extra_parser_kwargs = dict()
for arg in ("on_duplicate_attribute",):
if arg in kwargs:
value = kwargs.pop(arg)
extra_parser_kwargs[arg] = value
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
parser_kwargs["convert_charrefs"] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(
self,
markup: _RawMarkup,
user_specified_encoding: Optional[_Encoding] = None,
document_declared_encoding: Optional[_Encoding] = None,
exclude_encodings: Optional[_Encodings] = None,
) -> Iterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]:
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for parsing the document.
This TreeBuilder uses Unicode, Dammit to convert the markup
into Unicode, so the ``markup`` element of the tuple will
always be a string.
"""
if isinstance(markup, str):
# Parse Unicode as-is.
yield (markup, None, None, False)
return
# Ask UnicodeDammit to sniff the most likely encoding.
known_definite_encodings: List[_Encoding] = []
if user_specified_encoding:
# This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the
# HTML5 spec. (See the EncodingDetector class for
# details.)
known_definite_encodings.append(user_specified_encoding)
user_encodings: List[_Encoding] = []
if document_declared_encoding:
# This was found in the document; treat it as a slightly
# lower-priority user encoding.
user_encodings.append(document_declared_encoding)
dammit = UnicodeDammit(
markup,
known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings,
is_html=True,
exclude_encodings=exclude_encodings,
)
if dammit.unicode_markup is None:
# In every case I've seen, Unicode, Dammit is able to
# convert the markup into Unicode, even if it needs to use
# REPLACEMENT CHARACTER. But there is a code path that
# could result in unicode_markup being None, and
# HTMLParser can only parse Unicode, so here we handle
# that code path.
raise ParserRejectedMarkup(
"Could not convert input to Unicode, and html.parser will not accept bytestrings."
)
else:
yield (
dammit.unicode_markup,
dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters,
)
def feed(self, markup: _RawMarkup) -> None:
args, kwargs = self.parser_args
# HTMLParser.feed will only handle str, but
# BeautifulSoup.markup is allowed to be _RawMarkup, because
# it's set by the yield value of
# TreeBuilder.prepare_markup. Fortunately,
# HTMLParserTreeBuilder.prepare_markup always yields a str
# (UnicodeDammit.unicode_markup).
assert isinstance(markup, str)
# We know BeautifulSoup calls TreeBuilder.initialize_soup
# before calling feed(), so we can assume self.soup
# is set.
assert self.soup is not None
parser = BeautifulSoupHTMLParser(self.soup, *args, **kwargs)
try:
parser.feed(markup)
parser.close()
except AssertionError as e:
# html.parser raises AssertionError in rare cases to
# indicate a fatal problem with the markup, especially
# when there's an error in the doctype declaration.
raise ParserRejectedMarkup(e)
parser.already_closed_empty_element = []

View File

@ -0,0 +1,491 @@
# encoding: utf-8
from __future__ import annotations
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [
"LXMLTreeBuilderForXML",
"LXMLTreeBuilder",
]
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Set,
Tuple,
Type,
TYPE_CHECKING,
Union,
)
from typing_extensions import TypeAlias
from io import BytesIO
from io import StringIO
from lxml import etree
from bs4.element import (
AttributeDict,
XMLAttributeDict,
Comment,
Doctype,
NamespacedAttribute,
ProcessingInstruction,
XMLProcessingInstruction,
)
from bs4.builder import (
DetectsXMLParsedAsHTML,
FAST,
HTML,
HTMLTreeBuilder,
PERMISSIVE,
TreeBuilder,
XML,
)
from bs4.dammit import EncodingDetector
from bs4.exceptions import ParserRejectedMarkup
if TYPE_CHECKING:
from bs4._typing import (
_Encoding,
_Encodings,
_NamespacePrefix,
_NamespaceURL,
_NamespaceMapping,
_InvertedNamespaceMapping,
_RawMarkup,
)
from bs4 import BeautifulSoup
LXML: str = "lxml"
def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
"Invert a dictionary."
return dict((v, k) for k, v in list(d.items()))
_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
_ParserOrParserClass: TypeAlias = Union[
_LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
]
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
is_xml: bool = True
processing_instruction_class: Type[ProcessingInstruction]
NAME: str = "lxml-xml"
ALTERNATE_NAMES: Iterable[str] = ["xml"]
# Well, it's permissive by XML parser standards.
features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE: int = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
nsmaps: List[Optional[_InvertedNamespaceMapping]]
empty_element_tags: Set[str]
parser: Any
_default_parser: Optional[etree.XMLParser]
# NOTE: If we parsed Element objects and looked at .sourceline,
# we'd be able to see the line numbers from the original document.
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
# See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup: BeautifulSoup) -> None:
"""Let the BeautifulSoup object know about the standard namespace
mapping.
:param soup: A `BeautifulSoup`.
"""
# Beyond this point, self.soup is set, so we can assume (and
# assert) it's not None whenever necessary.
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
def _register_namespaces(self, mapping: Dict[str, str]) -> None:
"""Let the BeautifulSoup object know about namespaces encountered
while parsing the document.
This might be useful later on when creating CSS selectors.
This will track (almost) all namespaces, even ones that were
only in scope for part of the document. If two namespaces have
the same prefix, only the first one encountered will be
tracked. Un-prefixed namespaces are not tracked.
:param mapping: A dictionary mapping namespace prefixes to URIs.
"""
assert self.soup is not None
for key, value in list(mapping.items()):
# This is 'if key' and not 'if key is not None' because we
# don't track un-prefixed namespaces. Soupselect will
# treat an un-prefixed namespace as the default, which
# causes confusion in some cases.
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
"""Find the default parser for the given encoding.
:return: Either a parser object or a class, which
will be instantiated with default arguments.
"""
if self._default_parser is not None:
return self._default_parser
return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
"""Instantiate an appropriate parser for the given encoding.
:param encoding: A string.
:return: A parser object such as an `etree.XMLParser`.
"""
# Use the default parser.
parser = self.default_parser(encoding)
if callable(parser):
# Instantiate the parser with default arguments
parser = parser(target=self, recover=True, encoding=encoding)
return parser
def __init__(
self,
parser: Optional[etree.XMLParser] = None,
empty_element_tags: Optional[Set[str]] = None,
**kwargs: Any,
):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
self._default_parser = parser
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
if self.is_xml:
self.processing_instruction_class = XMLProcessingInstruction
else:
self.processing_instruction_class = ProcessingInstruction
if "attribute_dict_class" not in kwargs:
kwargs["attribute_dict_class"] = XMLAttributeDict
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
if tag[0] == "{":
namespace, name = tag[1:].split("}", 1)
return (namespace, name)
else:
return (None, tag)
def prepare_markup(
self,
markup: _RawMarkup,
user_specified_encoding: Optional[_Encoding] = None,
document_declared_encoding: Optional[_Encoding] = None,
exclude_encodings: Optional[_Encodings] = None,
) -> Iterable[
Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
]:
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
lxml really wants to get a bytestring and convert it to
Unicode itself. So instead of using UnicodeDammit to convert
the bytestring to Unicode using different encodings, this
implementation uses EncodingDetector to iterate over the
encodings, and tell lxml to try to parse the document as each
one in turn.
:param markup: Some markup -- hopefully a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
"""
if not self.is_xml:
# We're in HTML mode, so if we're given XML, that's worth
# noting.
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
# TODO: This is a workaround for
# https://bugs.launchpad.net/lxml/+bug/1948551.
# We can remove it once the upstream issue is fixed.
if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
markup = markup[1:]
yield markup, None, document_declared_encoding, False
if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
# Since the document was Unicode in the first place, there
# is no need to try any more strategies; we know this will
# work.
return
known_definite_encodings: List[_Encoding] = []
if user_specified_encoding:
# This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the
# HTML5 spec. (See the EncodingDetector class for
# details.)
known_definite_encodings.append(user_specified_encoding)
user_encodings: List[_Encoding] = []
if document_declared_encoding:
# This was found in the document; treat it as a slightly
# lower-priority user encoding.
user_encodings.append(document_declared_encoding)
detector = EncodingDetector(
markup,
known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings,
is_html=not self.is_xml,
exclude_encodings=exclude_encodings,
)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup: _RawMarkup) -> None:
io: Union[BytesIO, StringIO]
if isinstance(markup, bytes):
io = BytesIO(markup)
elif isinstance(markup, str):
io = StringIO(markup)
# initialize_soup is called before feed, so we know this
# is not None.
assert self.soup is not None
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = io.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = io.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(e)
def close(self) -> None:
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(
self,
tag: str | bytes,
attrs: Dict[str | bytes, str | bytes],
nsmap: _NamespaceMapping = {},
) -> None:
# This is called by lxml code as a result of calling
# BeautifulSoup.feed(), and we know self.soup is set by the time feed()
# is called.
assert self.soup is not None
assert isinstance(tag, str)
# We need to recreate the attribute dict for three
# reasons. First, for type checking, so we can assert there
# are no bytestrings in the keys or values. Second, because we
# need a mutable dict--lxml might send us an immutable
# dictproxy. Third, so we can handle namespaced attribute
# names by converting the keys to NamespacedAttributes.
new_attrs: Dict[Union[str, NamespacedAttribute], str] = (
self.attribute_dict_class()
)
for k, v in attrs.items():
assert isinstance(k, str)
assert isinstance(v, str)
new_attrs[k] = v
nsprefix: Optional[_NamespacePrefix] = None
namespace: Optional[_NamespaceURL] = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
# First, Let the BeautifulSoup object know about it.
self._register_namespaces(nsmap)
# Then, add it to our running list of inverted namespace
# mappings.
self.nsmaps.append(_invert(nsmap))
# The currently active namespace prefixes have
# changed. Calculate the new mapping so it can be stored
# with all Tag objects created while these prefixes are in
# scope.
current_mapping = dict(self.active_namespace_prefixes[-1])
current_mapping.update(nsmap)
# We should not track un-prefixed namespaces as we can only hold one
# and it will be recognized as the default namespace by soupsieve,
# which may be confusing in some situations.
if "" in current_mapping:
del current_mapping[""]
self.active_namespace_prefixes.append(current_mapping)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/"
)
new_attrs[attribute] = namespace
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
final_attrs: AttributeDict = self.attribute_dict_class()
for attr, value in list(new_attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
final_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
final_attrs[attr] = value
namespace, tag = self._getNsTag(tag)
nsprefix = self._prefix_for_namespace(namespace)
self.soup.handle_starttag(
tag,
namespace,
nsprefix,
final_attrs,
namespaces=self.active_namespace_prefixes[-1],
)
def _prefix_for_namespace(
self, namespace: Optional[_NamespaceURL]
) -> Optional[_NamespacePrefix]:
"""Find the currently active prefix for the given namespace."""
if namespace is None:
return None
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name: str | bytes) -> None:
assert self.soup is not None
assert isinstance(name, str)
self.soup.endData()
namespace, name = self._getNsTag(name)
nsprefix = None
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
out_of_scope_nsmap = self.nsmaps.pop()
if out_of_scope_nsmap is not None:
# This tag introduced a namespace mapping which is no
# longer in scope. Recalculate the currently active
# namespace prefixes.
self.active_namespace_prefixes.pop()
def pi(self, target: str, data: str) -> None:
assert self.soup is not None
self.soup.endData()
data = target + " " + data
self.soup.handle_data(data)
self.soup.endData(self.processing_instruction_class)
def data(self, data: str | bytes) -> None:
assert self.soup is not None
assert isinstance(data, str)
self.soup.handle_data(data)
def doctype(self, name: str, pubid: str, system: str) -> None:
assert self.soup is not None
self.soup.endData()
doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
self.soup.handle_data(doctype_string)
self.soup.endData(containerClass=Doctype)
def comment(self, text: str | bytes) -> None:
"Handle comments as Comment objects."
assert self.soup is not None
assert isinstance(text, str)
self.soup.endData()
self.soup.handle_data(text)
self.soup.endData(Comment)
def test_fragment_to_document(self, fragment: str) -> str:
"""See `TreeBuilder`."""
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
NAME: str = LXML
ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
is_xml: bool = False
def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
return etree.HTMLParser
def feed(self, markup: _RawMarkup) -> None:
# We know self.soup is set by the time feed() is called.
assert self.soup is not None
encoding = self.soup.original_encoding
try:
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment: str) -> str:
"""See `TreeBuilder`."""
return "<html><body>%s</body></html>" % fragment