2895 lines
107 KiB
Python
2895 lines
107 KiB
Python
from __future__ import annotations
|
|
|
|
# Use of this source code is governed by the MIT license.
|
|
__license__ = "MIT"
|
|
|
|
import re
|
|
import warnings
|
|
|
|
from bs4.css import CSS
|
|
from bs4._deprecation import (
|
|
_deprecated,
|
|
_deprecated_alias,
|
|
_deprecated_function_alias,
|
|
)
|
|
from bs4.formatter import (
|
|
Formatter,
|
|
HTMLFormatter,
|
|
XMLFormatter,
|
|
)
|
|
from bs4._warnings import AttributeResemblesVariableWarning
|
|
|
|
from typing import (
|
|
Any,
|
|
Callable,
|
|
Dict,
|
|
Generic,
|
|
Iterable,
|
|
Iterator,
|
|
List,
|
|
Mapping,
|
|
Optional,
|
|
Pattern,
|
|
Set,
|
|
TYPE_CHECKING,
|
|
Tuple,
|
|
Type,
|
|
TypeVar,
|
|
Union,
|
|
cast,
|
|
)
|
|
from typing_extensions import (
|
|
Self,
|
|
TypeAlias,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from bs4 import BeautifulSoup
|
|
from bs4.builder import TreeBuilder
|
|
from bs4.filter import ElementFilter
|
|
from bs4.formatter import (
|
|
_EntitySubstitutionFunction,
|
|
_FormatterOrName,
|
|
)
|
|
from bs4._typing import (
|
|
_AtMostOneElement,
|
|
_AttributeValue,
|
|
_AttributeValues,
|
|
_Encoding,
|
|
_InsertableElement,
|
|
_OneElement,
|
|
_QueryResults,
|
|
_RawOrProcessedAttributeValues,
|
|
_StrainableElement,
|
|
_StrainableAttribute,
|
|
_StrainableAttributes,
|
|
_StrainableString,
|
|
)
|
|
|
|
_OneOrMoreStringTypes: TypeAlias = Union[
|
|
Type["NavigableString"], Iterable[Type["NavigableString"]]
|
|
]
|
|
|
|
_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
|
|
|
|
# Deprecated module-level attributes.
|
|
# See https://peps.python.org/pep-0562/
|
|
_deprecated_names = dict(
|
|
whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
|
|
)
|
|
#: :meta private:
|
|
_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
|
|
|
|
|
|
def __getattr__(name: str) -> Any:
|
|
if name in _deprecated_names:
|
|
message = _deprecated_names[name]
|
|
warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
|
|
|
|
return globals()[f"_deprecated_{name}"]
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
|
|
|
|
#: Documents output by Beautiful Soup will be encoded with
|
|
#: this encoding unless you specify otherwise.
|
|
DEFAULT_OUTPUT_ENCODING: str = "utf-8"
|
|
|
|
#: A regular expression that can be used to split on whitespace.
|
|
nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
|
|
|
|
#: These encodings are recognized by Python (so `Tag.encode`
|
|
#: could theoretically support them) but XML and HTML don't recognize
|
|
#: them (so they should not show up in an XML or HTML document as that
|
|
#: document's encoding).
|
|
#:
|
|
#: If an XML document is encoded in one of these encodings, no encoding
|
|
#: will be mentioned in the XML declaration. If an HTML document is
|
|
#: encoded in one of these encodings, and the HTML document has a
|
|
#: <meta> tag that mentions an encoding, the encoding will be given as
|
|
#: the empty string.
|
|
#:
|
|
#: Source:
|
|
#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
|
|
PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
|
|
[
|
|
"idna",
|
|
"mbcs",
|
|
"oem",
|
|
"palmos",
|
|
"punycode",
|
|
"raw_unicode_escape",
|
|
"undefined",
|
|
"unicode_escape",
|
|
"raw-unicode-escape",
|
|
"unicode-escape",
|
|
"string-escape",
|
|
"string_escape",
|
|
]
|
|
)
|
|
|
|
|
|
class NamespacedAttribute(str):
|
|
"""A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
|
|
which remembers the namespace prefix ('xml') and the name ('lang')
|
|
that were used to create it.
|
|
"""
|
|
|
|
prefix: Optional[str]
|
|
name: Optional[str]
|
|
namespace: Optional[str]
|
|
|
|
def __new__(
|
|
cls,
|
|
prefix: Optional[str],
|
|
name: Optional[str] = None,
|
|
namespace: Optional[str] = None,
|
|
) -> Self:
|
|
if not name:
|
|
# This is the default namespace. Its name "has no value"
|
|
# per https://www.w3.org/TR/xml-names/#defaulting
|
|
name = None
|
|
|
|
if not name:
|
|
obj = str.__new__(cls, prefix)
|
|
elif not prefix:
|
|
# Not really namespaced.
|
|
obj = str.__new__(cls, name)
|
|
else:
|
|
obj = str.__new__(cls, prefix + ":" + name)
|
|
obj.prefix = prefix
|
|
obj.name = name
|
|
obj.namespace = namespace
|
|
return obj
|
|
|
|
|
|
class AttributeValueWithCharsetSubstitution(str):
|
|
"""An abstract class standing in for a character encoding specified
|
|
inside an HTML ``<meta>`` tag.
|
|
|
|
Subclasses exist for each place such a character encoding might be
|
|
found: either inside the ``charset`` attribute
|
|
(`CharsetMetaAttributeValue`) or inside the ``content`` attribute
|
|
(`ContentMetaAttributeValue`)
|
|
|
|
This allows Beautiful Soup to replace that part of the HTML file
|
|
with a different encoding when ouputting a tree as a string.
|
|
"""
|
|
|
|
# The original, un-encoded value of the ``content`` attribute.
|
|
#: :meta private:
|
|
original_value: str
|
|
|
|
def substitute_encoding(self, eventual_encoding: str) -> str:
|
|
"""Do whatever's necessary in this implementation-specific
|
|
portion an HTML document to substitute in a specific encoding.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
|
|
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
|
"""A generic stand-in for the value of a ``<meta>`` tag's ``charset``
|
|
attribute.
|
|
|
|
When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
|
|
value of the ``charset`` attribute will become one of these objects.
|
|
|
|
If the document is later encoded to an encoding other than UTF-8, its
|
|
``<meta>`` tag will mention the new encoding instead of ``utf8``.
|
|
"""
|
|
|
|
def __new__(cls, original_value: str) -> Self:
|
|
# We don't need to use the original value for anything, but
|
|
# it might be useful for the user to know.
|
|
obj = str.__new__(cls, original_value)
|
|
obj.original_value = original_value
|
|
return obj
|
|
|
|
def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
|
|
"""When an HTML document is being encoded to a given encoding, the
|
|
value of a ``<meta>`` tag's ``charset`` becomes the name of
|
|
the encoding.
|
|
"""
|
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
|
|
return ""
|
|
return eventual_encoding
|
|
|
|
|
|
class AttributeValueList(List[str]):
|
|
"""Class for the list used to hold the values of attributes which
|
|
have multiple values (such as HTML's 'class'). It's just a regular
|
|
list, but you can subclass it and pass it in to the TreeBuilder
|
|
constructor as attribute_value_list_class, to have your subclass
|
|
instantiated instead.
|
|
"""
|
|
|
|
|
|
class AttributeDict(Dict[Any,Any]):
|
|
"""Superclass for the dictionary used to hold a tag's
|
|
attributes. You can use this, but it's just a regular dict with no
|
|
special logic.
|
|
"""
|
|
|
|
|
|
class XMLAttributeDict(AttributeDict):
|
|
"""A dictionary for holding a Tag's attributes, which processes
|
|
incoming values for consistency with the HTML spec.
|
|
"""
|
|
|
|
def __setitem__(self, key: str, value: Any) -> None:
|
|
"""Set an attribute value, possibly modifying it to comply with
|
|
the XML spec.
|
|
|
|
This just means converting common non-string values to
|
|
strings: XML attributes may have "any literal string as a
|
|
value."
|
|
"""
|
|
if value is None:
|
|
value = ""
|
|
if isinstance(value, bool):
|
|
# XML does not define any rules for boolean attributes.
|
|
# Preserve the old Beautiful Soup behavior (a bool that
|
|
# gets converted to a string on output) rather than
|
|
# guessing what the value should be.
|
|
pass
|
|
elif isinstance(value, (int, float)):
|
|
# It's dangerous to convert _every_ attribute value into a
|
|
# plain string, since an attribute value may be a more
|
|
# sophisticated string-like object
|
|
# (e.g. CharsetMetaAttributeValue). But we can definitely
|
|
# convert numeric values and booleans, which are the most common.
|
|
value = str(value)
|
|
|
|
super().__setitem__(key, value)
|
|
|
|
|
|
class HTMLAttributeDict(AttributeDict):
|
|
"""A dictionary for holding a Tag's attributes, which processes
|
|
incoming values for consistency with the HTML spec, which says
|
|
'Attribute values are a mixture of text and character
|
|
references...'
|
|
|
|
Basically, this means converting common non-string values into
|
|
strings, like XMLAttributeDict, though HTML also has some rules
|
|
around boolean attributes that XML doesn't have.
|
|
"""
|
|
|
|
def __setitem__(self, key: str, value: Any) -> None:
|
|
"""Set an attribute value, possibly modifying it to comply
|
|
with the HTML spec,
|
|
"""
|
|
if value in (False, None):
|
|
# 'The values "true" and "false" are not allowed on
|
|
# boolean attributes. To represent a false value, the
|
|
# attribute has to be omitted altogether.'
|
|
if key in self:
|
|
del self[key]
|
|
return
|
|
if isinstance(value, bool):
|
|
# 'If the [boolean] attribute is present, its value must
|
|
# either be the empty string or a value that is an ASCII
|
|
# case-insensitive match for the attribute's canonical
|
|
# name, with no leading or trailing whitespace.'
|
|
#
|
|
# [fixme] It's not clear to me whether "canonical name"
|
|
# means fully-qualified name, unqualified name, or
|
|
# (probably not) name with namespace prefix. For now I'm
|
|
# going with unqualified name.
|
|
if isinstance(key, NamespacedAttribute):
|
|
value = key.name
|
|
else:
|
|
value = key
|
|
elif isinstance(value, (int, float)):
|
|
# See note in XMLAttributeDict for the reasoning why we
|
|
# only do this to numbers.
|
|
value = str(value)
|
|
super().__setitem__(key, value)
|
|
|
|
|
|
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
|
"""A generic stand-in for the value of a ``<meta>`` tag's ``content``
|
|
attribute.
|
|
|
|
When Beautiful Soup parses the markup:
|
|
``<meta http-equiv="content-type" content="text/html; charset=utf8">``
|
|
|
|
The value of the ``content`` attribute will become one of these objects.
|
|
|
|
If the document is later encoded to an encoding other than UTF-8, its
|
|
``<meta>`` tag will mention the new encoding instead of ``utf8``.
|
|
"""
|
|
|
|
#: Match the 'charset' argument inside the 'content' attribute
|
|
#: of a <meta> tag.
|
|
#: :meta private:
|
|
CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
|
|
|
|
def __new__(cls, original_value: str) -> Self:
|
|
cls.CHARSET_RE.search(original_value)
|
|
obj = str.__new__(cls, original_value)
|
|
obj.original_value = original_value
|
|
return obj
|
|
|
|
def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
|
|
"""When an HTML document is being encoded to a given encoding, the
|
|
value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
|
|
the name of the encoding.
|
|
"""
|
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
|
|
return self.CHARSET_RE.sub("", self.original_value)
|
|
|
|
def rewrite(match: re.Match[str]) -> str:
|
|
return match.group(1) + eventual_encoding
|
|
|
|
return self.CHARSET_RE.sub(rewrite, self.original_value)
|
|
|
|
|
|
class PageElement(object):
|
|
"""An abstract class representing a single element in the parse tree.
|
|
|
|
`NavigableString`, `Tag`, etc. are all subclasses of
|
|
`PageElement`. For this reason you'll see a lot of methods that
|
|
return `PageElement`, but you'll never see an actual `PageElement`
|
|
object. For the most part you can think of `PageElement` as
|
|
meaning "a `Tag` or a `NavigableString`."
|
|
"""
|
|
|
|
#: In general, we can't tell just by looking at an element whether
|
|
#: it's contained in an XML document or an HTML document. But for
|
|
#: `Tag` objects (q.v.) we can store this information at parse time.
|
|
#: :meta private:
|
|
known_xml: Optional[bool] = None
|
|
|
|
#: Whether or not this element has been decomposed from the tree
|
|
#: it was created in.
|
|
_decomposed: bool
|
|
|
|
parent: Optional[Tag]
|
|
next_element: _AtMostOneElement
|
|
previous_element: _AtMostOneElement
|
|
next_sibling: _AtMostOneElement
|
|
previous_sibling: _AtMostOneElement
|
|
|
|
#: Whether or not this element is hidden from generated output.
|
|
#: Only the `BeautifulSoup` object itself is hidden.
|
|
hidden: bool = False
|
|
|
|
def setup(
|
|
self,
|
|
parent: Optional[Tag] = None,
|
|
previous_element: _AtMostOneElement = None,
|
|
next_element: _AtMostOneElement = None,
|
|
previous_sibling: _AtMostOneElement = None,
|
|
next_sibling: _AtMostOneElement = None,
|
|
) -> None:
|
|
"""Sets up the initial relations between this element and
|
|
other elements.
|
|
|
|
:param parent: The parent of this element.
|
|
|
|
:param previous_element: The element parsed immediately before
|
|
this one.
|
|
|
|
:param next_element: The element parsed immediately after
|
|
this one.
|
|
|
|
:param previous_sibling: The most recently encountered element
|
|
on the same level of the parse tree as this one.
|
|
|
|
:param previous_sibling: The next element to be encountered
|
|
on the same level of the parse tree as this one.
|
|
"""
|
|
self.parent = parent
|
|
|
|
self.previous_element = previous_element
|
|
if self.previous_element is not None:
|
|
self.previous_element.next_element = self
|
|
|
|
self.next_element = next_element
|
|
if self.next_element is not None:
|
|
self.next_element.previous_element = self
|
|
|
|
self.next_sibling = next_sibling
|
|
if self.next_sibling is not None:
|
|
self.next_sibling.previous_sibling = self
|
|
|
|
if (
|
|
previous_sibling is None
|
|
and self.parent is not None
|
|
and self.parent.contents
|
|
):
|
|
previous_sibling = self.parent.contents[-1]
|
|
|
|
self.previous_sibling = previous_sibling
|
|
if self.previous_sibling is not None:
|
|
self.previous_sibling.next_sibling = self
|
|
|
|
def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
|
|
"""Format the given string using the given formatter.
|
|
|
|
:param s: A string.
|
|
:param formatter: A Formatter object, or a string naming one of the standard formatters.
|
|
"""
|
|
if formatter is None:
|
|
return s
|
|
if not isinstance(formatter, Formatter):
|
|
formatter = self.formatter_for_name(formatter)
|
|
output = formatter.substitute(s)
|
|
return output
|
|
|
|
def formatter_for_name(
|
|
self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
|
|
) -> Formatter:
|
|
"""Look up or create a Formatter for the given identifier,
|
|
if necessary.
|
|
|
|
:param formatter: Can be a `Formatter` object (used as-is), a
|
|
function (used as the entity substitution hook for an
|
|
`bs4.formatter.XMLFormatter` or
|
|
`bs4.formatter.HTMLFormatter`), or a string (used to look
|
|
up an `bs4.formatter.XMLFormatter` or
|
|
`bs4.formatter.HTMLFormatter` in the appropriate registry.
|
|
|
|
"""
|
|
if isinstance(formatter_name, Formatter):
|
|
return formatter_name
|
|
c: type[Formatter]
|
|
registry: Mapping[Optional[str], Formatter]
|
|
if self._is_xml:
|
|
c = XMLFormatter
|
|
registry = XMLFormatter.REGISTRY
|
|
else:
|
|
c = HTMLFormatter
|
|
registry = HTMLFormatter.REGISTRY
|
|
if callable(formatter_name):
|
|
return c(entity_substitution=formatter_name)
|
|
return registry[formatter_name]
|
|
|
|
@property
|
|
def _is_xml(self) -> bool:
|
|
"""Is this element part of an XML tree or an HTML tree?
|
|
|
|
This is used in formatter_for_name, when deciding whether an
|
|
XMLFormatter or HTMLFormatter is more appropriate. It can be
|
|
inefficient, but it should be called very rarely.
|
|
"""
|
|
if self.known_xml is not None:
|
|
# Most of the time we will have determined this when the
|
|
# document is parsed.
|
|
return self.known_xml
|
|
|
|
# Otherwise, it's likely that this element was created by
|
|
# direct invocation of the constructor from within the user's
|
|
# Python code.
|
|
if self.parent is None:
|
|
# This is the top-level object. It should have .known_xml set
|
|
# from tree creation. If not, take a guess--BS is usually
|
|
# used on HTML markup.
|
|
return getattr(self, "is_xml", False)
|
|
return self.parent._is_xml
|
|
|
|
nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
|
|
previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
|
|
raise NotImplementedError()
|
|
|
|
def __copy__(self) -> Self:
|
|
"""A copy of a PageElement can only be a deep copy, because
|
|
only one PageElement can occupy a given place in a parse tree.
|
|
"""
|
|
return self.__deepcopy__({})
|
|
|
|
default: Iterable[type[NavigableString]] = tuple() #: :meta private:
|
|
|
|
def _all_strings(
|
|
self, strip: bool = False, types: Iterable[type[NavigableString]] = default
|
|
) -> Iterator[str]:
|
|
"""Yield all strings of certain classes, possibly stripping them.
|
|
|
|
This is implemented differently in `Tag` and `NavigableString`.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
@property
|
|
def stripped_strings(self) -> Iterator[str]:
|
|
"""Yield all interesting strings in this PageElement, stripping them
|
|
first.
|
|
|
|
See `Tag` for information on which strings are considered
|
|
interesting in a given context.
|
|
"""
|
|
for string in self._all_strings(True):
|
|
yield string
|
|
|
|
def get_text(
|
|
self,
|
|
separator: str = "",
|
|
strip: bool = False,
|
|
types: Iterable[Type[NavigableString]] = default,
|
|
) -> str:
|
|
"""Get all child strings of this PageElement, concatenated using the
|
|
given separator.
|
|
|
|
:param separator: Strings will be concatenated using this separator.
|
|
|
|
:param strip: If True, strings will be stripped before being
|
|
concatenated.
|
|
|
|
:param types: A tuple of NavigableString subclasses. Any
|
|
strings of a subclass not found in this list will be
|
|
ignored. Although there are exceptions, the default
|
|
behavior in most cases is to consider only NavigableString
|
|
and CData objects. That means no comments, processing
|
|
instructions, etc.
|
|
|
|
:return: A string.
|
|
"""
|
|
return separator.join([s for s in self._all_strings(strip, types=types)])
|
|
|
|
getText = get_text
|
|
text = property(get_text)
|
|
|
|
def replace_with(self, *args: PageElement) -> Self:
|
|
"""Replace this `PageElement` with one or more other `PageElement`,
|
|
objects, keeping the rest of the tree the same.
|
|
|
|
:return: This `PageElement`, no longer part of the tree.
|
|
"""
|
|
if self.parent is None:
|
|
raise ValueError(
|
|
"Cannot replace one element with another when the "
|
|
"element to be replaced is not part of a tree."
|
|
)
|
|
if len(args) == 1 and args[0] is self:
|
|
# Replacing an element with itself is a no-op.
|
|
return self
|
|
if any(x is self.parent for x in args):
|
|
raise ValueError("Cannot replace a Tag with its parent.")
|
|
old_parent = self.parent
|
|
my_index = self.parent.index(self)
|
|
self.extract(_self_index=my_index)
|
|
for idx, replace_with in enumerate(args, start=my_index):
|
|
old_parent.insert(idx, replace_with)
|
|
return self
|
|
|
|
replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
|
|
|
|
def wrap(self, wrap_inside: Tag) -> Tag:
|
|
"""Wrap this `PageElement` inside a `Tag`.
|
|
|
|
:return: ``wrap_inside``, occupying the position in the tree that used
|
|
to be occupied by this object, and with this object now inside it.
|
|
"""
|
|
me = self.replace_with(wrap_inside)
|
|
wrap_inside.append(me)
|
|
return wrap_inside
|
|
|
|
def extract(self, _self_index: Optional[int] = None) -> Self:
|
|
"""Destructively rips this element out of the tree.
|
|
|
|
:param _self_index: The location of this element in its parent's
|
|
.contents, if known. Passing this in allows for a performance
|
|
optimization.
|
|
|
|
:return: this `PageElement`, no longer part of the tree.
|
|
"""
|
|
if self.parent is not None:
|
|
if _self_index is None:
|
|
_self_index = self.parent.index(self)
|
|
del self.parent.contents[_self_index]
|
|
|
|
# Find the two elements that would be next to each other if
|
|
# this element (and any children) hadn't been parsed. Connect
|
|
# the two.
|
|
last_child = self._last_descendant()
|
|
|
|
# last_child can't be None because we passed accept_self=True
|
|
# into _last_descendant. Worst case, last_child will be
|
|
# self. Making this cast removes several mypy complaints later
|
|
# on as we manipulate last_child.
|
|
last_child = cast(PageElement, last_child)
|
|
next_element = last_child.next_element
|
|
|
|
if self.previous_element is not None:
|
|
if self.previous_element is not next_element:
|
|
self.previous_element.next_element = next_element
|
|
if next_element is not None and next_element is not self.previous_element:
|
|
next_element.previous_element = self.previous_element
|
|
self.previous_element = None
|
|
last_child.next_element = None
|
|
|
|
self.parent = None
|
|
if (
|
|
self.previous_sibling is not None
|
|
and self.previous_sibling is not self.next_sibling
|
|
):
|
|
self.previous_sibling.next_sibling = self.next_sibling
|
|
if (
|
|
self.next_sibling is not None
|
|
and self.next_sibling is not self.previous_sibling
|
|
):
|
|
self.next_sibling.previous_sibling = self.previous_sibling
|
|
self.previous_sibling = self.next_sibling = None
|
|
return self
|
|
|
|
def decompose(self) -> None:
|
|
"""Recursively destroys this `PageElement` and its children.
|
|
|
|
The element will be removed from the tree and wiped out; so
|
|
will everything beneath it.
|
|
|
|
The behavior of a decomposed `PageElement` is undefined and you
|
|
should never use one for anything, but if you need to *check*
|
|
whether an element has been decomposed, you can use the
|
|
`PageElement.decomposed` property.
|
|
"""
|
|
self.extract()
|
|
e: _AtMostOneElement = self
|
|
next_up: _AtMostOneElement = None
|
|
while e is not None:
|
|
next_up = e.next_element
|
|
e.__dict__.clear()
|
|
if isinstance(e, Tag):
|
|
e.contents = []
|
|
e._decomposed = True
|
|
e = next_up
|
|
|
|
def _last_descendant(
|
|
self, is_initialized: bool = True, accept_self: bool = True
|
|
) -> _AtMostOneElement:
|
|
"""Finds the last element beneath this object to be parsed.
|
|
|
|
Special note to help you figure things out if your type
|
|
checking is tripped up by the fact that this method returns
|
|
_AtMostOneElement instead of PageElement: the only time
|
|
this method returns None is if `accept_self` is False and the
|
|
`PageElement` has no children--either it's a NavigableString
|
|
or an empty Tag.
|
|
|
|
:param is_initialized: Has `PageElement.setup` been called on
|
|
this `PageElement` yet?
|
|
|
|
:param accept_self: Is ``self`` an acceptable answer to the
|
|
question?
|
|
"""
|
|
if is_initialized and self.next_sibling is not None:
|
|
last_child = self.next_sibling.previous_element
|
|
else:
|
|
last_child = self
|
|
while isinstance(last_child, Tag) and last_child.contents:
|
|
last_child = last_child.contents[-1]
|
|
if not accept_self and last_child is self:
|
|
last_child = None
|
|
return last_child
|
|
|
|
_lastRecursiveChild = _deprecated_alias(
|
|
"_lastRecursiveChild", "_last_descendant", "4.0.0"
|
|
)
|
|
|
|
def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
|
|
"""Makes the given element(s) the immediate predecessor of this one.
|
|
|
|
All the elements will have the same `PageElement.parent` as
|
|
this one, and the given elements will occur immediately before
|
|
this one.
|
|
|
|
:param args: One or more PageElements.
|
|
|
|
:return The list of PageElements that were inserted.
|
|
"""
|
|
parent = self.parent
|
|
if parent is None:
|
|
raise ValueError("Element has no parent, so 'before' has no meaning.")
|
|
if any(x is self for x in args):
|
|
raise ValueError("Can't insert an element before itself.")
|
|
results: List[PageElement] = []
|
|
for predecessor in args:
|
|
# Extract first so that the index won't be screwed up if they
|
|
# are siblings.
|
|
if isinstance(predecessor, PageElement):
|
|
predecessor.extract()
|
|
index = parent.index(self)
|
|
results.extend(parent.insert(index, predecessor))
|
|
|
|
return results
|
|
|
|
def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
|
|
"""Makes the given element(s) the immediate successor of this one.
|
|
|
|
The elements will have the same `PageElement.parent` as this
|
|
one, and the given elements will occur immediately after this
|
|
one.
|
|
|
|
:param args: One or more PageElements.
|
|
|
|
:return The list of PageElements that were inserted.
|
|
"""
|
|
# Do all error checking before modifying the tree.
|
|
parent = self.parent
|
|
if parent is None:
|
|
raise ValueError("Element has no parent, so 'after' has no meaning.")
|
|
if any(x is self for x in args):
|
|
raise ValueError("Can't insert an element after itself.")
|
|
|
|
offset = 0
|
|
results: List[PageElement] = []
|
|
for successor in args:
|
|
# Extract first so that the index won't be screwed up if they
|
|
# are siblings.
|
|
if isinstance(successor, PageElement):
|
|
successor.extract()
|
|
index = parent.index(self)
|
|
results.extend(parent.insert(index + 1 + offset, successor))
|
|
offset += 1
|
|
|
|
return results
|
|
|
|
def find_next(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Find the first PageElement that matches the given criteria and
|
|
appears later in the document than this PageElement.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a NavigableString with specific text.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
|
|
|
|
findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
|
|
|
|
def find_all_next(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Find all `PageElement` objects that match the given criteria and
|
|
appear later in the document than this `PageElement`.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a NavigableString with specific text.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_all(
|
|
name,
|
|
attrs,
|
|
string,
|
|
limit,
|
|
self.next_elements,
|
|
_stacklevel=_stacklevel + 1,
|
|
**kwargs,
|
|
)
|
|
|
|
findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
|
|
|
|
def find_next_sibling(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Find the closest sibling to this PageElement that matches the
|
|
given criteria and appears later in the document.
|
|
|
|
All find_* methods take a common set of arguments. See the
|
|
online documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a `NavigableString` with specific text.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
|
|
|
|
findNextSibling = _deprecated_function_alias(
|
|
"findNextSibling", "find_next_sibling", "4.0.0"
|
|
)
|
|
|
|
def find_next_siblings(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Find all siblings of this `PageElement` that match the given criteria
|
|
and appear later in the document.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a `NavigableString` with specific text.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_all(
|
|
name,
|
|
attrs,
|
|
string,
|
|
limit,
|
|
self.next_siblings,
|
|
_stacklevel=_stacklevel + 1,
|
|
**kwargs,
|
|
)
|
|
|
|
findNextSiblings = _deprecated_function_alias(
|
|
"findNextSiblings", "find_next_siblings", "4.0.0"
|
|
)
|
|
fetchNextSiblings = _deprecated_function_alias(
|
|
"fetchNextSiblings", "find_next_siblings", "3.0.0"
|
|
)
|
|
|
|
def find_previous(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Look backwards in the document from this `PageElement` and find the
|
|
first `PageElement` that matches the given criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a `NavigableString` with specific text.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
|
|
|
|
findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
|
|
|
|
def find_all_previous(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Look backwards in the document from this `PageElement` and find all
|
|
`PageElement` that match the given criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a `NavigableString` with specific text.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_all(
|
|
name,
|
|
attrs,
|
|
string,
|
|
limit,
|
|
self.previous_elements,
|
|
_stacklevel=_stacklevel + 1,
|
|
**kwargs,
|
|
)
|
|
|
|
findAllPrevious = _deprecated_function_alias(
|
|
"findAllPrevious", "find_all_previous", "4.0.0"
|
|
)
|
|
fetchAllPrevious = _deprecated_function_alias(
|
|
"fetchAllPrevious", "find_all_previous", "3.0.0"
|
|
)
|
|
|
|
def find_previous_sibling(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Returns the closest sibling to this `PageElement` that matches the
|
|
given criteria and appears earlier in the document.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a `NavigableString` with specific text.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_one(
|
|
self.find_previous_siblings, name, attrs, string, **kwargs
|
|
)
|
|
|
|
findPreviousSibling = _deprecated_function_alias(
|
|
"findPreviousSibling", "find_previous_sibling", "4.0.0"
|
|
)
|
|
|
|
def find_previous_siblings(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Returns all siblings to this PageElement that match the
|
|
given criteria and appear earlier in the document.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param string: A filter for a NavigableString with specific text.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
return self._find_all(
|
|
name,
|
|
attrs,
|
|
string,
|
|
limit,
|
|
self.previous_siblings,
|
|
_stacklevel=_stacklevel + 1,
|
|
**kwargs,
|
|
)
|
|
|
|
findPreviousSiblings = _deprecated_function_alias(
|
|
"findPreviousSiblings", "find_previous_siblings", "4.0.0"
|
|
)
|
|
fetchPreviousSiblings = _deprecated_function_alias(
|
|
"fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
|
|
)
|
|
|
|
def find_parent(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Find the closest parent of this PageElement that matches the given
|
|
criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param self: Whether the PageElement itself should be considered
|
|
as one of its 'parents'.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
# NOTE: We can't use _find_one because findParents takes a different
|
|
# set of arguments.
|
|
r = None
|
|
results = self.find_parents(
|
|
name, attrs, 1, _stacklevel=3, **kwargs
|
|
)
|
|
if results:
|
|
r = results[0]
|
|
return r
|
|
|
|
findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
|
|
|
|
def find_parents(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Find all parents of this `PageElement` that match the given criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
iterator = self.parents
|
|
return self._find_all(
|
|
name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
|
|
)
|
|
|
|
findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
|
|
fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
|
|
|
|
@property
|
|
def next(self) -> _AtMostOneElement:
|
|
"""The `PageElement`, if any, that was parsed just after this one."""
|
|
return self.next_element
|
|
|
|
@property
|
|
def previous(self) -> _AtMostOneElement:
|
|
"""The `PageElement`, if any, that was parsed just before this one."""
|
|
return self.previous_element
|
|
|
|
# These methods do the real heavy lifting.
|
|
|
|
def _find_one(
|
|
self,
|
|
# TODO-TYPING: "There is no syntax to indicate optional or
|
|
# keyword arguments; such function types are rarely used
|
|
# as callback types." - So, not sure how to get more
|
|
# specific here.
|
|
method: Callable,
|
|
name: _FindMethodName,
|
|
attrs: _StrainableAttributes,
|
|
string: Optional[_StrainableString],
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
r: _AtMostOneElement = None
|
|
results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
|
|
if results:
|
|
r = results[0]
|
|
return r
|
|
|
|
def _find_all(
|
|
self,
|
|
name: _FindMethodName,
|
|
attrs: _StrainableAttributes,
|
|
string: Optional[_StrainableString],
|
|
limit: Optional[int],
|
|
generator: Iterator[PageElement],
|
|
_stacklevel: int = 3,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Iterates over a generator looking for things that match."""
|
|
|
|
if string is None and "text" in kwargs:
|
|
string = kwargs.pop("text")
|
|
warnings.warn(
|
|
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
|
|
DeprecationWarning,
|
|
stacklevel=_stacklevel,
|
|
)
|
|
|
|
if "_class" in kwargs:
|
|
warnings.warn(
|
|
AttributeResemblesVariableWarning.MESSAGE
|
|
% dict(
|
|
original="_class",
|
|
autocorrect="class_",
|
|
),
|
|
AttributeResemblesVariableWarning,
|
|
stacklevel=_stacklevel,
|
|
)
|
|
|
|
from bs4.filter import ElementFilter
|
|
|
|
if isinstance(name, ElementFilter):
|
|
matcher = name
|
|
else:
|
|
matcher = SoupStrainer(name, attrs, string, **kwargs)
|
|
|
|
result: Iterable[_OneElement]
|
|
if string is None and not limit and not attrs and not kwargs:
|
|
if name is True or name is None:
|
|
# Optimization to find all tags.
|
|
result = (element for element in generator if isinstance(element, Tag))
|
|
return ResultSet(matcher, result)
|
|
elif isinstance(name, str):
|
|
# Optimization to find all tags with a given name.
|
|
if name.count(":") == 1:
|
|
# This is a name with a prefix. If this is a namespace-aware document,
|
|
# we need to match the local name against tag.name. If not,
|
|
# we need to match the fully-qualified name against tag.name.
|
|
prefix, local_name = name.split(":", 1)
|
|
else:
|
|
prefix = None
|
|
local_name = name
|
|
result = []
|
|
for element in generator:
|
|
if not isinstance(element, Tag):
|
|
continue
|
|
if element.name == name or (
|
|
element.name == local_name
|
|
and (prefix is None or element.prefix == prefix)
|
|
):
|
|
result.append(element)
|
|
return ResultSet(matcher, result)
|
|
return matcher.find_all(generator, limit)
|
|
|
|
# These generators can be used to navigate starting from both
|
|
# NavigableStrings and Tags.
|
|
@property
|
|
def next_elements(self) -> Iterator[PageElement]:
|
|
"""All PageElements that were parsed after this one."""
|
|
i = self.next_element
|
|
while i is not None:
|
|
successor = i.next_element
|
|
yield i
|
|
i = successor
|
|
|
|
@property
|
|
def self_and_next_elements(self) -> Iterator[PageElement]:
|
|
"""This PageElement, then all PageElements that were parsed after it."""
|
|
return self._self_and(self.next_elements)
|
|
|
|
@property
|
|
def next_siblings(self) -> Iterator[PageElement]:
|
|
"""All PageElements that are siblings of this one but were parsed
|
|
later.
|
|
"""
|
|
i = self.next_sibling
|
|
while i is not None:
|
|
successor = i.next_sibling
|
|
yield i
|
|
i = successor
|
|
|
|
@property
|
|
def self_and_next_siblings(self) -> Iterator[PageElement]:
|
|
"""This PageElement, then all of its siblings."""
|
|
return self._self_and(self.next_siblings)
|
|
|
|
@property
|
|
def previous_elements(self) -> Iterator[PageElement]:
|
|
"""All PageElements that were parsed before this one.
|
|
|
|
:yield: A sequence of PageElements.
|
|
"""
|
|
i = self.previous_element
|
|
while i is not None:
|
|
successor = i.previous_element
|
|
yield i
|
|
i = successor
|
|
|
|
@property
|
|
def self_and_previous_elements(self) -> Iterator[PageElement]:
|
|
"""This PageElement, then all elements that were parsed
|
|
earlier."""
|
|
return self._self_and(self.previous_elements)
|
|
|
|
@property
|
|
def previous_siblings(self) -> Iterator[PageElement]:
|
|
"""All PageElements that are siblings of this one but were parsed
|
|
earlier.
|
|
|
|
:yield: A sequence of PageElements.
|
|
"""
|
|
i = self.previous_sibling
|
|
while i is not None:
|
|
successor = i.previous_sibling
|
|
yield i
|
|
i = successor
|
|
|
|
@property
|
|
def self_and_previous_siblings(self) -> Iterator[PageElement]:
|
|
"""This PageElement, then all of its siblings that were parsed
|
|
earlier."""
|
|
return self._self_and(self.previous_siblings)
|
|
|
|
@property
|
|
def parents(self) -> Iterator[Tag]:
|
|
"""All elements that are parents of this PageElement.
|
|
|
|
:yield: A sequence of Tags, ending with a BeautifulSoup object.
|
|
"""
|
|
i = self.parent
|
|
while i is not None:
|
|
successor = i.parent
|
|
yield i
|
|
i = successor
|
|
|
|
@property
|
|
def self_and_parents(self) -> Iterator[PageElement]:
|
|
"""This element, then all of its parents.
|
|
|
|
:yield: A sequence of PageElements, ending with a BeautifulSoup object.
|
|
"""
|
|
return self._self_and(self.parents)
|
|
|
|
def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
|
|
"""Modify a generator by yielding this element, then everything
|
|
yielded by the other generator.
|
|
"""
|
|
if not self.hidden:
|
|
yield self
|
|
for i in other_generator:
|
|
yield i
|
|
|
|
@property
|
|
def decomposed(self) -> bool:
|
|
"""Check whether a PageElement has been decomposed."""
|
|
return getattr(self, "_decomposed", False) or False
|
|
|
|
@_deprecated("next_elements", "4.0.0")
|
|
def nextGenerator(self) -> Iterator[PageElement]:
|
|
":meta private:"
|
|
return self.next_elements
|
|
|
|
@_deprecated("next_siblings", "4.0.0")
|
|
def nextSiblingGenerator(self) -> Iterator[PageElement]:
|
|
":meta private:"
|
|
return self.next_siblings
|
|
|
|
@_deprecated("previous_elements", "4.0.0")
|
|
def previousGenerator(self) -> Iterator[PageElement]:
|
|
":meta private:"
|
|
return self.previous_elements
|
|
|
|
@_deprecated("previous_siblings", "4.0.0")
|
|
def previousSiblingGenerator(self) -> Iterator[PageElement]:
|
|
":meta private:"
|
|
return self.previous_siblings
|
|
|
|
@_deprecated("parents", "4.0.0")
|
|
def parentGenerator(self) -> Iterator[PageElement]:
|
|
":meta private:"
|
|
return self.parents
|
|
|
|
|
|
class NavigableString(str, PageElement):
|
|
"""A Python string that is part of a parse tree.
|
|
|
|
When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
|
|
create a `NavigableString` for the string "penguin".
|
|
"""
|
|
|
|
#: A string prepended to the body of the 'real' string
|
|
#: when formatting it as part of a document, such as the '<!--'
|
|
#: in an HTML comment.
|
|
PREFIX: str = ""
|
|
|
|
#: A string appended to the body of the 'real' string
|
|
#: when formatting it as part of a document, such as the '-->'
|
|
#: in an HTML comment.
|
|
SUFFIX: str = ""
|
|
|
|
def __new__(cls, value: Union[str, bytes]) -> Self:
|
|
"""Create a new NavigableString.
|
|
|
|
When unpickling a NavigableString, this method is called with
|
|
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
|
|
passed in to the superclass's __new__ or the superclass won't know
|
|
how to handle non-ASCII characters.
|
|
"""
|
|
if isinstance(value, str):
|
|
u = str.__new__(cls, value)
|
|
else:
|
|
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
|
u.hidden = False
|
|
u.setup()
|
|
return u
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
|
|
"""A copy of a NavigableString has the same contents and class
|
|
as the original, but it is not connected to the parse tree.
|
|
|
|
:param recursive: This parameter is ignored; it's only defined
|
|
so that NavigableString.__deepcopy__ implements the same
|
|
signature as Tag.__deepcopy__.
|
|
"""
|
|
return type(self)(self)
|
|
|
|
def __getnewargs__(self) -> Tuple[str]:
|
|
return (str(self),)
|
|
|
|
# TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
|
|
# is introduced in 3.8.
|
|
def __getitem__(self, key: Union[int|slice]) -> str:
|
|
"""Raise an exception """
|
|
if isinstance(key, str):
|
|
raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
|
|
return super(NavigableString, self).__getitem__(key)
|
|
|
|
@property
|
|
def string(self) -> str:
|
|
"""Convenience property defined to match `Tag.string`.
|
|
|
|
:return: This property always returns the `NavigableString` it was
|
|
called on.
|
|
|
|
:meta private:
|
|
"""
|
|
return self
|
|
|
|
def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
|
|
"""Run the string through the provided formatter, making it
|
|
ready for output as part of an HTML or XML document.
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one
|
|
of the standard formatters.
|
|
"""
|
|
output = self.format_string(self, formatter)
|
|
return self.PREFIX + output + self.SUFFIX
|
|
|
|
@property
|
|
def name(self) -> None:
|
|
"""Since a NavigableString is not a Tag, it has no .name.
|
|
|
|
This property is implemented so that code like this doesn't crash
|
|
when run on a mixture of Tag and NavigableString objects:
|
|
[x.name for x in tag.children]
|
|
|
|
:meta private:
|
|
"""
|
|
return None
|
|
|
|
@name.setter
|
|
def name(self, name: str) -> None:
|
|
"""Prevent NavigableString.name from ever being set.
|
|
|
|
:meta private:
|
|
"""
|
|
raise AttributeError("A NavigableString cannot be given a name.")
|
|
|
|
def _all_strings(
|
|
self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
|
|
) -> Iterator[str]:
|
|
"""Yield all strings of certain classes, possibly stripping them.
|
|
|
|
This makes it easy for NavigableString to implement methods
|
|
like get_text() as conveniences, creating a consistent
|
|
text-extraction API across all PageElements.
|
|
|
|
:param strip: If True, all strings will be stripped before being
|
|
yielded.
|
|
|
|
:param types: A tuple of NavigableString subclasses. If this
|
|
NavigableString isn't one of those subclasses, the
|
|
sequence will be empty. By default, the subclasses
|
|
considered are NavigableString and CData objects. That
|
|
means no comments, processing instructions, etc.
|
|
|
|
:yield: A sequence that either contains this string, or is empty.
|
|
"""
|
|
if types is self.default:
|
|
# This is kept in Tag because it's full of subclasses of
|
|
# this class, which aren't defined until later in the file.
|
|
types = Tag.MAIN_CONTENT_STRING_TYPES
|
|
|
|
# Do nothing if the caller is looking for specific types of
|
|
# string, and we're of a different type.
|
|
#
|
|
# We check specific types instead of using isinstance(self,
|
|
# types) because all of these classes subclass
|
|
# NavigableString. Anyone who's using this feature probably
|
|
# wants generic NavigableStrings but not other stuff.
|
|
my_type = type(self)
|
|
if types is not None:
|
|
if isinstance(types, type):
|
|
# Looking for a single type.
|
|
if my_type is not types:
|
|
return
|
|
elif my_type not in types:
|
|
# Looking for one of a list of types.
|
|
return
|
|
|
|
value = self
|
|
if strip:
|
|
final_value = value.strip()
|
|
else:
|
|
final_value = self
|
|
if len(final_value) > 0:
|
|
yield final_value
|
|
|
|
@property
|
|
def strings(self) -> Iterator[str]:
|
|
"""Yield this string, but only if it is interesting.
|
|
|
|
This is defined the way it is for compatibility with
|
|
`Tag.strings`. See `Tag` for information on which strings are
|
|
interesting in a given context.
|
|
|
|
:yield: A sequence that either contains this string, or is empty.
|
|
"""
|
|
return self._all_strings()
|
|
|
|
|
|
class PreformattedString(NavigableString):
|
|
"""A `NavigableString` not subject to the normal formatting rules.
|
|
|
|
This is an abstract class used for special kinds of strings such
|
|
as comments (`Comment`) and CDATA blocks (`CData`).
|
|
"""
|
|
|
|
PREFIX: str = ""
|
|
SUFFIX: str = ""
|
|
|
|
def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
|
|
"""Make this string ready for output by adding any subclass-specific
|
|
prefix or suffix.
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one
|
|
of the standard formatters. The string will be passed into the
|
|
`Formatter`, but only to trigger any side effects: the return
|
|
value is ignored.
|
|
|
|
:return: The string, with any subclass-specific prefix and
|
|
suffix added on.
|
|
"""
|
|
if formatter is not None:
|
|
self.format_string(self, formatter)
|
|
return self.PREFIX + self + self.SUFFIX
|
|
|
|
|
|
class CData(PreformattedString):
|
|
"""A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
|
|
|
|
PREFIX: str = "<![CDATA["
|
|
SUFFIX: str = "]]>"
|
|
|
|
|
|
class ProcessingInstruction(PreformattedString):
|
|
"""A SGML processing instruction."""
|
|
|
|
PREFIX: str = "<?"
|
|
SUFFIX: str = ">"
|
|
|
|
|
|
class XMLProcessingInstruction(ProcessingInstruction):
|
|
"""An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
|
|
|
|
PREFIX: str = "<?"
|
|
SUFFIX: str = "?>"
|
|
|
|
|
|
class Comment(PreformattedString):
|
|
"""An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
|
|
|
|
PREFIX: str = "<!--"
|
|
SUFFIX: str = "-->"
|
|
|
|
|
|
class Declaration(PreformattedString):
|
|
"""An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
|
|
|
|
PREFIX: str = "<?"
|
|
SUFFIX: str = "?>"
|
|
|
|
|
|
class Doctype(PreformattedString):
|
|
"""A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
|
|
|
|
@classmethod
|
|
def for_name_and_ids(
|
|
cls, name: str, pub_id: Optional[str], system_id: Optional[str]
|
|
) -> Doctype:
|
|
"""Generate an appropriate document type declaration for a given
|
|
public ID and system ID.
|
|
|
|
:param name: The name of the document's root element, e.g. 'html'.
|
|
:param pub_id: The Formal Public Identifier for this document type,
|
|
e.g. '-//W3C//DTD XHTML 1.1//EN'
|
|
:param system_id: The system identifier for this document type,
|
|
e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
|
|
"""
|
|
return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
|
|
|
|
@classmethod
|
|
def _string_for_name_and_ids(
|
|
self, name: str, pub_id: Optional[str], system_id: Optional[str]
|
|
) -> str:
|
|
"""Generate a string to be used as the basis of a Doctype object.
|
|
|
|
This is a separate method from for_name_and_ids() because the lxml
|
|
TreeBuilder needs to call it.
|
|
"""
|
|
value = name or ""
|
|
if pub_id is not None:
|
|
value += ' PUBLIC "%s"' % pub_id
|
|
if system_id is not None:
|
|
value += ' "%s"' % system_id
|
|
elif system_id is not None:
|
|
value += ' SYSTEM "%s"' % system_id
|
|
return value
|
|
|
|
PREFIX: str = "<!DOCTYPE "
|
|
SUFFIX: str = ">\n"
|
|
|
|
|
|
class Stylesheet(NavigableString):
|
|
"""A `NavigableString` representing the contents of a `<style> HTML
|
|
tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
|
|
(probably CSS).
|
|
|
|
Used to distinguish embedded stylesheets from textual content.
|
|
"""
|
|
|
|
|
|
class Script(NavigableString):
|
|
"""A `NavigableString` representing the contents of a `<script>
|
|
HTML tag
|
|
<https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
|
|
(probably Javascript).
|
|
|
|
Used to distinguish executable code from textual content.
|
|
"""
|
|
|
|
|
|
class TemplateString(NavigableString):
|
|
"""A `NavigableString` representing a string found inside an `HTML
|
|
<template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
|
|
embedded in a larger document.
|
|
|
|
Used to distinguish such strings from the main body of the document.
|
|
"""
|
|
|
|
|
|
class RubyTextString(NavigableString):
|
|
"""A NavigableString representing the contents of an `<rt> HTML
|
|
tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
|
|
|
|
Can be used to distinguish such strings from the strings they're
|
|
annotating.
|
|
"""
|
|
|
|
|
|
class RubyParenthesisString(NavigableString):
|
|
"""A NavigableString representing the contents of an `<rp> HTML
|
|
tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
|
|
"""
|
|
|
|
|
|
class Tag(PageElement):
|
|
"""An HTML or XML tag that is part of a parse tree, along with its
|
|
attributes, contents, and relationships to other parts of the tree.
|
|
|
|
When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
|
|
create a `Tag` object representing the ``<b>`` tag. You can
|
|
instantiate `Tag` objects directly, but it's not necessary unless
|
|
you're adding entirely new markup to a parsed document. Most of
|
|
the constructor arguments are intended for use by the `TreeBuilder`
|
|
that's parsing a document.
|
|
|
|
:param parser: A `BeautifulSoup` object representing the parse tree this
|
|
`Tag` will be part of.
|
|
:param builder: The `TreeBuilder` being used to build the tree.
|
|
:param name: The name of the tag.
|
|
:param namespace: The URI of this tag's XML namespace, if any.
|
|
:param prefix: The prefix for this tag's XML namespace, if any.
|
|
:param attrs: A dictionary of attribute values.
|
|
:param parent: The `Tag` to use as the parent of this `Tag`. May be
|
|
the `BeautifulSoup` object itself.
|
|
:param previous: The `PageElement` that was parsed immediately before
|
|
parsing this tag.
|
|
:param is_xml: If True, this is an XML tag. Otherwise, this is an
|
|
HTML tag.
|
|
:param sourceline: The line number where this tag was found in its
|
|
source document.
|
|
:param sourcepos: The character position within ``sourceline`` where this
|
|
tag was found.
|
|
:param can_be_empty_element: If True, this tag should be
|
|
represented as <tag/>. If False, this tag should be represented
|
|
as <tag></tag>.
|
|
:param cdata_list_attributes: A dictionary of attributes whose values should
|
|
be parsed as lists of strings if they ever show up on this tag.
|
|
:param preserve_whitespace_tags: Names of tags whose contents
|
|
should have their whitespace preserved if they are encountered inside
|
|
this tag.
|
|
:param interesting_string_types: When iterating over this tag's
|
|
string contents in methods like `Tag.strings` or
|
|
`PageElement.get_text`, these are the types of strings that are
|
|
interesting enough to be considered. By default,
|
|
`NavigableString` (normal strings) and `CData` (CDATA
|
|
sections) are the only interesting string subtypes.
|
|
:param namespaces: A dictionary mapping currently active
|
|
namespace prefixes to URIs, as of the point in the parsing process when
|
|
this tag was encountered. This can be used later to
|
|
construct CSS selectors.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
parser: Optional[BeautifulSoup] = None,
|
|
builder: Optional[TreeBuilder] = None,
|
|
name: Optional[str] = None,
|
|
namespace: Optional[str] = None,
|
|
prefix: Optional[str] = None,
|
|
attrs: Optional[_RawOrProcessedAttributeValues] = None,
|
|
parent: Optional[Union[BeautifulSoup, Tag]] = None,
|
|
previous: _AtMostOneElement = None,
|
|
is_xml: Optional[bool] = None,
|
|
sourceline: Optional[int] = None,
|
|
sourcepos: Optional[int] = None,
|
|
can_be_empty_element: Optional[bool] = None,
|
|
cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
|
|
preserve_whitespace_tags: Optional[Set[str]] = None,
|
|
interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
|
|
namespaces: Optional[Dict[str, str]] = None,
|
|
# NOTE: Any new arguments here need to be mirrored in
|
|
# Tag.copy_self, and potentially BeautifulSoup.new_tag
|
|
# as well.
|
|
):
|
|
if parser is None:
|
|
self.parser_class = None
|
|
else:
|
|
# We don't actually store the parser object: that lets extracted
|
|
# chunks be garbage-collected.
|
|
self.parser_class = parser.__class__
|
|
if name is None:
|
|
raise ValueError("No value provided for new tag's name.")
|
|
self.name = name
|
|
self.namespace = namespace
|
|
self._namespaces = namespaces or {}
|
|
self.prefix = prefix
|
|
if (not builder or builder.store_line_numbers) and (
|
|
sourceline is not None or sourcepos is not None
|
|
):
|
|
self.sourceline = sourceline
|
|
self.sourcepos = sourcepos
|
|
else:
|
|
self.sourceline = sourceline
|
|
self.sourcepos = sourcepos
|
|
|
|
attr_dict_class: type[AttributeDict]
|
|
attribute_value_list_class: type[AttributeValueList]
|
|
if builder is None:
|
|
if is_xml:
|
|
attr_dict_class = XMLAttributeDict
|
|
else:
|
|
attr_dict_class = HTMLAttributeDict
|
|
attribute_value_list_class = AttributeValueList
|
|
else:
|
|
attr_dict_class = builder.attribute_dict_class
|
|
attribute_value_list_class = builder.attribute_value_list_class
|
|
self.attribute_value_list_class = attribute_value_list_class
|
|
|
|
if attrs is None:
|
|
self.attrs = attr_dict_class()
|
|
else:
|
|
if builder is not None and builder.cdata_list_attributes:
|
|
self.attrs = builder._replace_cdata_list_attribute_values(
|
|
self.name, attrs
|
|
)
|
|
else:
|
|
self.attrs = attr_dict_class()
|
|
# Make sure that the values of any multi-valued
|
|
# attributes (e.g. when a Tag is copied) are stored in
|
|
# new lists.
|
|
for k, v in attrs.items():
|
|
if isinstance(v, list):
|
|
v = v.__class__(v)
|
|
self.attrs[k] = v
|
|
|
|
# If possible, determine ahead of time whether this tag is an
|
|
# XML tag.
|
|
if builder:
|
|
self.known_xml = builder.is_xml
|
|
else:
|
|
self.known_xml = is_xml
|
|
self.contents: List[PageElement] = []
|
|
self.setup(parent, previous)
|
|
self.hidden = False
|
|
|
|
if builder is None:
|
|
# In the absence of a TreeBuilder, use whatever values were
|
|
# passed in here. They're probably None, unless this is a copy of some
|
|
# other tag.
|
|
self.can_be_empty_element = can_be_empty_element
|
|
self.cdata_list_attributes = cdata_list_attributes
|
|
self.preserve_whitespace_tags = preserve_whitespace_tags
|
|
self.interesting_string_types = interesting_string_types
|
|
else:
|
|
# Set up any substitutions for this tag, such as the charset in a META tag.
|
|
self.attribute_value_list_class = builder.attribute_value_list_class
|
|
builder.set_up_substitutions(self)
|
|
|
|
# Ask the TreeBuilder whether this tag might be an empty-element tag.
|
|
self.can_be_empty_element = builder.can_be_empty_element(name)
|
|
|
|
# Keep track of the list of attributes of this tag that
|
|
# might need to be treated as a list.
|
|
#
|
|
# For performance reasons, we store the whole data structure
|
|
# rather than asking the question of every tag. Asking would
|
|
# require building a new data structure every time, and
|
|
# (unlike can_be_empty_element), we almost never need
|
|
# to check this.
|
|
self.cdata_list_attributes = builder.cdata_list_attributes
|
|
|
|
# Keep track of the names that might cause this tag to be treated as a
|
|
# whitespace-preserved tag.
|
|
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
|
|
|
|
if self.name in builder.string_containers:
|
|
# This sort of tag uses a special string container
|
|
# subclass for most of its strings. We need to be able
|
|
# to look up the proper container subclass.
|
|
self.interesting_string_types = {builder.string_containers[self.name]}
|
|
else:
|
|
self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
|
|
|
|
parser_class: Optional[type[BeautifulSoup]]
|
|
name: str
|
|
namespace: Optional[str]
|
|
prefix: Optional[str]
|
|
attrs: _AttributeValues
|
|
sourceline: Optional[int]
|
|
sourcepos: Optional[int]
|
|
known_xml: Optional[bool]
|
|
contents: List[PageElement]
|
|
hidden: bool
|
|
interesting_string_types: Optional[Set[Type[NavigableString]]]
|
|
|
|
can_be_empty_element: Optional[bool]
|
|
cdata_list_attributes: Optional[Dict[str, Set[str]]]
|
|
preserve_whitespace_tags: Optional[Set[str]]
|
|
|
|
#: :meta private:
|
|
parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
|
|
|
|
def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
|
|
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
|
|
Its contents are a copy of the old Tag's contents.
|
|
"""
|
|
clone = self.copy_self()
|
|
|
|
if recursive:
|
|
# Clone this tag's descendants recursively, but without
|
|
# making any recursive function calls.
|
|
tag_stack: List[Tag] = [clone]
|
|
for event, element in self._event_stream(self.descendants):
|
|
if event is Tag.END_ELEMENT_EVENT:
|
|
# Stop appending incoming Tags to the Tag that was
|
|
# just closed.
|
|
tag_stack.pop()
|
|
else:
|
|
descendant_clone = element.__deepcopy__(memo, recursive=False)
|
|
# Add to its parent's .contents
|
|
tag_stack[-1].append(descendant_clone)
|
|
|
|
if event is Tag.START_ELEMENT_EVENT:
|
|
# Add the Tag itself to the stack so that its
|
|
# children will be .appended to it.
|
|
tag_stack.append(cast(Tag, descendant_clone))
|
|
return clone
|
|
|
|
def copy_self(self) -> Self:
|
|
"""Create a new Tag just like this one, but with no
|
|
contents and unattached to any parse tree.
|
|
|
|
This is the first step in the deepcopy process, but you can
|
|
call it on its own to create a copy of a Tag without copying its
|
|
contents.
|
|
"""
|
|
clone = type(self)(
|
|
None,
|
|
None,
|
|
self.name,
|
|
self.namespace,
|
|
self.prefix,
|
|
self.attrs,
|
|
is_xml=self._is_xml,
|
|
sourceline=self.sourceline,
|
|
sourcepos=self.sourcepos,
|
|
can_be_empty_element=self.can_be_empty_element,
|
|
cdata_list_attributes=self.cdata_list_attributes,
|
|
preserve_whitespace_tags=self.preserve_whitespace_tags,
|
|
interesting_string_types=self.interesting_string_types,
|
|
namespaces=self._namespaces,
|
|
)
|
|
for attr in ("can_be_empty_element", "hidden"):
|
|
setattr(clone, attr, getattr(self, attr))
|
|
return clone
|
|
|
|
@property
|
|
def is_empty_element(self) -> bool:
|
|
"""Is this tag an empty-element tag? (aka a self-closing tag)
|
|
|
|
A tag that has contents is never an empty-element tag.
|
|
|
|
A tag that has no contents may or may not be an empty-element
|
|
tag. It depends on the `TreeBuilder` used to create the
|
|
tag. If the builder has a designated list of empty-element
|
|
tags, then only a tag whose name shows up in that list is
|
|
considered an empty-element tag. This is usually the case
|
|
for HTML documents.
|
|
|
|
If the builder has no designated list of empty-element, then
|
|
any tag with no contents is an empty-element tag. This is usually
|
|
the case for XML documents.
|
|
"""
|
|
return len(self.contents) == 0 and self.can_be_empty_element is True
|
|
|
|
@_deprecated("is_empty_element", "4.0.0")
|
|
def isSelfClosing(self) -> bool:
|
|
": :meta private:"
|
|
return self.is_empty_element
|
|
|
|
@property
|
|
def string(self) -> Optional[str]:
|
|
"""Convenience property to get the single string within this
|
|
`Tag`, assuming there is just one.
|
|
|
|
:return: If this `Tag` has a single child that's a
|
|
`NavigableString`, the return value is that string. If this
|
|
element has one child `Tag`, the return value is that child's
|
|
`Tag.string`, recursively. If this `Tag` has no children,
|
|
or has more than one child, the return value is ``None``.
|
|
|
|
If this property is unexpectedly returning ``None`` for you,
|
|
it's probably because your `Tag` has more than one thing
|
|
inside it.
|
|
"""
|
|
if len(self.contents) != 1:
|
|
return None
|
|
child = self.contents[0]
|
|
if isinstance(child, NavigableString):
|
|
return child
|
|
elif isinstance(child, Tag):
|
|
return child.string
|
|
return None
|
|
|
|
@string.setter
|
|
def string(self, string: str) -> None:
|
|
"""Replace the `Tag.contents` of this `Tag` with a single string."""
|
|
self.clear()
|
|
if isinstance(string, NavigableString):
|
|
new_class = string.__class__
|
|
else:
|
|
new_class = NavigableString
|
|
self.append(new_class(string))
|
|
|
|
#: :meta private:
|
|
MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
|
|
|
|
def _all_strings(
|
|
self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
|
|
) -> Iterator[str]:
|
|
"""Yield all strings of certain classes, possibly stripping them.
|
|
|
|
:param strip: If True, all strings will be stripped before being
|
|
yielded.
|
|
|
|
:param types: A tuple of NavigableString subclasses. Any strings of
|
|
a subclass not found in this list will be ignored. By
|
|
default, the subclasses considered are the ones found in
|
|
self.interesting_string_types. If that's not specified,
|
|
only NavigableString and CData objects will be
|
|
considered. That means no comments, processing
|
|
instructions, etc.
|
|
"""
|
|
if types is self.default:
|
|
if self.interesting_string_types is None:
|
|
types = self.MAIN_CONTENT_STRING_TYPES
|
|
else:
|
|
types = self.interesting_string_types
|
|
|
|
for descendant in self.descendants:
|
|
if not isinstance(descendant, NavigableString):
|
|
continue
|
|
descendant_type = type(descendant)
|
|
if isinstance(types, type):
|
|
if descendant_type is not types:
|
|
# We're not interested in strings of this type.
|
|
continue
|
|
elif types is not None and descendant_type not in types:
|
|
# We're not interested in strings of this type.
|
|
continue
|
|
if strip:
|
|
stripped = descendant.strip()
|
|
if len(stripped) == 0:
|
|
continue
|
|
yield stripped
|
|
else:
|
|
yield descendant
|
|
|
|
strings = property(_all_strings)
|
|
|
|
def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
|
|
"""Insert one or more new PageElements as a child of this `Tag`.
|
|
|
|
This works similarly to :py:meth:`list.insert`, except you can insert
|
|
multiple elements at once.
|
|
|
|
:param position: The numeric position that should be occupied
|
|
in this Tag's `Tag.children` by the first new `PageElement`.
|
|
|
|
:param new_children: The PageElements to insert.
|
|
|
|
:return The newly inserted PageElements.
|
|
"""
|
|
inserted: List[PageElement] = []
|
|
for new_child in new_children:
|
|
inserted.extend(self._insert(position, new_child))
|
|
position += 1
|
|
return inserted
|
|
|
|
def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
|
|
if new_child is None:
|
|
raise ValueError("Cannot insert None into a tag.")
|
|
if new_child is self:
|
|
raise ValueError("Cannot insert a tag into itself.")
|
|
if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
|
|
new_child = NavigableString(new_child)
|
|
|
|
from bs4 import BeautifulSoup
|
|
if isinstance(new_child, BeautifulSoup):
|
|
# We don't want to end up with a situation where one BeautifulSoup
|
|
# object contains another. Insert the BeautifulSoup's children and
|
|
# return them.
|
|
return self.insert(position, *list(new_child.contents))
|
|
position = min(position, len(self.contents))
|
|
if hasattr(new_child, "parent") and new_child.parent is not None:
|
|
# We're 'inserting' an element that's already one
|
|
# of this object's children.
|
|
if new_child.parent is self:
|
|
current_index = self.index(new_child)
|
|
if current_index < position:
|
|
# We're moving this element further down the list
|
|
# of this object's children. That means that when
|
|
# we extract this element, our target index will
|
|
# jump down one.
|
|
position -= 1
|
|
elif current_index == position:
|
|
# We're 'inserting' an element into its current location.
|
|
# This is a no-op.
|
|
return [new_child]
|
|
new_child.extract()
|
|
|
|
new_child.parent = self
|
|
previous_child = None
|
|
if position == 0:
|
|
new_child.previous_sibling = None
|
|
new_child.previous_element = self
|
|
else:
|
|
previous_child = self.contents[position - 1]
|
|
new_child.previous_sibling = previous_child
|
|
new_child.previous_sibling.next_sibling = new_child
|
|
new_child.previous_element = previous_child._last_descendant(False)
|
|
if new_child.previous_element is not None:
|
|
new_child.previous_element.next_element = new_child
|
|
|
|
new_childs_last_element = new_child._last_descendant(
|
|
is_initialized=False, accept_self=True
|
|
)
|
|
# new_childs_last_element can't be None because we passed
|
|
# accept_self=True into _last_descendant. Worst case,
|
|
# new_childs_last_element will be new_child itself. Making
|
|
# this cast removes several mypy complaints later on as we
|
|
# manipulate new_childs_last_element.
|
|
new_childs_last_element = cast(PageElement, new_childs_last_element)
|
|
|
|
if position >= len(self.contents):
|
|
new_child.next_sibling = None
|
|
|
|
parent: Optional[Tag] = self
|
|
parents_next_sibling = None
|
|
while parents_next_sibling is None and parent is not None:
|
|
parents_next_sibling = parent.next_sibling
|
|
parent = parent.parent
|
|
if parents_next_sibling is not None:
|
|
# We found the element that comes next in the document.
|
|
break
|
|
if parents_next_sibling is not None:
|
|
new_childs_last_element.next_element = parents_next_sibling
|
|
else:
|
|
# The last element of this tag is the last element in
|
|
# the document.
|
|
new_childs_last_element.next_element = None
|
|
else:
|
|
next_child = self.contents[position]
|
|
new_child.next_sibling = next_child
|
|
if new_child.next_sibling is not None:
|
|
new_child.next_sibling.previous_sibling = new_child
|
|
new_childs_last_element.next_element = next_child
|
|
|
|
if new_childs_last_element.next_element is not None:
|
|
new_childs_last_element.next_element.previous_element = (
|
|
new_childs_last_element
|
|
)
|
|
self.contents.insert(position, new_child)
|
|
|
|
return [new_child]
|
|
|
|
def unwrap(self) -> Self:
|
|
"""Replace this `PageElement` with its contents.
|
|
|
|
:return: This object, no longer part of the tree.
|
|
"""
|
|
my_parent = self.parent
|
|
if my_parent is None:
|
|
raise ValueError(
|
|
"Cannot replace an element with its contents when that "
|
|
"element is not part of a tree."
|
|
)
|
|
my_index = my_parent.index(self)
|
|
self.extract(_self_index=my_index)
|
|
for child in reversed(self.contents[:]):
|
|
my_parent.insert(my_index, child)
|
|
return self
|
|
|
|
replace_with_children = unwrap
|
|
|
|
@_deprecated("unwrap", "4.0.0")
|
|
def replaceWithChildren(self) -> _OneElement:
|
|
": :meta private:"
|
|
return self.unwrap()
|
|
|
|
def append(self, tag: _InsertableElement) -> PageElement:
|
|
"""
|
|
Appends the given `PageElement` to the contents of this `Tag`.
|
|
|
|
:param tag: A PageElement.
|
|
|
|
:return The newly appended PageElement.
|
|
"""
|
|
return self.insert(len(self.contents), tag)[0]
|
|
|
|
def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
|
|
"""Appends one or more objects to the contents of this
|
|
`Tag`.
|
|
|
|
:param tags: If a list of `PageElement` objects is provided,
|
|
they will be appended to this tag's contents, one at a time.
|
|
If a single `Tag` is provided, its `Tag.contents` will be
|
|
used to extend this object's `Tag.contents`.
|
|
|
|
:return The list of PageElements that were appended.
|
|
"""
|
|
tag_list: Iterable[_InsertableElement]
|
|
|
|
if isinstance(tags, Tag):
|
|
tag_list = list(tags.contents)
|
|
elif isinstance(tags, (PageElement, str)):
|
|
# The caller should really be using append() instead,
|
|
# but we can make it work.
|
|
warnings.warn(
|
|
"A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
|
|
UserWarning,
|
|
stacklevel=2,
|
|
)
|
|
if isinstance(tags, str) and not isinstance(tags, PageElement):
|
|
tags = NavigableString(tags)
|
|
tag_list = [tags]
|
|
elif isinstance(tags, Iterable):
|
|
# Moving items around the tree may change their position in
|
|
# the original list. Make a list that won't change.
|
|
tag_list = list(tags)
|
|
|
|
results: List[PageElement] = []
|
|
for tag in tag_list:
|
|
results.append(self.append(tag))
|
|
|
|
return results
|
|
|
|
def clear(self, decompose: bool = False) -> None:
|
|
"""Destroy all children of this `Tag` by calling
|
|
`PageElement.extract` on them.
|
|
|
|
:param decompose: If this is True, `PageElement.decompose` (a
|
|
more destructive method) will be called instead of
|
|
`PageElement.extract`.
|
|
"""
|
|
for element in self.contents[:]:
|
|
if decompose:
|
|
element.decompose()
|
|
else:
|
|
element.extract()
|
|
|
|
def smooth(self) -> None:
|
|
"""Smooth out the children of this `Tag` by consolidating consecutive
|
|
strings.
|
|
|
|
If you perform a lot of operations that modify the tree,
|
|
calling this method afterwards can make pretty-printed output
|
|
look more natural.
|
|
"""
|
|
# Mark the first position of every pair of children that need
|
|
# to be consolidated. Do this rather than making a copy of
|
|
# self.contents, since in most cases very few strings will be
|
|
# affected.
|
|
marked = []
|
|
for i, a in enumerate(self.contents):
|
|
if isinstance(a, Tag):
|
|
# Recursively smooth children.
|
|
a.smooth()
|
|
if i == len(self.contents) - 1:
|
|
# This is the last item in .contents, and it's not a
|
|
# tag. There's no chance it needs any work.
|
|
continue
|
|
b = self.contents[i + 1]
|
|
if (
|
|
isinstance(a, NavigableString)
|
|
and isinstance(b, NavigableString)
|
|
and not isinstance(a, PreformattedString)
|
|
and not isinstance(b, PreformattedString)
|
|
):
|
|
marked.append(i)
|
|
|
|
# Go over the marked positions in reverse order, so that
|
|
# removing items from .contents won't affect the remaining
|
|
# positions.
|
|
for i in reversed(marked):
|
|
a = cast(NavigableString, self.contents[i])
|
|
b = cast(NavigableString, self.contents[i + 1])
|
|
b.extract()
|
|
n = NavigableString(a + b)
|
|
a.replace_with(n)
|
|
|
|
def index(self, element: PageElement) -> int:
|
|
"""Find the index of a child of this `Tag` (by identity, not value).
|
|
|
|
Doing this by identity avoids issues when a `Tag` contains two
|
|
children that have string equality.
|
|
|
|
:param element: Look for this `PageElement` in this object's contents.
|
|
"""
|
|
for i, child in enumerate(self.contents):
|
|
if child is element:
|
|
return i
|
|
raise ValueError("Tag.index: element not in tag")
|
|
|
|
def get(
|
|
self, key: str, default: Optional[_AttributeValue] = None
|
|
) -> Optional[_AttributeValue]:
|
|
"""Returns the value of the 'key' attribute for the tag, or
|
|
the value given for 'default' if it doesn't have that
|
|
attribute.
|
|
|
|
:param key: The attribute to look for.
|
|
:param default: Use this value if the attribute is not present
|
|
on this `Tag`.
|
|
"""
|
|
return self.attrs.get(key, default)
|
|
|
|
def get_attribute_list(
|
|
self, key: str, default: Optional[AttributeValueList] = None
|
|
) -> AttributeValueList:
|
|
"""The same as get(), but always returns a (possibly empty) list.
|
|
|
|
:param key: The attribute to look for.
|
|
:param default: Use this value if the attribute is not present
|
|
on this `Tag`.
|
|
:return: A list of strings, usually empty or containing only a single
|
|
value.
|
|
"""
|
|
list_value: AttributeValueList
|
|
value = self.get(key, default)
|
|
if value is None:
|
|
list_value = self.attribute_value_list_class()
|
|
elif isinstance(value, list):
|
|
list_value = value
|
|
else:
|
|
if not isinstance(value, str):
|
|
value = cast(str, value)
|
|
list_value = self.attribute_value_list_class([value])
|
|
return list_value
|
|
|
|
def has_attr(self, key: str) -> bool:
|
|
"""Does this `Tag` have an attribute with the given name?"""
|
|
return key in self.attrs
|
|
|
|
def __hash__(self) -> int:
|
|
return str(self).__hash__()
|
|
|
|
def __getitem__(self, key: str) -> _AttributeValue:
|
|
"""tag[key] returns the value of the 'key' attribute for the Tag,
|
|
and throws an exception if it's not there."""
|
|
return self.attrs[key]
|
|
|
|
def __iter__(self) -> Iterator[PageElement]:
|
|
"Iterating over a Tag iterates over its contents."
|
|
return iter(self.contents)
|
|
|
|
def __len__(self) -> int:
|
|
"The length of a Tag is the length of its list of contents."
|
|
return len(self.contents)
|
|
|
|
def __contains__(self, x: Any) -> bool:
|
|
return x in self.contents
|
|
|
|
def __bool__(self) -> bool:
|
|
"A tag is non-None even if it has no contents."
|
|
return True
|
|
|
|
def __setitem__(self, key: str, value: _AttributeValue) -> None:
|
|
"""Setting tag[key] sets the value of the 'key' attribute for the
|
|
tag."""
|
|
self.attrs[key] = value
|
|
|
|
def __delitem__(self, key: str) -> None:
|
|
"Deleting tag[key] deletes all 'key' attributes for the tag."
|
|
self.attrs.pop(key, None)
|
|
|
|
def __call__(
|
|
self,
|
|
name: Optional[_StrainableElement] = None,
|
|
attrs: _StrainableAttributes = {},
|
|
recursive: bool = True,
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Calling a Tag like a function is the same as calling its
|
|
find_all() method. Eg. tag('a') returns a list of all the A tags
|
|
found within this tag."""
|
|
return self.find_all(
|
|
name, attrs, recursive, string, limit, _stacklevel, **kwargs
|
|
)
|
|
|
|
def __getattr__(self, subtag: str) -> Optional[Tag]:
|
|
"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
|
|
# print("Getattr %s.%s" % (self.__class__, tag))
|
|
result: _AtMostOneElement
|
|
if len(subtag) > 3 and subtag.endswith("Tag"):
|
|
# BS3: soup.aTag -> "soup.find("a")
|
|
tag_name = subtag[:-3]
|
|
warnings.warn(
|
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
|
|
% dict(name=tag_name),
|
|
DeprecationWarning,
|
|
stacklevel=2,
|
|
)
|
|
result = self.find(tag_name)
|
|
# We special case contents to avoid recursion.
|
|
elif not subtag.startswith("__") and not subtag == "contents":
|
|
result = self.find(subtag)
|
|
else:
|
|
raise AttributeError(
|
|
"'%s' object has no attribute '%s'" % (self.__class__, subtag)
|
|
)
|
|
return cast(Optional[Tag], result)
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
"""Returns true iff this Tag has the same name, the same attributes,
|
|
and the same contents (recursively) as `other`."""
|
|
if self is other:
|
|
return True
|
|
if not isinstance(other, Tag):
|
|
return False
|
|
if (
|
|
not hasattr(other, "name")
|
|
or not hasattr(other, "attrs")
|
|
or not hasattr(other, "contents")
|
|
or self.name != other.name
|
|
or self.attrs != other.attrs
|
|
or len(self) != len(other)
|
|
):
|
|
return False
|
|
for i, my_child in enumerate(self.contents):
|
|
if my_child != other.contents[i]:
|
|
return False
|
|
return True
|
|
|
|
def __ne__(self, other: Any) -> bool:
|
|
"""Returns true iff this Tag is not identical to `other`,
|
|
as defined in __eq__."""
|
|
return not self == other
|
|
|
|
def __repr__(self) -> str:
|
|
"""Renders this `Tag` as a string."""
|
|
return self.decode()
|
|
|
|
__str__ = __unicode__ = __repr__
|
|
|
|
def encode(
|
|
self,
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
|
|
indent_level: Optional[int] = None,
|
|
formatter: _FormatterOrName = "minimal",
|
|
errors: str = "xmlcharrefreplace",
|
|
) -> bytes:
|
|
"""Render this `Tag` and its contents as a bytestring.
|
|
|
|
:param encoding: The encoding to use when converting to
|
|
a bytestring. This may also affect the text of the document,
|
|
specifically any encoding declarations within the document.
|
|
:param indent_level: Each line of the rendering will be
|
|
indented this many levels. (The ``formatter`` decides what a
|
|
'level' means, in terms of spaces or other characters
|
|
output.) This is used internally in recursive calls while
|
|
pretty-printing.
|
|
:param formatter: Either a `Formatter` object, or a string naming one of
|
|
the standard formatters.
|
|
:param errors: An error handling strategy such as
|
|
'xmlcharrefreplace'. This value is passed along into
|
|
:py:meth:`str.encode` and its value should be one of the `error
|
|
handling constants defined by Python's codecs module
|
|
<https://docs.python.org/3/library/codecs.html#error-handlers>`_.
|
|
"""
|
|
# Turn the data structure into Unicode, then encode the
|
|
# Unicode.
|
|
u = self.decode(indent_level, encoding, formatter)
|
|
return u.encode(encoding, errors)
|
|
|
|
def decode(
|
|
self,
|
|
indent_level: Optional[int] = None,
|
|
eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
|
|
formatter: _FormatterOrName = "minimal",
|
|
iterator: Optional[Iterator[PageElement]] = None,
|
|
) -> str:
|
|
"""Render this `Tag` and its contents as a Unicode string.
|
|
|
|
:param indent_level: Each line of the rendering will be
|
|
indented this many levels. (The ``formatter`` decides what a
|
|
'level' means, in terms of spaces or other characters
|
|
output.) This is used internally in recursive calls while
|
|
pretty-printing.
|
|
:param encoding: The encoding you intend to use when
|
|
converting the string to a bytestring. decode() is *not*
|
|
responsible for performing that encoding. This information
|
|
is needed so that a real encoding can be substituted in if
|
|
the document contains an encoding declaration (e.g. in a
|
|
<meta> tag).
|
|
:param formatter: Either a `Formatter` object, or a string
|
|
naming one of the standard formatters.
|
|
:param iterator: The iterator to use when navigating over the
|
|
parse tree. This is only used by `Tag.decode_contents` and
|
|
you probably won't need to use it.
|
|
"""
|
|
pieces = []
|
|
# First off, turn a non-Formatter `formatter` into a Formatter
|
|
# object. This will stop the lookup from happening over and
|
|
# over again.
|
|
if not isinstance(formatter, Formatter):
|
|
formatter = self.formatter_for_name(formatter)
|
|
|
|
if indent_level is True:
|
|
indent_level = 0
|
|
|
|
# The currently active tag that put us into string literal
|
|
# mode. Until this element is closed, children will be treated
|
|
# as string literals and not pretty-printed. String literal
|
|
# mode is turned on immediately after this tag begins, and
|
|
# turned off immediately before it's closed. This means there
|
|
# will be whitespace before and after the tag itself.
|
|
string_literal_tag = None
|
|
|
|
for event, element in self._event_stream(iterator):
|
|
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
|
|
element = cast(Tag, element)
|
|
piece = element._format_tag(eventual_encoding, formatter, opening=True)
|
|
elif event is Tag.END_ELEMENT_EVENT:
|
|
element = cast(Tag, element)
|
|
piece = element._format_tag(eventual_encoding, formatter, opening=False)
|
|
if indent_level is not None:
|
|
indent_level -= 1
|
|
else:
|
|
element = cast(NavigableString, element)
|
|
piece = element.output_ready(formatter)
|
|
|
|
# Now we need to apply the 'prettiness' -- extra
|
|
# whitespace before and/or after this tag. This can get
|
|
# complicated because certain tags, like <pre> and
|
|
# <script>, can't be prettified, since adding whitespace would
|
|
# change the meaning of the content.
|
|
|
|
# The default behavior is to add whitespace before and
|
|
# after an element when string literal mode is off, and to
|
|
# leave things as they are when string literal mode is on.
|
|
if string_literal_tag:
|
|
indent_before = indent_after = False
|
|
else:
|
|
indent_before = indent_after = True
|
|
|
|
# The only time the behavior is more complex than that is
|
|
# when we encounter an opening or closing tag that might
|
|
# put us into or out of string literal mode.
|
|
if (
|
|
event is Tag.START_ELEMENT_EVENT
|
|
and not string_literal_tag
|
|
and not cast(Tag, element)._should_pretty_print()
|
|
):
|
|
# We are about to enter string literal mode. Add
|
|
# whitespace before this tag, but not after. We
|
|
# will stay in string literal mode until this tag
|
|
# is closed.
|
|
indent_before = True
|
|
indent_after = False
|
|
string_literal_tag = element
|
|
elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
|
|
# We are about to exit string literal mode by closing
|
|
# the tag that sent us into that mode. Add whitespace
|
|
# after this tag, but not before.
|
|
indent_before = False
|
|
indent_after = True
|
|
string_literal_tag = None
|
|
|
|
# Now we know whether to add whitespace before and/or
|
|
# after this element.
|
|
if indent_level is not None:
|
|
if indent_before or indent_after:
|
|
if isinstance(element, NavigableString):
|
|
piece = piece.strip()
|
|
if piece:
|
|
piece = self._indent_string(
|
|
piece, indent_level, formatter, indent_before, indent_after
|
|
)
|
|
if event == Tag.START_ELEMENT_EVENT:
|
|
indent_level += 1
|
|
pieces.append(piece)
|
|
return "".join(pieces)
|
|
|
|
class _TreeTraversalEvent(object):
|
|
"""An internal class representing an event in the process
|
|
of traversing a parse tree.
|
|
|
|
:meta private:
|
|
"""
|
|
|
|
# Stand-ins for the different events yielded by _event_stream
|
|
START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
|
|
END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
|
|
EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
|
|
STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
|
|
|
|
def _event_stream(
|
|
self, iterator: Optional[Iterator[PageElement]] = None
|
|
) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
|
|
"""Yield a sequence of events that can be used to reconstruct the DOM
|
|
for this element.
|
|
|
|
This lets us recreate the nested structure of this element
|
|
(e.g. when formatting it as a string) without using recursive
|
|
method calls.
|
|
|
|
This is similar in concept to the SAX API, but it's a simpler
|
|
interface designed for internal use. The events are different
|
|
from SAX and the arguments associated with the events are Tags
|
|
and other Beautiful Soup objects.
|
|
|
|
:param iterator: An alternate iterator to use when traversing
|
|
the tree.
|
|
"""
|
|
tag_stack: List[Tag] = []
|
|
|
|
iterator = iterator or self.self_and_descendants
|
|
|
|
for c in iterator:
|
|
# If the parent of the element we're about to yield is not
|
|
# the tag currently on the stack, it means that the tag on
|
|
# the stack closed before this element appeared.
|
|
while tag_stack and c.parent != tag_stack[-1]:
|
|
now_closed_tag = tag_stack.pop()
|
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
|
|
|
if isinstance(c, Tag):
|
|
if c.is_empty_element:
|
|
yield Tag.EMPTY_ELEMENT_EVENT, c
|
|
else:
|
|
yield Tag.START_ELEMENT_EVENT, c
|
|
tag_stack.append(c)
|
|
continue
|
|
else:
|
|
yield Tag.STRING_ELEMENT_EVENT, c
|
|
|
|
while tag_stack:
|
|
now_closed_tag = tag_stack.pop()
|
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
|
|
|
def _indent_string(
|
|
self,
|
|
s: str,
|
|
indent_level: int,
|
|
formatter: Formatter,
|
|
indent_before: bool,
|
|
indent_after: bool,
|
|
) -> str:
|
|
"""Add indentation whitespace before and/or after a string.
|
|
|
|
:param s: The string to amend with whitespace.
|
|
:param indent_level: The indentation level; affects how much
|
|
whitespace goes before the string.
|
|
:param indent_before: Whether or not to add whitespace
|
|
before the string.
|
|
:param indent_after: Whether or not to add whitespace
|
|
(a newline) after the string.
|
|
"""
|
|
space_before = ""
|
|
if indent_before and indent_level:
|
|
space_before = formatter.indent * indent_level
|
|
|
|
space_after = ""
|
|
if indent_after:
|
|
space_after = "\n"
|
|
|
|
return space_before + s + space_after
|
|
|
|
def _format_tag(
|
|
self, eventual_encoding: str, formatter: Formatter, opening: bool
|
|
) -> str:
|
|
if self.hidden:
|
|
# A hidden tag is invisible, although its contents
|
|
# are visible.
|
|
return ""
|
|
|
|
# A tag starts with the < character (see below).
|
|
|
|
# Then the / character, if this is a closing tag.
|
|
closing_slash = ""
|
|
if not opening:
|
|
closing_slash = "/"
|
|
|
|
# Then an optional namespace prefix.
|
|
prefix = ""
|
|
if self.prefix:
|
|
prefix = self.prefix + ":"
|
|
|
|
# Then a list of attribute values, if this is an opening tag.
|
|
attribute_string = ""
|
|
if opening:
|
|
attributes = formatter.attributes(self)
|
|
attrs = []
|
|
for key, val in attributes:
|
|
if val is None:
|
|
decoded = key
|
|
else:
|
|
if isinstance(val, list) or isinstance(val, tuple):
|
|
val = " ".join(val)
|
|
elif not isinstance(val, str):
|
|
val = str(val)
|
|
elif (
|
|
isinstance(val, AttributeValueWithCharsetSubstitution)
|
|
and eventual_encoding is not None
|
|
):
|
|
val = val.substitute_encoding(eventual_encoding)
|
|
|
|
text = formatter.attribute_value(val)
|
|
decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
|
|
attrs.append(decoded)
|
|
if attrs:
|
|
attribute_string = " " + " ".join(attrs)
|
|
|
|
# Then an optional closing slash (for a void element in an
|
|
# XML document).
|
|
void_element_closing_slash = ""
|
|
if self.is_empty_element:
|
|
void_element_closing_slash = formatter.void_element_close_prefix or ""
|
|
|
|
# Put it all together.
|
|
return (
|
|
"<"
|
|
+ closing_slash
|
|
+ prefix
|
|
+ self.name
|
|
+ attribute_string
|
|
+ void_element_closing_slash
|
|
+ ">"
|
|
)
|
|
|
|
def _should_pretty_print(self, indent_level: int = 1) -> bool:
|
|
"""Should this tag be pretty-printed?
|
|
|
|
Most of them should, but some (such as <pre> in HTML
|
|
documents) should not.
|
|
"""
|
|
return indent_level is not None and (
|
|
not self.preserve_whitespace_tags
|
|
or self.name not in self.preserve_whitespace_tags
|
|
)
|
|
|
|
def prettify(
|
|
self,
|
|
encoding: Optional[_Encoding] = None,
|
|
formatter: _FormatterOrName = "minimal",
|
|
) -> Union[str, bytes]:
|
|
"""Pretty-print this `Tag` as a string or bytestring.
|
|
|
|
:param encoding: The encoding of the bytestring, or None if you want Unicode.
|
|
:param formatter: A Formatter object, or a string naming one of
|
|
the standard formatters.
|
|
:return: A string (if no ``encoding`` is provided) or a bytestring
|
|
(otherwise).
|
|
"""
|
|
if encoding is None:
|
|
return self.decode(indent_level=0, formatter=formatter)
|
|
else:
|
|
return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
|
|
|
|
def decode_contents(
|
|
self,
|
|
indent_level: Optional[int] = None,
|
|
eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
|
|
formatter: _FormatterOrName = "minimal",
|
|
) -> str:
|
|
"""Renders the contents of this tag as a Unicode string.
|
|
|
|
:param indent_level: Each line of the rendering will be
|
|
indented this many levels. (The formatter decides what a
|
|
'level' means in terms of spaces or other characters
|
|
output.) Used internally in recursive calls while
|
|
pretty-printing.
|
|
|
|
:param eventual_encoding: The tag is destined to be
|
|
encoded into this encoding. decode_contents() is *not*
|
|
responsible for performing that encoding. This information
|
|
is needed so that a real encoding can be substituted in if
|
|
the document contains an encoding declaration (e.g. in a
|
|
<meta> tag).
|
|
|
|
:param formatter: A `Formatter` object, or a string naming one of
|
|
the standard Formatters.
|
|
"""
|
|
return self.decode(
|
|
indent_level, eventual_encoding, formatter, iterator=self.descendants
|
|
)
|
|
|
|
def encode_contents(
|
|
self,
|
|
indent_level: Optional[int] = None,
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
|
|
formatter: _FormatterOrName = "minimal",
|
|
) -> bytes:
|
|
"""Renders the contents of this PageElement as a bytestring.
|
|
|
|
:param indent_level: Each line of the rendering will be
|
|
indented this many levels. (The ``formatter`` decides what a
|
|
'level' means, in terms of spaces or other characters
|
|
output.) This is used internally in recursive calls while
|
|
pretty-printing.
|
|
:param formatter: Either a `Formatter` object, or a string naming one of
|
|
the standard formatters.
|
|
:param encoding: The bytestring will be in this encoding.
|
|
"""
|
|
contents = self.decode_contents(indent_level, encoding, formatter)
|
|
return contents.encode(encoding)
|
|
|
|
@_deprecated("encode_contents", "4.0.0")
|
|
def renderContents(
|
|
self,
|
|
encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
|
|
prettyPrint: bool = False,
|
|
indentLevel: Optional[int] = 0,
|
|
) -> bytes:
|
|
"""Deprecated method for BS3 compatibility.
|
|
|
|
:meta private:
|
|
"""
|
|
if not prettyPrint:
|
|
indentLevel = None
|
|
return self.encode_contents(indent_level=indentLevel, encoding=encoding)
|
|
|
|
# Soup methods
|
|
|
|
def find(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
recursive: bool = True,
|
|
string: Optional[_StrainableString] = None,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _AtMostOneElement:
|
|
"""Look in the children of this PageElement and find the first
|
|
PageElement that matches the given criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param recursive: If this is True, find() will perform a
|
|
recursive search of this Tag's children. Otherwise,
|
|
only the direct children will be considered.
|
|
:param string: A filter on the `Tag.string` attribute.
|
|
:param limit: Stop looking after finding this many results.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
r = None
|
|
results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs)
|
|
if results:
|
|
r = results[0]
|
|
return r
|
|
|
|
findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
|
|
|
|
def find_all(
|
|
self,
|
|
name: _FindMethodName = None,
|
|
attrs: _StrainableAttributes = {},
|
|
recursive: bool = True,
|
|
string: Optional[_StrainableString] = None,
|
|
limit: Optional[int] = None,
|
|
_stacklevel: int = 2,
|
|
**kwargs: _StrainableAttribute,
|
|
) -> _QueryResults:
|
|
"""Look in the children of this `PageElement` and find all
|
|
`PageElement` objects that match the given criteria.
|
|
|
|
All find_* methods take a common set of arguments. See the online
|
|
documentation for detailed explanations.
|
|
|
|
:param name: A filter on tag name.
|
|
:param attrs: Additional filters on attribute values.
|
|
:param recursive: If this is True, find_all() will perform a
|
|
recursive search of this PageElement's children. Otherwise,
|
|
only the direct children will be considered.
|
|
:param limit: Stop looking after finding this many results.
|
|
:param _stacklevel: Used internally to improve warning messages.
|
|
:kwargs: Additional filters on attribute values.
|
|
"""
|
|
generator = self.descendants
|
|
if not recursive:
|
|
generator = self.children
|
|
return self._find_all(
|
|
name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs
|
|
)
|
|
|
|
findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
|
|
findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
|
|
|
|
# Generator methods
|
|
@property
|
|
def children(self) -> Iterator[PageElement]:
|
|
"""Iterate over all direct children of this `PageElement`."""
|
|
return (x for x in self.contents)
|
|
|
|
@property
|
|
def self_and_descendants(self) -> Iterator[PageElement]:
|
|
"""Iterate over this `Tag` and its children in a
|
|
breadth-first sequence.
|
|
"""
|
|
return self._self_and(self.descendants)
|
|
|
|
@property
|
|
def descendants(self) -> Iterator[PageElement]:
|
|
"""Iterate over all children of this `Tag` in a
|
|
breadth-first sequence.
|
|
"""
|
|
if not len(self.contents):
|
|
return
|
|
# _last_descendant() can't return None here because
|
|
# accept_self is True. Worst case, last_descendant will end up
|
|
# as self.
|
|
last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
|
|
stopNode = last_descendant.next_element
|
|
current: _AtMostOneElement = self.contents[0]
|
|
while current is not stopNode and current is not None:
|
|
successor = current.next_element
|
|
yield current
|
|
current = successor
|
|
|
|
# CSS selector code
|
|
def select_one(
|
|
self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
|
|
) -> Optional[Tag]:
|
|
"""Perform a CSS selection operation on the current element.
|
|
|
|
:param selector: A CSS selector.
|
|
|
|
:param namespaces: A dictionary mapping namespace prefixes
|
|
used in the CSS selector to namespace URIs. By default,
|
|
Beautiful Soup will use the prefixes it encountered while
|
|
parsing the document.
|
|
|
|
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
|
soupsieve.select() method.
|
|
"""
|
|
return self.css.select_one(selector, namespaces, **kwargs)
|
|
|
|
def select(
|
|
self,
|
|
selector: str,
|
|
namespaces: Optional[Dict[str, str]] = None,
|
|
limit: int = 0,
|
|
**kwargs: Any,
|
|
) -> ResultSet[Tag]:
|
|
"""Perform a CSS selection operation on the current element.
|
|
|
|
This uses the SoupSieve library.
|
|
|
|
:param selector: A string containing a CSS selector.
|
|
|
|
:param namespaces: A dictionary mapping namespace prefixes
|
|
used in the CSS selector to namespace URIs. By default,
|
|
Beautiful Soup will use the prefixes it encountered while
|
|
parsing the document.
|
|
|
|
:param limit: After finding this number of results, stop looking.
|
|
|
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
|
soupsieve.select() method.
|
|
"""
|
|
return self.css.select(selector, namespaces, limit, **kwargs)
|
|
|
|
@property
|
|
def css(self) -> CSS:
|
|
"""Return an interface to the CSS selector API."""
|
|
return CSS(self)
|
|
|
|
# Old names for backwards compatibility
|
|
@_deprecated("children", "4.0.0")
|
|
def childGenerator(self) -> Iterator[PageElement]:
|
|
"""Deprecated generator.
|
|
|
|
:meta private:
|
|
"""
|
|
return self.children
|
|
|
|
@_deprecated("descendants", "4.0.0")
|
|
def recursiveChildGenerator(self) -> Iterator[PageElement]:
|
|
"""Deprecated generator.
|
|
|
|
:meta private:
|
|
"""
|
|
return self.descendants
|
|
|
|
@_deprecated("has_attr", "4.0.0")
|
|
def has_key(self, key: str) -> bool:
|
|
"""Deprecated method. This was kind of misleading because has_key()
|
|
(attributes) was different from __in__ (contents).
|
|
|
|
has_key() is gone in Python 3, anyway.
|
|
|
|
:meta private:
|
|
"""
|
|
return self.has_attr(key)
|
|
|
|
|
|
_PageElementT = TypeVar("_PageElementT", bound=PageElement)
|
|
|
|
|
|
class ResultSet(List[_PageElementT], Generic[_PageElementT]):
|
|
"""A ResultSet is a list of `PageElement` objects, gathered as the result
|
|
of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
|
|
search results.
|
|
"""
|
|
|
|
source: Optional[ElementFilter]
|
|
|
|
def __init__(
|
|
self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
|
|
) -> None:
|
|
super(ResultSet, self).__init__(result)
|
|
self.source = source
|
|
|
|
def __getattr__(self, key: str) -> None:
|
|
"""Raise a helpful exception to explain a common code fix."""
|
|
raise AttributeError(
|
|
f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
|
|
)
|
|
|
|
|
|
# Now that all the classes used by SoupStrainer have been defined,
|
|
# import SoupStrainer itself into this module to preserve the
|
|
# backwards compatibility of anyone who imports
|
|
# bs4.element.SoupStrainer.
|
|
from bs4.filter import SoupStrainer # noqa: E402
|