# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
import logging
import pickle
import pytest
from typing import Iterable
from bs4 import (
BeautifulSoup,
GuessedAtParserWarning,
dammit,
)
from bs4.builder import (
TreeBuilder,
)
from bs4.element import (
AttributeValueList,
XMLAttributeDict,
Comment,
PYTHON_SPECIFIC_ENCODINGS,
Tag,
NavigableString,
)
from bs4.filter import SoupStrainer
from bs4.exceptions import (
ParserRejectedMarkup,
)
from bs4._warnings import (
MarkupResemblesLocatorWarning,
)
from . import (
default_builder,
LXML_PRESENT,
SoupTest,
)
import warnings
from typing import Type
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
data = "
Hello.
Here is some bolded text",
string_containers={
"b": BString,
"p": PString,
},
)
# The string before the
tag is a regular NavigableString.
assert isinstance(soup.div.contents[0], NavigableString)
# The string inside the
tag, but not inside the tag,
# is a PString.
assert isinstance(soup.p.contents[0], PString)
# Every string inside the tag is a BString, even the one that
# was also inside an tag.
for s in soup.b.strings:
assert isinstance(s, BString)
# Now that parsing was complete, the string_container_stack
# (where this information was kept) has been cleared out.
assert [] == soup.string_container_stack
@pytest.mark.parametrize("bad_markup", [1, False, lambda x: False])
def test_invalid_markup_type(self, bad_markup):
with pytest.raises(TypeError) as exc_info:
BeautifulSoup(bad_markup, "html.parser")
assert (
f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle."
in str(exc_info.value)
)
class TestOutput(SoupTest):
@pytest.mark.parametrize(
"eventual_encoding,actual_encoding",
[
("utf-8", "utf-8"),
("utf-16", "utf-16"),
],
)
def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
# Most of the time, calling decode() on an XML document will
# give you a document declaration that mentions the encoding
# you intend to use when encoding the document as a
# bytestring.
soup = self.soup("")
soup.is_xml = True
assert (
f'\n'
== soup.decode(eventual_encoding=eventual_encoding)
)
@pytest.mark.parametrize(
"eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
)
def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(
self, eventual_encoding
):
# But if you pass a Python internal encoding into decode(), or
# omit the eventual_encoding altogether, the document
# declaration won't mention any particular encoding.
soup = BeautifulSoup("", "html.parser")
soup.is_xml = True
assert '\n' == soup.decode(
eventual_encoding=eventual_encoding
)
def test(self):
# BeautifulSoup subclasses Tag and extends the decode() method.
# Make sure the other Tag methods which call decode() call
# it correctly.
soup = self.soup("")
assert b"" == soup.encode(encoding="utf-8")
assert b"" == soup.encode_contents(encoding="utf-8")
assert "" == soup.decode_contents()
assert "\n\n" == soup.prettify()
class TestWarnings(SoupTest):
# Note that some of the tests in this class create BeautifulSoup
# objects directly rather than using self.soup(). That's
# because SoupTest.soup is defined in a different file,
# which will throw off the assertion in _assert_warning
# that the code that triggered the warning is in the same
# file as the test.
def _assert_warning(
self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning]
) -> warnings.WarningMessage:
for w in warnings:
if isinstance(w.message, cls):
assert w.filename == __file__
return w
raise Exception("%s warning not found in %r" % (cls, warnings))
def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None:
warning = self._assert_warning(w, GuessedAtParserWarning)
message = str(warning.message)
assert message.startswith(GuessedAtParserWarning.MESSAGE[:60])
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
BeautifulSoup("")
self._assert_no_parser_specified(w)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
BeautifulSoup("", "html")
self._assert_no_parser_specified(w)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
self.soup("")
assert [] == w
def test_warning_if_strainer_filters_everything(self):
strainer = SoupStrainer(name="a", string="b")
with warnings.catch_warnings(record=True) as w:
self.soup("", parse_only=strainer)
warning = self._assert_warning(w, UserWarning)
msg = str(warning.message)
assert msg.startswith("The given value for parse_only will exclude everything:")
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup(
"",
"html.parser",
parseOnlyThese=SoupStrainer("b"),
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "parseOnlyThese" in msg
assert "parse_only" in msg
assert b"" == soup.encode()
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8")
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "fromEncoding" in msg
assert "from_encoding" in msg
assert "utf8" == soup.original_encoding
def test_unrecognized_keyword_argument(self):
with pytest.raises(TypeError):
self.soup("", no_such_argument=True)
@pytest.mark.parametrize(
"markup",
[
"markup.html",
"markup.htm",
"markup.HTML",
"markup.txt",
"markup.xhtml",
"markup.xml",
"/home/user/file.txt",
r"c:\user\file.html" r"\\server\share\path\file.XhTml",
],
)
def test_resembles_filename_warning(self, markup):
# A warning is issued if the "markup" looks like the name of
# an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w:
BeautifulSoup(markup, "html.parser")
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks more like a filename" in str(warning.message)
@pytest.mark.parametrize(
"markup",
[
"filename",
"markuphtml",
"markup.com",
"",
# Excluded due to an irrelevant file extension.
"markup.js",
"markup.jpg",
"markup.markup",
# Excluded due to the lack of any file extension.
"/home/user/file",
r"c:\user\file.html" r"\\server\share\path\file",
# Excluded because of two consecutive slashes _and_ the
# colon.
"log message containing a url http://www.url.com/ right there.html",
# Excluded for containing various characters or combinations
# not usually found in filenames.
"two consecutive spaces.html",
"two//consecutive//slashes.html",
"looks/like/a/filename/but/oops/theres/a#comment.html",
"two\nlines.html",
"contains?.html",
"contains*.html",
"contains#.html",
"contains&.html",
"contains;.html",
"contains>.html",
"contains<.html",
"contains$.html",
"contains|.html",
"contains:.html",
":-at-the-front.html",
],
)
def test_resembles_filename_no_warning(self, markup):
# The 'looks more like a filename' warning is not issued if
# the markup looks like a bare string, a domain name, or a
# file that's not an HTML file.
with warnings.catch_warnings(record=True) as w:
self.soup(markup)
assert [] == w
def test_url_warning_with_bytes_url(self):
url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list:
BeautifulSoup(url, "html.parser")
warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
assert "looks more like a URL" in str(warning.message)
assert url not in str(warning.message).encode("utf8")
def test_url_warning_with_unicode_url(self):
url = "http://www.crummyunicode.com/"
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
BeautifulSoup(url, "html.parser")
warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning)
assert "looks more like a URL" in str(warning.message)
assert url not in str(warning.message)
def test_url_warning_with_bytes_and_space(self):
# Here the markup contains something besides a URL, so no warning
# is issued.
with warnings.catch_warnings(record=True) as warning_list:
self.soup(b"http://www.crummybytes.com/ is great")
assert not any("looks more like a URL" in str(w.message) for w in warning_list)
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
self.soup("http://www.crummyunicode.com/ is great")
assert not any("looks more like a URL" in str(w.message) for w in warning_list)
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "NoYesNoYes Yes"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
assert soup.encode() == b"YesYes Yes"
class TestNewTag(SoupTest):
"""Test the BeautifulSoup.new_tag() method."""
def test_new_tag(self):
soup = self.soup("")
new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"})
assert isinstance(new_tag, Tag)
assert "foo" == new_tag.name
assert new_tag.string == "txt"
assert dict(bar="baz", name="a name") == new_tag.attrs
assert None is new_tag.parent
# string can be null
new_tag = soup.new_tag("foo")
assert None is new_tag.string
new_tag = soup.new_tag("foo", string=None)
assert None is new_tag.string
# Or the empty string
new_tag = soup.new_tag("foo", string="")
assert "" == new_tag.string
@pytest.mark.skipif(
not LXML_PRESENT, reason="lxml not installed, cannot parse XML document"
)
def test_xml_tag_inherits_self_closing_rules_from_builder(self):
xml_soup = BeautifulSoup("", "xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
# Both the
and tag are empty-element, just because
# they have no contents.
assert b"
" == xml_br.encode()
assert b"
" == xml_p.encode()
def test_tag_inherits_self_closing_rules_from_builder(self):
html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
# The HTML builder users HTML's rules about which tags are
# empty-element tags, and the new tags reflect these rules.
assert b"
" == html_br.encode()
assert b"" == html_p.encode()
class TestNewString(SoupTest):
"""Test the BeautifulSoup.new_string() method."""
def test_new_string_creates_navigablestring(self):
soup = self.soup("")
s = soup.new_string("foo")
assert "foo" == s
assert isinstance(s, NavigableString)
def test_new_string_can_create_navigablestring_subclass(self):
soup = self.soup("")
s = soup.new_string("foo", Comment)
assert "foo" == s
assert isinstance(s, Comment)
class TestPickle(SoupTest):
# Test our ability to pickle the BeautifulSoup object itself.
def test_normal_pickle(self):
soup = self.soup("some markup")
pickled = pickle.dumps(soup)
unpickled = pickle.loads(pickled)
assert "some markup" == unpickled.a.string
def test_pickle_with_no_builder(self):
# We had a bug that prevented pickling from working if
# the builder wasn't set.
soup = self.soup("some markup")
soup.builder = None
pickled = pickle.dumps(soup)
unpickled = pickle.loads(pickled)
assert "some markup" == unpickled.string
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setup_method(self):
self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
assert (
self.utf8_data
== b'Sacr\xc3\xa9 bleu!'
)
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set to 'utf-8', a superset of ASCII.
chardet = dammit._chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
# Disable chardet, which will realize that the ASCII is ASCII.
dammit._chardet_dammit = noop
ascii = b"a"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
assert isinstance(unicode_output, str)
assert unicode_output == self.document_for(ascii.decode())
assert soup_from_ascii.original_encoding.lower() == "utf-8"
finally:
logging.disable(logging.NOTSET)
dammit._chardet_dammit = chardet
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.decode() == self.unicode_data
assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!"
assert soup_from_unicode.original_encoding is None
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
assert soup_from_utf8.decode() == self.unicode_data
assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!"
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.encode("utf-8") == self.utf8_data