# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" import logging import pickle import pytest from typing import Iterable from bs4 import ( BeautifulSoup, GuessedAtParserWarning, dammit, ) from bs4.builder import ( TreeBuilder, ) from bs4.element import ( AttributeValueList, XMLAttributeDict, Comment, PYTHON_SPECIFIC_ENCODINGS, Tag, NavigableString, ) from bs4.filter import SoupStrainer from bs4.exceptions import ( ParserRejectedMarkup, ) from bs4._warnings import ( MarkupResemblesLocatorWarning, ) from . import ( default_builder, LXML_PRESENT, SoupTest, ) import warnings from typing import Type class TestConstructor(SoupTest): def test_short_unicode_input(self): data = "

éé

" soup = self.soup(data) assert "éé" == soup.h1.string def test_embedded_null(self): data = "

foo\0bar

" soup = self.soup(data) assert "foo\0bar" == soup.h1.string def test_exclude_encodings(self): utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) assert "windows-1252" == soup.original_encoding def test_custom_builder_class(self): # Verify that you can pass in a custom Builder class and # it'll be instantiated with the appropriate keyword arguments. class Mock(object): def __init__(self, **kwargs): self.called_with = kwargs self.is_xml = True self.store_line_numbers = False self.cdata_list_attributes = [] self.preserve_whitespace_tags = [] self.string_containers = {} self.attribute_dict_class = XMLAttributeDict self.attribute_value_list_class = AttributeValueList def initialize_soup(self, soup): pass def feed(self, markup): self.fed = markup def reset(self): pass def ignore(self, ignore): pass set_up_substitutions = can_be_empty_element = ignore def prepare_markup(self, *args, **kwargs): yield ( "prepared markup", "original encoding", "declared encoding", "contains replacement characters", ) kwargs = dict( var="value", # This is a deprecated BS3-era keyword argument, which # will be stripped out. convertEntities=True, ) with warnings.catch_warnings(record=True): soup = BeautifulSoup("", builder=Mock, **kwargs) assert isinstance(soup.builder, Mock) assert dict(var="value") == soup.builder.called_with assert "prepared markup" == soup.builder.fed # You can also instantiate the TreeBuilder yourself. In this # case, that specific object is used and any keyword arguments # to the BeautifulSoup constructor are ignored. builder = Mock(**kwargs) with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup( "", builder=builder, ignored_value=True, ) msg = str(w[0].message) assert msg.startswith( "Keyword arguments to the BeautifulSoup constructor will be ignored." ) assert builder == soup.builder assert kwargs == builder.called_with def test_parser_markup_rejection(self): # If markup is completely rejected by the parser, an # explanatory ParserRejectedMarkup exception is raised. class Mock(TreeBuilder): def feed(self, *args, **kwargs): raise ParserRejectedMarkup("Nope.") def prepare_markup(self, markup, *args, **kwargs): # We're going to try two different ways of preparing this markup, # but feed() will reject both of them. yield markup, None, None, False yield markup, None, None, False with pytest.raises(ParserRejectedMarkup) as exc_info: BeautifulSoup("", builder=Mock) assert ( "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." in str(exc_info.value) ) def test_cdata_list_attributes(self): # Most attribute values are represented as scalars, but the # HTML standard says that some attributes, like 'class' have # space-separated lists as values. markup = '' soup = self.soup(markup) # Note that the spaces are stripped for 'class' but not for 'id'. a = soup.a assert " an id " == a["id"] assert ["a", "class"] == a["class"] # TreeBuilder takes an argument called 'multi_valued_attributes' which lets # you customize or disable this. As always, you can customize the TreeBuilder # by passing in a keyword argument to the BeautifulSoup constructor. soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) assert " a class " == soup.a["class"] # Here are two ways of saying that `id` is a multi-valued # attribute in this context, but 'class' is not. for switcheroo in ({"*": "id"}, {"a": "id"}): with warnings.catch_warnings(record=True): # This will create a warning about not explicitly # specifying a parser, but we'll ignore it. soup = self.soup( markup, builder=None, multi_valued_attributes=switcheroo ) a = soup.a assert ["an", "id"] == a["id"] assert " a class " == a["class"] def test_replacement_classes(self): # Test the ability to pass in replacements for element classes # which will be used when building the tree. class TagPlus(Tag): pass class StringPlus(NavigableString): pass class CommentPlus(Comment): pass soup = self.soup( "foobar", element_classes={ Tag: TagPlus, NavigableString: StringPlus, Comment: CommentPlus, }, ) # The tree was built with TagPlus, StringPlus, and CommentPlus objects, # rather than Tag, String, and Comment objects. assert all( isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.descendants ) def test_alternate_string_containers(self): # Test the ability to customize the string containers for # different types of tags. class PString(NavigableString): pass class BString(NavigableString): pass soup = self.soup( "
Hello.

Here is some bolded text", string_containers={ "b": BString, "p": PString, }, ) # The string before the

tag is a regular NavigableString. assert isinstance(soup.div.contents[0], NavigableString) # The string inside the

tag, but not inside the tag, # is a PString. assert isinstance(soup.p.contents[0], PString) # Every string inside the tag is a BString, even the one that # was also inside an tag. for s in soup.b.strings: assert isinstance(s, BString) # Now that parsing was complete, the string_container_stack # (where this information was kept) has been cleared out. assert [] == soup.string_container_stack @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False]) def test_invalid_markup_type(self, bad_markup): with pytest.raises(TypeError) as exc_info: BeautifulSoup(bad_markup, "html.parser") assert ( f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle." in str(exc_info.value) ) class TestOutput(SoupTest): @pytest.mark.parametrize( "eventual_encoding,actual_encoding", [ ("utf-8", "utf-8"), ("utf-16", "utf-16"), ], ) def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): # Most of the time, calling decode() on an XML document will # give you a document declaration that mentions the encoding # you intend to use when encoding the document as a # bytestring. soup = self.soup("") soup.is_xml = True assert ( f'\n' == soup.decode(eventual_encoding=eventual_encoding) ) @pytest.mark.parametrize( "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] ) def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding( self, eventual_encoding ): # But if you pass a Python internal encoding into decode(), or # omit the eventual_encoding altogether, the document # declaration won't mention any particular encoding. soup = BeautifulSoup("", "html.parser") soup.is_xml = True assert '\n' == soup.decode( eventual_encoding=eventual_encoding ) def test(self): # BeautifulSoup subclasses Tag and extends the decode() method. # Make sure the other Tag methods which call decode() call # it correctly. soup = self.soup("") assert b"" == soup.encode(encoding="utf-8") assert b"" == soup.encode_contents(encoding="utf-8") assert "" == soup.decode_contents() assert "\n\n" == soup.prettify() class TestWarnings(SoupTest): # Note that some of the tests in this class create BeautifulSoup # objects directly rather than using self.soup(). That's # because SoupTest.soup is defined in a different file, # which will throw off the assertion in _assert_warning # that the code that triggered the warning is in the same # file as the test. def _assert_warning( self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning] ) -> warnings.WarningMessage: for w in warnings: if isinstance(w.message, cls): assert w.filename == __file__ return w raise Exception("%s warning not found in %r" % (cls, warnings)) def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None: warning = self._assert_warning(w, GuessedAtParserWarning) message = str(warning.message) assert message.startswith(GuessedAtParserWarning.MESSAGE[:60]) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: BeautifulSoup("") self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: BeautifulSoup("", "html") self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: self.soup("") assert [] == w def test_warning_if_strainer_filters_everything(self): strainer = SoupStrainer(name="a", string="b") with warnings.catch_warnings(record=True) as w: self.soup("", parse_only=strainer) warning = self._assert_warning(w, UserWarning) msg = str(warning.message) assert msg.startswith("The given value for parse_only will exclude everything:") def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup( "", "html.parser", parseOnlyThese=SoupStrainer("b"), ) warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "parseOnlyThese" in msg assert "parse_only" in msg assert b"" == soup.encode() def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8") warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "fromEncoding" in msg assert "from_encoding" in msg assert "utf8" == soup.original_encoding def test_unrecognized_keyword_argument(self): with pytest.raises(TypeError): self.soup("", no_such_argument=True) @pytest.mark.parametrize( "markup", [ "markup.html", "markup.htm", "markup.HTML", "markup.txt", "markup.xhtml", "markup.xml", "/home/user/file.txt", r"c:\user\file.html" r"\\server\share\path\file.XhTml", ], ) def test_resembles_filename_warning(self, markup): # A warning is issued if the "markup" looks like the name of # an HTML or text file, or a full path to a file on disk. with warnings.catch_warnings(record=True) as w: BeautifulSoup(markup, "html.parser") warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks more like a filename" in str(warning.message) @pytest.mark.parametrize( "markup", [ "filename", "markuphtml", "markup.com", "", # Excluded due to an irrelevant file extension. "markup.js", "markup.jpg", "markup.markup", # Excluded due to the lack of any file extension. "/home/user/file", r"c:\user\file.html" r"\\server\share\path\file", # Excluded because of two consecutive slashes _and_ the # colon. "log message containing a url http://www.url.com/ right there.html", # Excluded for containing various characters or combinations # not usually found in filenames. "two consecutive spaces.html", "two//consecutive//slashes.html", "looks/like/a/filename/but/oops/theres/a#comment.html", "two\nlines.html", "contains?.html", "contains*.html", "contains#.html", "contains&.html", "contains;.html", "contains>.html", "contains<.html", "contains$.html", "contains|.html", "contains:.html", ":-at-the-front.html", ], ) def test_resembles_filename_no_warning(self, markup): # The 'looks more like a filename' warning is not issued if # the markup looks like a bare string, a domain name, or a # file that's not an HTML file. with warnings.catch_warnings(record=True) as w: self.soup(markup) assert [] == w def test_url_warning_with_bytes_url(self): url = b"http://www.crummybytes.com/" with warnings.catch_warnings(record=True) as warning_list: BeautifulSoup(url, "html.parser") warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message).encode("utf8") def test_url_warning_with_unicode_url(self): url = "http://www.crummyunicode.com/" with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning BeautifulSoup(url, "html.parser") warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message) def test_url_warning_with_bytes_and_space(self): # Here the markup contains something besides a URL, so no warning # is issued. with warnings.catch_warnings(record=True) as warning_list: self.soup(b"http://www.crummybytes.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: self.soup("http://www.crummyunicode.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYesNoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) assert soup.encode() == b"YesYes Yes" class TestNewTag(SoupTest): """Test the BeautifulSoup.new_tag() method.""" def test_new_tag(self): soup = self.soup("") new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"}) assert isinstance(new_tag, Tag) assert "foo" == new_tag.name assert new_tag.string == "txt" assert dict(bar="baz", name="a name") == new_tag.attrs assert None is new_tag.parent # string can be null new_tag = soup.new_tag("foo") assert None is new_tag.string new_tag = soup.new_tag("foo", string=None) assert None is new_tag.string # Or the empty string new_tag = soup.new_tag("foo", string="") assert "" == new_tag.string @pytest.mark.skipif( not LXML_PRESENT, reason="lxml not installed, cannot parse XML document" ) def test_xml_tag_inherits_self_closing_rules_from_builder(self): xml_soup = BeautifulSoup("", "xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") # Both the
and

tag are empty-element, just because # they have no contents. assert b"
" == xml_br.encode() assert b"

" == xml_p.encode() def test_tag_inherits_self_closing_rules_from_builder(self): html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") # The HTML builder users HTML's rules about which tags are # empty-element tags, and the new tags reflect these rules. assert b"
" == html_br.encode() assert b"

" == html_p.encode() class TestNewString(SoupTest): """Test the BeautifulSoup.new_string() method.""" def test_new_string_creates_navigablestring(self): soup = self.soup("") s = soup.new_string("foo") assert "foo" == s assert isinstance(s, NavigableString) def test_new_string_can_create_navigablestring_subclass(self): soup = self.soup("") s = soup.new_string("foo", Comment) assert "foo" == s assert isinstance(s, Comment) class TestPickle(SoupTest): # Test our ability to pickle the BeautifulSoup object itself. def test_normal_pickle(self): soup = self.soup("some markup") pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.a.string def test_pickle_with_no_builder(self): # We had a bug that prevented pickling from working if # the builder wasn't set. soup = self.soup("some markup") soup.builder = None pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.string class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setup_method(self): self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. assert ( self.utf8_data == b'Sacr\xc3\xa9 bleu!' ) def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = dammit._chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. dammit._chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() assert isinstance(unicode_output, str) assert unicode_output == self.document_for(ascii.decode()) assert soup_from_ascii.original_encoding.lower() == "utf-8" finally: logging.disable(logging.NOTSET) dammit._chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.decode() == self.unicode_data assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!" assert soup_from_unicode.original_encoding is None def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) assert soup_from_utf8.decode() == self.unicode_data assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!" def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.encode("utf-8") == self.utf8_data