Hello.

Here is some bolded text", string_containers={ "b": BString, "p": PString, }, ) # The string before the

tag is a regular NavigableString. assert isinstance(soup.div.contents[0], NavigableString) # The string inside the

tag, but not inside the tag, # is a PString. assert isinstance(soup.p.contents[0], PString) # Every string inside the tag is a BString, even the one that # was also inside an tag. for s in soup.b.strings: assert isinstance(s, BString) # Now that parsing was complete, the string_container_stack # (where this information was kept) has been cleared out. assert [] == soup.string_container_stack @pytest.mark.parametrize("bad_markup", [1, False, lambda x: False]) def test_invalid_markup_type(self, bad_markup): with pytest.raises(TypeError) as exc_info: BeautifulSoup(bad_markup, "html.parser") assert ( f"Incoming markup is of an invalid type: {bad_markup!r}. Markup must be a string, a bytestring, or an open filehandle." in str(exc_info.value) ) class TestOutput(SoupTest): @pytest.mark.parametrize( "eventual_encoding,actual_encoding", [ ("utf-8", "utf-8"), ("utf-16", "utf-16"), ], ) def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): # Most of the time, calling decode() on an XML document will # give you a document declaration that mentions the encoding # you intend to use when encoding the document as a # bytestring. soup = self.soup("") soup.is_xml = True assert ( f'\n' == soup.decode(eventual_encoding=eventual_encoding) ) @pytest.mark.parametrize( "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] ) def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding( self, eventual_encoding ): # But if you pass a Python internal encoding into decode(), or # omit the eventual_encoding altogether, the document # declaration won't mention any particular encoding. soup = BeautifulSoup("", "html.parser") soup.is_xml = True assert '\n' == soup.decode( eventual_encoding=eventual_encoding ) def test(self): # BeautifulSoup subclasses Tag and extends the decode() method. # Make sure the other Tag methods which call decode() call # it correctly. soup = self.soup("") assert b"" == soup.encode(encoding="utf-8") assert b"" == soup.encode_contents(encoding="utf-8") assert "" == soup.decode_contents() assert "\n\n" == soup.prettify() class TestWarnings(SoupTest): # Note that some of the tests in this class create BeautifulSoup # objects directly rather than using self.soup(). That's # because SoupTest.soup is defined in a different file, # which will throw off the assertion in _assert_warning # that the code that triggered the warning is in the same # file as the test. def _assert_warning( self, warnings: Iterable[warnings.WarningMessage], cls: Type[Warning] ) -> warnings.WarningMessage: for w in warnings: if isinstance(w.message, cls): assert w.filename == __file__ return w raise Exception("%s warning not found in %r" % (cls, warnings)) def _assert_no_parser_specified(self, w: Iterable[warnings.WarningMessage]) -> None: warning = self._assert_warning(w, GuessedAtParserWarning) message = str(warning.message) assert message.startswith(GuessedAtParserWarning.MESSAGE[:60]) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: BeautifulSoup("") self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: BeautifulSoup("", "html") self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: self.soup("") assert [] == w def test_warning_if_strainer_filters_everything(self): strainer = SoupStrainer(name="a", string="b") with warnings.catch_warnings(record=True) as w: self.soup("", parse_only=strainer) warning = self._assert_warning(w, UserWarning) msg = str(warning.message) assert msg.startswith("The given value for parse_only will exclude everything:") def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = BeautifulSoup( "", "html.parser", parseOnlyThese=SoupStrainer("b"), ) warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "parseOnlyThese" in msg assert "parse_only" in msg assert b"" == soup.encode() def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = BeautifulSoup(utf8, "html.parser", fromEncoding="utf8") warning = self._assert_warning(w, DeprecationWarning) msg = str(warning.message) assert "fromEncoding" in msg assert "from_encoding" in msg assert "utf8" == soup.original_encoding def test_unrecognized_keyword_argument(self): with pytest.raises(TypeError): self.soup("", no_such_argument=True) @pytest.mark.parametrize( "markup", [ "markup.html", "markup.htm", "markup.HTML", "markup.txt", "markup.xhtml", "markup.xml", "/home/user/file.txt", r"c:\user\file.html" r"\\server\share\path\file.XhTml", ], ) def test_resembles_filename_warning(self, markup): # A warning is issued if the "markup" looks like the name of # an HTML or text file, or a full path to a file on disk. with warnings.catch_warnings(record=True) as w: BeautifulSoup(markup, "html.parser") warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks more like a filename" in str(warning.message) @pytest.mark.parametrize( "markup", [ "filename", "markuphtml", "markup.com", "", # Excluded due to an irrelevant file extension. "markup.js", "markup.jpg", "markup.markup", # Excluded due to the lack of any file extension. "/home/user/file", r"c:\user\file.html" r"\\server\share\path\file", # Excluded because of two consecutive slashes _and_ the # colon. "log message containing a url http://www.url.com/ right there.html", # Excluded for containing various characters or combinations # not usually found in filenames. "two consecutive spaces.html", "two//consecutive//slashes.html", "looks/like/a/filename/but/oops/theres/a#comment.html", "two\nlines.html", "contains?.html", "contains*.html", "contains#.html", "contains&.html", "contains;.html", "contains>.html", "contains<.html", "contains$.html", "contains|.html", "contains:.html", ":-at-the-front.html", ], ) def test_resembles_filename_no_warning(self, markup): # The 'looks more like a filename' warning is not issued if # the markup looks like a bare string, a domain name, or a # file that's not an HTML file. with warnings.catch_warnings(record=True) as w: self.soup(markup) assert [] == w def test_url_warning_with_bytes_url(self): url = b"http://www.crummybytes.com/" with warnings.catch_warnings(record=True) as warning_list: BeautifulSoup(url, "html.parser") warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message).encode("utf8") def test_url_warning_with_unicode_url(self): url = "http://www.crummyunicode.com/" with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning BeautifulSoup(url, "html.parser") warning = self._assert_warning(warning_list, MarkupResemblesLocatorWarning) assert "looks more like a URL" in str(warning.message) assert url not in str(warning.message) def test_url_warning_with_bytes_and_space(self): # Here the markup contains something besides a URL, so no warning # is issued. with warnings.catch_warnings(record=True) as warning_list: self.soup(b"http://www.crummybytes.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: self.soup("http://www.crummyunicode.com/ is great") assert not any("looks more like a URL" in str(w.message) for w in warning_list) class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYes NoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) assert soup.encode() == b"YesYes Yes" class TestNewTag(SoupTest): """Test the BeautifulSoup.new_tag() method.""" def test_new_tag(self): soup = self.soup("") new_tag = soup.new_tag("foo", string="txt", bar="baz", attrs={"name": "a name"}) assert isinstance(new_tag, Tag) assert "foo" == new_tag.name assert new_tag.string == "txt" assert dict(bar="baz", name="a name") == new_tag.attrs assert None is new_tag.parent # string can be null new_tag = soup.new_tag("foo") assert None is new_tag.string new_tag = soup.new_tag("foo", string=None) assert None is new_tag.string # Or the empty string new_tag = soup.new_tag("foo", string="") assert "" == new_tag.string @pytest.mark.skipif( not LXML_PRESENT, reason="lxml not installed, cannot parse XML document" ) def test_xml_tag_inherits_self_closing_rules_from_builder(self): xml_soup = BeautifulSoup("", "xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") # Both the
and

tag are empty-element, just because # they have no contents. assert b"
" == xml_br.encode() assert b"

" == xml_p.encode() def test_tag_inherits_self_closing_rules_from_builder(self): html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") # The HTML builder users HTML's rules about which tags are # empty-element tags, and the new tags reflect these rules. assert b"
" == html_br.encode() assert b"

" == html_p.encode() class TestNewString(SoupTest): """Test the BeautifulSoup.new_string() method.""" def test_new_string_creates_navigablestring(self): soup = self.soup("") s = soup.new_string("foo") assert "foo" == s assert isinstance(s, NavigableString) def test_new_string_can_create_navigablestring_subclass(self): soup = self.soup("") s = soup.new_string("foo", Comment) assert "foo" == s assert isinstance(s, Comment) class TestPickle(SoupTest): # Test our ability to pickle the BeautifulSoup object itself. def test_normal_pickle(self): soup = self.soup("some markup") pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.a.string def test_pickle_with_no_builder(self): # We had a bug that prevented pickling from working if # the builder wasn't set. soup = self.soup("some markup") soup.builder = None pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) assert "some markup" == unpickled.string class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setup_method(self): self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. assert ( self.utf8_data == b'Sacr\xc3\xa9 bleu!' ) def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = dammit._chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. dammit._chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() assert isinstance(unicode_output, str) assert unicode_output == self.document_for(ascii.decode()) assert soup_from_ascii.original_encoding.lower() == "utf-8" finally: logging.disable(logging.NOTSET) dammit._chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.decode() == self.unicode_data assert soup_from_unicode.foo.string == "Sacr\xe9 bleu!" assert soup_from_unicode.original_encoding is None def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) assert soup_from_utf8.decode() == self.unicode_data assert soup_from_utf8.foo.string == "Sacr\xe9 bleu!" def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.encode("utf-8") == self.utf8_data

éé

foo\0bar