temp

Rename tree to soup.
Re-use decode_body.
2022-05-24 13:23:49 -04:00 · 2022-05-24 13:21:59 -04:00 · 2022-05-24 13:19:17 -04:00 · 2022-05-24 13:18:58 -04:00 · 2022-05-24 13:17:14 -04:00
6 changed files with 125 additions and 367 deletions
--- a/mypy.ini
+++ b/mypy.ini
@@ -139,6 +139,9 @@ disallow_untyped_defs = True
 [mypy-authlib.*]
 ignore_missing_imports = True

+[mypy-bcrypt]
+ignore_missing_imports = True
+
 [mypy-canonicaljson]
 ignore_missing_imports = True

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -173,6 +173,7 @@ authlib = { version = ">=0.14.0", optional = true }
 # Note: systemd-python 231 appears to have been yanked from pypi
 systemd-python = { version = ">=231", optional = true }
 lxml = { version = ">=4.2.0", optional = true }
+beautifulsoup4 = { version = ">=4.10.0", optional = true }
 sentry-sdk = { version = ">=0.7.2", optional = true }
 opentracing = { version = ">=2.2.0", optional = true }
 jaeger-client = { version = ">=4.0.0", optional = true }
@@ -194,7 +195,7 @@ oidc = ["authlib"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python"]
-url_preview = ["lxml"]
+url_preview = ["lxml", "beautifulsoup4"]
 sentry = ["sentry-sdk"]
 opentracing = ["jaeger-client", "opentracing"]
 jwt = ["pyjwt"]
@@ -263,6 +264,7 @@ types-pyOpenSSL = ">=20.0.7"
 types-PyYAML = ">=5.4.10"
 types-requests = ">=2.26.0"
 types-setuptools = ">=57.4.0"
+types-beautifulsoup4 = ">=4.10.5"

 # Dependencies which are exclusively required by unit test code. This is
 # NOT a list of all modules that are necessary to run the unit tests.
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -17,12 +17,12 @@ from typing import TYPE_CHECKING, List, Optional

 import attr

-from synapse.rest.media.v1.preview_html import parse_html_description
+from synapse.rest.media.v1.preview_html import decode_body, parse_html_description
 from synapse.types import JsonDict
 from synapse.util import json_decoder

 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup

    from synapse.server import HomeServer

@@ -97,29 +97,25 @@ class OEmbedProvider:
        # No match.
        return None

-    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]:
        """
        Search an HTML document for oEmbed autodiscovery information.

        Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.

        Returns:
            The URL to use for oEmbed information, or None if no URL was found.
        """
        # Search for link elements with the proper rel and type attributes.
-        for tag in tree.xpath(
-            "//link[@rel='alternate'][@type='application/json+oembed']"
-        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
-
        # Some providers (e.g. Flickr) use alternative instead of alternate.
-        for tag in tree.xpath(
-            "//link[@rel='alternative'][@type='application/json+oembed']"
+        for tag in soup.find_all(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+            href=True,
        ):
-            if "href" in tag.attrib:
-                return tag.attrib["href"]
+            return tag["href"]

        return None

@@ -174,7 +170,7 @@ class OEmbedProvider:
            # Process each type separately.
            oembed_type = oembed["type"]
            if oembed_type == "rich":
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)

            elif oembed_type == "photo":
                # If this is a photo, use the full image, not the thumbnail.
@@ -182,7 +178,7 @@ class OEmbedProvider:

            elif oembed_type == "video":
                open_graph_response["og:type"] = "video.other"
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
                open_graph_response["og:video:width"] = oembed["width"]
                open_graph_response["og:video:height"] = oembed["height"]

@@ -202,54 +198,40 @@ class OEmbedProvider:
        return OEmbedResult(open_graph_response, author_name, cache_age)


-def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
-    results = []
-    for tag in tree.xpath("//*/" + tag_name):
-        if "src" in tag.attrib:
-            results.append(tag.attrib["src"])
-    return results
+def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]:
+    return [tag["src"] for tag in soup.find_all(tag_name, src=True)]


-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
    """
    Calculate description for an HTML document.

-    This uses lxml to convert the HTML document into plaintext. If errors
+    This uses BeautifulSoup to convert the HTML document into plaintext. If errors
    occur during processing of the document, an empty response is returned.

    Args:
        open_graph_response: The current Open Graph summary. This is updated with additional fields.
        html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
    """
+    soup = decode_body(html_body, url)
+
    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
-
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
+    if not soup:
        return

    # Attempt to find interesting URLs (images, videos, embeds).
    if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
+        image_urls = _fetch_urls(soup, "img")
        if image_urls:
            open_graph_response["og:image"] = image_urls[0]

-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
+    video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed")
    if video_urls:
        open_graph_response["og:video"] = video_urls[0]

-    description = parse_html_description(tree)
+    description = parse_html_description(soup)
    if description:
        open_graph_response["og:description"] = description
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -11,101 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import codecs
 import itertools
 import logging
 import re
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Union

 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
+    from bs4.element import PageElement, Tag

 logger = logging.getLogger(__name__)

-_charset_match = re.compile(
-    rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
-)
-_xml_encoding_match = re.compile(
-    rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
-)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)


-def _normalise_encoding(encoding: str) -> Optional[str]:
-    """Use the Python codec's name as the normalised entry."""
-    try:
-        return codecs.lookup(encoding).name
-    except LookupError:
-        return None
-
-
-def _get_html_media_encodings(
-    body: bytes, content_type: Optional[str]
-) -> Iterable[str]:
+def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
    """
-    Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
-
-    The precedence used for finding a character encoding is:
-
-    1. <meta> tag with a charset declared.
-    2. The XML document's character encoding attribute.
-    3. The Content-Type header.
-    4. Fallback to utf-8.
-    5. Fallback to windows-1252.
-
-    This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
-
-    Args:
-        body: The HTML document, as bytes.
-        content_type: The Content-Type header.
-
-    Returns:
-        The character encoding of the body, as a string.
-    """
-    # There's no point in returning an encoding more than once.
-    attempted_encodings: Set[str] = set()
-
-    # Limit searches to the first 1kb, since it ought to be at the top.
-    body_start = body[:1024]
-
-    # Check if it has an encoding set in a meta tag.
-    match = _charset_match.search(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-
-    # Check if it has an XML document with an encoding.
-    match = _xml_encoding_match.match(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding and encoding not in attempted_encodings:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # Check the HTTP Content-Type header for a character set.
-    if content_type:
-        content_match = _content_type_match.match(content_type)
-        if content_match:
-            encoding = _normalise_encoding(content_match.group(1))
-            if encoding and encoding not in attempted_encodings:
-                attempted_encodings.add(encoding)
-                yield encoding
-
-    # Finally, fallback to UTF-8, then windows-1252.
-    for fallback in ("utf-8", "cp1252"):
-        if fallback not in attempted_encodings:
-            yield fallback
-
-
-def decode_body(
-    body: bytes, uri: str, content_type: Optional[str] = None
-) -> Optional["etree.Element"]:
-    """
-    This uses lxml to parse the HTML document.
+    This uses BeautifulSoup to parse the HTML document.

    Args:
        body: The HTML document, as bytes.
@@ -119,39 +41,28 @@ def decode_body(
    if not body:
        return None

-    # The idea here is that multiple encodings are tried until one works.
-    # Unfortunately the result is never used and then LXML will decode the string
-    # again with the found encoding.
-    for encoding in _get_html_media_encodings(body, content_type):
-        try:
-            body.decode(encoding)
-        except Exception:
-            pass
-        else:
-            break
-    else:
+    from bs4 import BeautifulSoup
+    from bs4.builder import ParserRejectedMarkup
+
+    try:
+        soup = BeautifulSoup(body, "lxml")
+        # If an empty document is returned, convert to None.
+        if not len(soup):
+            return None
+        return soup
+    except ParserRejectedMarkup:
        logger.warning("Unable to decode HTML body for %s", uri)
        return None

-    from lxml import etree

-    # Create an HTML parser.
-    parser = etree.HTMLParser(recover=True, encoding=encoding)
-
-    # Attempt to parse the body. Returns None if the body was successfully
-    # parsed, but no tree was found.
-    return etree.fromstring(body, parser)
-
-
-def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]:
    """
-    Parse the HTML document into an Open Graph response.
+    Calculate metadata for an HTML document.

-    This uses lxml to search the HTML document for Open Graph data (or
-    synthesizes it from the document).
+    This uses BeautifulSoup to search the HTML document for Open Graph data.

    Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.

    Returns:
        The Open Graph response as a dictionary.
@@ -174,13 +85,12 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

    og: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
-        if "content" in tag.attrib:
-            # if we've got more than 50 tags, someone is taking the piss
-            if len(og) >= 50:
-                logger.warning("Skipping OG for page with too many 'og:' tags")
-                return {}
-            og[tag.attrib["property"]] = tag.attrib["content"]
+    for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True):
+        # if we've got more than 50 tags, someone is taking the piss
+        if len(og) >= 50:
+            logger.warning("Skipping OG for page with too many 'og:' tags")
+            return {}
+        og[tag["property"]] = tag["content"]

    # TODO: grab article: meta tags too, e.g.:

@@ -193,43 +103,43 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:

    if "og:title" not in og:
        # do some basic spidering of the HTML
-        title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
-        if title and title[0].text is not None:
-            og["og:title"] = title[0].text.strip()
+        title = soup.find(("title", "h1", "h2", "h3"))
+        if title and title.string:
+            og["og:title"] = title.string.strip()
        else:
            og["og:title"] = None

    if "og:image" not in og:
        # TODO: extract a favicon failing all else
-        meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
-        )
+        meta_image = soup.find("meta", image="image")
        if meta_image:
-            og["og:image"] = meta_image[0]
+            og["og:image"] = meta_image["content"]
        else:
            # TODO: consider inlined CSS styles as well as width & height attribs
-            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
+            def greater_than(tag: "Tag") -> bool:
+                if "width" not in tag or "height" not in tag:
+                    return False
+                try:
+                    return int(tag["width"]) > 10 and int(tag["height"]) > 10
+                except ValueError:
+                    return False
+
+            images = soup.find_all("img", src=True, width=greater_than)
            images = sorted(
                images,
-                key=lambda i: (
-                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
-                ),
+                key=lambda i: (-1 * float(i["width"]) * float(i["height"])),
            )
            if not images:
-                images = tree.xpath("//img[@src]")
+                images = soup.find_all("img", src=True)
            if images:
-                og["og:image"] = images[0].attrib["src"]
+                og["og:image"] = images[0]["src"]

    if "og:description" not in og:
-        meta_description = tree.xpath(
-            "//*/meta"
-            "[translate(@name, 'DESCRIPTION', 'description')='description']"
-            "/@content"
-        )
+        meta_description = soup.find("meta", description="description")
        if meta_description:
-            og["og:description"] = meta_description[0]
+            og["og:description"] = meta_description["content"]
        else:
-            og["og:description"] = parse_html_description(tree)
+            og["og:description"] = parse_html_description(soup)
    elif og["og:description"]:
        # This must be a non-empty string at this point.
        assert isinstance(og["og:description"], str)
@@ -240,7 +150,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    return og


-def parse_html_description(tree: "etree.Element") -> Optional[str]:
+def parse_html_description(soup: "BeautifulSoup") -> Optional[str]:
    """
    Calculate a text description based on an HTML document.

@@ -251,14 +161,11 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
    This is a very very very coarse approximation to a plain text render of the page.

    Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.

    Returns:
        The plain text description, or None if one cannot be generated.
    """
-    # We don't just use XPATH here as that is slow on some machines.
-
-    from lxml import etree

    TAGS_TO_REMOVE = (
        "header",
@@ -268,52 +175,44 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
        "script",
        "noscript",
        "style",
-        etree.Comment,
    )

    # Split all the text nodes into paragraphs (by splitting on new
    # lines)
    text_nodes = (
        re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
+        for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE)
    )
    return summarize_paragraphs(text_nodes)


 def _iterate_over_text(
-    tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
+    soup: Optional["Tag"], *tags_to_ignore: Iterable[str]
 ) -> Generator[str, None, None]:
-    """Iterate over the tree returning text nodes in a depth first fashion,
+    """Iterate over the document returning text nodes in a depth first fashion,
    skipping text nodes inside certain tags.
    """
+    if not soup:
+        return
+
+    from bs4.element import NavigableString, Tag
+
    # This is basically a stack that we extend using itertools.chain.
    # This will either consist of an element to iterate over *or* a string
    # to be returned.
-    elements = iter([tree])
+    elements: Iterator["PageElement"] = iter([soup])
    while True:
        el = next(elements, None)
        if el is None:
            return

-        if isinstance(el, str):
-            yield el
-        elif el.tag not in tags_to_ignore:
-            # el.text is the text before the first child, so we can immediately
-            # return it if the text exists.
-            if el.text:
-                yield el.text
-
-            # We add to the stack all the elements children, interspersed with
-            # each child's tail text (if it exists). The tail text of a node
-            # is text that comes *after* the node, so we always include it even
-            # if we ignore the child node.
-            elements = itertools.chain(
-                itertools.chain.from_iterable(  # Basically a flatmap
-                    [child, child.tail] if child.tail else [child]
-                    for child in el.iterchildren()
-                ),
-                elements,
-            )
+        # Do not consider sub-classes of NavigableString since those represent
+        # comments, etc.
+        if type(el) == NavigableString:
+            yield str(el)
+        elif isinstance(el, Tag) and el.name not in tags_to_ignore:
+            # We add to the stack all the elements children.
+            elements = itertools.chain(el.contents, elements)


 def summarize_paragraphs(
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource):

            # define our OG response for this media
        elif _is_html(media_info.media_type):
-            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+            # TODO: somehow stop a big HTML document from exploding synapse's RAM

            with open(media_info.filename, "rb") as file:
                body = file.read()

-            tree = decode_body(body, media_info.uri, media_info.media_type)
-            if tree is not None:
+            soup = decode_body(body, media_info.uri)
+            if soup is not None:
                # Check if this HTML document points to oEmbed information and
                # defer to that.
-                oembed_url = self._oembed.autodiscover_from_html(tree)
+                oembed_url = self._oembed.autodiscover_from_html(soup)
                og_from_oembed: JsonDict = {}
                if oembed_url:
                    oembed_info = await self._handle_url(
@@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource):

                # Parse Open Graph information from the HTML in case the oEmbed
                # response failed or is incomplete.
-                og_from_html = parse_html_to_open_graph(tree)
+                og_from_html = parse_html_to_open_graph(soup)

                # Compile the Open Graph response by using the scraped
                # information from the HTML and overlaying any information
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 from synapse.rest.media.v1.preview_html import (
-    _get_html_media_encodings,
    decode_body,
    parse_html_to_open_graph,
    summarize_paragraphs,
@@ -159,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

@@ -175,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

@@ -194,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(
            og,
@@ -216,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

@@ -230,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

@@ -245,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

@@ -260,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """

-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

    def test_empty(self) -> None:
        """Test a body with no data in it."""
        html = b""
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(soup)

-    def test_no_tree(self) -> None:
-        """A valid body with no tree in it."""
+    def test_no_soup(self):
+        """A valid body with no soup in it."""
        html = b"\x00"
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(soup)

    def test_xml(self) -> None:
        """Test decoding XML and ensure it works properly."""
@@ -288,22 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
        <head><title>Foo</title></head><body>Some text.</body></html>
        """.strip()
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding(self) -> None:
-        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

    def test_invalid_encoding2(self) -> None:
@@ -317,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
        </body>
        </html>
        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

    def test_windows_1252(self) -> None:
@@ -331,119 +316,6 @@ class CalcOgTestCase(unittest.TestCase):
        </body>
        </html>
        """
-        tree = decode_body(html, "http://example.com/test.html")
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        og = parse_html_to_open_graph(soup)
        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
-
-
-class MediaEncodingTestCase(unittest.TestCase):
-    def test_meta_charset(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="ascii">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-        # A less well-formed version.
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head>< meta charset = ascii>
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_charset_underscores(self) -> None:
-        """A character encoding contains underscore."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="Shift_JIS">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
-
-    def test_xml_encoding(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_xml_encoding(self) -> None:
-        """Meta tags take precedence over XML encoding."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        <head><meta charset="UTF-16">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
-
-    def test_content_type(self) -> None:
-        """A character encoding is found via the Content-Type header."""
-        # Test a few variations of the header.
-        headers = (
-            'text/html; charset="ascii";',
-            "text/html;charset=ascii;",
-            'text/html;  charset="ascii"',
-            "text/html; charset=ascii",
-            'text/html; charset="ascii;',
-            'text/html; charset=ascii";',
-        )
-        for header in headers:
-            encodings = _get_html_media_encodings(b"", header)
-            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_fallback(self) -> None:
-        """A character encoding cannot be found in the body or header."""
-        encodings = _get_html_media_encodings(b"", "text/html")
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_duplicates(self) -> None:
-        """Ensure each encoding is only attempted once."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="utf8"?>
-        <html>
-        <head><meta charset="UTF-8">
-        </head>
-        </html>
-        """,
-            'text/html; charset="UTF_8"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_unknown_invalid(self) -> None:
-        """A character encoding should be ignored if it is unknown or invalid."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="invalid">
-        </head>
-        </html>
-        """,
-            'text/html; charset="invalid"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
Author	SHA1	Message	Date
Patrick Cloke	6bed82d324	temp	2022-05-24 13:23:49 -04:00
Patrick Cloke	ecc942ff26	Rename tree to soup.	2022-05-24 13:21:59 -04:00
Patrick Cloke	11a9925252	Re-use decode_body.	2022-05-24 13:19:17 -04:00
Patrick Cloke	6235ed2656	Remove dead code.	2022-05-24 13:18:58 -04:00
Patrick Cloke	2b46d28c81	Use BeautifulSoup instead of LXML directly.	2022-05-24 13:17:14 -04:00