1
0

Compare commits

...

5 Commits

Author SHA1 Message Date
Patrick Cloke
6bed82d324 temp 2022-05-24 13:23:49 -04:00
Patrick Cloke
ecc942ff26 Rename tree to soup. 2022-05-24 13:21:59 -04:00
Patrick Cloke
11a9925252 Re-use decode_body. 2022-05-24 13:19:17 -04:00
Patrick Cloke
6235ed2656 Remove dead code. 2022-05-24 13:18:58 -04:00
Patrick Cloke
2b46d28c81 Use BeautifulSoup instead of LXML directly. 2022-05-24 13:17:14 -04:00
6 changed files with 125 additions and 367 deletions

View File

@@ -139,6 +139,9 @@ disallow_untyped_defs = True
[mypy-authlib.*]
ignore_missing_imports = True
[mypy-bcrypt]
ignore_missing_imports = True
[mypy-canonicaljson]
ignore_missing_imports = True

View File

@@ -173,6 +173,7 @@ authlib = { version = ">=0.14.0", optional = true }
# Note: systemd-python 231 appears to have been yanked from pypi
systemd-python = { version = ">=231", optional = true }
lxml = { version = ">=4.2.0", optional = true }
beautifulsoup4 = { version = ">=4.10.0", optional = true }
sentry-sdk = { version = ">=0.7.2", optional = true }
opentracing = { version = ">=2.2.0", optional = true }
jaeger-client = { version = ">=4.0.0", optional = true }
@@ -194,7 +195,7 @@ oidc = ["authlib"]
# `systemd.journal.JournalHandler`, as is documented in
# `contrib/systemd/log_config.yaml`.
systemd = ["systemd-python"]
url_preview = ["lxml"]
url_preview = ["lxml", "beautifulsoup4"]
sentry = ["sentry-sdk"]
opentracing = ["jaeger-client", "opentracing"]
jwt = ["pyjwt"]
@@ -263,6 +264,7 @@ types-pyOpenSSL = ">=20.0.7"
types-PyYAML = ">=5.4.10"
types-requests = ">=2.26.0"
types-setuptools = ">=57.4.0"
types-beautifulsoup4 = ">=4.10.5"
# Dependencies which are exclusively required by unit test code. This is
# NOT a list of all modules that are necessary to run the unit tests.

View File

@@ -17,12 +17,12 @@ from typing import TYPE_CHECKING, List, Optional
import attr
from synapse.rest.media.v1.preview_html import parse_html_description
from synapse.rest.media.v1.preview_html import decode_body, parse_html_description
from synapse.types import JsonDict
from synapse.util import json_decoder
if TYPE_CHECKING:
from lxml import etree
from bs4 import BeautifulSoup
from synapse.server import HomeServer
@@ -97,29 +97,25 @@ class OEmbedProvider:
# No match.
return None
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Args:
tree: The parsed HTML body.
soup: The parsed HTML body.
Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]
# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
for tag in soup.find_all(
"link",
rel=("alternate", "alternative"),
type="application/json+oembed",
href=True,
):
if "href" in tag.attrib:
return tag.attrib["href"]
return tag["href"]
return None
@@ -174,7 +170,7 @@ class OEmbedProvider:
# Process each type separately.
oembed_type = oembed["type"]
if oembed_type == "rich":
calc_description_and_urls(open_graph_response, oembed["html"])
calc_description_and_urls(open_graph_response, oembed["html"], url)
elif oembed_type == "photo":
# If this is a photo, use the full image, not the thumbnail.
@@ -182,7 +178,7 @@ class OEmbedProvider:
elif oembed_type == "video":
open_graph_response["og:type"] = "video.other"
calc_description_and_urls(open_graph_response, oembed["html"])
calc_description_and_urls(open_graph_response, oembed["html"], url)
open_graph_response["og:video:width"] = oembed["width"]
open_graph_response["og:video:height"] = oembed["height"]
@@ -202,54 +198,40 @@ class OEmbedProvider:
return OEmbedResult(open_graph_response, author_name, cache_age)
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
results = []
for tag in tree.xpath("//*/" + tag_name):
if "src" in tag.attrib:
results.append(tag.attrib["src"])
return results
def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]:
return [tag["src"] for tag in soup.find_all(tag_name, src=True)]
def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
def calc_description_and_urls(
open_graph_response: JsonDict, html_body: str, url: str
) -> None:
"""
Calculate description for an HTML document.
This uses lxml to convert the HTML document into plaintext. If errors
This uses BeautifulSoup to convert the HTML document into plaintext. If errors
occur during processing of the document, an empty response is returned.
Args:
open_graph_response: The current Open Graph summary. This is updated with additional fields.
html_body: The HTML document, as bytes.
Returns:
The summary
url: The URL which is being previewed (not the one which was requested).
"""
soup = decode_body(html_body, url)
# If there's no body, nothing useful is going to be found.
if not html_body:
return
from lxml import etree
# Create an HTML parser. If this fails, log and return no metadata.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)
# The data was successfully parsed, but no tree was found.
if tree is None:
if not soup:
return
# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
image_urls = _fetch_urls(tree, "img")
image_urls = _fetch_urls(soup, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]
video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]
description = parse_html_description(tree)
description = parse_html_description(soup)
if description:
open_graph_response["og:description"] = description

View File

@@ -11,101 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import itertools
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Union
if TYPE_CHECKING:
from lxml import etree
from bs4 import BeautifulSoup
from bs4.element import PageElement, Tag
logger = logging.getLogger(__name__)
_charset_match = re.compile(
rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
)
_xml_encoding_match = re.compile(
rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
def _normalise_encoding(encoding: str) -> Optional[str]:
"""Use the Python codec's name as the normalised entry."""
try:
return codecs.lookup(encoding).name
except LookupError:
return None
def _get_html_media_encodings(
body: bytes, content_type: Optional[str]
) -> Iterable[str]:
def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
"""
Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
The precedence used for finding a character encoding is:
1. <meta> tag with a charset declared.
2. The XML document's character encoding attribute.
3. The Content-Type header.
4. Fallback to utf-8.
5. Fallback to windows-1252.
This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
Args:
body: The HTML document, as bytes.
content_type: The Content-Type header.
Returns:
The character encoding of the body, as a string.
"""
# There's no point in returning an encoding more than once.
attempted_encodings: Set[str] = set()
# Limit searches to the first 1kb, since it ought to be at the top.
body_start = body[:1024]
# Check if it has an encoding set in a meta tag.
match = _charset_match.search(body_start)
if match:
encoding = _normalise_encoding(match.group(1).decode("ascii"))
if encoding:
attempted_encodings.add(encoding)
yield encoding
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
# Check if it has an XML document with an encoding.
match = _xml_encoding_match.match(body_start)
if match:
encoding = _normalise_encoding(match.group(1).decode("ascii"))
if encoding and encoding not in attempted_encodings:
attempted_encodings.add(encoding)
yield encoding
# Check the HTTP Content-Type header for a character set.
if content_type:
content_match = _content_type_match.match(content_type)
if content_match:
encoding = _normalise_encoding(content_match.group(1))
if encoding and encoding not in attempted_encodings:
attempted_encodings.add(encoding)
yield encoding
# Finally, fallback to UTF-8, then windows-1252.
for fallback in ("utf-8", "cp1252"):
if fallback not in attempted_encodings:
yield fallback
def decode_body(
body: bytes, uri: str, content_type: Optional[str] = None
) -> Optional["etree.Element"]:
"""
This uses lxml to parse the HTML document.
This uses BeautifulSoup to parse the HTML document.
Args:
body: The HTML document, as bytes.
@@ -119,39 +41,28 @@ def decode_body(
if not body:
return None
# The idea here is that multiple encodings are tried until one works.
# Unfortunately the result is never used and then LXML will decode the string
# again with the found encoding.
for encoding in _get_html_media_encodings(body, content_type):
try:
body.decode(encoding)
except Exception:
pass
else:
break
else:
from bs4 import BeautifulSoup
from bs4.builder import ParserRejectedMarkup
try:
soup = BeautifulSoup(body, "lxml")
# If an empty document is returned, convert to None.
if not len(soup):
return None
return soup
except ParserRejectedMarkup:
logger.warning("Unable to decode HTML body for %s", uri)
return None
from lxml import etree
# Create an HTML parser.
parser = etree.HTMLParser(recover=True, encoding=encoding)
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body, parser)
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
Calculate metadata for an HTML document.
This uses lxml to search the HTML document for Open Graph data (or
synthesizes it from the document).
This uses BeautifulSoup to search the HTML document for Open Graph data.
Args:
tree: The parsed HTML document.
soup: The parsed HTML document.
Returns:
The Open Graph response as a dictionary.
@@ -174,13 +85,12 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
og: Dict[str, Optional[str]] = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag["property"]] = tag["content"]
# TODO: grab article: meta tags too, e.g.:
@@ -193,43 +103,43 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
if "og:title" not in og:
# do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
if title and title[0].text is not None:
og["og:title"] = title[0].text.strip()
title = soup.find(("title", "h1", "h2", "h3"))
if title and title.string:
og["og:title"] = title.string.strip()
else:
og["og:title"] = None
if "og:image" not in og:
# TODO: extract a favicon failing all else
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
)
meta_image = soup.find("meta", image="image")
if meta_image:
og["og:image"] = meta_image[0]
og["og:image"] = meta_image["content"]
else:
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
def greater_than(tag: "Tag") -> bool:
if "width" not in tag or "height" not in tag:
return False
try:
return int(tag["width"]) > 10 and int(tag["height"]) > 10
except ValueError:
return False
images = soup.find_all("img", src=True, width=greater_than)
images = sorted(
images,
key=lambda i: (
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
),
key=lambda i: (-1 * float(i["width"]) * float(i["height"])),
)
if not images:
images = tree.xpath("//img[@src]")
images = soup.find_all("img", src=True)
if images:
og["og:image"] = images[0].attrib["src"]
og["og:image"] = images[0]["src"]
if "og:description" not in og:
meta_description = tree.xpath(
"//*/meta"
"[translate(@name, 'DESCRIPTION', 'description')='description']"
"/@content"
)
meta_description = soup.find("meta", description="description")
if meta_description:
og["og:description"] = meta_description[0]
og["og:description"] = meta_description["content"]
else:
og["og:description"] = parse_html_description(tree)
og["og:description"] = parse_html_description(soup)
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
@@ -240,7 +150,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
return og
def parse_html_description(tree: "etree.Element") -> Optional[str]:
def parse_html_description(soup: "BeautifulSoup") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
@@ -251,14 +161,11 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
This is a very very very coarse approximation to a plain text render of the page.
Args:
tree: The parsed HTML document.
soup: The parsed HTML document.
Returns:
The plain text description, or None if one cannot be generated.
"""
# We don't just use XPATH here as that is slow on some machines.
from lxml import etree
TAGS_TO_REMOVE = (
"header",
@@ -268,52 +175,44 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"script",
"noscript",
"style",
etree.Comment,
)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
soup: Optional["Tag"], *tags_to_ignore: Iterable[str]
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
"""Iterate over the document returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
"""
if not soup:
return
from bs4.element import NavigableString, Tag
# This is basically a stack that we extend using itertools.chain.
# This will either consist of an element to iterate over *or* a string
# to be returned.
elements = iter([tree])
elements: Iterator["PageElement"] = iter([soup])
while True:
el = next(elements, None)
if el is None:
return
if isinstance(el, str):
yield el
elif el.tag not in tags_to_ignore:
# el.text is the text before the first child, so we can immediately
# return it if the text exists.
if el.text:
yield el.text
# We add to the stack all the elements children, interspersed with
# each child's tail text (if it exists). The tail text of a node
# is text that comes *after* the node, so we always include it even
# if we ignore the child node.
elements = itertools.chain(
itertools.chain.from_iterable( # Basically a flatmap
[child, child.tail] if child.tail else [child]
for child in el.iterchildren()
),
elements,
)
# Do not consider sub-classes of NavigableString since those represent
# comments, etc.
if type(el) == NavigableString:
yield str(el)
elif isinstance(el, Tag) and el.name not in tags_to_ignore:
# We add to the stack all the elements children.
elements = itertools.chain(el.contents, elements)
def summarize_paragraphs(

View File

@@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource):
# define our OG response for this media
elif _is_html(media_info.media_type):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
# TODO: somehow stop a big HTML document from exploding synapse's RAM
with open(media_info.filename, "rb") as file:
body = file.read()
tree = decode_body(body, media_info.uri, media_info.media_type)
if tree is not None:
soup = decode_body(body, media_info.uri)
if soup is not None:
# Check if this HTML document points to oEmbed information and
# defer to that.
oembed_url = self._oembed.autodiscover_from_html(tree)
oembed_url = self._oembed.autodiscover_from_html(soup)
og_from_oembed: JsonDict = {}
if oembed_url:
oembed_info = await self._handle_url(
@@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# Parse Open Graph information from the HTML in case the oEmbed
# response failed or is incomplete.
og_from_html = parse_html_to_open_graph(tree)
og_from_html = parse_html_to_open_graph(soup)
# Compile the Open Graph response by using the scraped
# information from the HTML and overlaying any information

View File

@@ -13,7 +13,6 @@
# limitations under the License.
from synapse.rest.media.v1.preview_html import (
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
summarize_paragraphs,
@@ -159,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -175,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -194,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(
og,
@@ -216,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@@ -230,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@@ -245,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@@ -260,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_empty(self) -> None:
"""Test a body with no data in it."""
html = b""
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)
soup = decode_body(html, "http://example.com/test.html")
self.assertIsNone(soup)
def test_no_tree(self) -> None:
"""A valid body with no tree in it."""
def test_no_soup(self):
"""A valid body with no soup in it."""
html = b"\x00"
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)
soup = decode_body(html, "http://example.com/test.html")
self.assertIsNone(soup)
def test_xml(self) -> None:
"""Test decoding XML and ensure it works properly."""
@@ -288,22 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding(self) -> None:
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Some text.
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self) -> None:
@@ -317,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_windows_1252(self) -> None:
@@ -331,119 +316,6 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
soup = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(soup)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None:
"""A character encoding is found via the meta tag."""
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="ascii">
</head>
</html>
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
# A less well-formed version.
encodings = _get_html_media_encodings(
b"""
<html>
<head>< meta charset = ascii>
</head>
</html>
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_meta_charset_underscores(self) -> None:
"""A character encoding contains underscore."""
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="Shift_JIS">
</head>
</html>
""",
"text/html",
)
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
def test_xml_encoding(self) -> None:
"""A character encoding is found via the meta tag."""
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
</html>
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_meta_xml_encoding(self) -> None:
"""Meta tags take precedence over XML encoding."""
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
<head><meta charset="UTF-16">
</head>
</html>
""",
"text/html",
)
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
def test_content_type(self) -> None:
"""A character encoding is found via the Content-Type header."""
# Test a few variations of the header.
headers = (
'text/html; charset="ascii";',
"text/html;charset=ascii;",
'text/html; charset="ascii"',
"text/html; charset=ascii",
'text/html; charset="ascii;',
'text/html; charset=ascii";',
)
for header in headers:
encodings = _get_html_media_encodings(b"", header)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_fallback(self) -> None:
"""A character encoding cannot be found in the body or header."""
encodings = _get_html_media_encodings(b"", "text/html")
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
def test_duplicates(self) -> None:
"""Ensure each encoding is only attempted once."""
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="utf8"?>
<html>
<head><meta charset="UTF-8">
</head>
</html>
""",
'text/html; charset="UTF_8"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
def test_unknown_invalid(self) -> None:
"""A character encoding should be ignored if it is unknown or invalid."""
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="invalid">
</head>
</html>
""",
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])