Compare commits
5 Commits
devon/acl-
...
clokep/bs4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6bed82d324 | ||
|
|
ecc942ff26 | ||
|
|
11a9925252 | ||
|
|
6235ed2656 | ||
|
|
2b46d28c81 |
3
mypy.ini
3
mypy.ini
@@ -139,6 +139,9 @@ disallow_untyped_defs = True
|
||||
[mypy-authlib.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-bcrypt]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-canonicaljson]
|
||||
ignore_missing_imports = True
|
||||
|
||||
|
||||
@@ -173,6 +173,7 @@ authlib = { version = ">=0.14.0", optional = true }
|
||||
# Note: systemd-python 231 appears to have been yanked from pypi
|
||||
systemd-python = { version = ">=231", optional = true }
|
||||
lxml = { version = ">=4.2.0", optional = true }
|
||||
beautifulsoup4 = { version = ">=4.10.0", optional = true }
|
||||
sentry-sdk = { version = ">=0.7.2", optional = true }
|
||||
opentracing = { version = ">=2.2.0", optional = true }
|
||||
jaeger-client = { version = ">=4.0.0", optional = true }
|
||||
@@ -194,7 +195,7 @@ oidc = ["authlib"]
|
||||
# `systemd.journal.JournalHandler`, as is documented in
|
||||
# `contrib/systemd/log_config.yaml`.
|
||||
systemd = ["systemd-python"]
|
||||
url_preview = ["lxml"]
|
||||
url_preview = ["lxml", "beautifulsoup4"]
|
||||
sentry = ["sentry-sdk"]
|
||||
opentracing = ["jaeger-client", "opentracing"]
|
||||
jwt = ["pyjwt"]
|
||||
@@ -263,6 +264,7 @@ types-pyOpenSSL = ">=20.0.7"
|
||||
types-PyYAML = ">=5.4.10"
|
||||
types-requests = ">=2.26.0"
|
||||
types-setuptools = ">=57.4.0"
|
||||
types-beautifulsoup4 = ">=4.10.5"
|
||||
|
||||
# Dependencies which are exclusively required by unit test code. This is
|
||||
# NOT a list of all modules that are necessary to run the unit tests.
|
||||
|
||||
@@ -17,12 +17,12 @@ from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
import attr
|
||||
|
||||
from synapse.rest.media.v1.preview_html import parse_html_description
|
||||
from synapse.rest.media.v1.preview_html import decode_body, parse_html_description
|
||||
from synapse.types import JsonDict
|
||||
from synapse.util import json_decoder
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from synapse.server import HomeServer
|
||||
|
||||
@@ -97,29 +97,25 @@ class OEmbedProvider:
|
||||
# No match.
|
||||
return None
|
||||
|
||||
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
|
||||
def autodiscover_from_html(self, soup: "BeautifulSoup") -> Optional[str]:
|
||||
"""
|
||||
Search an HTML document for oEmbed autodiscovery information.
|
||||
|
||||
Args:
|
||||
tree: The parsed HTML body.
|
||||
soup: The parsed HTML body.
|
||||
|
||||
Returns:
|
||||
The URL to use for oEmbed information, or None if no URL was found.
|
||||
"""
|
||||
# Search for link elements with the proper rel and type attributes.
|
||||
for tag in tree.xpath(
|
||||
"//link[@rel='alternate'][@type='application/json+oembed']"
|
||||
):
|
||||
if "href" in tag.attrib:
|
||||
return tag.attrib["href"]
|
||||
|
||||
# Some providers (e.g. Flickr) use alternative instead of alternate.
|
||||
for tag in tree.xpath(
|
||||
"//link[@rel='alternative'][@type='application/json+oembed']"
|
||||
for tag in soup.find_all(
|
||||
"link",
|
||||
rel=("alternate", "alternative"),
|
||||
type="application/json+oembed",
|
||||
href=True,
|
||||
):
|
||||
if "href" in tag.attrib:
|
||||
return tag.attrib["href"]
|
||||
return tag["href"]
|
||||
|
||||
return None
|
||||
|
||||
@@ -174,7 +170,7 @@ class OEmbedProvider:
|
||||
# Process each type separately.
|
||||
oembed_type = oembed["type"]
|
||||
if oembed_type == "rich":
|
||||
calc_description_and_urls(open_graph_response, oembed["html"])
|
||||
calc_description_and_urls(open_graph_response, oembed["html"], url)
|
||||
|
||||
elif oembed_type == "photo":
|
||||
# If this is a photo, use the full image, not the thumbnail.
|
||||
@@ -182,7 +178,7 @@ class OEmbedProvider:
|
||||
|
||||
elif oembed_type == "video":
|
||||
open_graph_response["og:type"] = "video.other"
|
||||
calc_description_and_urls(open_graph_response, oembed["html"])
|
||||
calc_description_and_urls(open_graph_response, oembed["html"], url)
|
||||
open_graph_response["og:video:width"] = oembed["width"]
|
||||
open_graph_response["og:video:height"] = oembed["height"]
|
||||
|
||||
@@ -202,54 +198,40 @@ class OEmbedProvider:
|
||||
return OEmbedResult(open_graph_response, author_name, cache_age)
|
||||
|
||||
|
||||
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
||||
results = []
|
||||
for tag in tree.xpath("//*/" + tag_name):
|
||||
if "src" in tag.attrib:
|
||||
results.append(tag.attrib["src"])
|
||||
return results
|
||||
def _fetch_urls(soup: "BeautifulSoup", tag_name: str) -> List[str]:
|
||||
return [tag["src"] for tag in soup.find_all(tag_name, src=True)]
|
||||
|
||||
|
||||
def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
|
||||
def calc_description_and_urls(
|
||||
open_graph_response: JsonDict, html_body: str, url: str
|
||||
) -> None:
|
||||
"""
|
||||
Calculate description for an HTML document.
|
||||
|
||||
This uses lxml to convert the HTML document into plaintext. If errors
|
||||
This uses BeautifulSoup to convert the HTML document into plaintext. If errors
|
||||
occur during processing of the document, an empty response is returned.
|
||||
|
||||
Args:
|
||||
open_graph_response: The current Open Graph summary. This is updated with additional fields.
|
||||
html_body: The HTML document, as bytes.
|
||||
|
||||
Returns:
|
||||
The summary
|
||||
url: The URL which is being previewed (not the one which was requested).
|
||||
"""
|
||||
soup = decode_body(html_body, url)
|
||||
|
||||
# If there's no body, nothing useful is going to be found.
|
||||
if not html_body:
|
||||
return
|
||||
|
||||
from lxml import etree
|
||||
|
||||
# Create an HTML parser. If this fails, log and return no metadata.
|
||||
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
||||
|
||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||
tree = etree.fromstring(html_body, parser)
|
||||
|
||||
# The data was successfully parsed, but no tree was found.
|
||||
if tree is None:
|
||||
if not soup:
|
||||
return
|
||||
|
||||
# Attempt to find interesting URLs (images, videos, embeds).
|
||||
if "og:image" not in open_graph_response:
|
||||
image_urls = _fetch_urls(tree, "img")
|
||||
image_urls = _fetch_urls(soup, "img")
|
||||
if image_urls:
|
||||
open_graph_response["og:image"] = image_urls[0]
|
||||
|
||||
video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
|
||||
video_urls = _fetch_urls(soup, "video") + _fetch_urls(soup, "embed")
|
||||
if video_urls:
|
||||
open_graph_response["og:video"] = video_urls[0]
|
||||
|
||||
description = parse_html_description(tree)
|
||||
description = parse_html_description(soup)
|
||||
if description:
|
||||
open_graph_response["og:description"] = description
|
||||
|
||||
@@ -11,101 +11,23 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import codecs
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Iterator, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import PageElement, Tag
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_charset_match = re.compile(
|
||||
rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
|
||||
)
|
||||
_xml_encoding_match = re.compile(
|
||||
rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
|
||||
)
|
||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||
|
||||
|
||||
def _normalise_encoding(encoding: str) -> Optional[str]:
|
||||
"""Use the Python codec's name as the normalised entry."""
|
||||
try:
|
||||
return codecs.lookup(encoding).name
|
||||
except LookupError:
|
||||
return None
|
||||
|
||||
|
||||
def _get_html_media_encodings(
|
||||
body: bytes, content_type: Optional[str]
|
||||
) -> Iterable[str]:
|
||||
def decode_body(body: Union[bytes, str], uri: str) -> Optional["BeautifulSoup"]:
|
||||
"""
|
||||
Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
|
||||
|
||||
The precedence used for finding a character encoding is:
|
||||
|
||||
1. <meta> tag with a charset declared.
|
||||
2. The XML document's character encoding attribute.
|
||||
3. The Content-Type header.
|
||||
4. Fallback to utf-8.
|
||||
5. Fallback to windows-1252.
|
||||
|
||||
This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
|
||||
|
||||
Args:
|
||||
body: The HTML document, as bytes.
|
||||
content_type: The Content-Type header.
|
||||
|
||||
Returns:
|
||||
The character encoding of the body, as a string.
|
||||
"""
|
||||
# There's no point in returning an encoding more than once.
|
||||
attempted_encodings: Set[str] = set()
|
||||
|
||||
# Limit searches to the first 1kb, since it ought to be at the top.
|
||||
body_start = body[:1024]
|
||||
|
||||
# Check if it has an encoding set in a meta tag.
|
||||
match = _charset_match.search(body_start)
|
||||
if match:
|
||||
encoding = _normalise_encoding(match.group(1).decode("ascii"))
|
||||
if encoding:
|
||||
attempted_encodings.add(encoding)
|
||||
yield encoding
|
||||
|
||||
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||
|
||||
# Check if it has an XML document with an encoding.
|
||||
match = _xml_encoding_match.match(body_start)
|
||||
if match:
|
||||
encoding = _normalise_encoding(match.group(1).decode("ascii"))
|
||||
if encoding and encoding not in attempted_encodings:
|
||||
attempted_encodings.add(encoding)
|
||||
yield encoding
|
||||
|
||||
# Check the HTTP Content-Type header for a character set.
|
||||
if content_type:
|
||||
content_match = _content_type_match.match(content_type)
|
||||
if content_match:
|
||||
encoding = _normalise_encoding(content_match.group(1))
|
||||
if encoding and encoding not in attempted_encodings:
|
||||
attempted_encodings.add(encoding)
|
||||
yield encoding
|
||||
|
||||
# Finally, fallback to UTF-8, then windows-1252.
|
||||
for fallback in ("utf-8", "cp1252"):
|
||||
if fallback not in attempted_encodings:
|
||||
yield fallback
|
||||
|
||||
|
||||
def decode_body(
|
||||
body: bytes, uri: str, content_type: Optional[str] = None
|
||||
) -> Optional["etree.Element"]:
|
||||
"""
|
||||
This uses lxml to parse the HTML document.
|
||||
This uses BeautifulSoup to parse the HTML document.
|
||||
|
||||
Args:
|
||||
body: The HTML document, as bytes.
|
||||
@@ -119,39 +41,28 @@ def decode_body(
|
||||
if not body:
|
||||
return None
|
||||
|
||||
# The idea here is that multiple encodings are tried until one works.
|
||||
# Unfortunately the result is never used and then LXML will decode the string
|
||||
# again with the found encoding.
|
||||
for encoding in _get_html_media_encodings(body, content_type):
|
||||
try:
|
||||
body.decode(encoding)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
else:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.builder import ParserRejectedMarkup
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(body, "lxml")
|
||||
# If an empty document is returned, convert to None.
|
||||
if not len(soup):
|
||||
return None
|
||||
return soup
|
||||
except ParserRejectedMarkup:
|
||||
logger.warning("Unable to decode HTML body for %s", uri)
|
||||
return None
|
||||
|
||||
from lxml import etree
|
||||
|
||||
# Create an HTML parser.
|
||||
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||
|
||||
# Attempt to parse the body. Returns None if the body was successfully
|
||||
# parsed, but no tree was found.
|
||||
return etree.fromstring(body, parser)
|
||||
|
||||
|
||||
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||
def parse_html_to_open_graph(soup: "BeautifulSoup") -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Parse the HTML document into an Open Graph response.
|
||||
Calculate metadata for an HTML document.
|
||||
|
||||
This uses lxml to search the HTML document for Open Graph data (or
|
||||
synthesizes it from the document).
|
||||
This uses BeautifulSoup to search the HTML document for Open Graph data.
|
||||
|
||||
Args:
|
||||
tree: The parsed HTML document.
|
||||
soup: The parsed HTML document.
|
||||
|
||||
Returns:
|
||||
The Open Graph response as a dictionary.
|
||||
@@ -174,13 +85,12 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
|
||||
|
||||
og: Dict[str, Optional[str]] = {}
|
||||
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
|
||||
if "content" in tag.attrib:
|
||||
# if we've got more than 50 tags, someone is taking the piss
|
||||
if len(og) >= 50:
|
||||
logger.warning("Skipping OG for page with too many 'og:' tags")
|
||||
return {}
|
||||
og[tag.attrib["property"]] = tag.attrib["content"]
|
||||
for tag in soup.find_all("meta", property=re.compile(r"^og:"), content=True):
|
||||
# if we've got more than 50 tags, someone is taking the piss
|
||||
if len(og) >= 50:
|
||||
logger.warning("Skipping OG for page with too many 'og:' tags")
|
||||
return {}
|
||||
og[tag["property"]] = tag["content"]
|
||||
|
||||
# TODO: grab article: meta tags too, e.g.:
|
||||
|
||||
@@ -193,43 +103,43 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||
|
||||
if "og:title" not in og:
|
||||
# do some basic spidering of the HTML
|
||||
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
||||
if title and title[0].text is not None:
|
||||
og["og:title"] = title[0].text.strip()
|
||||
title = soup.find(("title", "h1", "h2", "h3"))
|
||||
if title and title.string:
|
||||
og["og:title"] = title.string.strip()
|
||||
else:
|
||||
og["og:title"] = None
|
||||
|
||||
if "og:image" not in og:
|
||||
# TODO: extract a favicon failing all else
|
||||
meta_image = tree.xpath(
|
||||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
||||
)
|
||||
meta_image = soup.find("meta", image="image")
|
||||
if meta_image:
|
||||
og["og:image"] = meta_image[0]
|
||||
og["og:image"] = meta_image["content"]
|
||||
else:
|
||||
# TODO: consider inlined CSS styles as well as width & height attribs
|
||||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
||||
def greater_than(tag: "Tag") -> bool:
|
||||
if "width" not in tag or "height" not in tag:
|
||||
return False
|
||||
try:
|
||||
return int(tag["width"]) > 10 and int(tag["height"]) > 10
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
images = soup.find_all("img", src=True, width=greater_than)
|
||||
images = sorted(
|
||||
images,
|
||||
key=lambda i: (
|
||||
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
|
||||
),
|
||||
key=lambda i: (-1 * float(i["width"]) * float(i["height"])),
|
||||
)
|
||||
if not images:
|
||||
images = tree.xpath("//img[@src]")
|
||||
images = soup.find_all("img", src=True)
|
||||
if images:
|
||||
og["og:image"] = images[0].attrib["src"]
|
||||
og["og:image"] = images[0]["src"]
|
||||
|
||||
if "og:description" not in og:
|
||||
meta_description = tree.xpath(
|
||||
"//*/meta"
|
||||
"[translate(@name, 'DESCRIPTION', 'description')='description']"
|
||||
"/@content"
|
||||
)
|
||||
meta_description = soup.find("meta", description="description")
|
||||
if meta_description:
|
||||
og["og:description"] = meta_description[0]
|
||||
og["og:description"] = meta_description["content"]
|
||||
else:
|
||||
og["og:description"] = parse_html_description(tree)
|
||||
og["og:description"] = parse_html_description(soup)
|
||||
elif og["og:description"]:
|
||||
# This must be a non-empty string at this point.
|
||||
assert isinstance(og["og:description"], str)
|
||||
@@ -240,7 +150,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||
return og
|
||||
|
||||
|
||||
def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
||||
def parse_html_description(soup: "BeautifulSoup") -> Optional[str]:
|
||||
"""
|
||||
Calculate a text description based on an HTML document.
|
||||
|
||||
@@ -251,14 +161,11 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
||||
This is a very very very coarse approximation to a plain text render of the page.
|
||||
|
||||
Args:
|
||||
tree: The parsed HTML document.
|
||||
soup: The parsed HTML document.
|
||||
|
||||
Returns:
|
||||
The plain text description, or None if one cannot be generated.
|
||||
"""
|
||||
# We don't just use XPATH here as that is slow on some machines.
|
||||
|
||||
from lxml import etree
|
||||
|
||||
TAGS_TO_REMOVE = (
|
||||
"header",
|
||||
@@ -268,52 +175,44 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
||||
"script",
|
||||
"noscript",
|
||||
"style",
|
||||
etree.Comment,
|
||||
)
|
||||
|
||||
# Split all the text nodes into paragraphs (by splitting on new
|
||||
# lines)
|
||||
text_nodes = (
|
||||
re.sub(r"\s+", "\n", el).strip()
|
||||
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
|
||||
for el in _iterate_over_text(soup.find("body"), *TAGS_TO_REMOVE)
|
||||
)
|
||||
return summarize_paragraphs(text_nodes)
|
||||
|
||||
|
||||
def _iterate_over_text(
|
||||
tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
|
||||
soup: Optional["Tag"], *tags_to_ignore: Iterable[str]
|
||||
) -> Generator[str, None, None]:
|
||||
"""Iterate over the tree returning text nodes in a depth first fashion,
|
||||
"""Iterate over the document returning text nodes in a depth first fashion,
|
||||
skipping text nodes inside certain tags.
|
||||
"""
|
||||
if not soup:
|
||||
return
|
||||
|
||||
from bs4.element import NavigableString, Tag
|
||||
|
||||
# This is basically a stack that we extend using itertools.chain.
|
||||
# This will either consist of an element to iterate over *or* a string
|
||||
# to be returned.
|
||||
elements = iter([tree])
|
||||
elements: Iterator["PageElement"] = iter([soup])
|
||||
while True:
|
||||
el = next(elements, None)
|
||||
if el is None:
|
||||
return
|
||||
|
||||
if isinstance(el, str):
|
||||
yield el
|
||||
elif el.tag not in tags_to_ignore:
|
||||
# el.text is the text before the first child, so we can immediately
|
||||
# return it if the text exists.
|
||||
if el.text:
|
||||
yield el.text
|
||||
|
||||
# We add to the stack all the elements children, interspersed with
|
||||
# each child's tail text (if it exists). The tail text of a node
|
||||
# is text that comes *after* the node, so we always include it even
|
||||
# if we ignore the child node.
|
||||
elements = itertools.chain(
|
||||
itertools.chain.from_iterable( # Basically a flatmap
|
||||
[child, child.tail] if child.tail else [child]
|
||||
for child in el.iterchildren()
|
||||
),
|
||||
elements,
|
||||
)
|
||||
# Do not consider sub-classes of NavigableString since those represent
|
||||
# comments, etc.
|
||||
if type(el) == NavigableString:
|
||||
yield str(el)
|
||||
elif isinstance(el, Tag) and el.name not in tags_to_ignore:
|
||||
# We add to the stack all the elements children.
|
||||
elements = itertools.chain(el.contents, elements)
|
||||
|
||||
|
||||
def summarize_paragraphs(
|
||||
|
||||
@@ -298,16 +298,16 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||
|
||||
# define our OG response for this media
|
||||
elif _is_html(media_info.media_type):
|
||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||
# TODO: somehow stop a big HTML document from exploding synapse's RAM
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
||||
tree = decode_body(body, media_info.uri, media_info.media_type)
|
||||
if tree is not None:
|
||||
soup = decode_body(body, media_info.uri)
|
||||
if soup is not None:
|
||||
# Check if this HTML document points to oEmbed information and
|
||||
# defer to that.
|
||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||
oembed_url = self._oembed.autodiscover_from_html(soup)
|
||||
og_from_oembed: JsonDict = {}
|
||||
if oembed_url:
|
||||
oembed_info = await self._handle_url(
|
||||
@@ -323,7 +323,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||
|
||||
# Parse Open Graph information from the HTML in case the oEmbed
|
||||
# response failed or is incomplete.
|
||||
og_from_html = parse_html_to_open_graph(tree)
|
||||
og_from_html = parse_html_to_open_graph(soup)
|
||||
|
||||
# Compile the Open Graph response by using the scraped
|
||||
# information from the HTML and overlaying any information
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
from synapse.rest.media.v1.preview_html import (
|
||||
_get_html_media_encodings,
|
||||
decode_body,
|
||||
parse_html_to_open_graph,
|
||||
summarize_paragraphs,
|
||||
@@ -159,8 +158,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
@@ -175,8 +174,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
@@ -194,8 +193,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(
|
||||
og,
|
||||
@@ -216,8 +215,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
@@ -230,8 +229,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
@@ -245,8 +244,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||
|
||||
@@ -260,22 +259,22 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
|
||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
def test_empty(self) -> None:
|
||||
"""Test a body with no data in it."""
|
||||
html = b""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
self.assertIsNone(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
self.assertIsNone(soup)
|
||||
|
||||
def test_no_tree(self) -> None:
|
||||
"""A valid body with no tree in it."""
|
||||
def test_no_soup(self):
|
||||
"""A valid body with no soup in it."""
|
||||
html = b"\x00"
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
self.assertIsNone(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
self.assertIsNone(soup)
|
||||
|
||||
def test_xml(self) -> None:
|
||||
"""Test decoding XML and ensure it works properly."""
|
||||
@@ -288,22 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head><title>Foo</title></head><body>Some text.</body></html>
|
||||
""".strip()
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_invalid_encoding(self) -> None:
|
||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
Some text.
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_invalid_encoding2(self) -> None:
|
||||
@@ -317,8 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||
|
||||
def test_windows_1252(self) -> None:
|
||||
@@ -331,119 +316,6 @@ class CalcOgTestCase(unittest.TestCase):
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
soup = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(soup)
|
||||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
||||
|
||||
|
||||
class MediaEncodingTestCase(unittest.TestCase):
|
||||
def test_meta_charset(self) -> None:
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="ascii">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
# A less well-formed version.
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head>< meta charset = ascii>
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
def test_meta_charset_underscores(self) -> None:
|
||||
"""A character encoding contains underscore."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="Shift_JIS">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
|
||||
|
||||
def test_xml_encoding(self) -> None:
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
def test_meta_xml_encoding(self) -> None:
|
||||
"""Meta tags take precedence over XML encoding."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
<head><meta charset="UTF-16">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
|
||||
|
||||
def test_content_type(self) -> None:
|
||||
"""A character encoding is found via the Content-Type header."""
|
||||
# Test a few variations of the header.
|
||||
headers = (
|
||||
'text/html; charset="ascii";',
|
||||
"text/html;charset=ascii;",
|
||||
'text/html; charset="ascii"',
|
||||
"text/html; charset=ascii",
|
||||
'text/html; charset="ascii;',
|
||||
'text/html; charset=ascii";',
|
||||
)
|
||||
for header in headers:
|
||||
encodings = _get_html_media_encodings(b"", header)
|
||||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
def test_fallback(self) -> None:
|
||||
"""A character encoding cannot be found in the body or header."""
|
||||
encodings = _get_html_media_encodings(b"", "text/html")
|
||||
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
||||
|
||||
def test_duplicates(self) -> None:
|
||||
"""Ensure each encoding is only attempted once."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="utf8"?>
|
||||
<html>
|
||||
<head><meta charset="UTF-8">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
'text/html; charset="UTF_8"',
|
||||
)
|
||||
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
||||
|
||||
def test_unknown_invalid(self) -> None:
|
||||
"""A character encoding should be ignored if it is unknown or invalid."""
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="invalid">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
'text/html; charset="invalid"',
|
||||
)
|
||||
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
||||
|
||||
Reference in New Issue
Block a user