Compare commits

...

10 Commits

Author SHA1 Message Date
Jason Robinson
835da407c8 Set # syntax=docker/dockerfile:experimental
For buildkit in some environments.

Signed-off-by: Jason Robinson <jasonr@matrix.org>
2022-04-25 11:47:45 +03:00
Richard van der Hoff
46a64f473d changelog 2022-04-22 15:48:39 +01:00
Richard van der Hoff
cec8ef5890 add a TODO comment 2022-04-22 15:48:39 +01:00
Richard van der Hoff
0f48724e00 Only filter extremities we care about
... and then request the extremities we have filtered.
2022-04-22 15:48:38 +01:00
Richard van der Hoff
7780a71abf Apply the visibility check to the extremities we are considering
This means that we will check the insertion events, and we won't check
extremities we have already ruled out for depth.
2022-04-22 15:48:38 +01:00
Richard van der Hoff
55c6252edd Rename more variables
These aren't forward extremites, or indeed extremities of any kind, so their
names were confusing.
2022-04-22 15:48:38 +01:00
Richard van der Hoff
6e2140364b update some comments and logs 2022-04-22 15:48:38 +01:00
Richard van der Hoff
6e5663b291 Move some filtering and sorting logic earlier
We can potentially skip some expensive db work by moving this synchronous code
earlier. The `sorted` might be expensive, but nowhere near as expensive as the
db lookups.
2022-04-22 13:34:27 +01:00
Richard van der Hoff
b81cab0794 skip a dict construction
we may as well just chain together the two inputs
2022-04-22 13:32:00 +01:00
Richard van der Hoff
d8dc12a95c Rename a couple of variables
These are both *lists* of tuples, so let's give them a name that reflects that
2022-04-22 13:29:09 +01:00
4 changed files with 119 additions and 81 deletions

1
changelog.d/12522.misc Normal file
View File

@@ -0,0 +1 @@
Optimise room backfill, to reduce memory usage.

View File

@@ -1,3 +1,4 @@
# syntax=docker/dockerfile:experimental
# Dockerfile to build the matrixdotorg/synapse docker images.
#
# Note that it uses features which are only available in BuildKit - see

View File

@@ -1,4 +1,4 @@
# Copyright 2014-2021 The Matrix.org Foundation C.I.C.
# Copyright 2014-2022 The Matrix.org Foundation C.I.C.
# Copyright 2020 Sorunome
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,9 +15,10 @@
"""Contains handlers for federation events."""
import itertools
import logging
from http import HTTPStatus
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Tuple, Union
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
@@ -178,68 +179,24 @@ class FederationHandler:
logger.debug("Not backfilling as no extremeties found.")
return False
# We only want to paginate if we can actually see the events we'll get,
# as otherwise we'll just spend a lot of resources to get redacted
# events.
#
# We do this by filtering all the backwards extremities and seeing if
# any remain. Given we don't have the extremity events themselves, we
# need to actually check the events that reference them.
#
# *Note*: the spec wants us to keep backfilling until we reach the start
# of the room in case we are allowed to see some of the history. However
# in practice that causes more issues than its worth, as a) its
# relatively rare for there to be any visible history and b) even when
# there is its often sufficiently long ago that clients would stop
# attempting to paginate before backfill reached the visible history.
#
# TODO: If we do do a backfill then we should filter the backwards
# extremities to only include those that point to visible portions of
# history.
#
# TODO: Correctly handle the case where we are allowed to see the
# forward event but not the backward extremity, e.g. in the case of
# initial join of the server where we are allowed to see the join
# event but not anything before it. This would require looking at the
# state *before* the event, ignoring the special casing certain event
# types have.
forward_event_ids = await self.store.get_successor_events(
list(oldest_events_with_depth)
)
extremities_events = await self.store.get_events(
forward_event_ids,
redact_behaviour=EventRedactBehaviour.AS_IS,
get_prev_content=False,
)
# We set `check_history_visibility_only` as we might otherwise get false
# positives from users having been erased.
filtered_extremities = await filter_events_for_server(
self.storage,
self.server_name,
list(extremities_events.values()),
redact=False,
check_history_visibility_only=True,
# we now have a list of potential places to backpaginate from. We prefer to
# start with the most recent (ie, max depth), so let's sort the list.
sorted_extremeties_tuples: List[Tuple[str, int]] = sorted(
itertools.chain(
oldest_events_with_depth.items(),
insertion_events_to_be_backfilled.items(),
),
key=lambda e: -int(e[1]),
)
logger.debug(
"_maybe_backfill_inner: filtered_extremities %s", filtered_extremities
"_maybe_backfill_inner: room_id: %s: current_depth: %s, limit: %s, extrems (%d): %s",
room_id,
current_depth,
limit,
len(sorted_extremeties_tuples),
sorted_extremeties_tuples,
)
if not filtered_extremities and not insertion_events_to_be_backfilled:
return False
extremities = {
**oldest_events_with_depth,
# TODO: insertion_events_to_be_backfilled is currently skipping the filtered_extremities checks
**insertion_events_to_be_backfilled,
}
# Check if we reached a point where we should start backfilling.
sorted_extremeties_tuple = sorted(extremities.items(), key=lambda e: -int(e[1]))
max_depth = sorted_extremeties_tuple[0][1]
# If we're approaching an extremity we trigger a backfill, otherwise we
# no-op.
#
@@ -249,6 +206,11 @@ class FederationHandler:
# chose more than one times the limit in case of failure, but choosing a
# much larger factor will result in triggering a backfill request much
# earlier than necessary.
#
# XXX: shouldn't we do this *after* the filter by depth below? Again, we don't
# care about events that have happened after our current position.
#
max_depth = sorted_extremeties_tuples[0][1]
if current_depth - 2 * limit > max_depth:
logger.debug(
"Not backfilling as we don't need to. %d < %d - 2 * %d",
@@ -265,31 +227,98 @@ class FederationHandler:
# 2. we have likely previously tried and failed to backfill from that
# extremity, so to avoid getting "stuck" requesting the same
# backfill repeatedly we drop those extremities.
filtered_sorted_extremeties_tuple = [
t for t in sorted_extremeties_tuple if int(t[1]) <= current_depth
]
logger.debug(
"room_id: %s, backfill: current_depth: %s, limit: %s, max_depth: %s, extrems (%d): %s filtered_sorted_extremeties_tuple: %s",
room_id,
current_depth,
limit,
max_depth,
len(sorted_extremeties_tuple),
sorted_extremeties_tuple,
filtered_sorted_extremeties_tuple,
)
#
# However, we need to check that the filtered extremities are non-empty.
# If they are empty then either we can a) bail or b) still attempt to
# backfill. We opt to try backfilling anyway just in case we do get
# relevant events.
if filtered_sorted_extremeties_tuple:
sorted_extremeties_tuple = filtered_sorted_extremeties_tuple
#
filtered_sorted_extremeties_tuples = [
t for t in sorted_extremeties_tuples if int(t[1]) <= current_depth
]
if filtered_sorted_extremeties_tuples:
logger.debug(
"_maybe_backfill_inner: extrems before current depth: %s",
filtered_sorted_extremeties_tuples,
)
sorted_extremeties_tuples = filtered_sorted_extremeties_tuples
else:
logger.debug(
"_maybe_backfill_inner: all extrems are *after* current depth. Backfilling anyway."
)
# We don't want to specify too many extremities as it causes the backfill
# request URI to be too long.
extremities = dict(sorted_extremeties_tuple[:5])
# We still need to narrow down the list of extremities we pass to the remote
# server. We limit to 5 of them, to avoid the request URI becoming too long.
#
# However, we only want to paginate from a particular extremity if we can
# actually see the events we'll get, as otherwise we'll just spend a lot of
# resources to get redacted events.
#
# We do this by filtering all the backwards extremities and seeing if
# any remain. Given we don't have the extremity events themselves, we
# need to actually check the events that reference them - their "successor"
# events.
#
# *Note*: the spec wants us to keep backfilling until we reach the start
# of the room in case we are allowed to see some of the history. However
# in practice that causes more issues than its worth, as a) its
# relatively rare for there to be any visible history and b) even when
# there is its often sufficiently long ago that clients would stop
# attempting to paginate before backfill reached the visible history.
#
# Calculating the visibility of each extremity is quite expensive, and there
# can be thousands of them in a big gappy room, so we just check them one
# by one until we've checked them all, or we've got 5 of them.
#
# TODO: Correctly handle the case where we are allowed to see the
# successor event but not the backward extremity, e.g. in the case of
# initial join of the server where we are allowed to see the join
# event but not anything before it. This would require looking at the
# state *before* the event, ignoring the special casing certain event
# types have.
extremities_to_request: Set[str] = set()
for extremity_event_id, _ in sorted_extremeties_tuples:
if len(extremities_to_request) >= 5:
break
successor_event_ids = await self.store.get_successor_events(
[extremity_event_id]
)
successor_events = await self.store.get_events_as_list(
successor_event_ids,
redact_behaviour=EventRedactBehaviour.AS_IS,
get_prev_content=False,
)
# We set `check_history_visibility_only` as we might otherwise get false
# positives from users having been erased.
filtered_extremities = await filter_events_for_server(
self.storage,
self.server_name,
successor_events,
redact=False,
check_history_visibility_only=True,
)
if filtered_extremities:
extremities_to_request.add(extremity_event_id)
else:
logger.debug(
"_maybe_backfill_inner: skipping extremity %s as it would not be visible.",
extremity_event_id,
)
if not extremities_to_request:
logger.debug(
"_maybe_backfill_inner: found no extremities which would be visible"
)
return False
logger.debug(
"_maybe_backfill_inner: extremities_to_request %s", extremities_to_request
)
# Now we need to decide which hosts to hit first.
@@ -309,7 +338,7 @@ class FederationHandler:
for dom in domains:
try:
await self._federation_event_handler.backfill(
dom, room_id, limit=100, extremities=extremities
dom, room_id, limit=100, extremities=extremities_to_request
)
# If this succeeded then we probably already have the
# appropriate stuff.

View File

@@ -419,6 +419,13 @@ async def _event_to_memberships(
return {}
# for each event, get the event_ids of the membership state at those events.
#
# TODO: this means that we request the entire membership list. If there are only
# one or two users on this server, and the room is huge, this is very wasteful
# (it means more db work, and churns the *stateGroupMembersCache*).
# It might be that we could extend StateFilter to specify "give me keys matching
# *:<server_name>", to avoid this.
event_to_state_ids = await storage.state.get_state_ids_for_events(
frozenset(e.event_id for e in events),
state_filter=StateFilter.from_types(types=((EventTypes.Member, None),)),