Disconnect background process work from request trace (#18932)
Before https://github.com/element-hq/synapse/pull/18849, we we're using our own custom `LogContextScopeManager` which tied the tracing scope to the `LoggingContext`. Since we created a new `BackgroundProcessLoggingContext` any time we `run_as_background_process(...)`, the trace for the background work was separate from the trace that kicked of the work as expected (e.g. request trace is separate from the background process we kicked to fetch more messages from the federation). Since we've now switched to the `ContextVarsScopeManager` (in https://github.com/element-hq/synapse/pull/18849), the tracing scope now crosses the `LoggingContext` boundaries (and thread boundaries) without a problem. This means we end up with request traces that include all of the background work that we've kicked off bloating the trace and making it hard to understand what's going on. This PR separates the traces again to how things were before. Additionally, things are even better now since I added some cross-link references between the traces to easily be able to jump between. Follow-up to https://github.com/element-hq/synapse/pull/18849 --- In the before, you can see that the trace is blown up by the background process (`bgproc.qwer`). In the after, we now only have a little cross-link marker span (`start_bgproc.qwer`) to jump to background process trace. Before | After --- | --- <some image> | <some image> ### Testing strategy 1. Run a Jaeger instance (https://www.jaegertracing.io/docs/1.6/getting-started/) ```shell $ docker run -d --name jaeger \ -e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \ -p 5775:5775/udp \ -p 6831:6831/udp \ -p 6832:6832/udp \ -p 5778:5778 \ -p 16686:16686 \ -p 14268:14268 \ -p 9411:9411 \ jaegertracing/all-in-one:1.59.0 ``` 1. Configure Synapse to use tracing: `homeserver.yaml` ```yaml ## Tracing ## opentracing: enabled: true jaeger_config: sampler: type: const param: 1 logging: false ``` 1. Make sure the optional `opentracing` dependency is installed: `poetry install --extras all` 1. In the `VersionsRestServlet`, modify it to kick off a dummy background process (easy to test this way) ```python from synapse.metrics.background_process_metrics import run_as_background_process async def _qwer() -> None: await self.clock.sleep(1) run_as_background_process("qwer", "test_server", _qwer) ``` 1. Run Synapse: `poetry run synapse_homeserver --config-path homeserver.yaml` 1. Fire off a version requests: `curl http://localhost:8008/_matrix/client/versions` 1. Visit http://localhost:16686/search to view the traces - Select the correct service - Look for the `VersionsRestServlet` operation - Press 'Find Traces' button - Select the relevant trace - Notice how the trace isn't bloated - Look for the `start_bgproc.qwer` span cross-linking to the background process - Jump to the other trace using the cross-link reference -> `bgproc.qwer`
This commit is contained in:
1
changelog.d/18932.misc
Normal file
1
changelog.d/18932.misc
Normal file
@@ -0,0 +1 @@
|
||||
Disconnect background process work from request trace.
|
||||
@@ -119,7 +119,6 @@ class InviteAutoAccepter:
|
||||
event.state_key,
|
||||
event.room_id,
|
||||
"join",
|
||||
bg_start_span=False,
|
||||
)
|
||||
|
||||
if is_direct_message:
|
||||
|
||||
@@ -576,7 +576,9 @@ def start_active_span_follows_from(
|
||||
operation_name: str,
|
||||
contexts: Collection,
|
||||
child_of: Optional[Union["opentracing.Span", "opentracing.SpanContext"]] = None,
|
||||
tags: Optional[Dict[str, str]] = None,
|
||||
start_time: Optional[float] = None,
|
||||
ignore_active_span: bool = False,
|
||||
*,
|
||||
inherit_force_tracing: bool = False,
|
||||
tracer: Optional["opentracing.Tracer"] = None,
|
||||
@@ -591,9 +593,16 @@ def start_active_span_follows_from(
|
||||
span will be the parent. (If there is no currently active span, the first
|
||||
span in `contexts` will be the parent.)
|
||||
|
||||
tags: an optional dictionary of span tags. The caller gives up ownership of that
|
||||
dictionary, because the :class:`Tracer` may use it as-is to avoid extra data
|
||||
copying.
|
||||
|
||||
start_time: optional override for the start time of the created span. Seconds
|
||||
since the epoch.
|
||||
|
||||
ignore_active_span: an explicit flag that ignores the current active
|
||||
scope and creates a root span.
|
||||
|
||||
inherit_force_tracing: if set, and any of the previous contexts have had tracing
|
||||
forced, the new span will also have tracing forced.
|
||||
tracer: override the opentracing tracer. By default the global tracer is used.
|
||||
@@ -606,7 +615,9 @@ def start_active_span_follows_from(
|
||||
operation_name,
|
||||
child_of=child_of,
|
||||
references=references,
|
||||
tags=tags,
|
||||
start_time=start_time,
|
||||
ignore_active_span=ignore_active_span,
|
||||
tracer=tracer,
|
||||
)
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from contextlib import nullcontext
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from functools import wraps
|
||||
from types import TracebackType
|
||||
from typing import (
|
||||
@@ -28,7 +28,9 @@ from typing import (
|
||||
Any,
|
||||
Awaitable,
|
||||
Callable,
|
||||
ContextManager,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
Optional,
|
||||
Protocol,
|
||||
@@ -49,7 +51,12 @@ from synapse.logging.context import (
|
||||
LoggingContext,
|
||||
PreserveLoggingContext,
|
||||
)
|
||||
from synapse.logging.opentracing import SynapseTags, start_active_span
|
||||
from synapse.logging.opentracing import (
|
||||
SynapseTags,
|
||||
active_span,
|
||||
start_active_span,
|
||||
start_active_span_follows_from,
|
||||
)
|
||||
from synapse.metrics import SERVER_NAME_LABEL
|
||||
from synapse.metrics._types import Collector
|
||||
|
||||
@@ -264,15 +271,97 @@ def run_as_background_process(
|
||||
|
||||
with BackgroundProcessLoggingContext(
|
||||
name=desc, server_name=server_name, instance_id=count
|
||||
) as context:
|
||||
) as logging_context:
|
||||
try:
|
||||
if bg_start_span:
|
||||
ctx = start_active_span(
|
||||
f"bgproc.{desc}", tags={SynapseTags.REQUEST_ID: str(context)}
|
||||
)
|
||||
original_active_tracing_span = active_span()
|
||||
|
||||
# If there is already an active span (e.g. because this background
|
||||
# process was started as part of handling a request for example),
|
||||
# because this is a long-running background task that may serve a
|
||||
# broader purpose than the request that kicked it off, we don't want
|
||||
# it to be a direct child of the currently active trace connected to
|
||||
# the request. We only want a loose reference to jump between the
|
||||
# traces.
|
||||
#
|
||||
# For example, when making a `/messages` request, when approaching a
|
||||
# gap, we may kick off a background process to fetch missing events
|
||||
# from federation. The `/messages` request trace should't include
|
||||
# the entire time taken and details around fetching the missing
|
||||
# events since the request doesn't rely on the result, it was just
|
||||
# part of the heuristic to initiate things.
|
||||
#
|
||||
# We don't care about the value from the context manager as it's not
|
||||
# used (so we just use `Any` for the type). Ideally, we'd be able to
|
||||
# mark this as unused like an `assert_never` of sorts.
|
||||
tracing_scope: ContextManager[Any]
|
||||
if original_active_tracing_span is not None:
|
||||
# With the OpenTracing client that we're using, it's impossible to
|
||||
# create a disconnected root span while also providing `references`
|
||||
# so we first create a bare root span, then create a child span that
|
||||
# includes the references that we want.
|
||||
root_tracing_scope = start_active_span(
|
||||
f"bgproc.{desc}",
|
||||
tags={SynapseTags.REQUEST_ID: str(logging_context)},
|
||||
# Create a root span for the background process (disconnected
|
||||
# from other spans)
|
||||
ignore_active_span=True,
|
||||
)
|
||||
|
||||
# Also add a span in the original request trace that cross-links
|
||||
# to background process trace. We immediately finish the span as
|
||||
# this is just a marker to follow where the real work is being
|
||||
# done.
|
||||
#
|
||||
# In OpenTracing, `FOLLOWS_FROM` indicates parent-child
|
||||
# relationship whereas we just want a cross-link to the
|
||||
# downstream trace. This is a bit hacky, but the closest we
|
||||
# can get to in OpenTracing land. If we ever migrate to
|
||||
# OpenTelemetry, we should use a normal `Link` for this.
|
||||
with start_active_span_follows_from(
|
||||
f"start_bgproc.{desc}",
|
||||
child_of=original_active_tracing_span,
|
||||
ignore_active_span=True,
|
||||
# Points to the background process span.
|
||||
contexts=[root_tracing_scope.span.context],
|
||||
):
|
||||
pass
|
||||
|
||||
# Then start the tracing scope that we're going to use for
|
||||
# the duration of the background process within the root
|
||||
# span we just created.
|
||||
child_tracing_scope = start_active_span_follows_from(
|
||||
f"bgproc_child.{desc}",
|
||||
child_of=root_tracing_scope.span,
|
||||
ignore_active_span=True,
|
||||
tags={SynapseTags.REQUEST_ID: str(logging_context)},
|
||||
# Create the `FOLLOWS_FROM` reference to the request's
|
||||
# span so there is a loose coupling between the two
|
||||
# traces and it's easy to jump between.
|
||||
contexts=[original_active_tracing_span.context],
|
||||
)
|
||||
|
||||
# For easy usage down below, we create a context manager that
|
||||
# combines both scopes.
|
||||
@contextmanager
|
||||
def combined_context_manager() -> Generator[None, None, None]:
|
||||
with root_tracing_scope, child_tracing_scope:
|
||||
yield
|
||||
|
||||
tracing_scope = combined_context_manager()
|
||||
|
||||
else:
|
||||
# Otherwise, when there is no active span, we will be creating
|
||||
# a disconnected root span already and we don't have to
|
||||
# worry about cross-linking to anything.
|
||||
tracing_scope = start_active_span(
|
||||
f"bgproc.{desc}",
|
||||
tags={SynapseTags.REQUEST_ID: str(logging_context)},
|
||||
)
|
||||
else:
|
||||
ctx = nullcontext() # type: ignore[assignment]
|
||||
with ctx:
|
||||
tracing_scope = nullcontext()
|
||||
|
||||
with tracing_scope:
|
||||
return await func(*args, **kwargs)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
|
||||
@@ -250,6 +250,13 @@ class RedisSubscriber(SubscriberProtocol):
|
||||
self.server_name,
|
||||
self._async_send_command,
|
||||
cmd,
|
||||
# We originally started tracing background processes to avoid `There was no
|
||||
# active span` errors but this change meant we started generating 15x the
|
||||
# number of spans than before (this is one of the most heavily called
|
||||
# instances of `run_as_background_process`).
|
||||
#
|
||||
# Since we don't log or tag a tracing span in the downstream
|
||||
# code, we can safely disable this.
|
||||
bg_start_span=False,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user