Disconnect background process work from request trace (#18932)

Before https://github.com/element-hq/synapse/pull/18849, we we're using
our own custom `LogContextScopeManager` which tied the tracing scope to
the `LoggingContext`. Since we created a new
`BackgroundProcessLoggingContext` any time we
`run_as_background_process(...)`, the trace for the background work was
separate from the trace that kicked of the work as expected (e.g.
request trace is separate from the background process we kicked to fetch
more messages from the federation).

Since we've now switched to the `ContextVarsScopeManager` (in
https://github.com/element-hq/synapse/pull/18849), the tracing scope now
crosses the `LoggingContext` boundaries (and thread boundaries) without
a problem. This means we end up with request traces that include all of
the background work that we've kicked off bloating the trace and making
it hard to understand what's going on.

This PR separates the traces again to how things were before.
Additionally, things are even better now since I added some cross-link
references between the traces to easily be able to jump between.

Follow-up to https://github.com/element-hq/synapse/pull/18849

---

In the before, you can see that the trace is blown up by the background
process (`bgproc.qwer`).

In the after, we now only have a little cross-link marker span
(`start_bgproc.qwer`) to jump to background process trace.

Before | After
---  | ---
<some image> | <some image>



### Testing strategy

1. Run a Jaeger instance
(https://www.jaegertracing.io/docs/1.6/getting-started/)
    ```shell
    $ docker run -d --name jaeger \
      -e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \
      -p 5775:5775/udp \
      -p 6831:6831/udp \
      -p 6832:6832/udp \
      -p 5778:5778 \
      -p 16686:16686 \
      -p 14268:14268 \
      -p 9411:9411 \
      jaegertracing/all-in-one:1.59.0
    ```
 1. Configure Synapse to use tracing:
     `homeserver.yaml`
     ```yaml
    ## Tracing ##
    opentracing:
      enabled: true
      jaeger_config:
        sampler:
          type: const
          param: 1
        logging:
          false
    ```
1. Make sure the optional `opentracing` dependency is installed: `poetry
install --extras all`
1. In the `VersionsRestServlet`, modify it to kick off a dummy
background process (easy to test this way)
    ```python
from synapse.metrics.background_process_metrics import
run_as_background_process

    async def _qwer() -> None:
        await self.clock.sleep(1)

    run_as_background_process("qwer", "test_server", _qwer)
    ```
1. Run Synapse: `poetry run synapse_homeserver --config-path
homeserver.yaml`
1. Fire off a version requests: `curl
http://localhost:8008/_matrix/client/versions`
 1. Visit http://localhost:16686/search to view the traces
     - Select the correct service
     - Look for the  `VersionsRestServlet` operation
     - Press 'Find Traces' button
     - Select the relevant trace
     - Notice how the trace isn't bloated
- Look for the `start_bgproc.qwer` span cross-linking to the background
process
- Jump to the other trace using the cross-link reference ->
`bgproc.qwer`
This commit is contained in:
Eric Eastwood
2025-09-25 21:45:18 -05:00
committed by GitHub
parent d2a966f922
commit 04721c85e6
5 changed files with 116 additions and 9 deletions

1
changelog.d/18932.misc Normal file
View File

@@ -0,0 +1 @@
Disconnect background process work from request trace.

View File

@@ -119,7 +119,6 @@ class InviteAutoAccepter:
event.state_key,
event.room_id,
"join",
bg_start_span=False,
)
if is_direct_message:

View File

@@ -576,7 +576,9 @@ def start_active_span_follows_from(
operation_name: str,
contexts: Collection,
child_of: Optional[Union["opentracing.Span", "opentracing.SpanContext"]] = None,
tags: Optional[Dict[str, str]] = None,
start_time: Optional[float] = None,
ignore_active_span: bool = False,
*,
inherit_force_tracing: bool = False,
tracer: Optional["opentracing.Tracer"] = None,
@@ -591,9 +593,16 @@ def start_active_span_follows_from(
span will be the parent. (If there is no currently active span, the first
span in `contexts` will be the parent.)
tags: an optional dictionary of span tags. The caller gives up ownership of that
dictionary, because the :class:`Tracer` may use it as-is to avoid extra data
copying.
start_time: optional override for the start time of the created span. Seconds
since the epoch.
ignore_active_span: an explicit flag that ignores the current active
scope and creates a root span.
inherit_force_tracing: if set, and any of the previous contexts have had tracing
forced, the new span will also have tracing forced.
tracer: override the opentracing tracer. By default the global tracer is used.
@@ -606,7 +615,9 @@ def start_active_span_follows_from(
operation_name,
child_of=child_of,
references=references,
tags=tags,
start_time=start_time,
ignore_active_span=ignore_active_span,
tracer=tracer,
)

View File

@@ -20,7 +20,7 @@
import logging
import threading
from contextlib import nullcontext
from contextlib import contextmanager, nullcontext
from functools import wraps
from types import TracebackType
from typing import (
@@ -28,7 +28,9 @@ from typing import (
Any,
Awaitable,
Callable,
ContextManager,
Dict,
Generator,
Iterable,
Optional,
Protocol,
@@ -49,7 +51,12 @@ from synapse.logging.context import (
LoggingContext,
PreserveLoggingContext,
)
from synapse.logging.opentracing import SynapseTags, start_active_span
from synapse.logging.opentracing import (
SynapseTags,
active_span,
start_active_span,
start_active_span_follows_from,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics._types import Collector
@@ -264,15 +271,97 @@ def run_as_background_process(
with BackgroundProcessLoggingContext(
name=desc, server_name=server_name, instance_id=count
) as context:
) as logging_context:
try:
if bg_start_span:
ctx = start_active_span(
f"bgproc.{desc}", tags={SynapseTags.REQUEST_ID: str(context)}
)
original_active_tracing_span = active_span()
# If there is already an active span (e.g. because this background
# process was started as part of handling a request for example),
# because this is a long-running background task that may serve a
# broader purpose than the request that kicked it off, we don't want
# it to be a direct child of the currently active trace connected to
# the request. We only want a loose reference to jump between the
# traces.
#
# For example, when making a `/messages` request, when approaching a
# gap, we may kick off a background process to fetch missing events
# from federation. The `/messages` request trace should't include
# the entire time taken and details around fetching the missing
# events since the request doesn't rely on the result, it was just
# part of the heuristic to initiate things.
#
# We don't care about the value from the context manager as it's not
# used (so we just use `Any` for the type). Ideally, we'd be able to
# mark this as unused like an `assert_never` of sorts.
tracing_scope: ContextManager[Any]
if original_active_tracing_span is not None:
# With the OpenTracing client that we're using, it's impossible to
# create a disconnected root span while also providing `references`
# so we first create a bare root span, then create a child span that
# includes the references that we want.
root_tracing_scope = start_active_span(
f"bgproc.{desc}",
tags={SynapseTags.REQUEST_ID: str(logging_context)},
# Create a root span for the background process (disconnected
# from other spans)
ignore_active_span=True,
)
# Also add a span in the original request trace that cross-links
# to background process trace. We immediately finish the span as
# this is just a marker to follow where the real work is being
# done.
#
# In OpenTracing, `FOLLOWS_FROM` indicates parent-child
# relationship whereas we just want a cross-link to the
# downstream trace. This is a bit hacky, but the closest we
# can get to in OpenTracing land. If we ever migrate to
# OpenTelemetry, we should use a normal `Link` for this.
with start_active_span_follows_from(
f"start_bgproc.{desc}",
child_of=original_active_tracing_span,
ignore_active_span=True,
# Points to the background process span.
contexts=[root_tracing_scope.span.context],
):
pass
# Then start the tracing scope that we're going to use for
# the duration of the background process within the root
# span we just created.
child_tracing_scope = start_active_span_follows_from(
f"bgproc_child.{desc}",
child_of=root_tracing_scope.span,
ignore_active_span=True,
tags={SynapseTags.REQUEST_ID: str(logging_context)},
# Create the `FOLLOWS_FROM` reference to the request's
# span so there is a loose coupling between the two
# traces and it's easy to jump between.
contexts=[original_active_tracing_span.context],
)
# For easy usage down below, we create a context manager that
# combines both scopes.
@contextmanager
def combined_context_manager() -> Generator[None, None, None]:
with root_tracing_scope, child_tracing_scope:
yield
tracing_scope = combined_context_manager()
else:
# Otherwise, when there is no active span, we will be creating
# a disconnected root span already and we don't have to
# worry about cross-linking to anything.
tracing_scope = start_active_span(
f"bgproc.{desc}",
tags={SynapseTags.REQUEST_ID: str(logging_context)},
)
else:
ctx = nullcontext() # type: ignore[assignment]
with ctx:
tracing_scope = nullcontext()
with tracing_scope:
return await func(*args, **kwargs)
except Exception:
logger.exception(

View File

@@ -250,6 +250,13 @@ class RedisSubscriber(SubscriberProtocol):
self.server_name,
self._async_send_command,
cmd,
# We originally started tracing background processes to avoid `There was no
# active span` errors but this change meant we started generating 15x the
# number of spans than before (this is one of the most heavily called
# instances of `run_as_background_process`).
#
# Since we don't log or tag a tracing span in the downstream
# code, we can safely disable this.
bg_start_span=False,
)