1
0

Add Prometheus HTTP service discovery endpoint for easy discovery of all workers in Docker image (#19336)

Add Prometheus [HTTP service discovery](https://prometheus.io/docs/prometheus/latest/http_sd/)
endpoint for easy discovery of all workers in Docker image.

Follow-up to https://github.com/element-hq/synapse/pull/19324

Spawning from wanting to [run a load
test](https://github.com/element-hq/synapse-rust-apps/pull/397) against
the Complement Docker image of Synapse and see metrics from the
homeserver.


`GET http://<synapse_container>:9469/metrics/service_discovery`
```json5
[
  {
    "targets": [ "<host>", ... ],
    "labels": {
      "<labelname>": "<labelvalue>", ...
    }
  },
  ...
]
```

The metrics from each worker can also be accessed via
`http://<synapse_container>:9469/metrics/worker/<worker_name>` which is
what the service discovery response points to behind the scenes. This
way, you only need to expose a single port (9469) to access all metrics.

<details>
<summary>Real HTTP service discovery response</summary>

```json5
[
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "event_persister",
            "index": "1",
            "__metrics_path__": "/metrics/worker/event_persister1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "event_persister",
            "index": "2",
            "__metrics_path__": "/metrics/worker/event_persister2"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "background_worker",
            "index": "1",
            "__metrics_path__": "/metrics/worker/background_worker1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "event_creator",
            "index": "1",
            "__metrics_path__": "/metrics/worker/event_creator1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "user_dir",
            "index": "1",
            "__metrics_path__": "/metrics/worker/user_dir1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "media_repository",
            "index": "1",
            "__metrics_path__": "/metrics/worker/media_repository1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "federation_inbound",
            "index": "1",
            "__metrics_path__": "/metrics/worker/federation_inbound1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "federation_reader",
            "index": "1",
            "__metrics_path__": "/metrics/worker/federation_reader1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "federation_sender",
            "index": "1",
            "__metrics_path__": "/metrics/worker/federation_sender1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "synchrotron",
            "index": "1",
            "__metrics_path__": "/metrics/worker/synchrotron1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "client_reader",
            "index": "1",
            "__metrics_path__": "/metrics/worker/client_reader1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "appservice",
            "index": "1",
            "__metrics_path__": "/metrics/worker/appservice1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "pusher",
            "index": "1",
            "__metrics_path__": "/metrics/worker/pusher1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "device_lists",
            "index": "1",
            "__metrics_path__": "/metrics/worker/device_lists1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "device_lists",
            "index": "2",
            "__metrics_path__": "/metrics/worker/device_lists2"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "stream_writers",
            "index": "1",
            "__metrics_path__": "/metrics/worker/stream_writers1"
        }
    },
    {
        "targets": [
            "localhost:9469"
        ],
        "labels": {
            "job": "main",
            "index": "1",
            "__metrics_path__": "/metrics/worker/main"
        }
    }
]
```

</details>


And how it ends up as targets in Prometheus
(http://localhost:9090/targets):

(image)


### Testing strategy

1. Make sure your firewall allows the Docker containers to communicate
to the host (`host.docker.internal`) so they can access exposed ports of
other Docker containers. We want to allow Synapse to access the
Prometheus container and Grafana to access to the Prometheus container.
- `sudo ufw allow in on docker0 comment "Allow traffic from the default
Docker network to the host machine (host.docker.internal)"`
- `sudo ufw allow in on br-+ comment "(from Matrix Complement testing)
Allow traffic from custom Docker networks to the host machine
(host.docker.internal)"`
- [Complement firewall
docs](ee6acd9154/README.md (potential-conflict-with-firewall-software))
1. Build the Docker image for Synapse: `docker build -t
matrixdotorg/synapse -f docker/Dockerfile . && docker build -t
matrixdotorg/synapse-workers -f docker/Dockerfile-workers .`
([docs](7a24fafbc3/docker/README-testing.md (building-and-running-the-images-manually)))
 1. Start Synapse:
     ```
    docker run -d --name synapse \
        --mount type=volume,src=synapse-data,dst=/data \
        -e SYNAPSE_SERVER_NAME=my.docker.synapse.server \
        -e SYNAPSE_REPORT_STATS=no \
        -e SYNAPSE_ENABLE_METRICS=1 \
        -p 8008:8008 \
        -p 9469:9469 \
        matrixdotorg/synapse-workers:latest
    ```
    - Also try with workers:
       ```
      docker run -d --name synapse \
          --mount type=volume,src=synapse-data,dst=/data \
          -e SYNAPSE_SERVER_NAME=my.docker.synapse.server \
          -e SYNAPSE_REPORT_STATS=no \
          -e SYNAPSE_ENABLE_METRICS=1 \
          -e SYNAPSE_WORKER_TYPES="\
              event_persister:2, \
              background_worker, \
              event_creator, \
              user_dir, \
              media_repository, \
              federation_inbound, \
              federation_reader, \
              federation_sender, \
              synchrotron, \
              client_reader, \
              appservice, \
              pusher, \
              device_lists:2, \
stream_writers=account_data+presence+receipts+to_device+typing" \
          -p 8008:8008 \
          -p 9469:9469 \
          matrixdotorg/synapse-workers:latest
      ```
1. You should be able to see Prometheus service discovery endpoint at
http://localhost:9469/metrics/service_discovery
 1. Create a Prometheus config (`prometheus.yml`)
    ```yaml
    global:
      scrape_interval: 15s
      scrape_timeout: 15s
      evaluation_interval: 15s
    
    scrape_configs:
      - job_name: synapse
        scrape_interval: 15s
        metrics_path: /_synapse/metrics
        scheme: http
# We set `honor_labels` so that each service can set their own `job`
label
        #
# > honor_labels controls how Prometheus handles conflicts between
labels that are
# > already present in scraped data and labels that Prometheus would
attach
# > server-side ("job" and "instance" labels, manually configured target
# > labels, and labels generated by service discovery implementations).
        # >
# > *--
https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config*
        honor_labels: true
        # Use HTTP service discovery
        #
        # Reference:
        #  - https://prometheus.io/docs/prometheus/latest/http_sd/
# -
https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config
        http_sd_configs:
          - url: 'http://localhost:9469/metrics/service_discovery'
    ```
1. Start Prometheus (update the volume bind mount to the config you just
saved somewhere):
    ```
    docker run \
        --detach \
        --name=prometheus \
        --add-host host.docker.internal:host-gateway \
        -p 9090:9090 \
-v
~/Documents/code/random/prometheus-config/prometheus.yml:/etc/prometheus/prometheus.yml
\
        prom/prometheus
    ```
1. Make sure you're seeing some data in Prometheus. On
http://localhost:9090/query, search for `synapse_build_info`
 1. Start [Grafana](https://hub.docker.com/r/grafana/grafana)
    ```
docker run -d --name=grafana --add-host
host.docker.internal:host-gateway -p 3000:3000 grafana/grafana
    ```
1. Visit the Grafana dashboard, http://localhost:3000/ (Credentials:
`admin`/`admin`)
1. **Connections** -> **Data Sources** -> **Add data source** ->
**Prometheus**
     - Prometheus server URL: `http://host.docker.internal:9090`
1. Import the Synapse dashboard:
https://github.com/element-hq/synapse/blob/develop/contrib/grafana/synapse.json
This commit is contained in:
Eric Eastwood
2026-01-14 18:02:55 -06:00
committed by GitHub
parent 58f59ffbcb
commit a1e9abc7df
6 changed files with 328 additions and 34 deletions

1
changelog.d/19336.docker Normal file
View File

@@ -0,0 +1 @@
Add [Prometheus HTTP service discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config) endpoint for easy discovery of all workers when using the `docker/Dockerfile-workers` image (see the [*Metrics* section of our Docker testing docs](docker/README-testing.md#metrics)).

View File

@@ -192,8 +192,7 @@ COPY ./docker/conf /conf
# 8448: SS Matrix API port from Synapse
EXPOSE 8008/tcp 8448/tcp
# 19090: Metrics listener port for the main process (metrics must be enabled with
# SYNAPSE_ENABLE_METRICS=1). Metrics for workers are on ports starting from 19091 but
# since these are dynamic we don't expose them by default.
# SYNAPSE_ENABLE_METRICS=1).
EXPOSE 19090/tcp
ENTRYPOINT ["/start.py"]

View File

@@ -71,6 +71,15 @@ FROM $FROM
# Expose nginx listener port
EXPOSE 8080/tcp
# Metrics for workers are on ports starting from 19091 but since these are dynamic
# we don't expose them by default (metrics must be enabled with
# SYNAPSE_ENABLE_METRICS=1)
#
# Instead, we expose a single port used for Prometheus HTTP service discovery
# (`http://<synapse_container>:9469/metrics/service_discovery`) and proxy all of the
# workers' metrics endpoints through that
# (`http://<synapse_container>:9469/metrics/worker/<worker_name>`).
EXPOSE 9469/tcp
# A script to read environment variables and create the necessary
# files to run the desired worker configuration. Will start supervisord.

View File

@@ -137,3 +137,47 @@ TLS certificate and key (respectively), both in PEM (textual) format.
In this case, Nginx will additionally serve using HTTPS on port 8448.
### Metrics
Set `SYNAPSE_ENABLE_METRICS=1` to configure `enable_metrics: true` and setup the
`metrics` listener on the main and worker processes. Defaults to `0` (disabled). The
main process will listen on port `19090` and workers on port `19091 + <worker index>`.
When using `docker/Dockerfile-workers`, to ease the complexity with the metrics setup,
we also have a [Prometheus HTTP service
discovery](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config)
endpoint available at `http://<synapse_container>:9469/metrics/service_discovery`.
The metrics from each worker can also be accessed via
`http://<synapse_container>:9469/metrics/worker/<worker_name>` which is what the service
discovery response points to behind the scenes. This way, you only need to expose a
single port (9469) to access all metrics.
```yaml
global:
scrape_interval: 15s
scrape_timeout: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: synapse
scrape_interval: 15s
metrics_path: /_synapse/metrics
scheme: http
# We set `honor_labels` so that each service can set their own `job`/`instance` label
#
# > honor_labels controls how Prometheus handles conflicts between labels that are
# > already present in scraped data and labels that Prometheus would attach
# > server-side ("job" and "instance" labels, manually configured target
# > labels, and labels generated by service discovery implementations).
# >
# > *-- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config*
honor_labels: true
# Use HTTP service discovery
#
# Reference:
# - https://prometheus.io/docs/prometheus/latest/http_sd/
# - https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config
http_sd_configs:
- url: 'http://localhost:9469/metrics/service_discovery'
```

View File

@@ -48,3 +48,5 @@ server {
proxy_set_header Host $host:$server_port;
}
}
{{ nginx_prometheus_metrics_service_discovery }}

View File

@@ -58,6 +58,7 @@
# in the project's README), this script may be run multiple times, and functionality should
# continue to work if so.
import json
import os
import platform
import re
@@ -75,6 +76,7 @@ from typing import (
SupportsIndex,
)
import attr
import yaml
from jinja2 import Environment, FileSystemLoader
@@ -341,7 +343,7 @@ WORKERS_CONFIG: dict[str, dict[str, Any]] = {
}
# Templates for sections that may be inserted multiple times in config files
NGINX_LOCATION_CONFIG_BLOCK = """
NGINX_LOCATION_REGEX_CONFIG_BLOCK = """
location ~* {endpoint} {{
proxy_pass {upstream};
proxy_set_header X-Forwarded-For $remote_addr;
@@ -350,6 +352,25 @@ NGINX_LOCATION_CONFIG_BLOCK = """
}}
"""
# Having both **regex** (`NGINX_LOCATION_REGEX_CONFIG_BLOCK`) match vs **exact**
# (`NGINX_LOCATION_EXACT_CONFIG_BLOCK`) match is necessary because we can't use a URI
# path in `proxy_pass http://localhost:19090/_synapse/metrics` with the regex version.
#
# Example of what happens if you try to use `proxy_pass http://localhost:19090/_synapse/metrics`
# with `NGINX_LOCATION_REGEX_CONFIG_BLOCK`:
# ```
# nginx | 2025/12/31 22:58:34 [emerg] 21#21: "proxy_pass" cannot have URI part in location given by regular expression, or inside named location, or inside "if" statement, or inside "limit_except" block in /etc/nginx/conf.d/matrix-synapse.conf:732
# ```
NGINX_LOCATION_EXACT_CONFIG_BLOCK = """
location = {endpoint} {{
proxy_pass {upstream};
proxy_set_header X-Forwarded-For $remote_addr;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Host $host;
}}
"""
NGINX_UPSTREAM_CONFIG_BLOCK = """
upstream {upstream_worker_base_name} {{
{body}
@@ -357,6 +378,63 @@ upstream {upstream_worker_base_name} {{
"""
PROMETHEUS_METRICS_SERVICE_DISCOVERY_FILE_PATH = (
"/data/prometheus_service_discovery.json"
)
"""
We serve this file with nginx so people can use it with `http_sd_config` in their
Prometheus config.
"""
NGINX_HOST_PLACEHOLDER = "<HOST_PLACEHOLDER>"
"""Will be replaced with the whatever hostname:port used to access the nginx metrics endpoint."""
NGINX_PROMETHEUS_METRICS_SERVICE_DISCOVERY = """
server {{
listen 9469;
location = /metrics/service_discovery {{
alias {service_discovery_file_path};
default_type application/json;
# Find/replace the host placeholder in the response body with the actual
# host used to access this endpoint.
#
# We want to reflect back whatever host the client used to access this file.
# For example, if they accessed it via `localhost:9469`, then they
# can also reach all of the proxied metrics endpoints at the same address.
# Or if it's Prometheus in another container, it will access this via
# `host.docker.internal:9469`, etc. Or perhaps it's even some randomly assigned
# port mapping.
sub_filter '{host_placeholder}' '$http_host';
# By default, `ngx_http_sub_module` only works on `text/html` responses. We want
# to find/replace in `application/JSON`.
sub_filter_types application/json;
# Replace all occurences
sub_filter_once off;
}}
# Make the service discovery endpoint easy to find; redirect to the correct spot.
location = / {{
return 302 /metrics/service_discovery;
}}
{metrics_proxy_locations}
}}
"""
"""
Setup the nginx config necessary to serve the JSON file for Prometheus HTTP service discovery
(`http_sd_config`). Served at `/metrics/service_discovery`.
Reference:
- https://prometheus.io/docs/prometheus/latest/http_sd/
- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config
We also proxy all of the Synapse metrics endpoints through a central place so that
people only need to expose the single 9469 port and service discovery can take care of
the rest: `/metrics/worker/<worker_name>` -> http://localhost:19090/_synapse/metrics
"""
# Utility functions
def log(txt: str) -> None:
print(txt)
@@ -616,9 +694,42 @@ def generate_base_homeserver_config() -> None:
subprocess.run([sys.executable, "/start.py", "migrate_config"], check=True)
@attr.s(auto_attribs=True)
class Worker:
worker_name: str
"""
ex.
`event_persister:2` -> `event_persister1` and `event_persister2`
`stream_writers=account_data+presence+receipts+to_device+typing"` -> `stream_writers`
"""
worker_base_name: str
"""
ex.
`event_persister:2` -> `event_persister`
`stream_writers=account_data+presence+receipts+to_device+typing"` -> `stream_writers`
"""
worker_index: int
"""
The index of the worker starting from 1 for each worker type requested.
ex.
`event_persister:2` -> `1` and `2`
`stream_writers=account_data+presence+receipts+to_device+typing"` -> `1`
"""
worker_types: set[str]
"""
ex.
`event_persister:2` -> `{"event_persister"}`
`stream_writers=account_data+presence+receipts+to_device+typing"` -> `{"account_data", "presence", "receipts","to_device", "typing"}
"""
def parse_worker_types(
requested_worker_types: list[str],
) -> dict[str, set[str]]:
) -> list[Worker]:
"""Read the desired list of requested workers and prepare the data for use in
generating worker config files while also checking for potential gotchas.
@@ -626,10 +737,7 @@ def parse_worker_types(
requested_worker_types: The list formed from the split environment variable
containing the unprocessed requests for workers.
Returns: A dict of worker names to set of worker types. Format:
{'worker_name':
{'worker_type', 'worker_type2'}
}
Returns: A list of requested workers
"""
# A counter of worker_base_name -> int. Used for determining the name for a given
# worker when generating its config file, as each worker's name is just
@@ -640,8 +748,8 @@ def parse_worker_types(
# more than a single worker for cases where multiples would be bad(e.g. presence).
worker_type_shard_counter: dict[str, int] = defaultdict(int)
# The final result of all this processing
dict_to_return: dict[str, set[str]] = {}
# Map from worker name to `Worker`
worker_dict: dict[str, Worker] = {}
# Handle any multipliers requested for given workers.
multiple_processed_worker_types = apply_requested_multiplier_for_worker(
@@ -727,24 +835,29 @@ def parse_worker_types(
if worker_number > 1:
# If this isn't the first worker, check that we don't have a confusing
# mixture of worker types with the same base name.
first_worker_with_base_name = dict_to_return[f"{worker_base_name}1"]
if first_worker_with_base_name != worker_types_set:
first_worker_with_base_name = worker_dict[f"{worker_base_name}1"]
if first_worker_with_base_name.worker_types != worker_types_set:
error(
f"Can not use worker_name: '{worker_name}' for worker_type(s): "
f"{worker_types_set!r}. It is already in use by "
f"worker_type(s): {first_worker_with_base_name!r}"
f"worker_type(s): {first_worker_with_base_name.worker_types!r}"
)
dict_to_return[worker_name] = worker_types_set
worker_dict[worker_name] = Worker(
worker_name=worker_name,
worker_base_name=worker_base_name,
worker_index=worker_number,
worker_types=worker_types_set,
)
return dict_to_return
return list(worker_dict.values())
def generate_worker_files(
environ: Mapping[str, str],
config_path: str,
data_dir: str,
requested_worker_types: dict[str, set[str]],
requested_workers: list[Worker],
) -> None:
"""Read the desired workers(if any) that is passed in and generate shared
homeserver, nginx and supervisord configs.
@@ -754,8 +867,7 @@ def generate_worker_files(
config_path: The location of the generated Synapse main worker config file.
data_dir: The location of the synapse data directory. Where log and
user-facing config files live.
requested_worker_types: A Dict containing requested workers in the format of
{'worker_name1': {'worker_type', ...}}
requested_workers: A list of requested workers
"""
# Note that yaml cares about indentation, so care should be taken to insert lines
# into files at the correct indentation below.
@@ -845,7 +957,9 @@ def generate_worker_files(
healthcheck_urls = ["http://localhost:8080/health"]
# Get the set of all worker types that we have configured
all_worker_types_in_use = set(chain(*requested_worker_types.values()))
all_worker_types_in_use = set(
chain(*[worker.worker_types for worker in requested_workers])
)
# Map locations to upstreams (corresponding to worker types) in Nginx
# but only if we use the appropriate worker type
for worker_type in all_worker_types_in_use:
@@ -854,12 +968,13 @@ def generate_worker_files(
# For each worker type specified by the user, create config values and write it's
# yaml config file
for worker_name, worker_types_set in requested_worker_types.items():
worker_name_to_metrics_port_map: dict[str, int] = {}
for worker in requested_workers:
# The collected and processed data will live here.
worker_config: dict[str, Any] = {}
# Merge all worker config templates for this worker into a single config
for worker_type in worker_types_set:
for worker_type in worker.worker_types:
copy_of_template_config = WORKERS_CONFIG[worker_type].copy()
# Merge worker type template configuration data. It's a combination of lists
@@ -869,10 +984,16 @@ def generate_worker_files(
)
# Replace placeholder names in the config template with the actual worker name.
worker_config = insert_worker_name_for_worker_config(worker_config, worker_name)
worker_config = insert_worker_name_for_worker_config(
worker_config, worker.worker_name
)
worker_config.update(
{"name": worker_name, "port": str(worker_port), "config_path": config_path}
{
"name": worker.worker_name,
"port": str(worker_port),
"config_path": config_path,
}
)
# Keep the `shared_config` up to date with the `shared_extra_conf` from each
@@ -895,19 +1016,26 @@ def generate_worker_files(
# the `events` stream. For other workers, the worker name is the same
# name of the stream they write to, but for some reason it is not the
# case for event_persister.
if "event_persister" in worker_types_set:
worker_types_set.add("events")
if "event_persister" in worker.worker_types:
worker.worker_types.add("events")
# Update the shared config with sharding-related options if necessary
add_worker_roles_to_shared_config(
shared_config, worker_types_set, worker_name, worker_port
shared_config, worker.worker_types, worker.worker_name, worker_port
)
# Enable the worker in supervisord
worker_descriptors.append(worker_config)
# Write out the worker's logging config file
log_config_filepath = generate_worker_log_config(environ, worker_name, data_dir)
log_config_filepath = generate_worker_log_config(
environ, worker.worker_name, data_dir
)
worker_name_to_metrics_port_map[worker.worker_name] = worker_metrics_port
if enable_metrics:
# Enable prometheus metrics endpoint on this worker
worker_config["metrics_port"] = worker_metrics_port
if enable_metrics:
# Enable prometheus metrics endpoint on this worker
@@ -916,14 +1044,14 @@ def generate_worker_files(
# Then a worker config file
convert(
"/conf/worker.yaml.j2",
f"/conf/workers/{worker_name}.yaml",
f"/conf/workers/{worker.worker_name}.yaml",
**worker_config,
worker_log_config_filepath=log_config_filepath,
using_unix_sockets=using_unix_sockets,
)
# Save this worker's port number to the correct nginx upstreams
for worker_type in worker_types_set:
for worker_type in worker.worker_types:
nginx_upstreams.setdefault(worker_type, set()).add(worker_port)
worker_port += 1
@@ -932,7 +1060,7 @@ def generate_worker_files(
# Build the nginx location config blocks
nginx_location_config = ""
for endpoint, upstream in nginx_locations.items():
nginx_location_config += NGINX_LOCATION_CONFIG_BLOCK.format(
nginx_location_config += NGINX_LOCATION_REGEX_CONFIG_BLOCK.format(
endpoint=endpoint,
upstream=upstream,
)
@@ -955,6 +1083,111 @@ def generate_worker_files(
body=body,
)
# Provide a Prometheus metrics service discovery endpoint to easily be able to pick
# up all of the workers
nginx_prometheus_metrics_service_discovery = ""
if enable_metrics:
# Write JSON file for Prometheus service discovery pointing to all of the
# workers. We serve this file with nginx so people can use it with
# `http_sd_config` in their Prometheus config.
#
# > It fetches targets from an HTTP endpoint containing a list of zero or more
# > `<static_config>`s. The target must reply with an HTTP 200 response. The HTTP
# > header `Content-Type` must be `application/json`, and the body must be valid
# > JSON.
# >
# > *-- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config*
#
# Another reference: https://prometheus.io/docs/prometheus/latest/http_sd/
prometheus_http_service_discovery_content = [
{
"targets": [NGINX_HOST_PLACEHOLDER],
"labels": {
# The downstream user should also configure `honor_labels: true` in
# their Prometheus config to prevent Prometheus from overwriting the
# `job` labels.
#
# > honor_labels controls how Prometheus handles conflicts between labels that are
# > already present in scraped data and labels that Prometheus would attach
# > server-side ("job" and "instance" labels, manually configured target
# > labels, and labels generated by service discovery implementations).
# >
# > *-- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config*
#
# Reference:
# - https://prometheus.io/docs/concepts/jobs_instances/
# - https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
"job": worker.worker_base_name,
"index": f"{worker.worker_index}",
# This allows us to change the `metrics_path` on a per-target basis.
# We want to grab the metrics from our nginx proxied location (setup
# below).
#
# While there doesn't seem to be official docs on these special
# labels (`__metrics_path__`, `__scheme__`, `__scrape_interval__`,
# `__scrape_timeout__`), this discussion best summarizes how this
# works: https://github.com/prometheus/prometheus/discussions/13217
"__metrics_path__": f"/metrics/worker/{worker.worker_name}",
},
}
for worker in requested_workers
]
# Add the main Synapse process as well
prometheus_http_service_discovery_content.append(
{
"targets": [NGINX_HOST_PLACEHOLDER],
"labels": {
# We use `"synapse"` as the job name for the main process because it
# matches what we expect people to use from a monolith setup with
# their static scrape config. It's `job` name used in our Grafana
# dashboard for the main process.
"job": "synapse",
"index": "1",
"__metrics_path__": "/metrics/worker/main",
},
}
)
# Check to make sure the file doesn't already exist
if os.path.isfile(PROMETHEUS_METRICS_SERVICE_DISCOVERY_FILE_PATH):
error(
f"Prometheus service discovery file "
f"'{PROMETHEUS_METRICS_SERVICE_DISCOVERY_FILE_PATH}' already exists (unexpected)! "
f"This is a problem because the existing file probably doesn't match the "
"Synapse workers we're setting up now."
)
# Write the file
with open(PROMETHEUS_METRICS_SERVICE_DISCOVERY_FILE_PATH, "w") as outfile:
outfile.write(
json.dumps(prometheus_http_service_discovery_content, indent=4)
)
# Proxy all of the Synapse metrics endpoints through a central place so that
# people only need to expose the single 9469 port and service discovery can take
# care of the rest: `/metrics/worker/<worker_name>` ->
# http://localhost:19090/_synapse/metrics
#
# Build the nginx location config blocks
metrics_proxy_locations = ""
for worker in requested_workers:
metrics_proxy_locations += NGINX_LOCATION_EXACT_CONFIG_BLOCK.format(
endpoint=f"/metrics/worker/{worker.worker_name}",
upstream=f"http://localhost:{worker_name_to_metrics_port_map[worker.worker_name]}/_synapse/metrics",
)
# Add the main Synapse process as well
metrics_proxy_locations += NGINX_LOCATION_EXACT_CONFIG_BLOCK.format(
endpoint="/metrics/worker/main",
upstream="http://localhost:19090/_synapse/metrics",
)
# Add a nginx server/location to serve the JSON file
nginx_prometheus_metrics_service_discovery = NGINX_PROMETHEUS_METRICS_SERVICE_DISCOVERY.format(
service_discovery_file_path=PROMETHEUS_METRICS_SERVICE_DISCOVERY_FILE_PATH,
host_placeholder=NGINX_HOST_PLACEHOLDER,
metrics_proxy_locations=metrics_proxy_locations,
)
# Finally, we'll write out the config files.
# log config for the master process
@@ -972,7 +1205,7 @@ def generate_worker_files(
if reg_path.suffix.lower() in (".yaml", ".yml")
]
workers_in_use = len(requested_worker_types) > 0
workers_in_use = len(requested_workers) > 0
# If there are workers, add the main process to the instance_map too.
if workers_in_use:
@@ -1007,6 +1240,7 @@ def generate_worker_files(
tls_cert_path=os.environ.get("SYNAPSE_TLS_CERT"),
tls_key_path=os.environ.get("SYNAPSE_TLS_KEY"),
using_unix_sockets=using_unix_sockets,
nginx_prometheus_metrics_service_discovery=nginx_prometheus_metrics_service_discovery,
)
# Supervisord config
@@ -1107,15 +1341,20 @@ def main(args: list[str], environ: MutableMapping[str, str]) -> None:
if not worker_types_env:
# No workers, just the main process
worker_types = []
requested_worker_types: dict[str, Any] = {}
requested_workers: list[Worker] = []
else:
# Split type names by comma, ignoring whitespace.
worker_types = split_and_strip_string(worker_types_env, ",")
requested_worker_types = parse_worker_types(worker_types)
requested_workers = parse_worker_types(worker_types)
# Always regenerate all other config files
log("Generating worker config files")
generate_worker_files(environ, config_path, data_dir, requested_worker_types)
generate_worker_files(
environ=environ,
config_path=config_path,
data_dir=data_dir,
requested_workers=requested_workers,
)
# Mark workers as being configured
with open(mark_filepath, "w") as f: