diff --git a/changelog.d/18733.misc b/changelog.d/18733.misc new file mode 100644 index 0000000000..f38647d4c6 --- /dev/null +++ b/changelog.d/18733.misc @@ -0,0 +1 @@ +Update metrics linting to be able to handle custom metrics. diff --git a/mypy.ini b/mypy.ini index cf64248cc5..ae903f858a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,17 @@ [mypy] namespace_packages = True -plugins = pydantic.mypy, mypy_zope:plugin, scripts-dev/mypy_synapse_plugin.py +# Our custom mypy plugin should remain first in this list. +# +# mypy has a limitation where it only chooses the first plugin that returns a non-None +# value for each hook (known-limitation, c.f. +# https://github.com/python/mypy/issues/19524). We workaround this by putting our custom +# plugin first in the plugin order and then manually calling any other conflicting +# plugin hooks in our own plugin followed by our own checks. +# +# If you add a new plugin, make sure to check whether the hooks being used conflict with +# our custom plugin hooks and if so, manually call the other plugin's hooks in our +# custom plugin. (also applies to if the plugin is updated in the future) +plugins = scripts-dev/mypy_synapse_plugin.py, pydantic.mypy, mypy_zope:plugin follow_imports = normal show_error_codes = True show_traceback = True @@ -99,3 +110,6 @@ ignore_missing_imports = True [mypy-multipart.*] ignore_missing_imports = True + +[mypy-mypy_zope.*] +ignore_missing_imports = True diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 2a7f7ace10..610dec415a 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -23,16 +23,21 @@ can crop up, e.g the cache descriptors. """ -from typing import Callable, Optional, Tuple, Type, Union +import enum +from typing import Callable, Mapping, Optional, Tuple, Type, Union +import attr import mypy.types from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, TupleExpr, Var from mypy.plugin import ( + ClassDefContext, + Context, FunctionLike, FunctionSigContext, MethodSigContext, + MypyFile, Plugin, ) from mypy.typeops import bind_self @@ -41,12 +46,15 @@ from mypy.types import ( CallableType, Instance, NoneType, + Options, TupleType, TypeAliasType, TypeVarType, UninhabitedType, UnionType, ) +from mypy_zope import plugin as mypy_zope_plugin +from pydantic.mypy import plugin as mypy_pydantic_plugin PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( "missing-server-name-label", @@ -54,19 +62,153 @@ PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( category="per-homeserver-tenant-metrics", ) +PROMETHEUS_METRIC_MISSING_FROM_LIST_TO_CHECK = ErrorCode( + "metric-type-missing-from-list", + "Every Prometheus metric type must be included in the `prometheus_metric_fullname_to_label_arg_map`.", + category="per-homeserver-tenant-metrics", +) + + +class Sentinel(enum.Enum): + # defining a sentinel in this way allows mypy to correctly handle the + # type of a dictionary lookup and subsequent type narrowing. + UNSET_SENTINEL = object() + + +@attr.s(auto_attribs=True) +class ArgLocation: + keyword_name: str + """ + The keyword argument name for this argument + """ + position: int + """ + The 0-based positional index of this argument + """ + + +prometheus_metric_fullname_to_label_arg_map: Mapping[str, Optional[ArgLocation]] = { + # `Collector` subclasses: + "prometheus_client.metrics.MetricWrapperBase": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Counter": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Histogram": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Gauge": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Summary": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Info": ArgLocation("labelnames", 2), + "prometheus_client.metrics.Enum": ArgLocation("labelnames", 2), + "synapse.metrics.LaterGauge": ArgLocation("labelnames", 2), + "synapse.metrics.InFlightGauge": ArgLocation("labels", 2), + "synapse.metrics.GaugeBucketCollector": ArgLocation("labelnames", 2), + "prometheus_client.registry.Collector": None, + "prometheus_client.registry._EmptyCollector": None, + "prometheus_client.registry.CollectorRegistry": None, + "prometheus_client.process_collector.ProcessCollector": None, + "prometheus_client.platform_collector.PlatformCollector": None, + "prometheus_client.gc_collector.GCCollector": None, + "synapse.metrics._gc.GCCounts": None, + "synapse.metrics._gc.PyPyGCStats": None, + "synapse.metrics._reactor_metrics.ReactorLastSeenMetric": None, + "synapse.metrics.CPUMetrics": None, + "synapse.metrics.jemalloc.JemallocCollector": None, + "synapse.util.metrics.DynamicCollectorRegistry": None, + "synapse.metrics.background_process_metrics._Collector": None, + # + # `Metric` subclasses: + "prometheus_client.metrics_core.Metric": None, + "prometheus_client.metrics_core.UnknownMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.CounterMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.GaugeMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.SummaryMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.InfoMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.HistogramMetricFamily": ArgLocation("labels", 3), + "prometheus_client.metrics_core.GaugeHistogramMetricFamily": ArgLocation( + "labels", 4 + ), + "prometheus_client.metrics_core.StateSetMetricFamily": ArgLocation("labels", 3), + "synapse.metrics.GaugeHistogramMetricFamilyWithLabels": ArgLocation( + "labelnames", 4 + ), +} +""" +Map from the fullname of the Prometheus `Metric`/`Collector` classes to the keyword +argument name and positional index of the label names. This map is useful because +different metrics have different signatures for passing in label names and we just need +to know where to look. + +This map should include any metrics that we collect with Prometheus. Which corresponds +to anything that inherits from `prometheus_client.registry.Collector` +(`synapse.metrics._types.Collector`) or `prometheus_client.metrics_core.Metric`. The +exhaustiveness of this list is enforced by `analyze_prometheus_metric_classes`. + +The entries with `None` always fail the lint because they don't have a `labelnames` +argument (therefore, no `SERVER_NAME_LABEL`), but we include them here so that people +can notice and manually allow via a type ignore comment as the source of truth +should be in the source code. +""" + +# Unbound at this point because we don't know the mypy version yet. +# This is set in the `plugin(...)` function below. +MypyPydanticPluginClass: Type[Plugin] +MypyZopePluginClass: Type[Plugin] + class SynapsePlugin(Plugin): + def __init__(self, options: Options): + super().__init__(options) + self.mypy_pydantic_plugin = MypyPydanticPluginClass(options) + self.mypy_zope_plugin = MypyZopePluginClass(options) + + def set_modules(self, modules: dict[str, MypyFile]) -> None: + """ + This is called by mypy internals. We have to override this to ensure it's also + called for any other plugins that we're manually handling. + + Here is how mypy describes it: + + > [`self._modules`] can't be set in `__init__` because it is executed too soon + > in `build.py`. Therefore, `build.py` *must* set it later before graph processing + > starts by calling `set_modules()`. + """ + super().set_modules(modules) + self.mypy_pydantic_plugin.set_modules(modules) + self.mypy_zope_plugin.set_modules(modules) + + def get_base_class_hook( + self, fullname: str + ) -> Optional[Callable[[ClassDefContext], None]]: + def _get_base_class_hook(ctx: ClassDefContext) -> None: + # Run any `get_base_class_hook` checks from other plugins first. + # + # Unfortunately, because mypy only chooses the first plugin that returns a + # non-None value (known-limitation, c.f. + # https://github.com/python/mypy/issues/19524), we workaround this by + # putting our custom plugin first in the plugin order and then calling the + # other plugin's hook manually followed by our own checks. + if callback := self.mypy_pydantic_plugin.get_base_class_hook(fullname): + callback(ctx) + if callback := self.mypy_zope_plugin.get_base_class_hook(fullname): + callback(ctx) + + # Now run our own checks + analyze_prometheus_metric_classes(ctx) + + return _get_base_class_hook + def get_function_signature_hook( self, fullname: str ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]: - if fullname in ( - "prometheus_client.metrics.Counter", - "prometheus_client.metrics.Histogram", - "prometheus_client.metrics.Gauge", - # TODO: Add other prometheus_client metrics that need checking as we - # refactor, see https://github.com/element-hq/synapse/issues/18592 - ): - return check_prometheus_metric_instantiation + # Strip off the unique identifier for classes that are dynamically created inside + # functions. ex. `synapse.metrics.jemalloc.JemallocCollector@185` (this is the line + # number) + if "@" in fullname: + fullname = fullname.split("@", 1)[0] + + # Look for any Prometheus metrics to make sure they have the `SERVER_NAME_LABEL` + # label. + if fullname in prometheus_metric_fullname_to_label_arg_map.keys(): + # Because it's difficult to determine the `fullname` of the function in the + # callback, let's just pass it in while we have it. + return lambda ctx: check_prometheus_metric_instantiation(ctx, fullname) return None @@ -90,7 +232,44 @@ class SynapsePlugin(Plugin): return None -def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType: +def analyze_prometheus_metric_classes(ctx: ClassDefContext) -> None: + """ + Cross-check the list of Prometheus metric classes against the + `prometheus_metric_fullname_to_label_arg_map` to ensure the list is exhaustive and + up-to-date. + """ + + fullname = ctx.cls.fullname + # Strip off the unique identifier for classes that are dynamically created inside + # functions. ex. `synapse.metrics.jemalloc.JemallocCollector@185` (this is the line + # number) + if "@" in fullname: + fullname = fullname.split("@", 1)[0] + + if any( + ancestor_type.fullname + in ( + # All of the Prometheus metric classes inherit from the `Collector`. + "prometheus_client.registry.Collector", + "synapse.metrics._types.Collector", + # And custom metrics that inherit from `Metric`. + "prometheus_client.metrics_core.Metric", + ) + for ancestor_type in ctx.cls.info.mro + ): + if fullname not in prometheus_metric_fullname_to_label_arg_map: + ctx.api.fail( + f"Expected {fullname} to be in `prometheus_metric_fullname_to_label_arg_map`, " + f"but it was not found. This is a problem with our custom mypy plugin. " + f"Please add it to the map.", + Context(), + code=PROMETHEUS_METRIC_MISSING_FROM_LIST_TO_CHECK, + ) + + +def check_prometheus_metric_instantiation( + ctx: FunctionSigContext, fullname: str +) -> CallableType: """ Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label when instantiated. @@ -103,18 +282,49 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy Python garbage collection, and Twisted reactor tick time, which shouldn't have the `SERVER_NAME_LABEL`. In those cases, use a type ignore comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. + + Args: + ctx: The `FunctionSigContext` from mypy. + fullname: The fully qualified name of the function being called, + e.g. `"prometheus_client.metrics.Counter"` """ # The true signature, this isn't being modified so this is what will be returned. - signature: CallableType = ctx.default_signature + signature = ctx.default_signature + + # Find where the label names argument is in the function signature. + arg_location = prometheus_metric_fullname_to_label_arg_map.get( + fullname, Sentinel.UNSET_SENTINEL + ) + assert arg_location is not Sentinel.UNSET_SENTINEL, ( + f"Expected to find {fullname} in `prometheus_metric_fullname_to_label_arg_map`, " + f"but it was not found. This is a problem with our custom mypy plugin. " + f"Please add it to the map. Context: {ctx.context}" + ) + # People should be using `# type: ignore[missing-server-name-label]` for + # process-level metrics that should not have the `SERVER_NAME_LABEL`. + if arg_location is None: + ctx.api.fail( + f"{signature.name} does not have a `labelnames`/`labels` argument " + "(if this is untrue, update `prometheus_metric_fullname_to_label_arg_map` " + "in our custom mypy plugin) and should probably have a type ignore comment, " + "e.g. `# type: ignore[missing-server-name-label]`. The reason we don't " + "automatically ignore this is the source of truth should be in the source code.", + ctx.context, + code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, + ) + return signature # Sanity check the arguments are still as expected in this version of # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)` # # `signature.arg_names` should be: ["name", "documentation", "labelnames", ...] - if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames": + if ( + len(signature.arg_names) < (arg_location.position + 1) + or signature.arg_names[arg_location.position] != arg_location.keyword_name + ): ctx.api.fail( - f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got " - f"{signature.arg_names[2]}", + f"Expected argument number {arg_location.position + 1} of {signature.name} to be `labelnames`/`labels`, " + f"but got {signature.arg_names[arg_location.position]}", ctx.context, ) return signature @@ -137,7 +347,11 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy # ... # ] # ``` - labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None + labelnames_arg_expression = ( + ctx.args[arg_location.position][0] + if len(ctx.args[arg_location.position]) > 0 + else None + ) if isinstance(labelnames_arg_expression, (ListExpr, TupleExpr)): # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). for labelname_expression in labelnames_arg_expression.items: @@ -476,10 +690,13 @@ def is_cacheable( def plugin(version: str) -> Type[SynapsePlugin]: + global MypyPydanticPluginClass, MypyZopePluginClass # This is the entry point of the plugin, and lets us deal with the fact # that the mypy plugin interface is *not* stable by looking at the version # string. # # However, since we pin the version of mypy Synapse uses in CI, we don't # really care. + MypyPydanticPluginClass = mypy_pydantic_plugin(version) + MypyZopePluginClass = mypy_zope_plugin(version) return SynapsePlugin diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 120e151b7f..11e2551a16 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -170,7 +170,9 @@ class LaterGauge(Collector): ] def collect(self) -> Iterable[Metric]: - g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) + # The decision to add `SERVER_NAME_LABEL` is from the `LaterGauge` usage itself + # (we don't enforce it here, one level up). + g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) # type: ignore[missing-server-name-label] try: calls = self.caller() @@ -304,7 +306,9 @@ class InFlightGauge(Generic[MetricsEntry], Collector): Note: may be called by a separate thread. """ - in_flight = GaugeMetricFamily( + # The decision to add `SERVER_NAME_LABEL` is from the `GaugeBucketCollector` + # usage itself (we don't enforce it here, one level up). + in_flight = GaugeMetricFamily( # type: ignore[missing-server-name-label] self.name + "_total", self.desc, labels=self.labels ) @@ -328,7 +332,9 @@ class InFlightGauge(Generic[MetricsEntry], Collector): yield in_flight for name in self.sub_metrics: - gauge = GaugeMetricFamily( + # The decision to add `SERVER_NAME_LABEL` is from the `InFlightGauge` usage + # itself (we don't enforce it here, one level up). + gauge = GaugeMetricFamily( # type: ignore[missing-server-name-label] "_".join([self.name, name]), "", labels=self.labels ) for key, metrics in metrics_by_key.items(): @@ -483,7 +489,9 @@ class GaugeBucketCollector(Collector): # that bucket or below. accumulated_values = itertools.accumulate(bucket_values) - return GaugeHistogramMetricFamilyWithLabels( + # The decision to add `SERVER_NAME_LABEL` is from the `GaugeBucketCollector` + # usage itself (we don't enforce it here, one level up). + return GaugeHistogramMetricFamilyWithLabels( # type: ignore[missing-server-name-label] name=self._name, documentation=self._documentation, labelnames=self._labelnames, @@ -519,16 +527,19 @@ class CPUMetrics(Collector): line = s.read() raw_stats = line.split(") ", 1)[1].split(" ") - user = GaugeMetricFamily("process_cpu_user_seconds_total", "") + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + user = GaugeMetricFamily("process_cpu_user_seconds_total", "") # type: ignore[missing-server-name-label] user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec) yield user - sys = GaugeMetricFamily("process_cpu_system_seconds_total", "") + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + sys = GaugeMetricFamily("process_cpu_system_seconds_total", "") # type: ignore[missing-server-name-label] sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec) yield sys -REGISTRY.register(CPUMetrics()) +# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. +REGISTRY.register(CPUMetrics()) # type: ignore[missing-server-name-label] # diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 053e2da48e..e7783b05e6 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -83,7 +83,8 @@ gc_time = Histogram( # type: ignore[missing-server-name-label] class GCCounts(Collector): def collect(self) -> Iterable[Metric]: - cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) # type: ignore[missing-server-name-label] for n, m in enumerate(gc.get_count()): cm.add_metric([str(n)], m) @@ -102,7 +103,8 @@ def install_gc_manager() -> None: if running_on_pypy: return - REGISTRY.register(GCCounts()) + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + REGISTRY.register(GCCounts()) # type: ignore[missing-server-name-label] gc.disable() @@ -177,7 +179,8 @@ class PyPyGCStats(Collector): # # Total time spent in GC: 0.073 # s.total_gc_time - pypy_gc_time = CounterMetricFamily( + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + pypy_gc_time = CounterMetricFamily( # type: ignore[missing-server-name-label] "pypy_gc_time_seconds_total", "Total time spent in PyPy GC", labels=[], @@ -185,7 +188,8 @@ class PyPyGCStats(Collector): pypy_gc_time.add_metric([], s.total_gc_time / 1000) yield pypy_gc_time - pypy_mem = GaugeMetricFamily( + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + pypy_mem = GaugeMetricFamily( # type: ignore[missing-server-name-label] "pypy_memory_bytes", "Memory tracked by PyPy allocator", labels=["state", "class", "kind"], @@ -209,4 +213,5 @@ class PyPyGCStats(Collector): if running_on_pypy: - REGISTRY.register(PyPyGCStats()) + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + REGISTRY.register(PyPyGCStats()) # type: ignore[missing-server-name-label] diff --git a/synapse/metrics/_reactor_metrics.py b/synapse/metrics/_reactor_metrics.py index e1f8570f3d..9852d0b932 100644 --- a/synapse/metrics/_reactor_metrics.py +++ b/synapse/metrics/_reactor_metrics.py @@ -115,7 +115,8 @@ class ReactorLastSeenMetric(Collector): self._call_wrapper = call_wrapper def collect(self) -> Iterable[Metric]: - cm = GaugeMetricFamily( + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + cm = GaugeMetricFamily( # type: ignore[missing-server-name-label] "python_twisted_reactor_last_seen", "Seconds since the Twisted reactor was last seen", ) @@ -166,4 +167,5 @@ except Exception as e: if wrapper: - REGISTRY.register(ReactorLastSeenMetric(wrapper)) + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + REGISTRY.register(ReactorLastSeenMetric(wrapper)) # type: ignore[missing-server-name-label] diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 9671621f8c..f7f2d88885 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -167,7 +167,9 @@ class _Collector(Collector): yield from m.collect() -REGISTRY.register(_Collector()) +# The `SERVER_NAME_LABEL` is included in the individual metrics added to this registry, +# so we don't need to worry about it on the collector itself. +REGISTRY.register(_Collector()) # type: ignore[missing-server-name-label] class _BackgroundProcess: diff --git a/synapse/metrics/jemalloc.py b/synapse/metrics/jemalloc.py index 321ff58083..fb8adbe060 100644 --- a/synapse/metrics/jemalloc.py +++ b/synapse/metrics/jemalloc.py @@ -188,7 +188,8 @@ def _setup_jemalloc_stats() -> None: def collect(self) -> Iterable[Metric]: stats.refresh_stats() - g = GaugeMetricFamily( + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + g = GaugeMetricFamily( # type: ignore[missing-server-name-label] "jemalloc_stats_app_memory_bytes", "The stats reported by jemalloc", labels=["type"], @@ -230,7 +231,8 @@ def _setup_jemalloc_stats() -> None: yield g - REGISTRY.register(JemallocCollector()) + # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. + REGISTRY.register(JemallocCollector()) # type: ignore[missing-server-name-label] logger.debug("Added jemalloc stats") diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py index 340da5bd4f..710a29e3f0 100644 --- a/synapse/util/caches/__init__.py +++ b/synapse/util/caches/__init__.py @@ -42,7 +42,10 @@ TRACK_MEMORY_USAGE = False # We track cache metrics in a special registry that lets us update the metrics # just before they are returned from the scrape endpoint. -CACHE_METRIC_REGISTRY = DynamicCollectorRegistry() +# +# The `SERVER_NAME_LABEL` is included in the individual metrics added to this registry, +# so we don't need to worry about it on the collector itself. +CACHE_METRIC_REGISTRY = DynamicCollectorRegistry() # type: ignore[missing-server-name-label] cache_size = Gauge( "synapse_util_caches_cache_size", diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index d87f621332..61874564a6 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -59,7 +59,8 @@ class TestMauLimit(unittest.TestCase): foo: int bar: int - gauge: InFlightGauge[MetricEntry] = InFlightGauge( + # This is a test and does not matter if it uses `SERVER_NAME_LABEL`. + gauge: InFlightGauge[MetricEntry] = InFlightGauge( # type: ignore[missing-server-name-label] "test1", "", labels=["test_label"], sub_metrics=["foo", "bar"] )