From be4c95baf11f16a6108da5b73f7140d36de6f55d Mon Sep 17 00:00:00 2001
From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com>
Date: Thu, 3 Jul 2025 11:12:12 +0100
Subject: [PATCH] Replace PyICU with Rust `icu_segmenter` crate (#18553)

Co-authored-by: anoa's Codex Agent <codex@amorgan.xyz>
Co-authored-by: Quentin Gliech <quenting@element.io>
---
 changelog.d/18553.misc                        |  1 +
 docs/development/contributing_guide.md        |  2 -
 docs/development/dependencies.md              |  5 +-
 docs/setup/installation.md                    | 26 ++--------
 docs/upgrade.md                               |  7 +++
 docs/user_directory.md                        | 13 ++---
 flake.nix                                     |  1 -
 poetry.lock                                   | 17 +------
 pyproject.toml                                |  7 ---
 rust/Cargo.toml                               |  1 +
 rust/src/lib.rs                               |  2 +
 rust/src/segmenter.rs                         | 33 +++++++++++++
 .../storage/databases/main/user_directory.py  | 49 +++----------------
 synapse/synapse_rust/segmenter.pyi            |  3 ++
 tests/storage/test_user_directory.py          | 39 +--------------
 15 files changed, 70 insertions(+), 136 deletions(-)
 create mode 100644 changelog.d/18553.misc
 create mode 100644 rust/src/segmenter.rs
 create mode 100644 synapse/synapse_rust/segmenter.pyi

diff --git a/changelog.d/18553.misc b/changelog.d/18553.misc
new file mode 100644
index 0000000000..bb143aacfc
--- /dev/null
+++ b/changelog.d/18553.misc
@@ -0,0 +1 @@
+Replace `PyICU` crate with equivalent `icu_segmenter` Rust crate.
\ No newline at end of file
diff --git a/docs/development/contributing_guide.md b/docs/development/contributing_guide.md
index d6efab96cf..eb6f04e301 100644
--- a/docs/development/contributing_guide.md
+++ b/docs/development/contributing_guide.md
@@ -29,8 +29,6 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r
 
 Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`.
 
-Synapse has an optional, improved user search with better Unicode support. For that you need the development package of `libicu`. On Debian or Ubuntu Linux, this can be installed with `sudo apt install libicu-dev`.
-
 The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git).
 
 For some tests, you will need [a recent version of Docker](https://docs.docker.com/get-docker/).
diff --git a/docs/development/dependencies.md b/docs/development/dependencies.md
index fa5ff4dcf7..e381b3d155 100644
--- a/docs/development/dependencies.md
+++ b/docs/development/dependencies.md
@@ -164,10 +164,7 @@ $ poetry cache clear --all .
 # including the wheel artifacts which is not covered by the above command
 # (see https://github.com/python-poetry/poetry/issues/10304)
 #
-# This is necessary in order to rebuild or fetch new wheels. For example, if you update
-# the `icu` library in on your system, you will need to rebuild the PyICU Python package
-# in order to incorporate the correct dynamically linked library locations otherwise you
-# will run into errors like: `ImportError: libicui18n.so.75: cannot open shared object file: No such file or directory`
+# This is necessary in order to rebuild or fetch new wheels.
 $ rm -rf $(poetry config cache-dir)
 ```
 
diff --git a/docs/setup/installation.md b/docs/setup/installation.md
index 0853496ab7..0840f532b0 100644
--- a/docs/setup/installation.md
+++ b/docs/setup/installation.md
@@ -286,7 +286,7 @@ Installing prerequisites on Ubuntu or Debian:
 ```sh
 sudo apt install build-essential python3-dev libffi-dev \
                      python3-pip python3-setuptools sqlite3 \
-                     libssl-dev virtualenv libjpeg-dev libxslt1-dev libicu-dev
+                     libssl-dev virtualenv libjpeg-dev libxslt1-dev
 ```
 
 ##### ArchLinux
@@ -295,7 +295,7 @@ Installing prerequisites on ArchLinux:
 
 ```sh
 sudo pacman -S base-devel python python-pip \
-               python-setuptools python-virtualenv sqlite3 icu
+               python-setuptools python-virtualenv sqlite3
 ```
 
 ##### CentOS/Fedora
@@ -305,8 +305,7 @@ Installing prerequisites on CentOS or Fedora Linux:
 ```sh
 sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \
                  libwebp-devel libxml2-devel libxslt-devel libpq-devel \
-                 python3-virtualenv libffi-devel openssl-devel python3-devel \
-                 libicu-devel
+                 python3-virtualenv libffi-devel openssl-devel python3-devel
 sudo dnf group install "Development Tools"
 ```
 
@@ -333,7 +332,7 @@ dnf install python3.12 python3.12-devel
 ```
 Finally, install common prerequisites
 ```bash
-dnf install libicu libicu-devel libpq5 libpq5-devel lz4 pkgconf
+dnf install libpq5 libpq5-devel lz4 pkgconf
 dnf group install "Development Tools"
 ```
 ###### Using venv module instead of virtualenv command
@@ -365,20 +364,6 @@ xcode-select --install
 
 Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them.
 
-You may need to install icu, and make the icu binaries and libraries accessible.
-Please follow [the official instructions of PyICU](https://pypi.org/project/PyICU/) to do so.
-
-If you're struggling to get icu discovered, and see:
-```
-  RuntimeError:
-  Please install pkg-config on your system or set the ICU_VERSION environment
-  variable to the version of ICU you have installed.
-```
-despite it being installed and having your `PATH` updated, you can omit this dependency by
-not specifying `--extras all` to `poetry`. If using postgres, you can install Synapse via
-`poetry install --extras saml2 --extras oidc --extras postgres --extras opentracing --extras redis --extras sentry`.
-ICU is not a hard dependency on getting a working installation.
-
 On ARM-based Macs you may also need to install libjpeg and libpq:
 ```sh
  brew install jpeg libpq
@@ -400,8 +385,7 @@ Installing prerequisites on openSUSE:
 ```sh
 sudo zypper in -t pattern devel_basis
 sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \
-               python-devel libffi-devel libopenssl-devel libjpeg62-devel \
-               libicu-devel
+               python-devel libffi-devel libopenssl-devel libjpeg62-devel
 ```
 
 ##### OpenBSD
diff --git a/docs/upgrade.md b/docs/upgrade.md
index d508e2231e..ca9ca121f2 100644
--- a/docs/upgrade.md
+++ b/docs/upgrade.md
@@ -117,6 +117,13 @@ each upgrade are complete before moving on to the next upgrade, to avoid
 stacking them up. You can monitor the currently running background updates with
 [the Admin API](usage/administration/admin_api/background_updates.html#status).
 
+# Upgrading to v1.134.0
+
+## ICU bundled with Synapse
+
+Synapse now uses the Rust `icu` library for improved user search. Installing the
+native ICU library on your system is no longer required.
+
 # Upgrading to v1.130.0
 
 ## Documented endpoint which can be delegated to a federation worker
diff --git a/docs/user_directory.md b/docs/user_directory.md
index be8664a016..75d32af44f 100644
--- a/docs/user_directory.md
+++ b/docs/user_directory.md
@@ -77,14 +77,11 @@ The user provided search term is lowercased and normalized using [NFKC](https://
 this treats the string as case-insensitive, canonicalizes different forms of the
 same text, and maps some "roughly equivalent" characters together.
 
-The search term is then split into words:
-
-* If [ICU](https://en.wikipedia.org/wiki/International_Components_for_Unicode) is
-  available, then the system's [default locale](https://unicode-org.github.io/icu/userguide/locale/#default-locales)
-  will be used to break the search term into words. (See the
-  [installation instructions](setup/installation.md) for how to install ICU.)
-* If unavailable, then runs of ASCII characters, numbers, underscores, and hyphens
-  are considered words.
+The search term is then split into segments using the [`icu_segmenter`
+Rust crate](https://crates.io/crates/icu_segmenter). This crate ships with its
+own dictionary and Long Short Term-Memory (LSTM) machine learning models
+per-language to segment words. Read more [in the crate's
+documentation](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_auto).
 
 The queries for PostgreSQL and SQLite are detailed below, but their overall goal
 is to find matching users, preferring users who are "real" (e.g. not bots,
diff --git a/flake.nix b/flake.nix
index 749c10da1d..4ff6518aed 100644
--- a/flake.nix
+++ b/flake.nix
@@ -96,7 +96,6 @@
                   gnumake
 
                   # Native dependencies for running Synapse.
-                  icu
                   libffi
                   libjpeg
                   libpqxx
diff --git a/poetry.lock b/poetry.lock
index 73cafc414c..329c373d12 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1987,18 +1987,6 @@ files = [
 [package.extras]
 plugins = ["importlib-metadata ; python_version < \"3.8\""]
 
-[[package]]
-name = "pyicu"
-version = "2.14"
-description = "Python extension wrapping the ICU C++ API"
-optional = true
-python-versions = "*"
-groups = ["main"]
-markers = "extra == \"all\" or extra == \"user-search\""
-files = [
-    {file = "PyICU-2.14.tar.gz", hash = "sha256:acc7eb92bd5c554ed577249c6978450a4feda0aa6f01470152b3a7b382a02132"},
-]
-
 [[package]]
 name = "pyjwt"
 version = "2.6.0"
@@ -3393,7 +3381,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
 test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
 
 [extras]
-all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"]
+all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "sentry-sdk", "txredisapi"]
 cache-memory = ["Pympler"]
 jwt = ["authlib"]
 matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
@@ -3406,9 +3394,8 @@ sentry = ["sentry-sdk"]
 systemd = ["systemd-python"]
 test = ["idna", "parameterized"]
 url-preview = ["lxml"]
-user-search = ["pyicu"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.9.0"
-content-hash = "9824e42dfc0e128129ee0c8641f7fe639bf47574cdd3f052dd995941abc6e44b"
+content-hash = "457f188ae22af9663b2ed21f2586720ce5014edc7c34a697787f16aad733ea41"
diff --git a/pyproject.toml b/pyproject.toml
index da9bbc6151..ecd601f03b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -254,7 +254,6 @@ hiredis = { version = "*", optional = true }
 Pympler = { version = "*", optional = true }
 parameterized = { version = ">=0.7.4", optional = true }
 idna = { version = ">=2.5", optional = true }
-pyicu = { version = ">=2.10.2", optional = true }
 
 [tool.poetry.extras]
 # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
@@ -277,10 +276,6 @@ redis = ["txredisapi", "hiredis"]
 # Required to use experimental `caches.track_memory_usage` config option.
 cache-memory = ["pympler"]
 test = ["parameterized", "idna"]
-# Allows for better search for international characters in the user directory. This
-# requires libicu's development headers installed on the system (e.g. libicu-dev on
-# Debian-based distributions).
-user-search = ["pyicu"]
 
 # The duplication here is awful. I hate hate hate hate hate it. However, for now I want
 # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
@@ -312,8 +307,6 @@ all = [
     "txredisapi", "hiredis",
     # cache-memory
     "pympler",
-    # improved user search
-    "pyicu",
     # omitted:
     #   - test: it's useful to have this separate from dev deps in the olddeps job
     #   - systemd: this is a system-based requirement
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 3a47f7ddf7..dab32c8952 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -43,6 +43,7 @@ sha2 = "0.10.8"
 serde = { version = "1.0.144", features = ["derive"] }
 serde_json = "1.0.85"
 ulid = "1.1.2"
+icu_segmenter = "2.0.0"
 reqwest = { version = "0.12.15", default-features = false, features = [
     "http2",
     "stream",
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index e33a8cc44c..6522148fa1 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -13,6 +13,7 @@ pub mod identifier;
 pub mod matrix_const;
 pub mod push;
 pub mod rendezvous;
+pub mod segmenter;
 
 lazy_static! {
     static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
@@ -53,6 +54,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
     events::register_module(py, m)?;
     http_client::register_module(py, m)?;
     rendezvous::register_module(py, m)?;
+    segmenter::register_module(py, m)?;
 
     Ok(())
 }
diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs
new file mode 100644
index 0000000000..135b3c1779
--- /dev/null
+++ b/rust/src/segmenter.rs
@@ -0,0 +1,33 @@
+use icu_segmenter::options::WordBreakInvariantOptions;
+use icu_segmenter::WordSegmenter;
+use pyo3::prelude::*;
+
+#[pyfunction]
+pub fn parse_words(text: &str) -> PyResult<Vec<String>> {
+    let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
+    let mut parts = Vec::new();
+    let mut last = 0usize;
+
+    // `segment_str` gives us word boundaries as a vector of indexes. Use that
+    // to build a vector of words, and return.
+    for boundary in segmenter.segment_str(text) {
+        if boundary > last {
+            parts.push(text[last..boundary].to_string());
+        }
+        last = boundary;
+    }
+    Ok(parts)
+}
+
+pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
+    let child_module = PyModule::new(py, "segmenter")?;
+    child_module.add_function(wrap_pyfunction!(parse_words, m)?)?;
+
+    m.add_submodule(&child_module)?;
+
+    py.import("sys")?
+        .getattr("modules")?
+        .set_item("synapse.synapse_rust.segmenter", child_module)?;
+
+    Ok(())
+}
diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py
index 09671abcad..9deb9ab73c 100644
--- a/synapse/storage/databases/main/user_directory.py
+++ b/synapse/storage/databases/main/user_directory.py
@@ -37,16 +37,8 @@ from typing import (
 
 import attr
 
-try:
-    # Figure out if ICU support is available for searching users.
-    import icu
-
-    USE_ICU = True
-except ModuleNotFoundError:
-    # except ModuleNotFoundError:
-    USE_ICU = False
-
 from synapse.api.errors import StoreError
+from synapse.synapse_rust import segmenter as icu
 from synapse.util.stringutils import non_null_str_or_none
 
 if TYPE_CHECKING:
@@ -1226,7 +1218,7 @@ def _filter_text_for_index(text: str) -> str:
 
 def _parse_query_sqlite(search_term: str) -> str:
     """Takes a plain unicode string from the user and converts it into a form
-    that can be passed to database.
+    that can be passed to the database.
     We use this so that we can add prefix matching, which isn't something
     that is supported by default.
 
@@ -1242,7 +1234,7 @@ def _parse_query_sqlite(search_term: str) -> str:
 
 def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
     """Takes a plain unicode string from the user and converts it into a form
-    that can be passed to database.
+    that can be passed to the database.
     We use this so that we can add prefix matching, which isn't something
     that is supported by default.
     """
@@ -1272,12 +1264,7 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
 
 
 def _parse_words(search_term: str) -> List[str]:
-    """Split the provided search string into a list of its words.
-
-    If support for ICU (International Components for Unicode) is available, use it.
-    Otherwise, fall back to using a regex to detect word boundaries. This latter
-    solution works well enough for most latin-based languages, but doesn't work as well
-    with other languages.
+    """Split the provided search string into a list of its words using ICU.
 
     Args:
         search_term: The search string.
@@ -1285,18 +1272,7 @@ def _parse_words(search_term: str) -> List[str]:
     Returns:
         A list of the words in the search string.
     """
-    if USE_ICU:
-        return _parse_words_with_icu(search_term)
-
-    return _parse_words_with_regex(search_term)
-
-
-def _parse_words_with_regex(search_term: str) -> List[str]:
-    """
-    Break down search term into words, when we don't have ICU available.
-    See: `_parse_words`
-    """
-    return re.findall(r"([\w-]+)", search_term, re.UNICODE)
+    return _parse_words_with_icu(search_term)
 
 
 def _parse_words_with_icu(search_term: str) -> List[str]:
@@ -1310,22 +1286,13 @@ def _parse_words_with_icu(search_term: str) -> List[str]:
         A list of the words in the search string.
     """
     results = []
-    breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
-    breaker.setText(search_term)
-    i = 0
-    while True:
-        j = breaker.nextBoundary()
-        if j < 0:
-            break
-
+    for part in icu.parse_words(search_term):
         # We want to make sure that we split on `@` and `:` specifically, as
         # they occur in user IDs.
-        for result in re.split(r"[@:]+", search_term[i:j]):
+        for result in re.split(r"[@:]+", part):
             results.append(result.strip())
 
-        i = j
-
-    # libicu will break up words that have punctuation in them, but to handle
+    # icu will break up words that have punctuation in them, but to handle
     # cases where user IDs have '-', '.' and '_' in them we want to *not* break
     # those into words and instead allow the DB to tokenise them how it wants.
     #
diff --git a/synapse/synapse_rust/segmenter.pyi b/synapse/synapse_rust/segmenter.pyi
new file mode 100644
index 0000000000..5f36765947
--- /dev/null
+++ b/synapse/synapse_rust/segmenter.pyi
@@ -0,0 +1,3 @@
+from typing import List
+
+def parse_words(text: str) -> List[str]: ...
diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py
index c26932069f..f97ca12d84 100644
--- a/tests/storage/test_user_directory.py
+++ b/tests/storage/test_user_directory.py
@@ -32,10 +32,8 @@ from synapse.rest.client import login, register, room
 from synapse.server import HomeServer
 from synapse.storage import DataStore
 from synapse.storage.background_updates import _BackgroundUpdateHandler
-from synapse.storage.databases.main import user_directory
 from synapse.storage.databases.main.user_directory import (
     _parse_words_with_icu,
-    _parse_words_with_regex,
 )
 from synapse.storage.roommember import ProfileInfo
 from synapse.util import Clock
@@ -44,12 +42,6 @@ from tests.server import ThreadedMemoryReactorClock
 from tests.test_utils.event_injection import inject_member_event
 from tests.unittest import HomeserverTestCase, override_config
 
-try:
-    import icu
-except ImportError:
-    icu = None  # type: ignore
-
-
 ALICE = "@alice:a"
 BOB = "@bob:b"
 BOBBY = "@bobby:a"
@@ -438,8 +430,6 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase):
 
 
 class UserDirectoryStoreTestCase(HomeserverTestCase):
-    use_icu = False
-
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
         self.store = hs.get_datastores().main
 
@@ -451,12 +441,6 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
         self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
         self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
 
-        self._restore_use_icu = user_directory.USE_ICU
-        user_directory.USE_ICU = self.use_icu
-
-    def tearDown(self) -> None:
-        user_directory.USE_ICU = self._restore_use_icu
-
     def test_search_user_dir(self) -> None:
         # normally when alice searches the directory she should just find
         # bob because bobby doesn't share a room with her.
@@ -648,24 +632,14 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
     test_search_user_dir_accent_insensitivity.skip = "not supported yet"  # type: ignore
 
 
-class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
-    use_icu = True
-
-    if not icu:
-        skip = "Requires PyICU"
-
-
 class UserDirectoryICUTestCase(HomeserverTestCase):
-    if not icu:
-        skip = "Requires PyICU"
-
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
         self.store = hs.get_datastores().main
         self.user_dir_helper = GetUserDirectoryTables(self.store)
 
     def test_icu_word_boundary(self) -> None:
-        """Tests that we correctly detect word boundaries when ICU (International
-        Components for Unicode) support is available.
+        """Tests that we correctly detect word boundaries with ICU
+        (International Components for Unicode).
         """
 
         display_name = "Gáo"
@@ -714,12 +688,3 @@ class UserDirectoryICUTestCase(HomeserverTestCase):
         self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"])
         self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"])
         self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"])
-
-    def test_regex_word_boundary_punctuation(self) -> None:
-        """
-        Tests the behaviour of punctuation with the non-ICU tokeniser
-        """
-        self.assertEqual(
-            _parse_words_with_regex("lazy'fox jumped:over the.dog"),
-            ["lazy", "fox", "jumped", "over", "the", "dog"],
-        )