From be4c95baf11f16a6108da5b73f7140d36de6f55d Mon Sep 17 00:00:00 2001 From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com> Date: Thu, 3 Jul 2025 11:12:12 +0100 Subject: [PATCH] Replace PyICU with Rust `icu_segmenter` crate (#18553) Co-authored-by: anoa's Codex Agent Co-authored-by: Quentin Gliech --- changelog.d/18553.misc | 1 + docs/development/contributing_guide.md | 2 - docs/development/dependencies.md | 5 +- docs/setup/installation.md | 26 ++-------- docs/upgrade.md | 7 +++ docs/user_directory.md | 13 ++--- flake.nix | 1 - poetry.lock | 17 +------ pyproject.toml | 7 --- rust/Cargo.toml | 1 + rust/src/lib.rs | 2 + rust/src/segmenter.rs | 33 +++++++++++++ .../storage/databases/main/user_directory.py | 49 +++---------------- synapse/synapse_rust/segmenter.pyi | 3 ++ tests/storage/test_user_directory.py | 39 +-------------- 15 files changed, 70 insertions(+), 136 deletions(-) create mode 100644 changelog.d/18553.misc create mode 100644 rust/src/segmenter.rs create mode 100644 synapse/synapse_rust/segmenter.pyi diff --git a/changelog.d/18553.misc b/changelog.d/18553.misc new file mode 100644 index 0000000000..bb143aacfc --- /dev/null +++ b/changelog.d/18553.misc @@ -0,0 +1 @@ +Replace `PyICU` crate with equivalent `icu_segmenter` Rust crate. \ No newline at end of file diff --git a/docs/development/contributing_guide.md b/docs/development/contributing_guide.md index d6efab96cf..eb6f04e301 100644 --- a/docs/development/contributing_guide.md +++ b/docs/development/contributing_guide.md @@ -29,8 +29,6 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`. -Synapse has an optional, improved user search with better Unicode support. For that you need the development package of `libicu`. On Debian or Ubuntu Linux, this can be installed with `sudo apt install libicu-dev`. - The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git). For some tests, you will need [a recent version of Docker](https://docs.docker.com/get-docker/). diff --git a/docs/development/dependencies.md b/docs/development/dependencies.md index fa5ff4dcf7..e381b3d155 100644 --- a/docs/development/dependencies.md +++ b/docs/development/dependencies.md @@ -164,10 +164,7 @@ $ poetry cache clear --all . # including the wheel artifacts which is not covered by the above command # (see https://github.com/python-poetry/poetry/issues/10304) # -# This is necessary in order to rebuild or fetch new wheels. For example, if you update -# the `icu` library in on your system, you will need to rebuild the PyICU Python package -# in order to incorporate the correct dynamically linked library locations otherwise you -# will run into errors like: `ImportError: libicui18n.so.75: cannot open shared object file: No such file or directory` +# This is necessary in order to rebuild or fetch new wheels. $ rm -rf $(poetry config cache-dir) ``` diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 0853496ab7..0840f532b0 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -286,7 +286,7 @@ Installing prerequisites on Ubuntu or Debian: ```sh sudo apt install build-essential python3-dev libffi-dev \ python3-pip python3-setuptools sqlite3 \ - libssl-dev virtualenv libjpeg-dev libxslt1-dev libicu-dev + libssl-dev virtualenv libjpeg-dev libxslt1-dev ``` ##### ArchLinux @@ -295,7 +295,7 @@ Installing prerequisites on ArchLinux: ```sh sudo pacman -S base-devel python python-pip \ - python-setuptools python-virtualenv sqlite3 icu + python-setuptools python-virtualenv sqlite3 ``` ##### CentOS/Fedora @@ -305,8 +305,7 @@ Installing prerequisites on CentOS or Fedora Linux: ```sh sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \ libwebp-devel libxml2-devel libxslt-devel libpq-devel \ - python3-virtualenv libffi-devel openssl-devel python3-devel \ - libicu-devel + python3-virtualenv libffi-devel openssl-devel python3-devel sudo dnf group install "Development Tools" ``` @@ -333,7 +332,7 @@ dnf install python3.12 python3.12-devel ``` Finally, install common prerequisites ```bash -dnf install libicu libicu-devel libpq5 libpq5-devel lz4 pkgconf +dnf install libpq5 libpq5-devel lz4 pkgconf dnf group install "Development Tools" ``` ###### Using venv module instead of virtualenv command @@ -365,20 +364,6 @@ xcode-select --install Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them. -You may need to install icu, and make the icu binaries and libraries accessible. -Please follow [the official instructions of PyICU](https://pypi.org/project/PyICU/) to do so. - -If you're struggling to get icu discovered, and see: -``` - RuntimeError: - Please install pkg-config on your system or set the ICU_VERSION environment - variable to the version of ICU you have installed. -``` -despite it being installed and having your `PATH` updated, you can omit this dependency by -not specifying `--extras all` to `poetry`. If using postgres, you can install Synapse via -`poetry install --extras saml2 --extras oidc --extras postgres --extras opentracing --extras redis --extras sentry`. -ICU is not a hard dependency on getting a working installation. - On ARM-based Macs you may also need to install libjpeg and libpq: ```sh brew install jpeg libpq @@ -400,8 +385,7 @@ Installing prerequisites on openSUSE: ```sh sudo zypper in -t pattern devel_basis sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \ - python-devel libffi-devel libopenssl-devel libjpeg62-devel \ - libicu-devel + python-devel libffi-devel libopenssl-devel libjpeg62-devel ``` ##### OpenBSD diff --git a/docs/upgrade.md b/docs/upgrade.md index d508e2231e..ca9ca121f2 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -117,6 +117,13 @@ each upgrade are complete before moving on to the next upgrade, to avoid stacking them up. You can monitor the currently running background updates with [the Admin API](usage/administration/admin_api/background_updates.html#status). +# Upgrading to v1.134.0 + +## ICU bundled with Synapse + +Synapse now uses the Rust `icu` library for improved user search. Installing the +native ICU library on your system is no longer required. + # Upgrading to v1.130.0 ## Documented endpoint which can be delegated to a federation worker diff --git a/docs/user_directory.md b/docs/user_directory.md index be8664a016..75d32af44f 100644 --- a/docs/user_directory.md +++ b/docs/user_directory.md @@ -77,14 +77,11 @@ The user provided search term is lowercased and normalized using [NFKC](https:// this treats the string as case-insensitive, canonicalizes different forms of the same text, and maps some "roughly equivalent" characters together. -The search term is then split into words: - -* If [ICU](https://en.wikipedia.org/wiki/International_Components_for_Unicode) is - available, then the system's [default locale](https://unicode-org.github.io/icu/userguide/locale/#default-locales) - will be used to break the search term into words. (See the - [installation instructions](setup/installation.md) for how to install ICU.) -* If unavailable, then runs of ASCII characters, numbers, underscores, and hyphens - are considered words. +The search term is then split into segments using the [`icu_segmenter` +Rust crate](https://crates.io/crates/icu_segmenter). This crate ships with its +own dictionary and Long Short Term-Memory (LSTM) machine learning models +per-language to segment words. Read more [in the crate's +documentation](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_auto). The queries for PostgreSQL and SQLite are detailed below, but their overall goal is to find matching users, preferring users who are "real" (e.g. not bots, diff --git a/flake.nix b/flake.nix index 749c10da1d..4ff6518aed 100644 --- a/flake.nix +++ b/flake.nix @@ -96,7 +96,6 @@ gnumake # Native dependencies for running Synapse. - icu libffi libjpeg libpqxx diff --git a/poetry.lock b/poetry.lock index 73cafc414c..329c373d12 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1987,18 +1987,6 @@ files = [ [package.extras] plugins = ["importlib-metadata ; python_version < \"3.8\""] -[[package]] -name = "pyicu" -version = "2.14" -description = "Python extension wrapping the ICU C++ API" -optional = true -python-versions = "*" -groups = ["main"] -markers = "extra == \"all\" or extra == \"user-search\"" -files = [ - {file = "PyICU-2.14.tar.gz", hash = "sha256:acc7eb92bd5c554ed577249c6978450a4feda0aa6f01470152b3a7b382a02132"}, -] - [[package]] name = "pyjwt" version = "2.6.0" @@ -3393,7 +3381,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"] +all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -3406,9 +3394,8 @@ sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["idna", "parameterized"] url-preview = ["lxml"] -user-search = ["pyicu"] [metadata] lock-version = "2.1" python-versions = "^3.9.0" -content-hash = "9824e42dfc0e128129ee0c8641f7fe639bf47574cdd3f052dd995941abc6e44b" +content-hash = "457f188ae22af9663b2ed21f2586720ce5014edc7c34a697787f16aad733ea41" diff --git a/pyproject.toml b/pyproject.toml index da9bbc6151..ecd601f03b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -254,7 +254,6 @@ hiredis = { version = "*", optional = true } Pympler = { version = "*", optional = true } parameterized = { version = ">=0.7.4", optional = true } idna = { version = ">=2.5", optional = true } -pyicu = { version = ">=2.10.2", optional = true } [tool.poetry.extras] # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified @@ -277,10 +276,6 @@ redis = ["txredisapi", "hiredis"] # Required to use experimental `caches.track_memory_usage` config option. cache-memory = ["pympler"] test = ["parameterized", "idna"] -# Allows for better search for international characters in the user directory. This -# requires libicu's development headers installed on the system (e.g. libicu-dev on -# Debian-based distributions). -user-search = ["pyicu"] # The duplication here is awful. I hate hate hate hate hate it. However, for now I want # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: @@ -312,8 +307,6 @@ all = [ "txredisapi", "hiredis", # cache-memory "pympler", - # improved user search - "pyicu", # omitted: # - test: it's useful to have this separate from dev deps in the olddeps job # - systemd: this is a system-based requirement diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 3a47f7ddf7..dab32c8952 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -43,6 +43,7 @@ sha2 = "0.10.8" serde = { version = "1.0.144", features = ["derive"] } serde_json = "1.0.85" ulid = "1.1.2" +icu_segmenter = "2.0.0" reqwest = { version = "0.12.15", default-features = false, features = [ "http2", "stream", diff --git a/rust/src/lib.rs b/rust/src/lib.rs index e33a8cc44c..6522148fa1 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -13,6 +13,7 @@ pub mod identifier; pub mod matrix_const; pub mod push; pub mod rendezvous; +pub mod segmenter; lazy_static! { static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init(); @@ -53,6 +54,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { events::register_module(py, m)?; http_client::register_module(py, m)?; rendezvous::register_module(py, m)?; + segmenter::register_module(py, m)?; Ok(()) } diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs new file mode 100644 index 0000000000..135b3c1779 --- /dev/null +++ b/rust/src/segmenter.rs @@ -0,0 +1,33 @@ +use icu_segmenter::options::WordBreakInvariantOptions; +use icu_segmenter::WordSegmenter; +use pyo3::prelude::*; + +#[pyfunction] +pub fn parse_words(text: &str) -> PyResult> { + let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default()); + let mut parts = Vec::new(); + let mut last = 0usize; + + // `segment_str` gives us word boundaries as a vector of indexes. Use that + // to build a vector of words, and return. + for boundary in segmenter.segment_str(text) { + if boundary > last { + parts.push(text[last..boundary].to_string()); + } + last = boundary; + } + Ok(parts) +} + +pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let child_module = PyModule::new(py, "segmenter")?; + child_module.add_function(wrap_pyfunction!(parse_words, m)?)?; + + m.add_submodule(&child_module)?; + + py.import("sys")? + .getattr("modules")? + .set_item("synapse.synapse_rust.segmenter", child_module)?; + + Ok(()) +} diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 09671abcad..9deb9ab73c 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -37,16 +37,8 @@ from typing import ( import attr -try: - # Figure out if ICU support is available for searching users. - import icu - - USE_ICU = True -except ModuleNotFoundError: - # except ModuleNotFoundError: - USE_ICU = False - from synapse.api.errors import StoreError +from synapse.synapse_rust import segmenter as icu from synapse.util.stringutils import non_null_str_or_none if TYPE_CHECKING: @@ -1226,7 +1218,7 @@ def _filter_text_for_index(text: str) -> str: def _parse_query_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form - that can be passed to database. + that can be passed to the database. We use this so that we can add prefix matching, which isn't something that is supported by default. @@ -1242,7 +1234,7 @@ def _parse_query_sqlite(search_term: str) -> str: def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: """Takes a plain unicode string from the user and converts it into a form - that can be passed to database. + that can be passed to the database. We use this so that we can add prefix matching, which isn't something that is supported by default. """ @@ -1272,12 +1264,7 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: def _parse_words(search_term: str) -> List[str]: - """Split the provided search string into a list of its words. - - If support for ICU (International Components for Unicode) is available, use it. - Otherwise, fall back to using a regex to detect word boundaries. This latter - solution works well enough for most latin-based languages, but doesn't work as well - with other languages. + """Split the provided search string into a list of its words using ICU. Args: search_term: The search string. @@ -1285,18 +1272,7 @@ def _parse_words(search_term: str) -> List[str]: Returns: A list of the words in the search string. """ - if USE_ICU: - return _parse_words_with_icu(search_term) - - return _parse_words_with_regex(search_term) - - -def _parse_words_with_regex(search_term: str) -> List[str]: - """ - Break down search term into words, when we don't have ICU available. - See: `_parse_words` - """ - return re.findall(r"([\w-]+)", search_term, re.UNICODE) + return _parse_words_with_icu(search_term) def _parse_words_with_icu(search_term: str) -> List[str]: @@ -1310,22 +1286,13 @@ def _parse_words_with_icu(search_term: str) -> List[str]: A list of the words in the search string. """ results = [] - breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) - breaker.setText(search_term) - i = 0 - while True: - j = breaker.nextBoundary() - if j < 0: - break - + for part in icu.parse_words(search_term): # We want to make sure that we split on `@` and `:` specifically, as # they occur in user IDs. - for result in re.split(r"[@:]+", search_term[i:j]): + for result in re.split(r"[@:]+", part): results.append(result.strip()) - i = j - - # libicu will break up words that have punctuation in them, but to handle + # icu will break up words that have punctuation in them, but to handle # cases where user IDs have '-', '.' and '_' in them we want to *not* break # those into words and instead allow the DB to tokenise them how it wants. # diff --git a/synapse/synapse_rust/segmenter.pyi b/synapse/synapse_rust/segmenter.pyi new file mode 100644 index 0000000000..5f36765947 --- /dev/null +++ b/synapse/synapse_rust/segmenter.pyi @@ -0,0 +1,3 @@ +from typing import List + +def parse_words(text: str) -> List[str]: ... diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index c26932069f..f97ca12d84 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -32,10 +32,8 @@ from synapse.rest.client import login, register, room from synapse.server import HomeServer from synapse.storage import DataStore from synapse.storage.background_updates import _BackgroundUpdateHandler -from synapse.storage.databases.main import user_directory from synapse.storage.databases.main.user_directory import ( _parse_words_with_icu, - _parse_words_with_regex, ) from synapse.storage.roommember import ProfileInfo from synapse.util import Clock @@ -44,12 +42,6 @@ from tests.server import ThreadedMemoryReactorClock from tests.test_utils.event_injection import inject_member_event from tests.unittest import HomeserverTestCase, override_config -try: - import icu -except ImportError: - icu = None # type: ignore - - ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" @@ -438,8 +430,6 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase): class UserDirectoryStoreTestCase(HomeserverTestCase): - use_icu = False - def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main @@ -451,12 +441,6 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) - self._restore_use_icu = user_directory.USE_ICU - user_directory.USE_ICU = self.use_icu - - def tearDown(self) -> None: - user_directory.USE_ICU = self._restore_use_icu - def test_search_user_dir(self) -> None: # normally when alice searches the directory she should just find # bob because bobby doesn't share a room with her. @@ -648,24 +632,14 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore -class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): - use_icu = True - - if not icu: - skip = "Requires PyICU" - - class UserDirectoryICUTestCase(HomeserverTestCase): - if not icu: - skip = "Requires PyICU" - def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main self.user_dir_helper = GetUserDirectoryTables(self.store) def test_icu_word_boundary(self) -> None: - """Tests that we correctly detect word boundaries when ICU (International - Components for Unicode) support is available. + """Tests that we correctly detect word boundaries with ICU + (International Components for Unicode). """ display_name = "Gáo" @@ -714,12 +688,3 @@ class UserDirectoryICUTestCase(HomeserverTestCase): self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"]) self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"]) self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"]) - - def test_regex_word_boundary_punctuation(self) -> None: - """ - Tests the behaviour of punctuation with the non-ICU tokeniser - """ - self.assertEqual( - _parse_words_with_regex("lazy'fox jumped:over the.dog"), - ["lazy", "fox", "jumped", "over", "the", "dog"], - )