Replace PyICU with Rust icu_segmenter crate (#18553)
Co-authored-by: anoa's Codex Agent <codex@amorgan.xyz> Co-authored-by: Quentin Gliech <quenting@element.io>
This commit is contained in:
1
changelog.d/18553.misc
Normal file
1
changelog.d/18553.misc
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Replace `PyICU` crate with equivalent `icu_segmenter` Rust crate.
|
||||||
@@ -29,8 +29,6 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r
|
|||||||
|
|
||||||
Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`.
|
Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`.
|
||||||
|
|
||||||
Synapse has an optional, improved user search with better Unicode support. For that you need the development package of `libicu`. On Debian or Ubuntu Linux, this can be installed with `sudo apt install libicu-dev`.
|
|
||||||
|
|
||||||
The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git).
|
The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git).
|
||||||
|
|
||||||
For some tests, you will need [a recent version of Docker](https://docs.docker.com/get-docker/).
|
For some tests, you will need [a recent version of Docker](https://docs.docker.com/get-docker/).
|
||||||
|
|||||||
@@ -164,10 +164,7 @@ $ poetry cache clear --all .
|
|||||||
# including the wheel artifacts which is not covered by the above command
|
# including the wheel artifacts which is not covered by the above command
|
||||||
# (see https://github.com/python-poetry/poetry/issues/10304)
|
# (see https://github.com/python-poetry/poetry/issues/10304)
|
||||||
#
|
#
|
||||||
# This is necessary in order to rebuild or fetch new wheels. For example, if you update
|
# This is necessary in order to rebuild or fetch new wheels.
|
||||||
# the `icu` library in on your system, you will need to rebuild the PyICU Python package
|
|
||||||
# in order to incorporate the correct dynamically linked library locations otherwise you
|
|
||||||
# will run into errors like: `ImportError: libicui18n.so.75: cannot open shared object file: No such file or directory`
|
|
||||||
$ rm -rf $(poetry config cache-dir)
|
$ rm -rf $(poetry config cache-dir)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -286,7 +286,7 @@ Installing prerequisites on Ubuntu or Debian:
|
|||||||
```sh
|
```sh
|
||||||
sudo apt install build-essential python3-dev libffi-dev \
|
sudo apt install build-essential python3-dev libffi-dev \
|
||||||
python3-pip python3-setuptools sqlite3 \
|
python3-pip python3-setuptools sqlite3 \
|
||||||
libssl-dev virtualenv libjpeg-dev libxslt1-dev libicu-dev
|
libssl-dev virtualenv libjpeg-dev libxslt1-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
##### ArchLinux
|
##### ArchLinux
|
||||||
@@ -295,7 +295,7 @@ Installing prerequisites on ArchLinux:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
sudo pacman -S base-devel python python-pip \
|
sudo pacman -S base-devel python python-pip \
|
||||||
python-setuptools python-virtualenv sqlite3 icu
|
python-setuptools python-virtualenv sqlite3
|
||||||
```
|
```
|
||||||
|
|
||||||
##### CentOS/Fedora
|
##### CentOS/Fedora
|
||||||
@@ -305,8 +305,7 @@ Installing prerequisites on CentOS or Fedora Linux:
|
|||||||
```sh
|
```sh
|
||||||
sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \
|
sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \
|
||||||
libwebp-devel libxml2-devel libxslt-devel libpq-devel \
|
libwebp-devel libxml2-devel libxslt-devel libpq-devel \
|
||||||
python3-virtualenv libffi-devel openssl-devel python3-devel \
|
python3-virtualenv libffi-devel openssl-devel python3-devel
|
||||||
libicu-devel
|
|
||||||
sudo dnf group install "Development Tools"
|
sudo dnf group install "Development Tools"
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -333,7 +332,7 @@ dnf install python3.12 python3.12-devel
|
|||||||
```
|
```
|
||||||
Finally, install common prerequisites
|
Finally, install common prerequisites
|
||||||
```bash
|
```bash
|
||||||
dnf install libicu libicu-devel libpq5 libpq5-devel lz4 pkgconf
|
dnf install libpq5 libpq5-devel lz4 pkgconf
|
||||||
dnf group install "Development Tools"
|
dnf group install "Development Tools"
|
||||||
```
|
```
|
||||||
###### Using venv module instead of virtualenv command
|
###### Using venv module instead of virtualenv command
|
||||||
@@ -365,20 +364,6 @@ xcode-select --install
|
|||||||
|
|
||||||
Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them.
|
Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them.
|
||||||
|
|
||||||
You may need to install icu, and make the icu binaries and libraries accessible.
|
|
||||||
Please follow [the official instructions of PyICU](https://pypi.org/project/PyICU/) to do so.
|
|
||||||
|
|
||||||
If you're struggling to get icu discovered, and see:
|
|
||||||
```
|
|
||||||
RuntimeError:
|
|
||||||
Please install pkg-config on your system or set the ICU_VERSION environment
|
|
||||||
variable to the version of ICU you have installed.
|
|
||||||
```
|
|
||||||
despite it being installed and having your `PATH` updated, you can omit this dependency by
|
|
||||||
not specifying `--extras all` to `poetry`. If using postgres, you can install Synapse via
|
|
||||||
`poetry install --extras saml2 --extras oidc --extras postgres --extras opentracing --extras redis --extras sentry`.
|
|
||||||
ICU is not a hard dependency on getting a working installation.
|
|
||||||
|
|
||||||
On ARM-based Macs you may also need to install libjpeg and libpq:
|
On ARM-based Macs you may also need to install libjpeg and libpq:
|
||||||
```sh
|
```sh
|
||||||
brew install jpeg libpq
|
brew install jpeg libpq
|
||||||
@@ -400,8 +385,7 @@ Installing prerequisites on openSUSE:
|
|||||||
```sh
|
```sh
|
||||||
sudo zypper in -t pattern devel_basis
|
sudo zypper in -t pattern devel_basis
|
||||||
sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \
|
sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \
|
||||||
python-devel libffi-devel libopenssl-devel libjpeg62-devel \
|
python-devel libffi-devel libopenssl-devel libjpeg62-devel
|
||||||
libicu-devel
|
|
||||||
```
|
```
|
||||||
|
|
||||||
##### OpenBSD
|
##### OpenBSD
|
||||||
|
|||||||
@@ -117,6 +117,13 @@ each upgrade are complete before moving on to the next upgrade, to avoid
|
|||||||
stacking them up. You can monitor the currently running background updates with
|
stacking them up. You can monitor the currently running background updates with
|
||||||
[the Admin API](usage/administration/admin_api/background_updates.html#status).
|
[the Admin API](usage/administration/admin_api/background_updates.html#status).
|
||||||
|
|
||||||
|
# Upgrading to v1.134.0
|
||||||
|
|
||||||
|
## ICU bundled with Synapse
|
||||||
|
|
||||||
|
Synapse now uses the Rust `icu` library for improved user search. Installing the
|
||||||
|
native ICU library on your system is no longer required.
|
||||||
|
|
||||||
# Upgrading to v1.130.0
|
# Upgrading to v1.130.0
|
||||||
|
|
||||||
## Documented endpoint which can be delegated to a federation worker
|
## Documented endpoint which can be delegated to a federation worker
|
||||||
|
|||||||
@@ -77,14 +77,11 @@ The user provided search term is lowercased and normalized using [NFKC](https://
|
|||||||
this treats the string as case-insensitive, canonicalizes different forms of the
|
this treats the string as case-insensitive, canonicalizes different forms of the
|
||||||
same text, and maps some "roughly equivalent" characters together.
|
same text, and maps some "roughly equivalent" characters together.
|
||||||
|
|
||||||
The search term is then split into words:
|
The search term is then split into segments using the [`icu_segmenter`
|
||||||
|
Rust crate](https://crates.io/crates/icu_segmenter). This crate ships with its
|
||||||
* If [ICU](https://en.wikipedia.org/wiki/International_Components_for_Unicode) is
|
own dictionary and Long Short Term-Memory (LSTM) machine learning models
|
||||||
available, then the system's [default locale](https://unicode-org.github.io/icu/userguide/locale/#default-locales)
|
per-language to segment words. Read more [in the crate's
|
||||||
will be used to break the search term into words. (See the
|
documentation](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_auto).
|
||||||
[installation instructions](setup/installation.md) for how to install ICU.)
|
|
||||||
* If unavailable, then runs of ASCII characters, numbers, underscores, and hyphens
|
|
||||||
are considered words.
|
|
||||||
|
|
||||||
The queries for PostgreSQL and SQLite are detailed below, but their overall goal
|
The queries for PostgreSQL and SQLite are detailed below, but their overall goal
|
||||||
is to find matching users, preferring users who are "real" (e.g. not bots,
|
is to find matching users, preferring users who are "real" (e.g. not bots,
|
||||||
|
|||||||
@@ -96,7 +96,6 @@
|
|||||||
gnumake
|
gnumake
|
||||||
|
|
||||||
# Native dependencies for running Synapse.
|
# Native dependencies for running Synapse.
|
||||||
icu
|
|
||||||
libffi
|
libffi
|
||||||
libjpeg
|
libjpeg
|
||||||
libpqxx
|
libpqxx
|
||||||
|
|||||||
17
poetry.lock
generated
17
poetry.lock
generated
@@ -1987,18 +1987,6 @@ files = [
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
plugins = ["importlib-metadata ; python_version < \"3.8\""]
|
plugins = ["importlib-metadata ; python_version < \"3.8\""]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pyicu"
|
|
||||||
version = "2.14"
|
|
||||||
description = "Python extension wrapping the ICU C++ API"
|
|
||||||
optional = true
|
|
||||||
python-versions = "*"
|
|
||||||
groups = ["main"]
|
|
||||||
markers = "extra == \"all\" or extra == \"user-search\""
|
|
||||||
files = [
|
|
||||||
{file = "PyICU-2.14.tar.gz", hash = "sha256:acc7eb92bd5c554ed577249c6978450a4feda0aa6f01470152b3a7b382a02132"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyjwt"
|
name = "pyjwt"
|
||||||
version = "2.6.0"
|
version = "2.6.0"
|
||||||
@@ -3393,7 +3381,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
|
|||||||
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
|
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"]
|
all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "sentry-sdk", "txredisapi"]
|
||||||
cache-memory = ["Pympler"]
|
cache-memory = ["Pympler"]
|
||||||
jwt = ["authlib"]
|
jwt = ["authlib"]
|
||||||
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
|
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
|
||||||
@@ -3406,9 +3394,8 @@ sentry = ["sentry-sdk"]
|
|||||||
systemd = ["systemd-python"]
|
systemd = ["systemd-python"]
|
||||||
test = ["idna", "parameterized"]
|
test = ["idna", "parameterized"]
|
||||||
url-preview = ["lxml"]
|
url-preview = ["lxml"]
|
||||||
user-search = ["pyicu"]
|
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = "^3.9.0"
|
python-versions = "^3.9.0"
|
||||||
content-hash = "9824e42dfc0e128129ee0c8641f7fe639bf47574cdd3f052dd995941abc6e44b"
|
content-hash = "457f188ae22af9663b2ed21f2586720ce5014edc7c34a697787f16aad733ea41"
|
||||||
|
|||||||
@@ -254,7 +254,6 @@ hiredis = { version = "*", optional = true }
|
|||||||
Pympler = { version = "*", optional = true }
|
Pympler = { version = "*", optional = true }
|
||||||
parameterized = { version = ">=0.7.4", optional = true }
|
parameterized = { version = ">=0.7.4", optional = true }
|
||||||
idna = { version = ">=2.5", optional = true }
|
idna = { version = ">=2.5", optional = true }
|
||||||
pyicu = { version = ">=2.10.2", optional = true }
|
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
|
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
|
||||||
@@ -277,10 +276,6 @@ redis = ["txredisapi", "hiredis"]
|
|||||||
# Required to use experimental `caches.track_memory_usage` config option.
|
# Required to use experimental `caches.track_memory_usage` config option.
|
||||||
cache-memory = ["pympler"]
|
cache-memory = ["pympler"]
|
||||||
test = ["parameterized", "idna"]
|
test = ["parameterized", "idna"]
|
||||||
# Allows for better search for international characters in the user directory. This
|
|
||||||
# requires libicu's development headers installed on the system (e.g. libicu-dev on
|
|
||||||
# Debian-based distributions).
|
|
||||||
user-search = ["pyicu"]
|
|
||||||
|
|
||||||
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
|
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
|
||||||
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
|
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
|
||||||
@@ -312,8 +307,6 @@ all = [
|
|||||||
"txredisapi", "hiredis",
|
"txredisapi", "hiredis",
|
||||||
# cache-memory
|
# cache-memory
|
||||||
"pympler",
|
"pympler",
|
||||||
# improved user search
|
|
||||||
"pyicu",
|
|
||||||
# omitted:
|
# omitted:
|
||||||
# - test: it's useful to have this separate from dev deps in the olddeps job
|
# - test: it's useful to have this separate from dev deps in the olddeps job
|
||||||
# - systemd: this is a system-based requirement
|
# - systemd: this is a system-based requirement
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ sha2 = "0.10.8"
|
|||||||
serde = { version = "1.0.144", features = ["derive"] }
|
serde = { version = "1.0.144", features = ["derive"] }
|
||||||
serde_json = "1.0.85"
|
serde_json = "1.0.85"
|
||||||
ulid = "1.1.2"
|
ulid = "1.1.2"
|
||||||
|
icu_segmenter = "2.0.0"
|
||||||
reqwest = { version = "0.12.15", default-features = false, features = [
|
reqwest = { version = "0.12.15", default-features = false, features = [
|
||||||
"http2",
|
"http2",
|
||||||
"stream",
|
"stream",
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ pub mod identifier;
|
|||||||
pub mod matrix_const;
|
pub mod matrix_const;
|
||||||
pub mod push;
|
pub mod push;
|
||||||
pub mod rendezvous;
|
pub mod rendezvous;
|
||||||
|
pub mod segmenter;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
|
static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
|
||||||
@@ -53,6 +54,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|||||||
events::register_module(py, m)?;
|
events::register_module(py, m)?;
|
||||||
http_client::register_module(py, m)?;
|
http_client::register_module(py, m)?;
|
||||||
rendezvous::register_module(py, m)?;
|
rendezvous::register_module(py, m)?;
|
||||||
|
segmenter::register_module(py, m)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
33
rust/src/segmenter.rs
Normal file
33
rust/src/segmenter.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
use icu_segmenter::options::WordBreakInvariantOptions;
|
||||||
|
use icu_segmenter::WordSegmenter;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
#[pyfunction]
|
||||||
|
pub fn parse_words(text: &str) -> PyResult<Vec<String>> {
|
||||||
|
let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
|
||||||
|
let mut parts = Vec::new();
|
||||||
|
let mut last = 0usize;
|
||||||
|
|
||||||
|
// `segment_str` gives us word boundaries as a vector of indexes. Use that
|
||||||
|
// to build a vector of words, and return.
|
||||||
|
for boundary in segmenter.segment_str(text) {
|
||||||
|
if boundary > last {
|
||||||
|
parts.push(text[last..boundary].to_string());
|
||||||
|
}
|
||||||
|
last = boundary;
|
||||||
|
}
|
||||||
|
Ok(parts)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
|
let child_module = PyModule::new(py, "segmenter")?;
|
||||||
|
child_module.add_function(wrap_pyfunction!(parse_words, m)?)?;
|
||||||
|
|
||||||
|
m.add_submodule(&child_module)?;
|
||||||
|
|
||||||
|
py.import("sys")?
|
||||||
|
.getattr("modules")?
|
||||||
|
.set_item("synapse.synapse_rust.segmenter", child_module)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -37,16 +37,8 @@ from typing import (
|
|||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
|
||||||
try:
|
|
||||||
# Figure out if ICU support is available for searching users.
|
|
||||||
import icu
|
|
||||||
|
|
||||||
USE_ICU = True
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
# except ModuleNotFoundError:
|
|
||||||
USE_ICU = False
|
|
||||||
|
|
||||||
from synapse.api.errors import StoreError
|
from synapse.api.errors import StoreError
|
||||||
|
from synapse.synapse_rust import segmenter as icu
|
||||||
from synapse.util.stringutils import non_null_str_or_none
|
from synapse.util.stringutils import non_null_str_or_none
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -1226,7 +1218,7 @@ def _filter_text_for_index(text: str) -> str:
|
|||||||
|
|
||||||
def _parse_query_sqlite(search_term: str) -> str:
|
def _parse_query_sqlite(search_term: str) -> str:
|
||||||
"""Takes a plain unicode string from the user and converts it into a form
|
"""Takes a plain unicode string from the user and converts it into a form
|
||||||
that can be passed to database.
|
that can be passed to the database.
|
||||||
We use this so that we can add prefix matching, which isn't something
|
We use this so that we can add prefix matching, which isn't something
|
||||||
that is supported by default.
|
that is supported by default.
|
||||||
|
|
||||||
@@ -1242,7 +1234,7 @@ def _parse_query_sqlite(search_term: str) -> str:
|
|||||||
|
|
||||||
def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
|
def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
|
||||||
"""Takes a plain unicode string from the user and converts it into a form
|
"""Takes a plain unicode string from the user and converts it into a form
|
||||||
that can be passed to database.
|
that can be passed to the database.
|
||||||
We use this so that we can add prefix matching, which isn't something
|
We use this so that we can add prefix matching, which isn't something
|
||||||
that is supported by default.
|
that is supported by default.
|
||||||
"""
|
"""
|
||||||
@@ -1272,12 +1264,7 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
|
|||||||
|
|
||||||
|
|
||||||
def _parse_words(search_term: str) -> List[str]:
|
def _parse_words(search_term: str) -> List[str]:
|
||||||
"""Split the provided search string into a list of its words.
|
"""Split the provided search string into a list of its words using ICU.
|
||||||
|
|
||||||
If support for ICU (International Components for Unicode) is available, use it.
|
|
||||||
Otherwise, fall back to using a regex to detect word boundaries. This latter
|
|
||||||
solution works well enough for most latin-based languages, but doesn't work as well
|
|
||||||
with other languages.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
search_term: The search string.
|
search_term: The search string.
|
||||||
@@ -1285,19 +1272,8 @@ def _parse_words(search_term: str) -> List[str]:
|
|||||||
Returns:
|
Returns:
|
||||||
A list of the words in the search string.
|
A list of the words in the search string.
|
||||||
"""
|
"""
|
||||||
if USE_ICU:
|
|
||||||
return _parse_words_with_icu(search_term)
|
return _parse_words_with_icu(search_term)
|
||||||
|
|
||||||
return _parse_words_with_regex(search_term)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_words_with_regex(search_term: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Break down search term into words, when we don't have ICU available.
|
|
||||||
See: `_parse_words`
|
|
||||||
"""
|
|
||||||
return re.findall(r"([\w-]+)", search_term, re.UNICODE)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_words_with_icu(search_term: str) -> List[str]:
|
def _parse_words_with_icu(search_term: str) -> List[str]:
|
||||||
"""Break down the provided search string into its individual words using ICU
|
"""Break down the provided search string into its individual words using ICU
|
||||||
@@ -1310,22 +1286,13 @@ def _parse_words_with_icu(search_term: str) -> List[str]:
|
|||||||
A list of the words in the search string.
|
A list of the words in the search string.
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
|
for part in icu.parse_words(search_term):
|
||||||
breaker.setText(search_term)
|
|
||||||
i = 0
|
|
||||||
while True:
|
|
||||||
j = breaker.nextBoundary()
|
|
||||||
if j < 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
# We want to make sure that we split on `@` and `:` specifically, as
|
# We want to make sure that we split on `@` and `:` specifically, as
|
||||||
# they occur in user IDs.
|
# they occur in user IDs.
|
||||||
for result in re.split(r"[@:]+", search_term[i:j]):
|
for result in re.split(r"[@:]+", part):
|
||||||
results.append(result.strip())
|
results.append(result.strip())
|
||||||
|
|
||||||
i = j
|
# icu will break up words that have punctuation in them, but to handle
|
||||||
|
|
||||||
# libicu will break up words that have punctuation in them, but to handle
|
|
||||||
# cases where user IDs have '-', '.' and '_' in them we want to *not* break
|
# cases where user IDs have '-', '.' and '_' in them we want to *not* break
|
||||||
# those into words and instead allow the DB to tokenise them how it wants.
|
# those into words and instead allow the DB to tokenise them how it wants.
|
||||||
#
|
#
|
||||||
|
|||||||
3
synapse/synapse_rust/segmenter.pyi
Normal file
3
synapse/synapse_rust/segmenter.pyi
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
def parse_words(text: str) -> List[str]: ...
|
||||||
@@ -32,10 +32,8 @@ from synapse.rest.client import login, register, room
|
|||||||
from synapse.server import HomeServer
|
from synapse.server import HomeServer
|
||||||
from synapse.storage import DataStore
|
from synapse.storage import DataStore
|
||||||
from synapse.storage.background_updates import _BackgroundUpdateHandler
|
from synapse.storage.background_updates import _BackgroundUpdateHandler
|
||||||
from synapse.storage.databases.main import user_directory
|
|
||||||
from synapse.storage.databases.main.user_directory import (
|
from synapse.storage.databases.main.user_directory import (
|
||||||
_parse_words_with_icu,
|
_parse_words_with_icu,
|
||||||
_parse_words_with_regex,
|
|
||||||
)
|
)
|
||||||
from synapse.storage.roommember import ProfileInfo
|
from synapse.storage.roommember import ProfileInfo
|
||||||
from synapse.util import Clock
|
from synapse.util import Clock
|
||||||
@@ -44,12 +42,6 @@ from tests.server import ThreadedMemoryReactorClock
|
|||||||
from tests.test_utils.event_injection import inject_member_event
|
from tests.test_utils.event_injection import inject_member_event
|
||||||
from tests.unittest import HomeserverTestCase, override_config
|
from tests.unittest import HomeserverTestCase, override_config
|
||||||
|
|
||||||
try:
|
|
||||||
import icu
|
|
||||||
except ImportError:
|
|
||||||
icu = None # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
ALICE = "@alice:a"
|
ALICE = "@alice:a"
|
||||||
BOB = "@bob:b"
|
BOB = "@bob:b"
|
||||||
BOBBY = "@bobby:a"
|
BOBBY = "@bobby:a"
|
||||||
@@ -438,8 +430,6 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase):
|
|||||||
|
|
||||||
|
|
||||||
class UserDirectoryStoreTestCase(HomeserverTestCase):
|
class UserDirectoryStoreTestCase(HomeserverTestCase):
|
||||||
use_icu = False
|
|
||||||
|
|
||||||
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
||||||
self.store = hs.get_datastores().main
|
self.store = hs.get_datastores().main
|
||||||
|
|
||||||
@@ -451,12 +441,6 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
|
|||||||
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
|
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
|
||||||
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
|
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
|
||||||
|
|
||||||
self._restore_use_icu = user_directory.USE_ICU
|
|
||||||
user_directory.USE_ICU = self.use_icu
|
|
||||||
|
|
||||||
def tearDown(self) -> None:
|
|
||||||
user_directory.USE_ICU = self._restore_use_icu
|
|
||||||
|
|
||||||
def test_search_user_dir(self) -> None:
|
def test_search_user_dir(self) -> None:
|
||||||
# normally when alice searches the directory she should just find
|
# normally when alice searches the directory she should just find
|
||||||
# bob because bobby doesn't share a room with her.
|
# bob because bobby doesn't share a room with her.
|
||||||
@@ -648,24 +632,14 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
|
|||||||
test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore
|
test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
|
|
||||||
use_icu = True
|
|
||||||
|
|
||||||
if not icu:
|
|
||||||
skip = "Requires PyICU"
|
|
||||||
|
|
||||||
|
|
||||||
class UserDirectoryICUTestCase(HomeserverTestCase):
|
class UserDirectoryICUTestCase(HomeserverTestCase):
|
||||||
if not icu:
|
|
||||||
skip = "Requires PyICU"
|
|
||||||
|
|
||||||
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
||||||
self.store = hs.get_datastores().main
|
self.store = hs.get_datastores().main
|
||||||
self.user_dir_helper = GetUserDirectoryTables(self.store)
|
self.user_dir_helper = GetUserDirectoryTables(self.store)
|
||||||
|
|
||||||
def test_icu_word_boundary(self) -> None:
|
def test_icu_word_boundary(self) -> None:
|
||||||
"""Tests that we correctly detect word boundaries when ICU (International
|
"""Tests that we correctly detect word boundaries with ICU
|
||||||
Components for Unicode) support is available.
|
(International Components for Unicode).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
display_name = "Gáo"
|
display_name = "Gáo"
|
||||||
@@ -714,12 +688,3 @@ class UserDirectoryICUTestCase(HomeserverTestCase):
|
|||||||
self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"])
|
self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"])
|
||||||
self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"])
|
self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"])
|
||||||
self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"])
|
self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"])
|
||||||
|
|
||||||
def test_regex_word_boundary_punctuation(self) -> None:
|
|
||||||
"""
|
|
||||||
Tests the behaviour of punctuation with the non-ICU tokeniser
|
|
||||||
"""
|
|
||||||
self.assertEqual(
|
|
||||||
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
|
|
||||||
["lazy", "fox", "jumped", "over", "the", "dog"],
|
|
||||||
)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user