Replace PyICU with Rust icu_segmenter crate (#18553)

Co-authored-by: anoa's Codex Agent <codex@amorgan.xyz>
Co-authored-by: Quentin Gliech <quenting@element.io>
This commit is contained in:
Andrew Morgan
2025-07-03 11:12:12 +01:00
committed by GitHub
parent 832690e746
commit be4c95baf1
15 changed files with 70 additions and 136 deletions

View File

@@ -43,6 +43,7 @@ sha2 = "0.10.8"
serde = { version = "1.0.144", features = ["derive"] }
serde_json = "1.0.85"
ulid = "1.1.2"
icu_segmenter = "2.0.0"
reqwest = { version = "0.12.15", default-features = false, features = [
"http2",
"stream",

View File

@@ -13,6 +13,7 @@ pub mod identifier;
pub mod matrix_const;
pub mod push;
pub mod rendezvous;
pub mod segmenter;
lazy_static! {
static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
@@ -53,6 +54,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
events::register_module(py, m)?;
http_client::register_module(py, m)?;
rendezvous::register_module(py, m)?;
segmenter::register_module(py, m)?;
Ok(())
}

33
rust/src/segmenter.rs Normal file
View File

@@ -0,0 +1,33 @@
use icu_segmenter::options::WordBreakInvariantOptions;
use icu_segmenter::WordSegmenter;
use pyo3::prelude::*;
#[pyfunction]
pub fn parse_words(text: &str) -> PyResult<Vec<String>> {
let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
let mut parts = Vec::new();
let mut last = 0usize;
// `segment_str` gives us word boundaries as a vector of indexes. Use that
// to build a vector of words, and return.
for boundary in segmenter.segment_str(text) {
if boundary > last {
parts.push(text[last..boundary].to_string());
}
last = boundary;
}
Ok(parts)
}
pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
let child_module = PyModule::new(py, "segmenter")?;
child_module.add_function(wrap_pyfunction!(parse_words, m)?)?;
m.add_submodule(&child_module)?;
py.import("sys")?
.getattr("modules")?
.set_item("synapse.synapse_rust.segmenter", child_module)?;
Ok(())
}