Replace PyICU with Rust icu_segmenter crate (#18553)
Co-authored-by: anoa's Codex Agent <codex@amorgan.xyz> Co-authored-by: Quentin Gliech <quenting@element.io>
This commit is contained in:
@@ -43,6 +43,7 @@ sha2 = "0.10.8"
|
||||
serde = { version = "1.0.144", features = ["derive"] }
|
||||
serde_json = "1.0.85"
|
||||
ulid = "1.1.2"
|
||||
icu_segmenter = "2.0.0"
|
||||
reqwest = { version = "0.12.15", default-features = false, features = [
|
||||
"http2",
|
||||
"stream",
|
||||
|
||||
@@ -13,6 +13,7 @@ pub mod identifier;
|
||||
pub mod matrix_const;
|
||||
pub mod push;
|
||||
pub mod rendezvous;
|
||||
pub mod segmenter;
|
||||
|
||||
lazy_static! {
|
||||
static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
|
||||
@@ -53,6 +54,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
events::register_module(py, m)?;
|
||||
http_client::register_module(py, m)?;
|
||||
rendezvous::register_module(py, m)?;
|
||||
segmenter::register_module(py, m)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
33
rust/src/segmenter.rs
Normal file
33
rust/src/segmenter.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use icu_segmenter::options::WordBreakInvariantOptions;
|
||||
use icu_segmenter::WordSegmenter;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
#[pyfunction]
|
||||
pub fn parse_words(text: &str) -> PyResult<Vec<String>> {
|
||||
let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
|
||||
let mut parts = Vec::new();
|
||||
let mut last = 0usize;
|
||||
|
||||
// `segment_str` gives us word boundaries as a vector of indexes. Use that
|
||||
// to build a vector of words, and return.
|
||||
for boundary in segmenter.segment_str(text) {
|
||||
if boundary > last {
|
||||
parts.push(text[last..boundary].to_string());
|
||||
}
|
||||
last = boundary;
|
||||
}
|
||||
Ok(parts)
|
||||
}
|
||||
|
||||
pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
let child_module = PyModule::new(py, "segmenter")?;
|
||||
child_module.add_function(wrap_pyfunction!(parse_words, m)?)?;
|
||||
|
||||
m.add_submodule(&child_module)?;
|
||||
|
||||
py.import("sys")?
|
||||
.getattr("modules")?
|
||||
.set_item("synapse.synapse_rust.segmenter", child_module)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user