segmenter_word.rs (16826B)
1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16}; 11 12 #[cfg(feature = "buffer_provider")] 13 use crate::unstable::provider::ffi::DataProvider; 14 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 15 use crate::unstable::{errors::ffi::DataError, locale_core::ffi::Locale}; 16 17 #[diplomat::enum_convert(icu_segmenter::options::WordType, needs_wildcard)] 18 #[diplomat::rust_link(icu::segmenter::options::WordType, Enum)] 19 pub enum SegmenterWordType { 20 None = 0, 21 Number = 1, 22 Letter = 2, 23 } 24 25 #[diplomat::opaque] 26 /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings. 27 #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)] 28 #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed, Struct, hidden)] 29 #[diplomat::demo(custom_func = "../../npm/demo_gen_custom/WordSegmenter.mjs")] 30 pub struct WordSegmenter(icu_segmenter::WordSegmenter); 31 32 #[diplomat::opaque] 33 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)] 34 pub struct WordBreakIteratorUtf8<'a>( 35 icu_segmenter::iterators::WordBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>, 36 ); 37 38 #[diplomat::opaque] 39 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)] 40 pub struct WordBreakIteratorUtf16<'a>( 41 icu_segmenter::iterators::WordBreakIterator<'a, 'a, Utf16>, 42 ); 43 #[diplomat::opaque] 44 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)] 45 pub struct WordBreakIteratorLatin1<'a>( 46 icu_segmenter::iterators::WordBreakIterator<'a, 'a, Latin1>, 47 ); 48 49 impl SegmenterWordType { 50 #[diplomat::rust_link(icu::segmenter::options::WordType::is_word_like, FnInEnum)] 51 #[diplomat::attr(auto, getter)] 52 pub fn is_word_like(self) -> bool { 53 icu_segmenter::options::WordType::from(self).is_word_like() 54 } 55 } 56 57 impl WordSegmenter { 58 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 59 /// or dictionary payload data, using compiled data. This does not assume any content locale. 60 /// 61 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 62 /// Khmer, Lao, and Thai. 63 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)] 64 #[diplomat::rust_link(icu::segmenter::options::WordBreakInvariantOptions, Struct, hidden)] 65 #[diplomat::attr(auto, named_constructor = "auto")] 66 #[cfg(feature = "compiled_data")] 67 pub fn create_auto() -> Box<WordSegmenter> { 68 Box::new(WordSegmenter( 69 icu_segmenter::WordSegmenter::new_auto(Default::default()).static_to_owned(), 70 )) 71 } 72 73 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 74 /// or dictionary payload data, using compiled data. 75 /// 76 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 77 /// Khmer, Lao, and Thai. 78 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)] 79 #[diplomat::rust_link(icu::segmenter::options::WordBreakOptions, Struct, hidden)] 80 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale")] 81 #[cfg(feature = "compiled_data")] 82 pub fn create_auto_with_content_locale( 83 locale: &Locale, 84 ) -> Result<Box<WordSegmenter>, DataError> { 85 Ok(Box::new(WordSegmenter( 86 icu_segmenter::WordSegmenter::try_new_auto(locale.into())?, 87 ))) 88 } 89 90 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 91 /// or dictionary payload data, using a particular data source. 92 /// 93 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 94 /// Khmer, Lao, and Thai. 95 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)] 96 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale_and_provider")] 97 #[cfg(feature = "buffer_provider")] 98 pub fn create_auto_with_content_locale_and_provider( 99 provider: &DataProvider, 100 locale: &Locale, 101 ) -> Result<Box<WordSegmenter>, DataError> { 102 Ok(Box::new(WordSegmenter( 103 icu_segmenter::WordSegmenter::try_new_auto_with_buffer_provider( 104 provider.get()?, 105 locale.into(), 106 )?, 107 ))) 108 } 109 110 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 111 /// Thai, using compiled data. This does not assume any content locale. 112 /// 113 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 114 /// Khmer, Lao, and Thai. 115 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)] 116 #[diplomat::attr(auto, named_constructor = "lstm")] 117 #[cfg(feature = "compiled_data")] 118 pub fn create_lstm() -> Box<WordSegmenter> { 119 Box::new(WordSegmenter( 120 icu_segmenter::WordSegmenter::new_lstm(Default::default()).static_to_owned(), 121 )) 122 } 123 124 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 125 /// Thai, using compiled data. 126 /// 127 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 128 /// Khmer, Lao, and Thai. 129 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)] 130 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale")] 131 #[cfg(feature = "compiled_data")] 132 pub fn create_lstm_with_content_locale( 133 locale: &Locale, 134 ) -> Result<Box<WordSegmenter>, DataError> { 135 Ok(Box::new(WordSegmenter( 136 icu_segmenter::WordSegmenter::try_new_lstm(locale.into())?, 137 ))) 138 } 139 140 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 141 /// Thai, using a particular data source. 142 /// 143 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 144 /// Khmer, Lao, and Thai. 145 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)] 146 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale_and_provider")] 147 #[cfg(feature = "buffer_provider")] 148 pub fn create_lstm_with_content_locale_and_provider( 149 provider: &DataProvider, 150 locale: &Locale, 151 ) -> Result<Box<WordSegmenter>, DataError> { 152 Ok(Box::new(WordSegmenter( 153 icu_segmenter::WordSegmenter::try_new_lstm_with_buffer_provider( 154 provider.get()?, 155 locale.into(), 156 )?, 157 ))) 158 } 159 160 /// Construct an [`WordSegmenter`] with with dictionary payload data for Chinese, Japanese, 161 /// Burmese, Khmer, Lao, and Thai, using compiled data. This does not assume any content locale. 162 /// 163 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 164 /// Khmer, Lao, and Thai. 165 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)] 166 #[diplomat::attr(auto, named_constructor = "dictionary")] 167 #[cfg(feature = "compiled_data")] 168 pub fn create_dictionary() -> Box<WordSegmenter> { 169 Box::new(WordSegmenter( 170 icu_segmenter::WordSegmenter::new_dictionary(Default::default()).static_to_owned(), 171 )) 172 } 173 174 /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese, 175 /// Burmese, Khmer, Lao, and Thai, using compiled data. 176 /// 177 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 178 /// Khmer, Lao, and Thai. 179 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)] 180 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale")] 181 #[cfg(feature = "compiled_data")] 182 pub fn create_dictionary_with_content_locale( 183 locale: &Locale, 184 ) -> Result<Box<WordSegmenter>, DataError> { 185 Ok(Box::new(WordSegmenter( 186 icu_segmenter::WordSegmenter::try_new_dictionary(locale.into())?, 187 ))) 188 } 189 190 /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese, 191 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 192 /// 193 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 194 /// Khmer, Lao, and Thai. 195 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)] 196 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale_and_provider")] 197 #[cfg(feature = "buffer_provider")] 198 pub fn create_dictionary_with_content_locale_and_provider( 199 provider: &DataProvider, 200 locale: &Locale, 201 ) -> Result<Box<WordSegmenter>, DataError> { 202 Ok(Box::new(WordSegmenter( 203 icu_segmenter::WordSegmenter::try_new_dictionary_with_buffer_provider( 204 provider.get()?, 205 locale.into(), 206 )?, 207 ))) 208 } 209 /// Segments a string. 210 /// 211 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 212 /// to the WHATWG Encoding Standard. 213 #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_utf8, FnInStruct)] 214 #[diplomat::rust_link( 215 icu::segmenter::WordSegmenterBorrowed::segment_str, 216 FnInStruct, 217 hidden 218 )] 219 #[diplomat::attr(not(supports = utf8_strings), disable)] 220 #[diplomat::attr(*, rename = "segment")] 221 pub fn segment_utf8<'a>( 222 &'a self, 223 input: &'a DiplomatStr, 224 ) -> Box<WordBreakIteratorUtf8<'a>> { 225 Box::new(WordBreakIteratorUtf8( 226 self.0.as_borrowed().segment_utf8(input), 227 )) 228 } 229 230 /// Segments a string. 231 /// 232 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 233 /// to the WHATWG Encoding Standard. 234 #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_utf16, FnInStruct)] 235 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 236 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] 237 pub fn segment_utf16<'a>( 238 &'a self, 239 input: &'a DiplomatStr16, 240 ) -> Box<WordBreakIteratorUtf16<'a>> { 241 Box::new(WordBreakIteratorUtf16( 242 self.0.as_borrowed().segment_utf16(input), 243 )) 244 } 245 246 /// Segments a Latin-1 string. 247 #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_latin1, FnInStruct)] 248 #[diplomat::attr(not(supports = utf8_strings), disable)] 249 pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>> { 250 Box::new(WordBreakIteratorLatin1( 251 self.0.as_borrowed().segment_latin1(input), 252 )) 253 } 254 } 255 256 impl<'a> WordBreakIteratorUtf8<'a> { 257 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 258 /// out of range of a 32-bit signed integer. 259 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)] 260 pub fn next(&mut self) -> i32 { 261 self.0 262 .next() 263 .and_then(|u| i32::try_from(u).ok()) 264 .unwrap_or(-1) 265 } 266 267 /// Return the status value of break boundary. 268 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)] 269 #[diplomat::rust_link( 270 icu::segmenter::iterators::WordBreakIteratorWithWordType, 271 Struct, 272 hidden 273 )] 274 #[diplomat::rust_link( 275 icu::segmenter::iterators::WordBreakIteratorWithWordType::next, 276 FnInStruct, 277 hidden 278 )] 279 #[diplomat::attr(auto, getter)] 280 pub fn word_type(&self) -> SegmenterWordType { 281 self.0.word_type().into() 282 } 283 284 /// Return true when break boundary is word-like such as letter/number/CJK 285 #[diplomat::rust_link( 286 icu::segmenter::iterators::WordBreakIterator::is_word_like, 287 FnInStruct 288 )] 289 #[diplomat::attr(auto, getter)] 290 pub fn is_word_like(&self) -> bool { 291 self.0.is_word_like() 292 } 293 } 294 295 impl<'a> WordBreakIteratorUtf16<'a> { 296 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 297 /// out of range of a 32-bit signed integer. 298 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)] 299 #[diplomat::rust_link( 300 icu::segmenter::iterators::WordBreakIterator::Item, 301 AssociatedTypeInStruct, 302 hidden 303 )] 304 pub fn next(&mut self) -> i32 { 305 self.0 306 .next() 307 .and_then(|u| i32::try_from(u).ok()) 308 .unwrap_or(-1) 309 } 310 311 /// Return the status value of break boundary. 312 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)] 313 #[diplomat::rust_link( 314 icu::segmenter::iterators::WordBreakIterator::iter_with_word_type, 315 FnInStruct, 316 hidden 317 )] 318 #[diplomat::attr(auto, getter)] 319 pub fn word_type(&self) -> SegmenterWordType { 320 self.0.word_type().into() 321 } 322 323 /// Return true when break boundary is word-like such as letter/number/CJK 324 #[diplomat::rust_link( 325 icu::segmenter::iterators::WordBreakIterator::is_word_like, 326 FnInStruct 327 )] 328 #[diplomat::attr(auto, getter)] 329 pub fn is_word_like(&self) -> bool { 330 self.0.is_word_like() 331 } 332 } 333 334 impl<'a> WordBreakIteratorLatin1<'a> { 335 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 336 /// out of range of a 32-bit signed integer. 337 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)] 338 #[diplomat::rust_link( 339 icu::segmenter::iterators::WordBreakIterator::Item, 340 AssociatedTypeInStruct, 341 hidden 342 )] 343 pub fn next(&mut self) -> i32 { 344 self.0 345 .next() 346 .and_then(|u| i32::try_from(u).ok()) 347 .unwrap_or(-1) 348 } 349 350 /// Return the status value of break boundary. 351 #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)] 352 #[diplomat::attr(auto, getter)] 353 pub fn word_type(&self) -> SegmenterWordType { 354 self.0.word_type().into() 355 } 356 357 /// Return true when break boundary is word-like such as letter/number/CJK 358 #[diplomat::rust_link( 359 icu::segmenter::iterators::WordBreakIterator::is_word_like, 360 FnInStruct 361 )] 362 #[diplomat::attr(auto, getter)] 363 pub fn is_word_like(&self) -> bool { 364 self.0.is_word_like() 365 } 366 } 367 } 368 369 impl<'a> From<&'a crate::unstable::locale_core::ffi::Locale> 370 for icu_segmenter::options::WordBreakOptions<'a> 371 { 372 fn from(other: &'a crate::unstable::locale_core::ffi::Locale) -> Self { 373 let mut options = icu_segmenter::options::WordBreakOptions::default(); 374 options.content_locale = Some(&other.0.id); 375 options 376 } 377 }