segmenter_line.rs (16183B)
1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16}; 11 12 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 13 use crate::unstable::locale_core::ffi::Locale; 14 #[cfg(feature = "buffer_provider")] 15 use crate::unstable::{errors::ffi::DataError, provider::ffi::DataProvider}; 16 use diplomat_runtime::DiplomatOption; 17 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 18 use icu_segmenter::options::LineBreakOptions; 19 20 #[diplomat::opaque] 21 /// An ICU4X line-break segmenter, capable of finding breakpoints in strings. 22 #[diplomat::rust_link(icu::segmenter::LineSegmenter, Struct)] 23 #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed, Struct, hidden)] 24 pub struct LineSegmenter(icu_segmenter::LineSegmenter); 25 26 #[diplomat::rust_link(icu::segmenter::options::LineBreakStrictness, Enum)] 27 #[diplomat::enum_convert(icu_segmenter::options::LineBreakStrictness, needs_wildcard)] 28 pub enum LineBreakStrictness { 29 Loose, 30 Normal, 31 Strict, 32 Anywhere, 33 } 34 35 #[diplomat::rust_link(icu::segmenter::options::LineBreakWordOption, Enum)] 36 #[diplomat::enum_convert(icu_segmenter::options::LineBreakWordOption, needs_wildcard)] 37 pub enum LineBreakWordOption { 38 Normal, 39 BreakAll, 40 KeepAll, 41 } 42 43 #[diplomat::rust_link(icu::segmenter::options::LineBreakOptions, Struct)] 44 #[diplomat::attr(supports = non_exhaustive_structs, rename = "LineBreakOptions")] 45 pub struct LineBreakOptionsV2 { 46 pub strictness: DiplomatOption<LineBreakStrictness>, 47 pub word_option: DiplomatOption<LineBreakWordOption>, 48 } 49 50 #[diplomat::opaque] 51 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)] 52 pub struct LineBreakIteratorUtf8<'a>( 53 icu_segmenter::iterators::LineBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>, 54 ); 55 56 #[diplomat::opaque] 57 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)] 58 pub struct LineBreakIteratorUtf16<'a>( 59 icu_segmenter::iterators::LineBreakIterator<'a, 'a, Utf16>, 60 ); 61 62 #[diplomat::opaque] 63 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)] 64 pub struct LineBreakIteratorLatin1<'a>( 65 icu_segmenter::iterators::LineBreakIterator<'a, 'a, Latin1>, 66 ); 67 68 impl LineSegmenter { 69 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) using compiled data. It automatically loads the best 70 /// available payload data for Burmese, Khmer, Lao, and Thai. 71 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 72 #[diplomat::attr(auto, named_constructor = "auto")] 73 #[cfg(feature = "compiled_data")] 74 pub fn create_auto() -> Box<LineSegmenter> { 75 Box::new(LineSegmenter( 76 icu_segmenter::LineSegmenter::new_auto(Default::default()).static_to_owned(), 77 )) 78 } 79 80 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and LSTM payload data for 81 /// Burmese, Khmer, Lao, and Thai, using compiled data. 82 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 83 #[diplomat::attr(auto, named_constructor = "lstm")] 84 #[cfg(feature = "compiled_data")] 85 pub fn create_lstm() -> Box<LineSegmenter> { 86 Box::new(LineSegmenter( 87 icu_segmenter::LineSegmenter::new_lstm(Default::default()).static_to_owned(), 88 )) 89 } 90 91 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and dictionary payload data for 92 /// Burmese, Khmer, Lao, and Thai, using compiled data 93 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 94 #[diplomat::attr(auto, named_constructor = "dictionary")] 95 #[cfg(feature = "compiled_data")] 96 pub fn create_dictionary() -> Box<LineSegmenter> { 97 Box::new(LineSegmenter( 98 icu_segmenter::LineSegmenter::new_dictionary(Default::default()).static_to_owned(), 99 )) 100 } 101 102 /// Construct a [`LineSegmenter`] with custom options using compiled data. It automatically loads the best 103 /// available payload data for Burmese, Khmer, Lao, and Thai. 104 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 105 #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options")] 106 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "auto_with_options")] 107 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "auto_with_options_v2")] 108 #[cfg(feature = "compiled_data")] 109 pub fn create_auto_with_options_v2( 110 content_locale: Option<&Locale>, 111 options: LineBreakOptionsV2, 112 ) -> Box<LineSegmenter> { 113 let mut options: LineBreakOptions = options.into(); 114 options.content_locale = content_locale.map(|c| &c.0.id); 115 Box::new(LineSegmenter( 116 icu_segmenter::LineSegmenter::new_auto(options).static_to_owned(), 117 )) 118 } 119 /// Construct a [`LineSegmenter`] with custom options. It automatically loads the best 120 /// available payload data for Burmese, Khmer, Lao, and Thai, using a particular data source. 121 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 122 #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options_and_provider")] 123 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_and_provider")] 124 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_v2_and_provider")] 125 #[cfg(feature = "buffer_provider")] 126 pub fn create_auto_with_options_v2_and_provider( 127 provider: &DataProvider, 128 content_locale: Option<&Locale>, 129 options: LineBreakOptionsV2, 130 ) -> Result<Box<LineSegmenter>, DataError> { 131 let mut options: LineBreakOptions = options.into(); 132 options.content_locale = content_locale.map(|c| &c.0.id); 133 134 Ok(Box::new(LineSegmenter( 135 icu_segmenter::LineSegmenter::try_new_auto_with_buffer_provider( 136 provider.get()?, 137 options, 138 )?, 139 ))) 140 } 141 /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for 142 /// Burmese, Khmer, Lao, and Thai, using compiled data. 143 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 144 #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options")] 145 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "lstm_with_options")] 146 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "lstm_with_options_v2")] 147 #[cfg(feature = "compiled_data")] 148 pub fn create_lstm_with_options_v2( 149 content_locale: Option<&Locale>, 150 options: LineBreakOptionsV2, 151 ) -> Box<LineSegmenter> { 152 let mut options: LineBreakOptions = options.into(); 153 options.content_locale = content_locale.map(|c| &c.0.id); 154 155 Box::new(LineSegmenter( 156 icu_segmenter::LineSegmenter::new_lstm(options).static_to_owned(), 157 )) 158 } 159 /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for 160 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 161 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 162 #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options_and_provider")] 163 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_and_provider")] 164 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_v2_and_provider")] 165 #[cfg(feature = "buffer_provider")] 166 pub fn create_lstm_with_options_v2_and_provider( 167 provider: &DataProvider, 168 content_locale: Option<&Locale>, 169 options: LineBreakOptionsV2, 170 ) -> Result<Box<LineSegmenter>, DataError> { 171 let mut options: LineBreakOptions = options.into(); 172 options.content_locale = content_locale.map(|c| &c.0.id); 173 174 Ok(Box::new(LineSegmenter( 175 icu_segmenter::LineSegmenter::try_new_lstm_with_buffer_provider( 176 provider.get()?, 177 options, 178 )?, 179 ))) 180 } 181 /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for 182 /// Burmese, Khmer, Lao, and Thai, using compiled data. 183 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 184 #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options")] 185 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "dictionary_with_options")] 186 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "dictionary_with_options_v2")] 187 #[cfg(feature = "compiled_data")] 188 pub fn create_dictionary_with_options_v2( 189 content_locale: Option<&Locale>, 190 options: LineBreakOptionsV2, 191 ) -> Box<LineSegmenter> { 192 let mut options: LineBreakOptions = options.into(); 193 options.content_locale = content_locale.map(|c| &c.0.id); 194 195 Box::new(LineSegmenter( 196 icu_segmenter::LineSegmenter::new_dictionary(options).static_to_owned(), 197 )) 198 } 199 /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for 200 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 201 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 202 #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options_and_provider")] 203 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_and_provider")] 204 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_v2_and_provider")] 205 #[cfg(feature = "buffer_provider")] 206 pub fn create_dictionary_with_options_v2_and_provider( 207 provider: &DataProvider, 208 content_locale: Option<&Locale>, 209 options: LineBreakOptionsV2, 210 ) -> Result<Box<LineSegmenter>, DataError> { 211 let mut options: LineBreakOptions = options.into(); 212 options.content_locale = content_locale.map(|c| &c.0.id); 213 214 Ok(Box::new(LineSegmenter( 215 icu_segmenter::LineSegmenter::try_new_dictionary_with_buffer_provider( 216 provider.get()?, 217 options, 218 )?, 219 ))) 220 } 221 /// Segments a string. 222 /// 223 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 224 /// to the WHATWG Encoding Standard. 225 #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_utf8, FnInStruct)] 226 #[diplomat::rust_link( 227 icu::segmenter::LineSegmenterBorrowed::segment_str, 228 FnInStruct, 229 hidden 230 )] 231 #[diplomat::attr(not(supports = utf8_strings), disable)] 232 #[diplomat::attr(*, rename = "segment")] 233 pub fn segment_utf8<'a>( 234 &'a self, 235 input: &'a DiplomatStr, 236 ) -> Box<LineBreakIteratorUtf8<'a>> { 237 Box::new(LineBreakIteratorUtf8( 238 self.0.as_borrowed().segment_utf8(input), 239 )) 240 } 241 242 /// Segments a string. 243 /// 244 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 245 /// to the WHATWG Encoding Standard. 246 #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_utf16, FnInStruct)] 247 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 248 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] 249 pub fn segment_utf16<'a>( 250 &'a self, 251 input: &'a DiplomatStr16, 252 ) -> Box<LineBreakIteratorUtf16<'a>> { 253 Box::new(LineBreakIteratorUtf16( 254 self.0.as_borrowed().segment_utf16(input), 255 )) 256 } 257 258 /// Segments a Latin-1 string. 259 #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_latin1, FnInStruct)] 260 #[diplomat::attr(not(supports = utf8_strings), disable)] 261 pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<LineBreakIteratorLatin1<'a>> { 262 Box::new(LineBreakIteratorLatin1( 263 self.0.as_borrowed().segment_latin1(input), 264 )) 265 } 266 } 267 268 impl<'a> LineBreakIteratorUtf8<'a> { 269 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 270 /// out of range of a 32-bit signed integer. 271 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)] 272 pub fn next(&mut self) -> i32 { 273 self.0 274 .next() 275 .and_then(|u| i32::try_from(u).ok()) 276 .unwrap_or(-1) 277 } 278 } 279 280 impl<'a> LineBreakIteratorUtf16<'a> { 281 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 282 /// out of range of a 32-bit signed integer. 283 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)] 284 #[diplomat::rust_link( 285 icu::segmenter::iterators::LineBreakIterator::Item, 286 AssociatedTypeInStruct, 287 hidden 288 )] 289 pub fn next(&mut self) -> i32 { 290 self.0 291 .next() 292 .and_then(|u| i32::try_from(u).ok()) 293 .unwrap_or(-1) 294 } 295 } 296 297 impl<'a> LineBreakIteratorLatin1<'a> { 298 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 299 /// out of range of a 32-bit signed integer. 300 #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)] 301 #[diplomat::rust_link( 302 icu::segmenter::iterators::LineBreakIterator::Item, 303 AssociatedTypeInStruct, 304 hidden 305 )] 306 pub fn next(&mut self) -> i32 { 307 self.0 308 .next() 309 .and_then(|u| i32::try_from(u).ok()) 310 .unwrap_or(-1) 311 } 312 } 313 } 314 315 impl From<ffi::LineBreakOptionsV2> for icu_segmenter::options::LineBreakOptions<'_> { 316 fn from(other: ffi::LineBreakOptionsV2) -> Self { 317 let mut options = icu_segmenter::options::LineBreakOptions::default(); 318 options.strictness = other.strictness.into_converted_option(); 319 options.word_option = other.word_option.into_converted_option(); 320 options 321 } 322 }