segmenter_sentence.rs (8040B)
1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16}; 11 12 #[cfg(feature = "buffer_provider")] 13 use crate::unstable::provider::ffi::DataProvider; 14 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 15 use crate::unstable::{errors::ffi::DataError, locale_core::ffi::Locale}; 16 17 #[diplomat::opaque] 18 /// An ICU4X sentence-break segmenter, capable of finding sentence breakpoints in strings. 19 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter, Struct)] 20 #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed, Struct, hidden)] 21 pub struct SentenceSegmenter(icu_segmenter::SentenceSegmenter); 22 23 #[diplomat::opaque] 24 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)] 25 pub struct SentenceBreakIteratorUtf8<'a>( 26 icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>, 27 ); 28 29 #[diplomat::opaque] 30 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)] 31 pub struct SentenceBreakIteratorUtf16<'a>( 32 icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, Utf16>, 33 ); 34 35 #[diplomat::opaque] 36 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)] 37 pub struct SentenceBreakIteratorLatin1<'a>( 38 icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, Latin1>, 39 ); 40 41 impl SentenceSegmenter { 42 /// Construct a [`SentenceSegmenter`] using compiled data. This does not assume any content locale. 43 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::new, FnInStruct)] 44 #[diplomat::rust_link( 45 icu::segmenter::options::SentenceBreakInvariantOptions, 46 Struct, 47 hidden 48 )] 49 #[diplomat::attr(auto, constructor)] 50 #[cfg(feature = "compiled_data")] 51 pub fn create() -> Box<SentenceSegmenter> { 52 Box::new(SentenceSegmenter( 53 icu_segmenter::SentenceSegmenter::new(Default::default()).static_to_owned(), 54 )) 55 } 56 /// Construct a [`SentenceSegmenter`] for content known to be of a given locale, using compiled data. 57 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)] 58 #[diplomat::rust_link(icu::segmenter::options::SentenceBreakOptions, Struct, hidden)] 59 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale")] 60 #[cfg(feature = "compiled_data")] 61 pub fn create_with_content_locale( 62 locale: &Locale, 63 ) -> Result<Box<SentenceSegmenter>, DataError> { 64 Ok(Box::new(SentenceSegmenter( 65 icu_segmenter::SentenceSegmenter::try_new(locale.into())?, 66 ))) 67 } 68 69 /// Construct a [`SentenceSegmenter`] for content known to be of a given locale, using a particular data source. 70 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)] 71 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale_and_provider")] 72 #[cfg(feature = "buffer_provider")] 73 pub fn create_with_content_locale_and_provider( 74 provider: &DataProvider, 75 locale: &Locale, 76 ) -> Result<Box<SentenceSegmenter>, DataError> { 77 Ok(Box::new(SentenceSegmenter( 78 icu_segmenter::SentenceSegmenter::try_new_with_buffer_provider( 79 provider.get()?, 80 locale.into(), 81 )?, 82 ))) 83 } 84 85 /// Segments a string. 86 /// 87 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 88 /// to the WHATWG Encoding Standard. 89 #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed::segment_utf8, FnInStruct)] 90 #[diplomat::rust_link( 91 icu::segmenter::SentenceSegmenterBorrowed::segment_str, 92 FnInStruct, 93 hidden 94 )] 95 #[diplomat::attr(not(supports = utf8_strings), disable)] 96 #[diplomat::attr(*, rename = "segment")] 97 pub fn segment_utf8<'a>( 98 &'a self, 99 input: &'a DiplomatStr, 100 ) -> Box<SentenceBreakIteratorUtf8<'a>> { 101 Box::new(SentenceBreakIteratorUtf8( 102 self.0.as_borrowed().segment_utf8(input), 103 )) 104 } 105 106 /// Segments a string. 107 /// 108 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 109 /// to the WHATWG Encoding Standard. 110 #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed::segment_utf16, FnInStruct)] 111 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 112 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] 113 pub fn segment_utf16<'a>( 114 &'a self, 115 input: &'a DiplomatStr16, 116 ) -> Box<SentenceBreakIteratorUtf16<'a>> { 117 Box::new(SentenceBreakIteratorUtf16( 118 self.0.as_borrowed().segment_utf16(input), 119 )) 120 } 121 122 /// Segments a Latin-1 string. 123 #[diplomat::rust_link( 124 icu::segmenter::SentenceSegmenterBorrowed::segment_latin1, 125 FnInStruct 126 )] 127 #[diplomat::attr(not(supports = utf8_strings), disable)] 128 pub fn segment_latin1<'a>( 129 &'a self, 130 input: &'a [u8], 131 ) -> Box<SentenceBreakIteratorLatin1<'a>> { 132 Box::new(SentenceBreakIteratorLatin1( 133 self.0.as_borrowed().segment_latin1(input), 134 )) 135 } 136 } 137 138 impl<'a> SentenceBreakIteratorUtf8<'a> { 139 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 140 /// out of range of a 32-bit signed integer. 141 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)] 142 pub fn next(&mut self) -> i32 { 143 self.0 144 .next() 145 .and_then(|u| i32::try_from(u).ok()) 146 .unwrap_or(-1) 147 } 148 } 149 150 impl<'a> SentenceBreakIteratorUtf16<'a> { 151 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 152 /// out of range of a 32-bit signed integer. 153 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)] 154 pub fn next(&mut self) -> i32 { 155 self.0 156 .next() 157 .and_then(|u| i32::try_from(u).ok()) 158 .unwrap_or(-1) 159 } 160 } 161 162 impl<'a> SentenceBreakIteratorLatin1<'a> { 163 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 164 /// out of range of a 32-bit signed integer. 165 #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)] 166 #[diplomat::rust_link( 167 icu::segmenter::iterators::SentenceBreakIterator::Item, 168 AssociatedTypeInStruct, 169 hidden 170 )] 171 pub fn next(&mut self) -> i32 { 172 self.0 173 .next() 174 .and_then(|u| i32::try_from(u).ok()) 175 .unwrap_or(-1) 176 } 177 } 178 } 179 180 impl<'a> From<&'a crate::unstable::locale_core::ffi::Locale> 181 for icu_segmenter::options::SentenceBreakOptions<'a> 182 { 183 fn from(other: &'a crate::unstable::locale_core::ffi::Locale) -> Self { 184 let mut options = icu_segmenter::options::SentenceBreakOptions::default(); 185 options.content_locale = Some(&other.0.id); 186 options 187 } 188 }