segmenter_grapheme.rs (7236B)
1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16}; 11 12 #[cfg(feature = "buffer_provider")] 13 use crate::unstable::{errors::ffi::DataError, provider::ffi::DataProvider}; 14 15 #[diplomat::opaque] 16 /// An ICU4X grapheme-cluster-break segmenter, capable of finding grapheme cluster breakpoints 17 /// in strings. 18 #[diplomat::rust_link(icu::segmenter::GraphemeClusterSegmenter, Struct)] 19 #[diplomat::rust_link(icu::segmenter::GraphemeClusterSegmenterBorrowed, Struct, hidden)] 20 pub struct GraphemeClusterSegmenter(icu_segmenter::GraphemeClusterSegmenter); 21 22 #[diplomat::opaque] 23 #[diplomat::rust_link(icu::segmenter::iterators::GraphemeClusterBreakIterator, Struct)] 24 pub struct GraphemeClusterBreakIteratorUtf8<'a>( 25 icu_segmenter::iterators::GraphemeClusterBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>, 26 ); 27 28 #[diplomat::opaque] 29 #[diplomat::rust_link(icu::segmenter::iterators::GraphemeClusterBreakIterator, Struct)] 30 pub struct GraphemeClusterBreakIteratorUtf16<'a>( 31 icu_segmenter::iterators::GraphemeClusterBreakIterator<'a, 'a, Utf16>, 32 ); 33 34 #[diplomat::opaque] 35 #[diplomat::rust_link(icu::segmenter::iterators::GraphemeClusterBreakIterator, Struct)] 36 pub struct GraphemeClusterBreakIteratorLatin1<'a>( 37 icu_segmenter::iterators::GraphemeClusterBreakIterator<'a, 'a, Latin1>, 38 ); 39 40 impl GraphemeClusterSegmenter { 41 /// Construct an [`GraphemeClusterSegmenter`] using compiled data. 42 #[diplomat::rust_link(icu::segmenter::GraphemeClusterSegmenter::new, FnInStruct)] 43 #[diplomat::attr(auto, constructor)] 44 #[cfg(feature = "compiled_data")] 45 pub fn create() -> Box<GraphemeClusterSegmenter> { 46 Box::new(GraphemeClusterSegmenter( 47 icu_segmenter::GraphemeClusterSegmenter::new().static_to_owned(), 48 )) 49 } 50 /// Construct an [`GraphemeClusterSegmenter`]. 51 #[diplomat::rust_link(icu::segmenter::GraphemeClusterSegmenter::new, FnInStruct)] 52 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_provider")] 53 #[cfg(feature = "buffer_provider")] 54 pub fn create_with_provider( 55 provider: &DataProvider, 56 ) -> Result<Box<GraphemeClusterSegmenter>, DataError> { 57 Ok(Box::new(GraphemeClusterSegmenter( 58 icu_segmenter::GraphemeClusterSegmenter::try_new_with_buffer_provider( 59 provider.get()?, 60 )?, 61 ))) 62 } 63 /// Segments a string. 64 /// 65 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 66 /// to the WHATWG Encoding Standard. 67 #[diplomat::rust_link( 68 icu::segmenter::GraphemeClusterSegmenterBorrowed::segment_str, 69 FnInStruct, 70 hidden 71 )] 72 #[diplomat::rust_link( 73 icu::segmenter::GraphemeClusterSegmenterBorrowed::segment_utf8, 74 FnInStruct 75 )] 76 #[diplomat::attr(not(supports = utf8_strings), disable)] 77 #[diplomat::attr(*, rename = "segment")] 78 pub fn segment_utf8<'a>( 79 &'a self, 80 input: &'a DiplomatStr, 81 ) -> Box<GraphemeClusterBreakIteratorUtf8<'a>> { 82 Box::new(GraphemeClusterBreakIteratorUtf8( 83 self.0.as_borrowed().segment_utf8(input), 84 )) 85 } 86 87 /// Segments a string. 88 /// 89 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 90 /// to the WHATWG Encoding Standard. 91 #[diplomat::rust_link( 92 icu::segmenter::GraphemeClusterSegmenterBorrowed::segment_utf16, 93 FnInStruct 94 )] 95 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 96 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] 97 pub fn segment_utf16<'a>( 98 &'a self, 99 input: &'a DiplomatStr16, 100 ) -> Box<GraphemeClusterBreakIteratorUtf16<'a>> { 101 Box::new(GraphemeClusterBreakIteratorUtf16( 102 self.0.as_borrowed().segment_utf16(input), 103 )) 104 } 105 106 /// Segments a Latin-1 string. 107 #[diplomat::rust_link( 108 icu::segmenter::GraphemeClusterSegmenterBorrowed::segment_latin1, 109 FnInStruct 110 )] 111 #[diplomat::attr(not(supports = utf8_strings), disable)] 112 pub fn segment_latin1<'a>( 113 &'a self, 114 input: &'a [u8], 115 ) -> Box<GraphemeClusterBreakIteratorLatin1<'a>> { 116 Box::new(GraphemeClusterBreakIteratorLatin1( 117 self.0.as_borrowed().segment_latin1(input), 118 )) 119 } 120 } 121 122 impl<'a> GraphemeClusterBreakIteratorUtf8<'a> { 123 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 124 /// out of range of a 32-bit signed integer. 125 #[diplomat::rust_link( 126 icu::segmenter::iterators::GraphemeClusterBreakIterator::next, 127 FnInStruct 128 )] 129 #[diplomat::rust_link( 130 icu::segmenter::iterators::GraphemeClusterBreakIterator::Item, 131 AssociatedTypeInStruct, 132 hidden 133 )] 134 pub fn next(&mut self) -> i32 { 135 self.0 136 .next() 137 .and_then(|u| i32::try_from(u).ok()) 138 .unwrap_or(-1) 139 } 140 } 141 142 impl<'a> GraphemeClusterBreakIteratorUtf16<'a> { 143 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 144 /// out of range of a 32-bit signed integer. 145 #[diplomat::rust_link( 146 icu::segmenter::iterators::GraphemeClusterBreakIterator::next, 147 FnInStruct 148 )] 149 #[diplomat::rust_link( 150 icu::segmenter::iterators::GraphemeClusterBreakIterator::Item, 151 AssociatedTypeInStruct, 152 hidden 153 )] 154 pub fn next(&mut self) -> i32 { 155 self.0 156 .next() 157 .and_then(|u| i32::try_from(u).ok()) 158 .unwrap_or(-1) 159 } 160 } 161 162 impl<'a> GraphemeClusterBreakIteratorLatin1<'a> { 163 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 164 /// out of range of a 32-bit signed integer. 165 #[diplomat::rust_link( 166 icu::segmenter::iterators::GraphemeClusterBreakIterator::next, 167 FnInStruct 168 )] 169 #[diplomat::rust_link( 170 icu::segmenter::iterators::GraphemeClusterBreakIterator::Item, 171 AssociatedTypeInStruct, 172 hidden 173 )] 174 pub fn next(&mut self) -> i32 { 175 self.0 176 .next() 177 .and_then(|u| i32::try_from(u).ok()) 178 .unwrap_or(-1) 179 } 180 } 181 }