tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

segmenter_line.rs (16183B)


      1 // This file is part of ICU4X. For terms of use, please see the file
      2 // called LICENSE at the top level of the ICU4X source tree
      3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
      4 
      5 #[diplomat::bridge]
      6 #[diplomat::abi_rename = "icu4x_{0}_mv1"]
      7 #[diplomat::attr(auto, namespace = "icu4x")]
      8 pub mod ffi {
      9    use alloc::boxed::Box;
     10    use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16};
     11 
     12    #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))]
     13    use crate::unstable::locale_core::ffi::Locale;
     14    #[cfg(feature = "buffer_provider")]
     15    use crate::unstable::{errors::ffi::DataError, provider::ffi::DataProvider};
     16    use diplomat_runtime::DiplomatOption;
     17    #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))]
     18    use icu_segmenter::options::LineBreakOptions;
     19 
     20    #[diplomat::opaque]
     21    /// An ICU4X line-break segmenter, capable of finding breakpoints in strings.
     22    #[diplomat::rust_link(icu::segmenter::LineSegmenter, Struct)]
     23    #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed, Struct, hidden)]
     24    pub struct LineSegmenter(icu_segmenter::LineSegmenter);
     25 
     26    #[diplomat::rust_link(icu::segmenter::options::LineBreakStrictness, Enum)]
     27    #[diplomat::enum_convert(icu_segmenter::options::LineBreakStrictness, needs_wildcard)]
     28    pub enum LineBreakStrictness {
     29        Loose,
     30        Normal,
     31        Strict,
     32        Anywhere,
     33    }
     34 
     35    #[diplomat::rust_link(icu::segmenter::options::LineBreakWordOption, Enum)]
     36    #[diplomat::enum_convert(icu_segmenter::options::LineBreakWordOption, needs_wildcard)]
     37    pub enum LineBreakWordOption {
     38        Normal,
     39        BreakAll,
     40        KeepAll,
     41    }
     42 
     43    #[diplomat::rust_link(icu::segmenter::options::LineBreakOptions, Struct)]
     44    #[diplomat::attr(supports = non_exhaustive_structs, rename = "LineBreakOptions")]
     45    pub struct LineBreakOptionsV2 {
     46        pub strictness: DiplomatOption<LineBreakStrictness>,
     47        pub word_option: DiplomatOption<LineBreakWordOption>,
     48    }
     49 
     50    #[diplomat::opaque]
     51    #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)]
     52    pub struct LineBreakIteratorUtf8<'a>(
     53        icu_segmenter::iterators::LineBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>,
     54    );
     55 
     56    #[diplomat::opaque]
     57    #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)]
     58    pub struct LineBreakIteratorUtf16<'a>(
     59        icu_segmenter::iterators::LineBreakIterator<'a, 'a, Utf16>,
     60    );
     61 
     62    #[diplomat::opaque]
     63    #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator, Struct)]
     64    pub struct LineBreakIteratorLatin1<'a>(
     65        icu_segmenter::iterators::LineBreakIterator<'a, 'a, Latin1>,
     66    );
     67 
     68    impl LineSegmenter {
     69        /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) using compiled data. It automatically loads the best
     70        /// available payload data for Burmese, Khmer, Lao, and Thai.
     71        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)]
     72        #[diplomat::attr(auto, named_constructor = "auto")]
     73        #[cfg(feature = "compiled_data")]
     74        pub fn create_auto() -> Box<LineSegmenter> {
     75            Box::new(LineSegmenter(
     76                icu_segmenter::LineSegmenter::new_auto(Default::default()).static_to_owned(),
     77            ))
     78        }
     79 
     80        /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and LSTM payload data for
     81        /// Burmese, Khmer, Lao, and Thai, using compiled data.
     82        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)]
     83        #[diplomat::attr(auto, named_constructor = "lstm")]
     84        #[cfg(feature = "compiled_data")]
     85        pub fn create_lstm() -> Box<LineSegmenter> {
     86            Box::new(LineSegmenter(
     87                icu_segmenter::LineSegmenter::new_lstm(Default::default()).static_to_owned(),
     88            ))
     89        }
     90 
     91        /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and dictionary payload data for
     92        /// Burmese, Khmer, Lao, and Thai, using compiled data
     93        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)]
     94        #[diplomat::attr(auto, named_constructor = "dictionary")]
     95        #[cfg(feature = "compiled_data")]
     96        pub fn create_dictionary() -> Box<LineSegmenter> {
     97            Box::new(LineSegmenter(
     98                icu_segmenter::LineSegmenter::new_dictionary(Default::default()).static_to_owned(),
     99            ))
    100        }
    101 
    102        /// Construct a [`LineSegmenter`] with custom options using compiled data. It automatically loads the best
    103        /// available payload data for Burmese, Khmer, Lao, and Thai.
    104        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)]
    105        #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options")]
    106        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "auto_with_options")]
    107        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "auto_with_options_v2")]
    108        #[cfg(feature = "compiled_data")]
    109        pub fn create_auto_with_options_v2(
    110            content_locale: Option<&Locale>,
    111            options: LineBreakOptionsV2,
    112        ) -> Box<LineSegmenter> {
    113            let mut options: LineBreakOptions = options.into();
    114            options.content_locale = content_locale.map(|c| &c.0.id);
    115            Box::new(LineSegmenter(
    116                icu_segmenter::LineSegmenter::new_auto(options).static_to_owned(),
    117            ))
    118        }
    119        /// Construct a [`LineSegmenter`] with custom options. It automatically loads the best
    120        /// available payload data for Burmese, Khmer, Lao, and Thai, using a particular data source.
    121        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)]
    122        #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options_and_provider")]
    123        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_and_provider")]
    124        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_v2_and_provider")]
    125        #[cfg(feature = "buffer_provider")]
    126        pub fn create_auto_with_options_v2_and_provider(
    127            provider: &DataProvider,
    128            content_locale: Option<&Locale>,
    129            options: LineBreakOptionsV2,
    130        ) -> Result<Box<LineSegmenter>, DataError> {
    131            let mut options: LineBreakOptions = options.into();
    132            options.content_locale = content_locale.map(|c| &c.0.id);
    133 
    134            Ok(Box::new(LineSegmenter(
    135                icu_segmenter::LineSegmenter::try_new_auto_with_buffer_provider(
    136                    provider.get()?,
    137                    options,
    138                )?,
    139            )))
    140        }
    141        /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for
    142        /// Burmese, Khmer, Lao, and Thai, using compiled data.
    143        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)]
    144        #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options")]
    145        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "lstm_with_options")]
    146        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "lstm_with_options_v2")]
    147        #[cfg(feature = "compiled_data")]
    148        pub fn create_lstm_with_options_v2(
    149            content_locale: Option<&Locale>,
    150            options: LineBreakOptionsV2,
    151        ) -> Box<LineSegmenter> {
    152            let mut options: LineBreakOptions = options.into();
    153            options.content_locale = content_locale.map(|c| &c.0.id);
    154 
    155            Box::new(LineSegmenter(
    156                icu_segmenter::LineSegmenter::new_lstm(options).static_to_owned(),
    157            ))
    158        }
    159        /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for
    160        /// Burmese, Khmer, Lao, and Thai, using a particular data source.
    161        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)]
    162        #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options_and_provider")]
    163        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_and_provider")]
    164        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_v2_and_provider")]
    165        #[cfg(feature = "buffer_provider")]
    166        pub fn create_lstm_with_options_v2_and_provider(
    167            provider: &DataProvider,
    168            content_locale: Option<&Locale>,
    169            options: LineBreakOptionsV2,
    170        ) -> Result<Box<LineSegmenter>, DataError> {
    171            let mut options: LineBreakOptions = options.into();
    172            options.content_locale = content_locale.map(|c| &c.0.id);
    173 
    174            Ok(Box::new(LineSegmenter(
    175                icu_segmenter::LineSegmenter::try_new_lstm_with_buffer_provider(
    176                    provider.get()?,
    177                    options,
    178                )?,
    179            )))
    180        }
    181        /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for
    182        /// Burmese, Khmer, Lao, and Thai, using compiled data.
    183        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)]
    184        #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options")]
    185        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "dictionary_with_options")]
    186        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "dictionary_with_options_v2")]
    187        #[cfg(feature = "compiled_data")]
    188        pub fn create_dictionary_with_options_v2(
    189            content_locale: Option<&Locale>,
    190            options: LineBreakOptionsV2,
    191        ) -> Box<LineSegmenter> {
    192            let mut options: LineBreakOptions = options.into();
    193            options.content_locale = content_locale.map(|c| &c.0.id);
    194 
    195            Box::new(LineSegmenter(
    196                icu_segmenter::LineSegmenter::new_dictionary(options).static_to_owned(),
    197            ))
    198        }
    199        /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for
    200        /// Burmese, Khmer, Lao, and Thai, using a particular data source.
    201        #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)]
    202        #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options_and_provider")]
    203        #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_and_provider")]
    204        #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_v2_and_provider")]
    205        #[cfg(feature = "buffer_provider")]
    206        pub fn create_dictionary_with_options_v2_and_provider(
    207            provider: &DataProvider,
    208            content_locale: Option<&Locale>,
    209            options: LineBreakOptionsV2,
    210        ) -> Result<Box<LineSegmenter>, DataError> {
    211            let mut options: LineBreakOptions = options.into();
    212            options.content_locale = content_locale.map(|c| &c.0.id);
    213 
    214            Ok(Box::new(LineSegmenter(
    215                icu_segmenter::LineSegmenter::try_new_dictionary_with_buffer_provider(
    216                    provider.get()?,
    217                    options,
    218                )?,
    219            )))
    220        }
    221        /// Segments a string.
    222        ///
    223        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
    224        /// to the WHATWG Encoding Standard.
    225        #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_utf8, FnInStruct)]
    226        #[diplomat::rust_link(
    227            icu::segmenter::LineSegmenterBorrowed::segment_str,
    228            FnInStruct,
    229            hidden
    230        )]
    231        #[diplomat::attr(not(supports = utf8_strings), disable)]
    232        #[diplomat::attr(*, rename = "segment")]
    233        pub fn segment_utf8<'a>(
    234            &'a self,
    235            input: &'a DiplomatStr,
    236        ) -> Box<LineBreakIteratorUtf8<'a>> {
    237            Box::new(LineBreakIteratorUtf8(
    238                self.0.as_borrowed().segment_utf8(input),
    239            ))
    240        }
    241 
    242        /// Segments a string.
    243        ///
    244        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
    245        /// to the WHATWG Encoding Standard.
    246        #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_utf16, FnInStruct)]
    247        #[diplomat::attr(not(supports = utf8_strings), rename = "segment")]
    248        #[diplomat::attr(supports = utf8_strings, rename = "segment16")]
    249        pub fn segment_utf16<'a>(
    250            &'a self,
    251            input: &'a DiplomatStr16,
    252        ) -> Box<LineBreakIteratorUtf16<'a>> {
    253            Box::new(LineBreakIteratorUtf16(
    254                self.0.as_borrowed().segment_utf16(input),
    255            ))
    256        }
    257 
    258        /// Segments a Latin-1 string.
    259        #[diplomat::rust_link(icu::segmenter::LineSegmenterBorrowed::segment_latin1, FnInStruct)]
    260        #[diplomat::attr(not(supports = utf8_strings), disable)]
    261        pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<LineBreakIteratorLatin1<'a>> {
    262            Box::new(LineBreakIteratorLatin1(
    263                self.0.as_borrowed().segment_latin1(input),
    264            ))
    265        }
    266    }
    267 
    268    impl<'a> LineBreakIteratorUtf8<'a> {
    269        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    270        /// out of range of a 32-bit signed integer.
    271        #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)]
    272        pub fn next(&mut self) -> i32 {
    273            self.0
    274                .next()
    275                .and_then(|u| i32::try_from(u).ok())
    276                .unwrap_or(-1)
    277        }
    278    }
    279 
    280    impl<'a> LineBreakIteratorUtf16<'a> {
    281        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    282        /// out of range of a 32-bit signed integer.
    283        #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)]
    284        #[diplomat::rust_link(
    285            icu::segmenter::iterators::LineBreakIterator::Item,
    286            AssociatedTypeInStruct,
    287            hidden
    288        )]
    289        pub fn next(&mut self) -> i32 {
    290            self.0
    291                .next()
    292                .and_then(|u| i32::try_from(u).ok())
    293                .unwrap_or(-1)
    294        }
    295    }
    296 
    297    impl<'a> LineBreakIteratorLatin1<'a> {
    298        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    299        /// out of range of a 32-bit signed integer.
    300        #[diplomat::rust_link(icu::segmenter::iterators::LineBreakIterator::next, FnInStruct)]
    301        #[diplomat::rust_link(
    302            icu::segmenter::iterators::LineBreakIterator::Item,
    303            AssociatedTypeInStruct,
    304            hidden
    305        )]
    306        pub fn next(&mut self) -> i32 {
    307            self.0
    308                .next()
    309                .and_then(|u| i32::try_from(u).ok())
    310                .unwrap_or(-1)
    311        }
    312    }
    313 }
    314 
    315 impl From<ffi::LineBreakOptionsV2> for icu_segmenter::options::LineBreakOptions<'_> {
    316    fn from(other: ffi::LineBreakOptionsV2) -> Self {
    317        let mut options = icu_segmenter::options::LineBreakOptions::default();
    318        options.strictness = other.strictness.into_converted_option();
    319        options.word_option = other.word_option.into_converted_option();
    320        options
    321    }
    322 }