tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

segmenter_word.rs (16826B)


      1 // This file is part of ICU4X. For terms of use, please see the file
      2 // called LICENSE at the top level of the ICU4X source tree
      3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
      4 
      5 #[diplomat::bridge]
      6 #[diplomat::abi_rename = "icu4x_{0}_mv1"]
      7 #[diplomat::attr(auto, namespace = "icu4x")]
      8 pub mod ffi {
      9    use alloc::boxed::Box;
     10    use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16};
     11 
     12    #[cfg(feature = "buffer_provider")]
     13    use crate::unstable::provider::ffi::DataProvider;
     14    #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))]
     15    use crate::unstable::{errors::ffi::DataError, locale_core::ffi::Locale};
     16 
     17    #[diplomat::enum_convert(icu_segmenter::options::WordType, needs_wildcard)]
     18    #[diplomat::rust_link(icu::segmenter::options::WordType, Enum)]
     19    pub enum SegmenterWordType {
     20        None = 0,
     21        Number = 1,
     22        Letter = 2,
     23    }
     24 
     25    #[diplomat::opaque]
     26    /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings.
     27    #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)]
     28    #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed, Struct, hidden)]
     29    #[diplomat::demo(custom_func = "../../npm/demo_gen_custom/WordSegmenter.mjs")]
     30    pub struct WordSegmenter(icu_segmenter::WordSegmenter);
     31 
     32    #[diplomat::opaque]
     33    #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)]
     34    pub struct WordBreakIteratorUtf8<'a>(
     35        icu_segmenter::iterators::WordBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>,
     36    );
     37 
     38    #[diplomat::opaque]
     39    #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)]
     40    pub struct WordBreakIteratorUtf16<'a>(
     41        icu_segmenter::iterators::WordBreakIterator<'a, 'a, Utf16>,
     42    );
     43    #[diplomat::opaque]
     44    #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator, Struct)]
     45    pub struct WordBreakIteratorLatin1<'a>(
     46        icu_segmenter::iterators::WordBreakIterator<'a, 'a, Latin1>,
     47    );
     48 
     49    impl SegmenterWordType {
     50        #[diplomat::rust_link(icu::segmenter::options::WordType::is_word_like, FnInEnum)]
     51        #[diplomat::attr(auto, getter)]
     52        pub fn is_word_like(self) -> bool {
     53            icu_segmenter::options::WordType::from(self).is_word_like()
     54        }
     55    }
     56 
     57    impl WordSegmenter {
     58        /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
     59        /// or dictionary payload data, using compiled data. This does not assume any content locale.
     60        ///
     61        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
     62        /// Khmer, Lao, and Thai.
     63        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)]
     64        #[diplomat::rust_link(icu::segmenter::options::WordBreakInvariantOptions, Struct, hidden)]
     65        #[diplomat::attr(auto, named_constructor = "auto")]
     66        #[cfg(feature = "compiled_data")]
     67        pub fn create_auto() -> Box<WordSegmenter> {
     68            Box::new(WordSegmenter(
     69                icu_segmenter::WordSegmenter::new_auto(Default::default()).static_to_owned(),
     70            ))
     71        }
     72 
     73        /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
     74        /// or dictionary payload data, using compiled data.
     75        ///
     76        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
     77        /// Khmer, Lao, and Thai.
     78        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)]
     79        #[diplomat::rust_link(icu::segmenter::options::WordBreakOptions, Struct, hidden)]
     80        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale")]
     81        #[cfg(feature = "compiled_data")]
     82        pub fn create_auto_with_content_locale(
     83            locale: &Locale,
     84        ) -> Result<Box<WordSegmenter>, DataError> {
     85            Ok(Box::new(WordSegmenter(
     86                icu_segmenter::WordSegmenter::try_new_auto(locale.into())?,
     87            )))
     88        }
     89 
     90        /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
     91        /// or dictionary payload data, using a particular data source.
     92        ///
     93        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
     94        /// Khmer, Lao, and Thai.
     95        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)]
     96        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale_and_provider")]
     97        #[cfg(feature = "buffer_provider")]
     98        pub fn create_auto_with_content_locale_and_provider(
     99            provider: &DataProvider,
    100            locale: &Locale,
    101        ) -> Result<Box<WordSegmenter>, DataError> {
    102            Ok(Box::new(WordSegmenter(
    103                icu_segmenter::WordSegmenter::try_new_auto_with_buffer_provider(
    104                    provider.get()?,
    105                    locale.into(),
    106                )?,
    107            )))
    108        }
    109 
    110        /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
    111        /// Thai, using compiled data.  This does not assume any content locale.
    112        ///
    113        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
    114        /// Khmer, Lao, and Thai.
    115        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)]
    116        #[diplomat::attr(auto, named_constructor = "lstm")]
    117        #[cfg(feature = "compiled_data")]
    118        pub fn create_lstm() -> Box<WordSegmenter> {
    119            Box::new(WordSegmenter(
    120                icu_segmenter::WordSegmenter::new_lstm(Default::default()).static_to_owned(),
    121            ))
    122        }
    123 
    124        /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
    125        /// Thai, using compiled data.
    126        ///
    127        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
    128        /// Khmer, Lao, and Thai.
    129        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)]
    130        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale")]
    131        #[cfg(feature = "compiled_data")]
    132        pub fn create_lstm_with_content_locale(
    133            locale: &Locale,
    134        ) -> Result<Box<WordSegmenter>, DataError> {
    135            Ok(Box::new(WordSegmenter(
    136                icu_segmenter::WordSegmenter::try_new_lstm(locale.into())?,
    137            )))
    138        }
    139 
    140        /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
    141        /// Thai, using a particular data source.
    142        ///
    143        /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
    144        /// Khmer, Lao, and Thai.
    145        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)]
    146        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale_and_provider")]
    147        #[cfg(feature = "buffer_provider")]
    148        pub fn create_lstm_with_content_locale_and_provider(
    149            provider: &DataProvider,
    150            locale: &Locale,
    151        ) -> Result<Box<WordSegmenter>, DataError> {
    152            Ok(Box::new(WordSegmenter(
    153                icu_segmenter::WordSegmenter::try_new_lstm_with_buffer_provider(
    154                    provider.get()?,
    155                    locale.into(),
    156                )?,
    157            )))
    158        }
    159 
    160        /// Construct an [`WordSegmenter`] with with dictionary payload data for Chinese, Japanese,
    161        /// Burmese, Khmer, Lao, and Thai, using compiled data.  This does not assume any content locale.
    162        ///
    163        /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
    164        /// Khmer, Lao, and Thai.
    165        #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)]
    166        #[diplomat::attr(auto, named_constructor = "dictionary")]
    167        #[cfg(feature = "compiled_data")]
    168        pub fn create_dictionary() -> Box<WordSegmenter> {
    169            Box::new(WordSegmenter(
    170                icu_segmenter::WordSegmenter::new_dictionary(Default::default()).static_to_owned(),
    171            ))
    172        }
    173 
    174        /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,
    175        /// Burmese, Khmer, Lao, and Thai, using compiled data.
    176        ///
    177        /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
    178        /// Khmer, Lao, and Thai.
    179        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)]
    180        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale")]
    181        #[cfg(feature = "compiled_data")]
    182        pub fn create_dictionary_with_content_locale(
    183            locale: &Locale,
    184        ) -> Result<Box<WordSegmenter>, DataError> {
    185            Ok(Box::new(WordSegmenter(
    186                icu_segmenter::WordSegmenter::try_new_dictionary(locale.into())?,
    187            )))
    188        }
    189 
    190        /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,
    191        /// Burmese, Khmer, Lao, and Thai, using a particular data source.
    192        ///
    193        /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
    194        /// Khmer, Lao, and Thai.
    195        #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)]
    196        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale_and_provider")]
    197        #[cfg(feature = "buffer_provider")]
    198        pub fn create_dictionary_with_content_locale_and_provider(
    199            provider: &DataProvider,
    200            locale: &Locale,
    201        ) -> Result<Box<WordSegmenter>, DataError> {
    202            Ok(Box::new(WordSegmenter(
    203                icu_segmenter::WordSegmenter::try_new_dictionary_with_buffer_provider(
    204                    provider.get()?,
    205                    locale.into(),
    206                )?,
    207            )))
    208        }
    209        /// Segments a string.
    210        ///
    211        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
    212        /// to the WHATWG Encoding Standard.
    213        #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_utf8, FnInStruct)]
    214        #[diplomat::rust_link(
    215            icu::segmenter::WordSegmenterBorrowed::segment_str,
    216            FnInStruct,
    217            hidden
    218        )]
    219        #[diplomat::attr(not(supports = utf8_strings), disable)]
    220        #[diplomat::attr(*, rename = "segment")]
    221        pub fn segment_utf8<'a>(
    222            &'a self,
    223            input: &'a DiplomatStr,
    224        ) -> Box<WordBreakIteratorUtf8<'a>> {
    225            Box::new(WordBreakIteratorUtf8(
    226                self.0.as_borrowed().segment_utf8(input),
    227            ))
    228        }
    229 
    230        /// Segments a string.
    231        ///
    232        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
    233        /// to the WHATWG Encoding Standard.
    234        #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_utf16, FnInStruct)]
    235        #[diplomat::attr(not(supports = utf8_strings), rename = "segment")]
    236        #[diplomat::attr(supports = utf8_strings, rename = "segment16")]
    237        pub fn segment_utf16<'a>(
    238            &'a self,
    239            input: &'a DiplomatStr16,
    240        ) -> Box<WordBreakIteratorUtf16<'a>> {
    241            Box::new(WordBreakIteratorUtf16(
    242                self.0.as_borrowed().segment_utf16(input),
    243            ))
    244        }
    245 
    246        /// Segments a Latin-1 string.
    247        #[diplomat::rust_link(icu::segmenter::WordSegmenterBorrowed::segment_latin1, FnInStruct)]
    248        #[diplomat::attr(not(supports = utf8_strings), disable)]
    249        pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>> {
    250            Box::new(WordBreakIteratorLatin1(
    251                self.0.as_borrowed().segment_latin1(input),
    252            ))
    253        }
    254    }
    255 
    256    impl<'a> WordBreakIteratorUtf8<'a> {
    257        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    258        /// out of range of a 32-bit signed integer.
    259        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)]
    260        pub fn next(&mut self) -> i32 {
    261            self.0
    262                .next()
    263                .and_then(|u| i32::try_from(u).ok())
    264                .unwrap_or(-1)
    265        }
    266 
    267        /// Return the status value of break boundary.
    268        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)]
    269        #[diplomat::rust_link(
    270            icu::segmenter::iterators::WordBreakIteratorWithWordType,
    271            Struct,
    272            hidden
    273        )]
    274        #[diplomat::rust_link(
    275            icu::segmenter::iterators::WordBreakIteratorWithWordType::next,
    276            FnInStruct,
    277            hidden
    278        )]
    279        #[diplomat::attr(auto, getter)]
    280        pub fn word_type(&self) -> SegmenterWordType {
    281            self.0.word_type().into()
    282        }
    283 
    284        /// Return true when break boundary is word-like such as letter/number/CJK
    285        #[diplomat::rust_link(
    286            icu::segmenter::iterators::WordBreakIterator::is_word_like,
    287            FnInStruct
    288        )]
    289        #[diplomat::attr(auto, getter)]
    290        pub fn is_word_like(&self) -> bool {
    291            self.0.is_word_like()
    292        }
    293    }
    294 
    295    impl<'a> WordBreakIteratorUtf16<'a> {
    296        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    297        /// out of range of a 32-bit signed integer.
    298        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)]
    299        #[diplomat::rust_link(
    300            icu::segmenter::iterators::WordBreakIterator::Item,
    301            AssociatedTypeInStruct,
    302            hidden
    303        )]
    304        pub fn next(&mut self) -> i32 {
    305            self.0
    306                .next()
    307                .and_then(|u| i32::try_from(u).ok())
    308                .unwrap_or(-1)
    309        }
    310 
    311        /// Return the status value of break boundary.
    312        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)]
    313        #[diplomat::rust_link(
    314            icu::segmenter::iterators::WordBreakIterator::iter_with_word_type,
    315            FnInStruct,
    316            hidden
    317        )]
    318        #[diplomat::attr(auto, getter)]
    319        pub fn word_type(&self) -> SegmenterWordType {
    320            self.0.word_type().into()
    321        }
    322 
    323        /// Return true when break boundary is word-like such as letter/number/CJK
    324        #[diplomat::rust_link(
    325            icu::segmenter::iterators::WordBreakIterator::is_word_like,
    326            FnInStruct
    327        )]
    328        #[diplomat::attr(auto, getter)]
    329        pub fn is_word_like(&self) -> bool {
    330            self.0.is_word_like()
    331        }
    332    }
    333 
    334    impl<'a> WordBreakIteratorLatin1<'a> {
    335        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    336        /// out of range of a 32-bit signed integer.
    337        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::next, FnInStruct)]
    338        #[diplomat::rust_link(
    339            icu::segmenter::iterators::WordBreakIterator::Item,
    340            AssociatedTypeInStruct,
    341            hidden
    342        )]
    343        pub fn next(&mut self) -> i32 {
    344            self.0
    345                .next()
    346                .and_then(|u| i32::try_from(u).ok())
    347                .unwrap_or(-1)
    348        }
    349 
    350        /// Return the status value of break boundary.
    351        #[diplomat::rust_link(icu::segmenter::iterators::WordBreakIterator::word_type, FnInStruct)]
    352        #[diplomat::attr(auto, getter)]
    353        pub fn word_type(&self) -> SegmenterWordType {
    354            self.0.word_type().into()
    355        }
    356 
    357        /// Return true when break boundary is word-like such as letter/number/CJK
    358        #[diplomat::rust_link(
    359            icu::segmenter::iterators::WordBreakIterator::is_word_like,
    360            FnInStruct
    361        )]
    362        #[diplomat::attr(auto, getter)]
    363        pub fn is_word_like(&self) -> bool {
    364            self.0.is_word_like()
    365        }
    366    }
    367 }
    368 
    369 impl<'a> From<&'a crate::unstable::locale_core::ffi::Locale>
    370    for icu_segmenter::options::WordBreakOptions<'a>
    371 {
    372    fn from(other: &'a crate::unstable::locale_core::ffi::Locale) -> Self {
    373        let mut options = icu_segmenter::options::WordBreakOptions::default();
    374        options.content_locale = Some(&other.0.id);
    375        options
    376    }
    377 }