tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

segmenter_sentence.rs (8040B)


      1 // This file is part of ICU4X. For terms of use, please see the file
      2 // called LICENSE at the top level of the ICU4X source tree
      3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
      4 
      5 #[diplomat::bridge]
      6 #[diplomat::abi_rename = "icu4x_{0}_mv1"]
      7 #[diplomat::attr(auto, namespace = "icu4x")]
      8 pub mod ffi {
      9    use alloc::boxed::Box;
     10    use icu_segmenter::scaffold::{Latin1, PotentiallyIllFormedUtf8, Utf16};
     11 
     12    #[cfg(feature = "buffer_provider")]
     13    use crate::unstable::provider::ffi::DataProvider;
     14    #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))]
     15    use crate::unstable::{errors::ffi::DataError, locale_core::ffi::Locale};
     16 
     17    #[diplomat::opaque]
     18    /// An ICU4X sentence-break segmenter, capable of finding sentence breakpoints in strings.
     19    #[diplomat::rust_link(icu::segmenter::SentenceSegmenter, Struct)]
     20    #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed, Struct, hidden)]
     21    pub struct SentenceSegmenter(icu_segmenter::SentenceSegmenter);
     22 
     23    #[diplomat::opaque]
     24    #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)]
     25    pub struct SentenceBreakIteratorUtf8<'a>(
     26        icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, PotentiallyIllFormedUtf8>,
     27    );
     28 
     29    #[diplomat::opaque]
     30    #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)]
     31    pub struct SentenceBreakIteratorUtf16<'a>(
     32        icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, Utf16>,
     33    );
     34 
     35    #[diplomat::opaque]
     36    #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator, Struct)]
     37    pub struct SentenceBreakIteratorLatin1<'a>(
     38        icu_segmenter::iterators::SentenceBreakIterator<'a, 'a, Latin1>,
     39    );
     40 
     41    impl SentenceSegmenter {
     42        /// Construct a [`SentenceSegmenter`] using compiled data. This does not assume any content locale.
     43        #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::new, FnInStruct)]
     44        #[diplomat::rust_link(
     45            icu::segmenter::options::SentenceBreakInvariantOptions,
     46            Struct,
     47            hidden
     48        )]
     49        #[diplomat::attr(auto, constructor)]
     50        #[cfg(feature = "compiled_data")]
     51        pub fn create() -> Box<SentenceSegmenter> {
     52            Box::new(SentenceSegmenter(
     53                icu_segmenter::SentenceSegmenter::new(Default::default()).static_to_owned(),
     54            ))
     55        }
     56        /// Construct a [`SentenceSegmenter`] for content known to be of a given locale, using compiled data.
     57        #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)]
     58        #[diplomat::rust_link(icu::segmenter::options::SentenceBreakOptions, Struct, hidden)]
     59        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale")]
     60        #[cfg(feature = "compiled_data")]
     61        pub fn create_with_content_locale(
     62            locale: &Locale,
     63        ) -> Result<Box<SentenceSegmenter>, DataError> {
     64            Ok(Box::new(SentenceSegmenter(
     65                icu_segmenter::SentenceSegmenter::try_new(locale.into())?,
     66            )))
     67        }
     68 
     69        /// Construct a [`SentenceSegmenter`]  for content known to be of a given locale, using a particular data source.
     70        #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)]
     71        #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale_and_provider")]
     72        #[cfg(feature = "buffer_provider")]
     73        pub fn create_with_content_locale_and_provider(
     74            provider: &DataProvider,
     75            locale: &Locale,
     76        ) -> Result<Box<SentenceSegmenter>, DataError> {
     77            Ok(Box::new(SentenceSegmenter(
     78                icu_segmenter::SentenceSegmenter::try_new_with_buffer_provider(
     79                    provider.get()?,
     80                    locale.into(),
     81                )?,
     82            )))
     83        }
     84 
     85        /// Segments a string.
     86        ///
     87        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
     88        /// to the WHATWG Encoding Standard.
     89        #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed::segment_utf8, FnInStruct)]
     90        #[diplomat::rust_link(
     91            icu::segmenter::SentenceSegmenterBorrowed::segment_str,
     92            FnInStruct,
     93            hidden
     94        )]
     95        #[diplomat::attr(not(supports = utf8_strings), disable)]
     96        #[diplomat::attr(*, rename = "segment")]
     97        pub fn segment_utf8<'a>(
     98            &'a self,
     99            input: &'a DiplomatStr,
    100        ) -> Box<SentenceBreakIteratorUtf8<'a>> {
    101            Box::new(SentenceBreakIteratorUtf8(
    102                self.0.as_borrowed().segment_utf8(input),
    103            ))
    104        }
    105 
    106        /// Segments a string.
    107        ///
    108        /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
    109        /// to the WHATWG Encoding Standard.
    110        #[diplomat::rust_link(icu::segmenter::SentenceSegmenterBorrowed::segment_utf16, FnInStruct)]
    111        #[diplomat::attr(not(supports = utf8_strings), rename = "segment")]
    112        #[diplomat::attr(supports = utf8_strings, rename = "segment16")]
    113        pub fn segment_utf16<'a>(
    114            &'a self,
    115            input: &'a DiplomatStr16,
    116        ) -> Box<SentenceBreakIteratorUtf16<'a>> {
    117            Box::new(SentenceBreakIteratorUtf16(
    118                self.0.as_borrowed().segment_utf16(input),
    119            ))
    120        }
    121 
    122        /// Segments a Latin-1 string.
    123        #[diplomat::rust_link(
    124            icu::segmenter::SentenceSegmenterBorrowed::segment_latin1,
    125            FnInStruct
    126        )]
    127        #[diplomat::attr(not(supports = utf8_strings), disable)]
    128        pub fn segment_latin1<'a>(
    129            &'a self,
    130            input: &'a [u8],
    131        ) -> Box<SentenceBreakIteratorLatin1<'a>> {
    132            Box::new(SentenceBreakIteratorLatin1(
    133                self.0.as_borrowed().segment_latin1(input),
    134            ))
    135        }
    136    }
    137 
    138    impl<'a> SentenceBreakIteratorUtf8<'a> {
    139        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    140        /// out of range of a 32-bit signed integer.
    141        #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)]
    142        pub fn next(&mut self) -> i32 {
    143            self.0
    144                .next()
    145                .and_then(|u| i32::try_from(u).ok())
    146                .unwrap_or(-1)
    147        }
    148    }
    149 
    150    impl<'a> SentenceBreakIteratorUtf16<'a> {
    151        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    152        /// out of range of a 32-bit signed integer.
    153        #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)]
    154        pub fn next(&mut self) -> i32 {
    155            self.0
    156                .next()
    157                .and_then(|u| i32::try_from(u).ok())
    158                .unwrap_or(-1)
    159        }
    160    }
    161 
    162    impl<'a> SentenceBreakIteratorLatin1<'a> {
    163        /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
    164        /// out of range of a 32-bit signed integer.
    165        #[diplomat::rust_link(icu::segmenter::iterators::SentenceBreakIterator::next, FnInStruct)]
    166        #[diplomat::rust_link(
    167            icu::segmenter::iterators::SentenceBreakIterator::Item,
    168            AssociatedTypeInStruct,
    169            hidden
    170        )]
    171        pub fn next(&mut self) -> i32 {
    172            self.0
    173                .next()
    174                .and_then(|u| i32::try_from(u).ok())
    175                .unwrap_or(-1)
    176        }
    177    }
    178 }
    179 
    180 impl<'a> From<&'a crate::unstable::locale_core::ffi::Locale>
    181    for icu_segmenter::options::SentenceBreakOptions<'a>
    182 {
    183    fn from(other: &'a crate::unstable::locale_core::ffi::Locale) -> Self {
    184        let mut options = icu_segmenter::options::SentenceBreakOptions::default();
    185        options.content_locale = Some(&other.0.id);
    186        options
    187    }
    188 }