tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TestSegmenterPerf.cpp (7603B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include <fstream>
      8 
      9 #include "gtest/gtest.h"
     10 #include "gtest/MozGTestBench.h"  // For MOZ_GTEST_BENCH
     11 #include "mozilla/intl/LineBreaker.h"
     12 #include "mozilla/intl/Segmenter.h"
     13 #include "mozilla/Preferences.h"
     14 #include "nsAtom.h"
     15 #include "nsLineBreaker.h"
     16 #include "nsString.h"
     17 #include "nsTArray.h"
     18 
     19 namespace mozilla::intl {
     20 
     21 using mozilla::intl::LineBreakRule;
     22 using mozilla::intl::WordBreakRule;
     23 
     24 constexpr size_t kIterations = 100;
     25 
     26 static std::string ReadFileIntoString(const char* aPath) {
     27  std::ifstream file(aPath);
     28  std::stringstream sstr;
     29  sstr << file.rdbuf();
     30  return sstr.str();
     31 }
     32 
     33 class SegmenterPerf : public ::testing::Test {
     34 protected:
     35  void SetUp() override {
     36    // Test files are into xpcom/tests/gtest/wikipedia
     37    mArUtf8 = ReadFileIntoString("ar.txt");
     38    mDeUtf8 = ReadFileIntoString("de.txt");
     39    mJaUtf8 = ReadFileIntoString("ja.txt");
     40    mRuUtf8 = ReadFileIntoString("ru.txt");
     41    mThUtf8 = ReadFileIntoString("th.txt");
     42    mTrUtf8 = ReadFileIntoString("tr.txt");
     43    mViUtf8 = ReadFileIntoString("vi.txt");
     44 
     45    CopyUTF8toUTF16(mArUtf8, mArUtf16);
     46    CopyUTF8toUTF16(mDeUtf8, mDeUtf16);
     47    CopyUTF8toUTF16(mJaUtf8, mJaUtf16);
     48    CopyUTF8toUTF16(mRuUtf8, mRuUtf16);
     49    CopyUTF8toUTF16(mThUtf8, mThUtf16);
     50    CopyUTF8toUTF16(mTrUtf8, mTrUtf16);
     51    CopyUTF8toUTF16(mViUtf8, mViUtf16);
     52 
     53    mAr = NS_Atomize(u"ar");
     54    mDe = NS_Atomize(u"de");
     55    mJa = NS_Atomize(u"ja");
     56    mRu = NS_Atomize(u"ru");
     57    mTh = NS_Atomize(u"th");
     58    mTr = NS_Atomize(u"tr");
     59    mVi = NS_Atomize(u"vi");
     60  }
     61 
     62 public:
     63  std::string mArUtf8;
     64  std::string mDeUtf8;
     65  std::string mJaUtf8;
     66  std::string mRuUtf8;
     67  std::string mThUtf8;
     68  std::string mTrUtf8;
     69  std::string mViUtf8;
     70 
     71  nsString mArUtf16;
     72  nsString mDeUtf16;
     73  nsString mJaUtf16;
     74  nsString mRuUtf16;
     75  nsString mThUtf16;
     76  nsString mTrUtf16;
     77  nsString mViUtf16;
     78 
     79  RefPtr<nsAtom> mAr;
     80  RefPtr<nsAtom> mDe;
     81  RefPtr<nsAtom> mJa;
     82  RefPtr<nsAtom> mRu;
     83  RefPtr<nsAtom> mTh;
     84  RefPtr<nsAtom> mTr;
     85  RefPtr<nsAtom> mVi;
     86 };
     87 
     88 class AutoSetSegmenter final {
     89 public:
     90  explicit AutoSetSegmenter(bool aValue) {
     91    nsresult rv =
     92        mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", aValue);
     93    EXPECT_TRUE(rv == NS_OK);
     94  }
     95 
     96  ~AutoSetSegmenter() {
     97    mozilla::Preferences::ClearUser("intl.icu4x.segmenter.enabled");
     98  }
     99 };
    100 
    101 static void TestSegmenterBench(const nsString& aStr, bool aIsJaOrZh,
    102                               size_t aCount = kIterations) {
    103  nsTArray<uint8_t> breakState;
    104  breakState.SetLength(aStr.Length());
    105 
    106  for (size_t i = 0; i < aCount; i++) {
    107    LineBreaker::ComputeBreakPositions(
    108        aStr.get(), aStr.Length(), WordBreakRule::Normal, LineBreakRule::Strict,
    109        aIsJaOrZh, breakState.Elements());
    110  }
    111 }
    112 
    113 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAROld, [this] {
    114  AutoSetSegmenter set(false);
    115  TestSegmenterBench(mArUtf16, false);
    116 });
    117 
    118 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDEOld, [this] {
    119  AutoSetSegmenter set(false);
    120  TestSegmenterBench(mDeUtf16, false);
    121 });
    122 
    123 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJAOld, [this] {
    124  AutoSetSegmenter set(false);
    125  TestSegmenterBench(mJaUtf16, true);
    126 });
    127 
    128 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRUOld, [this] {
    129  AutoSetSegmenter set(false);
    130  TestSegmenterBench(mRuUtf16, false);
    131 });
    132 
    133 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTHOld, [this] {
    134  AutoSetSegmenter set(false);
    135  TestSegmenterBench(mThUtf16, false);
    136 });
    137 
    138 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTROld, [this] {
    139  AutoSetSegmenter set(false);
    140  TestSegmenterBench(mTrUtf16, false);
    141 });
    142 
    143 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVIOld, [this] {
    144  AutoSetSegmenter set(false);
    145  TestSegmenterBench(mViUtf16, false);
    146 });
    147 
    148 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAR, [this] {
    149  AutoSetSegmenter set(false);
    150  TestSegmenterBench(mArUtf16, false);
    151 });
    152 
    153 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDE, [this] {
    154  AutoSetSegmenter set(true);
    155  TestSegmenterBench(mDeUtf16, false);
    156 });
    157 
    158 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJA, [this] {
    159  AutoSetSegmenter set(true);
    160  TestSegmenterBench(mJaUtf16, true);
    161 });
    162 
    163 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRU, [this] {
    164  AutoSetSegmenter set(true);
    165  TestSegmenterBench(mRuUtf16, false);
    166 });
    167 
    168 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTH, [this] {
    169  AutoSetSegmenter set(true);
    170  // LSTM segmenter is too slow
    171  TestSegmenterBench(mThUtf16, false, 3);
    172 });
    173 
    174 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTR, [this] {
    175  AutoSetSegmenter set(true);
    176  TestSegmenterBench(mTrUtf16, false);
    177 });
    178 
    179 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVI, [this] {
    180  AutoSetSegmenter set(true);
    181  TestSegmenterBench(mViUtf16, false);
    182 });
    183 
    184 class LBSink final : public nsILineBreakSink {
    185 public:
    186  LBSink() = default;
    187  ~LBSink() = default;
    188 
    189  virtual void SetBreaks(uint32_t, uint32_t, uint8_t*) override {}
    190  virtual void SetCapitalization(uint32_t, uint32_t, bool*) override {}
    191 };
    192 
    193 static void TestDOMSegmenterBench(const nsString& aStr, nsAtom* aLang,
    194                                  size_t aCount = kIterations) {
    195  LBSink sink;
    196  bool trailingBreak;
    197 
    198  for (size_t i = 0; i < aCount; i++) {
    199    nsLineBreaker breaker;
    200    breaker.AppendText(aLang, aStr.get(), aStr.Length(), 0, &sink);
    201    breaker.Reset(&trailingBreak);
    202  }
    203 }
    204 
    205 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAROld, [this] {
    206  AutoSetSegmenter set(false);
    207  TestDOMSegmenterBench(mArUtf16, mAr);
    208 });
    209 
    210 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDEOld, [this] {
    211  AutoSetSegmenter set(false);
    212  TestDOMSegmenterBench(mDeUtf16, mDe);
    213 });
    214 
    215 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJAOld, [this] {
    216  AutoSetSegmenter set(false);
    217  TestDOMSegmenterBench(mJaUtf16, mJa);
    218 });
    219 
    220 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRUOld, [this] {
    221  AutoSetSegmenter set(false);
    222  TestDOMSegmenterBench(mRuUtf16, mRu);
    223 });
    224 
    225 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTHOld, [this] {
    226  AutoSetSegmenter set(false);
    227  TestDOMSegmenterBench(mThUtf16, mTh);
    228 });
    229 
    230 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTROld, [this] {
    231  AutoSetSegmenter set(false);
    232  TestDOMSegmenterBench(mTrUtf16, mTr);
    233 });
    234 
    235 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVIOld, [this] {
    236  AutoSetSegmenter set(false);
    237  TestDOMSegmenterBench(mViUtf16, mVi);
    238 });
    239 
    240 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAR, [this] {
    241  AutoSetSegmenter set(true);
    242  TestDOMSegmenterBench(mArUtf16, mAr);
    243 });
    244 
    245 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDE, [this] {
    246  AutoSetSegmenter set(true);
    247  TestDOMSegmenterBench(mDeUtf16, mDe);
    248 });
    249 
    250 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJA, [this] {
    251  AutoSetSegmenter set(true);
    252  TestDOMSegmenterBench(mJaUtf16, mJa);
    253 });
    254 
    255 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRU, [this] {
    256  AutoSetSegmenter set(true);
    257  TestDOMSegmenterBench(mRuUtf16, mRu);
    258 });
    259 
    260 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTH, [this] {
    261  AutoSetSegmenter set(true);
    262  // LSTM segmenter is too slow
    263  TestDOMSegmenterBench(mThUtf16, mTh, 3);
    264 });
    265 
    266 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTR, [this] {
    267  AutoSetSegmenter set(true);
    268  TestDOMSegmenterBench(mTrUtf16, mTr);
    269 });
    270 
    271 MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVI, [this] {
    272  AutoSetSegmenter set(true);
    273  TestDOMSegmenterBench(mViUtf16, mVi);
    274 });
    275 
    276 }  // namespace mozilla::intl