tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

is_combining_diacritic.py (3289B)


      1 # This Source Code Form is subject to the terms of the Mozilla Public
      2 # License, v. 2.0. If a copy of the MPL was not distributed with this
      3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 
      5 from unicodedata import combining
      6 
      7 UNICODE_LIMIT = 0x110000
      8 
      9 UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
     10 UNICODE_COMBINING_CLASS_KANA_VOICING = 8
     11 UNICODE_COMBINING_CLASS_VIRAMA = 9
     12 
     13 
     14 # Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
     15 def is_combining_diacritic(char):
     16    return combining(char) not in (
     17        UNICODE_COMBINING_CLASS_NOT_REORDERED,
     18        UNICODE_COMBINING_CLASS_KANA_VOICING,
     19        UNICODE_COMBINING_CLASS_VIRAMA,
     20        91,
     21        129,
     22        130,
     23        132,
     24    )
     25 
     26 
     27 # See gfxFontUtils.h for the SharedBitSet that we're creating a const instance of here.
     28 BLOCK_SIZE = 32
     29 BLOCK_SIZE_BITS = BLOCK_SIZE * 8
     30 
     31 
     32 def main(header):
     33    blockIndex = []
     34    blocks = []
     35 
     36    # Figure out the contents of each 256-char block, and see if it is unique
     37    # or can share an already-allocated block.
     38    block = [0] * BLOCK_SIZE
     39    byte = 0
     40    bit = 0x01
     41    for char in range(UNICODE_LIMIT):
     42        if is_combining_diacritic(chr(char)):
     43            block[byte] |= bit
     44        bit <<= 1
     45        if bit == 0x100:
     46            bit = 0x01
     47            byte += 1
     48        if byte == BLOCK_SIZE:
     49            found = False
     50            for b in range(len(blocks)):
     51                if block == blocks[b]:
     52                    blockIndex.append(b)
     53                    found = True
     54                    break
     55            if not found:
     56                blockIndex.append(len(blocks))
     57                blocks.append(block)
     58            byte = 0
     59            block = [0] * BLOCK_SIZE
     60 
     61    # Strip trailing empty blocks from the index.
     62    while blockIndex[len(blockIndex) - 1] == 0:
     63        del blockIndex[len(blockIndex) - 1]
     64 
     65    # Write the SharedBitSet as data in a C++ header file.
     66    header.write("/* !GENERATED DATA -- DO NOT EDIT! */\n")
     67    header.write("/* (see is_combining_diacritic.py) */\n")
     68    header.write("\n")
     69    header.write('#include "gfxFontUtils.h"\n')
     70    header.write("\n")
     71 
     72    header.write("typedef struct {\n")
     73    header.write("  uint16_t mBlockIndexCount;\n")
     74    header.write("  uint16_t mBlockCount;\n")
     75    header.write("  uint16_t mBlockIndex[" + str(len(blockIndex)) + "];\n")
     76    header.write("  uint8_t mBlockData[" + str(len(blocks) * BLOCK_SIZE) + "];\n")
     77    header.write("} CombiningDiacriticsBitset_t;\n")
     78    header.write("\n")
     79 
     80    header.write(
     81        "static const CombiningDiacriticsBitset_t COMBINING_DIACRITICS_BITSET_DATA = {\n"
     82    )
     83    header.write("  " + str(len(blockIndex)) + ",\n")
     84    header.write("  " + str(len(blocks)) + ",\n")
     85    header.write("  {\n")
     86    for b in blockIndex:
     87        header.write("    " + str(b) + ",\n")
     88    header.write("  },\n")
     89    header.write("  {\n")
     90    for b in blocks:
     91        header.write("    ")
     92        for i in b:
     93            header.write(str(i) + ",")
     94        header.write("\n")
     95    header.write("  },\n")
     96    header.write("};\n")
     97    header.write("\n")
     98    header.write("static const SharedBitSet* sCombiningDiacriticsSet =\n")
     99    header.write(
    100        "    reinterpret_cast<const SharedBitSet*>(&COMBINING_DIACRITICS_BITSET_DATA);\n"
    101    )
    102    header.write("\n")