[ tor-browser ].git.dasho

inputext.cpp (4588B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2005-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "inputext.h"
     15 
     16 #include "cmemory.h"
     17 #include "cstring.h"
     18 
     19 #include <string.h>
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 #define BUFFER_SIZE 8192
     24 
     25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     27 
     28 InputText::InputText(UErrorCode &status)
     29    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
     30                                                 //   removed if appropriate.
     31      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
     32                                                 //   Value is percent, not absolute.
     33      fDeclaredEncoding(nullptr),
     34      fRawInput(nullptr),
     35      fRawLength(0)
     36 {
     37    if (fInputBytes == nullptr || fByteStats == nullptr) {
     38        status = U_MEMORY_ALLOCATION_ERROR;
     39    }
     40 }
     41 
     42 InputText::~InputText()
     43 {
     44    DELETE_ARRAY(fDeclaredEncoding);
     45    DELETE_ARRAY(fByteStats);
     46    DELETE_ARRAY(fInputBytes);
     47 }
     48 
     49 void InputText::setText(const char *in, int32_t len)
     50 {
     51    fInputLen  = 0;
     52    fC1Bytes   = false;
     53    fRawInput = reinterpret_cast<const uint8_t*>(in);
     54    fRawLength = len == -1 ? static_cast<int32_t>(uprv_strlen(in)) : len;
     55 }
     56 
     57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
     58 {
     59    if(encoding) {
     60        if (len == -1) {
     61            len = static_cast<int32_t>(uprv_strlen(encoding));
     62        }
     63 
     64        len += 1;     // to make place for the \0 at the end.
     65        uprv_free(fDeclaredEncoding);
     66        fDeclaredEncoding = NEW_ARRAY(char, len);
     67        uprv_strncpy(fDeclaredEncoding, encoding, len);
     68    }
     69 }
     70 
     71 UBool InputText::isSet() const 
     72 {
     73    return fRawInput != nullptr;
     74 }
     75 
     76 /**
     77 *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
     78 *               it by removing what appears to be html markup.
     79 * 
     80 * @internal
     81 */
     82 void InputText::MungeInput(UBool fStripTags) {
     83    int     srci = 0;
     84    int     dsti = 0;
     85    uint8_t b;
     86    bool    inMarkup = false;
     87    int32_t openTags = 0;
     88    int32_t badTags  = 0;
     89 
     90    //
     91    //  html / xml markup stripping.
     92    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
     93    //     discard everything within < brackets >
     94    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
     95    //     guess as to whether the input was actually marked up at all.
     96    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
     97    if (fStripTags) {
     98        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
     99            b = fRawInput[srci];
    100 
    101            if (b == static_cast<uint8_t>(0x3C)) { /* Check for the ASCII '<' */
    102                if (inMarkup) {
    103                    badTags += 1;
    104                }
    105 
    106                inMarkup = true;
    107                openTags += 1;
    108            }
    109 
    110            if (! inMarkup) {
    111                fInputBytes[dsti++] = b;
    112            }
    113 
    114            if (b == static_cast<uint8_t>(0x3E)) { /* Check for the ASCII '>' */
    115                inMarkup = false;
    116            }
    117        }
    118 
    119        fInputLen = dsti;
    120    }
    121 
    122    //
    123    //  If it looks like this input wasn't marked up, or if it looks like it's
    124    //    essentially nothing but markup abandon the markup stripping.
    125    //    Detection will have to work on the unstripped input.
    126    //
    127    if (openTags<5 || openTags/5 < badTags || 
    128        (fInputLen < 100 && fRawLength>600))
    129    {
    130        int32_t limit = fRawLength;
    131 
    132        if (limit > BUFFER_SIZE) {
    133            limit = BUFFER_SIZE;
    134        }
    135 
    136        for (srci=0; srci<limit; srci++) {
    137            fInputBytes[srci] = fRawInput[srci];
    138        }
    139 
    140        fInputLen = srci;
    141    }
    142 
    143    //
    144    // Tally up the byte occurrence statistics.
    145    // These are available for use by the various detectors.
    146    //
    147 
    148    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
    149 
    150    for (srci = 0; srci < fInputLen; srci += 1) {
    151        fByteStats[fInputBytes[srci]] += 1;
    152    }
    153 
    154    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
    155        if (fByteStats[i] != 0) {
    156            fC1Bytes = true;
    157            break;
    158        }
    159    }
    160 }
    161 
    162 U_NAMESPACE_END
    163 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE