tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

parser.py (18918B)


      1 """
      2 A direct translation of the webvtt file parsing algorithm.
      3 
      4 See https://w3c.github.io/webvtt/#file-parsing for documentation
      5 """
      6 import re
      7 import string
      8 
      9 SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r']
     10 SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS))
     11 DIGITS = string.digits
     12 
     13 class DictInit:
     14    def __init__(self, **dict):
     15        self.__dict__.update(dict)
     16 
     17 class VTTCue(DictInit): pass
     18 class VTTRegion(DictInit): pass
     19 class Stylesheet(DictInit): pass
     20 
     21 class W3CParser:
     22    input = None
     23    position = None
     24 
     25    def collect_characters(self, condition):
     26        result = ""
     27        while self.position < len(self.input) and condition(self.input[self.position]):
     28            result += self.input[self.position]
     29            self.position += 1
     30        return result
     31 
     32    def skip_whitespace(self):
     33        self.collect_characters(lambda c: c in SPACE_CHARACTERS)
     34 
     35    def parse_percentage_string(self, input):
     36        'parse a percentage string'
     37 
     38        # 1.
     39        input = input
     40 
     41        # 2.
     42        if not re.match(r'^\d+(\.\d+)?%$', input):
     43            return None
     44 
     45        # 3.
     46        percentage = float(input[:-1])
     47 
     48        # 4.
     49        if percentage < 0 or percentage > 100:
     50            return None
     51 
     52        # 5.
     53        return percentage
     54 
     55 class VTTParser(W3CParser):
     56    def __init__(self, input):
     57        self.input = input
     58        self.position = 0
     59        self.seen_cue = False
     60 
     61        self.text_tracks = []
     62        self.stylesheets = []
     63        self.regions = []
     64        self.errors = []
     65 
     66    def parse(self):
     67        'WebVTT parser algorithm'
     68 
     69        # 1.
     70        self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n')
     71 
     72        # 2.
     73        self.position = 0
     74 
     75        # 3.
     76        self.seen_cue = False
     77 
     78        # 4.
     79        if len(self.input) < 6:
     80            self.errors.append('input too small for webvtt')
     81            return
     82 
     83        # 5.
     84        if len(self.input) == 6 and self.input != 'WEBVTT':
     85            self.errors.append('invalid webvtt header')
     86            return
     87 
     88        # 6.
     89        if len(self.input) > 6:
     90            if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']):
     91                self.errors.append('invalid webvtt header')
     92                return
     93 
     94        # 7.
     95        self.collect_characters(lambda c: c != '\n')
     96 
     97        # 8.
     98        if self.position >= len(self.input):
     99            return
    100 
    101        # 9.
    102        if self.input[self.position] == '\n':
    103            self.position += 1
    104 
    105        # 10.
    106        if self.position >= len(self.input):
    107            return
    108 
    109        # 11.
    110        if self.input[self.position] != '\n':
    111            self.collect_block(in_header = True)
    112        else:
    113            self.position += 1
    114 
    115        # 12.
    116        self.collect_characters(lambda c: c == '\n')
    117 
    118        # 13.
    119        self.regions = []
    120 
    121        # 14.
    122        while self.position < len(self.input):
    123            # 1.
    124            block = self.collect_block()
    125 
    126            # 2.
    127            if isinstance(block, VTTCue):
    128                self.text_tracks.append(block)
    129 
    130            # 3.
    131            elif isinstance(block, Stylesheet):
    132                self.stylesheets.append(block)
    133 
    134            # 4.
    135            elif isinstance(block, VTTRegion):
    136                self.regions.append(block)
    137 
    138            # 5.
    139            self.collect_characters(lambda c: c == '\n')
    140 
    141        # 15.
    142        return
    143 
    144    def collect_block(self, in_header = False):
    145        'collect a WebVTT block'
    146 
    147        # 1. (done by class)
    148 
    149        line_count = 0                    # 2.
    150        previous_position = self.position # 3.
    151        line = ""                         # 4.
    152        buffer = ""                       # 5.
    153        seen_eof = False                  # 6.
    154        seen_arrow = False                # 7.
    155        cue = None                        # 8.
    156        stylesheet = None                 # 9.
    157        region = None                     # 10.
    158 
    159        # 11.
    160        while True:
    161            # 1.
    162            line = self.collect_characters(lambda c: c != '\n')
    163 
    164            # 2.
    165            line_count += 1
    166 
    167            # 3.
    168            if self.position >= len(self.input):
    169                seen_eof = True
    170            else:
    171                self.position += 1
    172 
    173            # 4.
    174            if '-->' in line:
    175                # 1.
    176                if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow):
    177                    # 1.
    178                    seen_arrow = True
    179 
    180                    # 2.
    181                    previous_position = self.position
    182 
    183                    # 3.
    184                    cue = VTTCue(
    185                        id = buffer,
    186                        pause_on_exit = False,
    187                        region = None,
    188                        writing_direction = 'horizontal',
    189                        snap_to_lines = True,
    190                        line = 'auto',
    191                        line_alignment = 'start alignment',
    192                        position = 'auto',
    193                        position_alignment = 'auto',
    194                        cue_size = 100,
    195                        text_alignment = 'center',
    196                        text = '',
    197                    )
    198 
    199                    # 4.
    200                    if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings():
    201                        cue = None
    202                    else:
    203                        buffer = ''
    204                        self.seen_cue = True # DIFFERENCE
    205 
    206                else:
    207                    self.errors.append('invalid webvtt cue block')
    208                    self.position = previous_position
    209                    break
    210 
    211            # 5.
    212            elif line == '':
    213                break
    214 
    215            # 6.
    216            else:
    217                # 1.
    218                if not in_header and line_count == 2:
    219                    # 1.
    220                    if not self.seen_cue and re.match(r'^STYLE\s*$', buffer):
    221                        stylesheet = Stylesheet(
    222                            location = None,
    223                            parent = None,
    224                            owner_node = None,
    225                            owner_rule = None,
    226                            media = None,
    227                            title = None,
    228                            alternate = False,
    229                            origin_clean = True,
    230                            source = None,
    231                        )
    232                        buffer = ''
    233                    # 2.
    234                    elif not self.seen_cue and re.match(r'^REGION\s*$', buffer):
    235                        region = VTTRegion(
    236                            id = '',
    237                            width = 100,
    238                            lines = 3,
    239                            anchor_point = (0, 100),
    240                            viewport_anchor_point = (0, 100),
    241                            scroll_value = None,
    242                        )
    243                        buffer = ''
    244 
    245                # 2.
    246                if buffer != '':
    247                    buffer += '\n'
    248 
    249                # 3.
    250                buffer += line
    251 
    252                # 4.
    253                previous_position = self.position
    254 
    255            # 7.
    256            if seen_eof:
    257                break
    258 
    259        # 12.
    260        if cue is not None:
    261            cue.text = buffer
    262            return cue
    263 
    264        # 13.
    265        elif stylesheet is not None:
    266            stylesheet.source = buffer
    267            return stylesheet
    268 
    269        # 14.
    270        elif region is not None:
    271            self.collect_region_settings(region, buffer)
    272            return region
    273 
    274        # 15.
    275        return None
    276 
    277    def collect_region_settings(self, region, input):
    278        'collect WebVTT region settings'
    279 
    280        # 1.
    281        settings = re.split(SPACE_SPLIT_PATTERN, input)
    282 
    283        # 2.
    284        for setting in settings:
    285            # 1.
    286            if ':' not in setting:
    287                continue
    288 
    289            index = setting.index(':')
    290            if index in [0, len(setting) - 1]:
    291                continue
    292 
    293            # 2.
    294            name = setting[:index]
    295 
    296            # 3.
    297            value = setting[index + 1:]
    298 
    299            # 4.
    300            if name == "id":
    301                region.id = value
    302 
    303            elif name == "width":
    304                percentage = self.parse_percentage_string(value)
    305                if percentage is not None:
    306                    region.width = percentage
    307 
    308            elif name == "lines":
    309                # 1.
    310                if not re.match(r'^\d+$', value):
    311                    continue
    312 
    313                # 2.
    314                number = int(value)
    315 
    316                # 3.
    317                region.lines = number
    318 
    319            elif name == "regionanchor":
    320                # 1.
    321                if ',' not in value:
    322                    continue
    323 
    324                #. 2.
    325                index = value.index(',')
    326                anchorX = value[:index]
    327 
    328                # 3.
    329                anchorY = value[index + 1:]
    330 
    331                # 4.
    332                percentageX = self.parse_percentage_string(anchorX)
    333                percentageY = self.parse_percentage_string(anchorY)
    334                if None in [percentageX, percentageY]:
    335                    continue
    336 
    337                # 5.
    338                region.anchor_point = (percentageX, percentageY)
    339 
    340            elif name == "viewportanchor":
    341                # 1.
    342                if ',' not in value:
    343                    continue
    344 
    345                #. 2.
    346                index = value.index(',')
    347                viewportanchorX = value[:index]
    348 
    349                # 3.
    350                viewportanchorY = value[index + 1:]
    351 
    352                # 4.
    353                percentageX = self.parse_percentage_string(viewportanchorX)
    354                percentageY = self.parse_percentage_string(viewportanchorY)
    355                if None in [percentageX, percentageY]:
    356                    continue
    357 
    358                # 5.
    359                region.viewport_anchor_point = (percentageX, percentageY)
    360 
    361            elif name == "scroll":
    362                # 1.
    363                if value == "up":
    364                    region.scroll_value = "up"
    365 
    366            # 5.
    367            continue
    368 
    369 
    370 class VTTCueParser(W3CParser):
    371    def __init__(self, parent, input, cue):
    372        self.parent = parent
    373        self.errors = self.parent.errors
    374        self.input = input
    375        self.position = 0
    376        self.cue = cue
    377 
    378    def collect_cue_timings_and_settings(self):
    379        'collect WebVTT cue timings and settings'
    380 
    381        # 1. (handled by class)
    382 
    383        # 2.
    384        self.position = 0
    385 
    386        # 3.
    387        self.skip_whitespace()
    388 
    389        # 4.
    390        timestamp = self.collect_timestamp()
    391        if timestamp is None:
    392            self.errors.append('invalid start time for VTTCue')
    393            return False
    394        self.cue.start_time = timestamp
    395 
    396        # 5.
    397        self.skip_whitespace()
    398 
    399        # 6.
    400        if self.input[self.position] != '-':
    401            return False
    402        self.position += 1
    403 
    404        # 7.
    405        if self.input[self.position] != '-':
    406            return False
    407        self.position += 1
    408 
    409        # 8.
    410        if self.input[self.position] != '>':
    411            return False
    412        self.position += 1
    413 
    414        # 9.
    415        self.skip_whitespace()
    416 
    417        # 10.
    418        timestamp = self.collect_timestamp()
    419        if timestamp is None:
    420            self.errors.append('invalid end time for VTTCue')
    421            return False
    422        self.cue.end_time = timestamp
    423 
    424        # 11.
    425        remainder = self.input[self.position:]
    426 
    427        # 12.
    428        self.parse_settings(remainder)
    429 
    430        # Extra
    431        return True
    432 
    433    def parse_settings(self, input):
    434        'parse the WebVTT cue settings'
    435 
    436        # 1.
    437 
    438        settings = re.split(SPACE_SPLIT_PATTERN, input)
    439 
    440        # 2.
    441        for setting in settings:
    442            # 1.
    443            if ':' not in setting:
    444                continue
    445 
    446            index = setting.index(':')
    447            if index in [0, len(setting) - 1]:
    448                continue
    449 
    450            # 2.
    451            name = setting[:index]
    452 
    453            # 3.
    454            value = setting[index + 1:]
    455 
    456            # 4.
    457            if name == 'region':
    458                # 1.
    459                last_regions = (region for region in reversed(self.parent.regions) if region.id == value)
    460                self.cue.region = next(last_regions, None)
    461 
    462            elif name == 'vertical':
    463                # 1. and 2.
    464                if value in ['rl', 'lr']:
    465                    self.cue.writing_direction = value
    466 
    467            elif name == 'line':
    468                # 1.
    469                if ',' in value:
    470                    index = value.index(',')
    471                    linepos = value[:index]
    472                    linealign = value[index + 1:]
    473 
    474                # 2.
    475                else:
    476                    linepos = value
    477                    linealign = None
    478 
    479                # 3.
    480                if not re.search(r'\d', linepos):
    481                    continue
    482 
    483                # 4.
    484                if linepos[-1] == '%':
    485                    number = self.parse_percentage_string(linepos)
    486                    if number is None:
    487                        continue
    488                else:
    489                    # 1.
    490                    if not re.match(r'^[-\.\d]*$', linepos):
    491                        continue
    492 
    493                    # 2.
    494                    if '-' in linepos[1:]:
    495                        continue
    496 
    497                    # 3.
    498                    if linepos.count('.') > 1:
    499                        continue
    500 
    501                    # 4.
    502                    if '.' in linepos:
    503                        if not re.search(r'\d\.\d', linepos):
    504                            continue
    505 
    506                    # 5.
    507                    number = float(linepos)
    508 
    509                # 5.
    510                if linealign == "start":
    511                    self.cue.line_alignment = 'start'
    512 
    513                # 6.
    514                elif linealign == "center":
    515                    self.cue.line_alignment = 'center'
    516 
    517                # 7.
    518                elif linealign == "end":
    519                    self.cue.line_alignment = 'end'
    520 
    521                # 8.
    522                elif linealign != None:
    523                    continue
    524 
    525                # 9.
    526                self.cue.line = number
    527 
    528                # 10.
    529                if linepos[-1] == '%':
    530                    self.cue.snap_to_lines = False
    531                else:
    532                    self.cue.snap_to_lines = True
    533 
    534            elif name == 'position':
    535                # 1.
    536                if ',' in value:
    537                    index = value.index(',')
    538                    colpos = value[:index]
    539                    colalign = value[index + 1:]
    540 
    541                # 2.
    542                else:
    543                    colpos = value
    544                    colalign = None
    545 
    546                # 3.
    547                number = self.parse_percentage_string(colpos)
    548                if number is None:
    549                    continue
    550 
    551                # 4.
    552                if colalign == "line-left":
    553                    self.cue.line_alignment = 'line-left'
    554 
    555                # 5.
    556                elif colalign == "center":
    557                    self.cue.line_alignment = 'center'
    558 
    559                # 6.
    560                elif colalign == "line-right":
    561                    self.cue.line_alignment = 'line-right'
    562 
    563                # 7.
    564                elif colalign != None:
    565                    continue
    566 
    567                # 8.
    568                self.cue.position = number
    569 
    570            elif name == 'size':
    571                # 1.
    572                number = self.parse_percentage_string(value)
    573                if number is None:
    574                    continue
    575 
    576                # 2.
    577                self.cue.cue_size = number
    578 
    579            elif name == 'align':
    580                # 1.
    581                if value == 'start':
    582                    self.cue.text_alignment = 'start'
    583 
    584                # 2.
    585                if value == 'center':
    586                    self.cue.text_alignment = 'center'
    587 
    588                # 3.
    589                if value == 'end':
    590                    self.cue.text_alignment = 'end'
    591 
    592                # 4.
    593                if value == 'left':
    594                    self.cue.text_alignment = 'left'
    595 
    596                # 5.
    597                if value == 'right':
    598                    self.cue.text_alignment = 'right'
    599 
    600            # 5.
    601            continue
    602 
    603    def collect_timestamp(self):
    604        'collect a WebVTT timestamp'
    605 
    606        # 1. (handled by class)
    607 
    608        # 2.
    609        most_significant_units = 'minutes'
    610 
    611        # 3.
    612        if self.position >= len(self.input):
    613            return None
    614 
    615        # 4.
    616        if self.input[self.position] not in DIGITS:
    617            return None
    618 
    619        # 5.
    620        string = self.collect_characters(lambda c: c in DIGITS)
    621 
    622        # 6.
    623        value_1 = int(string)
    624 
    625        # 7.
    626        if len(string) != 2 or value_1 > 59:
    627            most_significant_units = 'hours'
    628 
    629        # 8.
    630        if self.position >= len(self.input) or self.input[self.position] != ':':
    631            return None
    632        self.position += 1
    633 
    634        # 9.
    635        string = self.collect_characters(lambda c: c in DIGITS)
    636 
    637        # 10.
    638        if len(string) != 2:
    639            return None
    640 
    641        # 11.
    642        value_2 = int(string)
    643 
    644        # 12.
    645        if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':':
    646            # 1.
    647            if self.position >= len(self.input) or self.input[self.position] != ':':
    648                return None
    649            self.position += 1
    650 
    651            # 2.
    652            string = self.collect_characters(lambda c: c in DIGITS)
    653 
    654            # 3.
    655            if len(string) != 2:
    656                return None
    657 
    658            # 4.
    659            value_3 = int(string)
    660        else:
    661            value_3 = value_2
    662            value_2 = value_1
    663            value_1 = 0
    664 
    665        # 13.
    666        if self.position >= len(self.input) or self.input[self.position] != '.':
    667            return None
    668        self.position += 1
    669 
    670        # 14.
    671        string = self.collect_characters(lambda c: c in DIGITS)
    672 
    673        # 15.
    674        if len(string) != 3:
    675            return None
    676 
    677        # 16.
    678        value_4 = int(string)
    679 
    680        # 17.
    681        if value_2 >= 59 or value_3 >= 59:
    682            return None
    683 
    684        # 18.
    685        result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000
    686 
    687        # 19.
    688        return result
    689 
    690 
    691 def main(argv):
    692    files = [open(path, 'r') for path in argv[1:]]
    693 
    694    try:
    695        for file in files:
    696            parser = VTTParser(file.read())
    697            parser.parse()
    698 
    699            print("Results: {}".format(file))
    700            print("  Cues: {}".format(parser.text_tracks))
    701            print("  StyleSheets: {}".format(parser.stylesheets))
    702            print("  Regions: {}".format(parser.regions))
    703            print("  Errors: {}".format(parser.errors))
    704    finally:
    705        for file in files:
    706            file.close()
    707 
    708 if __name__ == '__main__':
    709    import sys
    710    main(sys.argv);