parser.py (18918B)
1 """ 2 A direct translation of the webvtt file parsing algorithm. 3 4 See https://w3c.github.io/webvtt/#file-parsing for documentation 5 """ 6 import re 7 import string 8 9 SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r'] 10 SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS)) 11 DIGITS = string.digits 12 13 class DictInit: 14 def __init__(self, **dict): 15 self.__dict__.update(dict) 16 17 class VTTCue(DictInit): pass 18 class VTTRegion(DictInit): pass 19 class Stylesheet(DictInit): pass 20 21 class W3CParser: 22 input = None 23 position = None 24 25 def collect_characters(self, condition): 26 result = "" 27 while self.position < len(self.input) and condition(self.input[self.position]): 28 result += self.input[self.position] 29 self.position += 1 30 return result 31 32 def skip_whitespace(self): 33 self.collect_characters(lambda c: c in SPACE_CHARACTERS) 34 35 def parse_percentage_string(self, input): 36 'parse a percentage string' 37 38 # 1. 39 input = input 40 41 # 2. 42 if not re.match(r'^\d+(\.\d+)?%$', input): 43 return None 44 45 # 3. 46 percentage = float(input[:-1]) 47 48 # 4. 49 if percentage < 0 or percentage > 100: 50 return None 51 52 # 5. 53 return percentage 54 55 class VTTParser(W3CParser): 56 def __init__(self, input): 57 self.input = input 58 self.position = 0 59 self.seen_cue = False 60 61 self.text_tracks = [] 62 self.stylesheets = [] 63 self.regions = [] 64 self.errors = [] 65 66 def parse(self): 67 'WebVTT parser algorithm' 68 69 # 1. 70 self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n') 71 72 # 2. 73 self.position = 0 74 75 # 3. 76 self.seen_cue = False 77 78 # 4. 79 if len(self.input) < 6: 80 self.errors.append('input too small for webvtt') 81 return 82 83 # 5. 84 if len(self.input) == 6 and self.input != 'WEBVTT': 85 self.errors.append('invalid webvtt header') 86 return 87 88 # 6. 89 if len(self.input) > 6: 90 if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']): 91 self.errors.append('invalid webvtt header') 92 return 93 94 # 7. 95 self.collect_characters(lambda c: c != '\n') 96 97 # 8. 98 if self.position >= len(self.input): 99 return 100 101 # 9. 102 if self.input[self.position] == '\n': 103 self.position += 1 104 105 # 10. 106 if self.position >= len(self.input): 107 return 108 109 # 11. 110 if self.input[self.position] != '\n': 111 self.collect_block(in_header = True) 112 else: 113 self.position += 1 114 115 # 12. 116 self.collect_characters(lambda c: c == '\n') 117 118 # 13. 119 self.regions = [] 120 121 # 14. 122 while self.position < len(self.input): 123 # 1. 124 block = self.collect_block() 125 126 # 2. 127 if isinstance(block, VTTCue): 128 self.text_tracks.append(block) 129 130 # 3. 131 elif isinstance(block, Stylesheet): 132 self.stylesheets.append(block) 133 134 # 4. 135 elif isinstance(block, VTTRegion): 136 self.regions.append(block) 137 138 # 5. 139 self.collect_characters(lambda c: c == '\n') 140 141 # 15. 142 return 143 144 def collect_block(self, in_header = False): 145 'collect a WebVTT block' 146 147 # 1. (done by class) 148 149 line_count = 0 # 2. 150 previous_position = self.position # 3. 151 line = "" # 4. 152 buffer = "" # 5. 153 seen_eof = False # 6. 154 seen_arrow = False # 7. 155 cue = None # 8. 156 stylesheet = None # 9. 157 region = None # 10. 158 159 # 11. 160 while True: 161 # 1. 162 line = self.collect_characters(lambda c: c != '\n') 163 164 # 2. 165 line_count += 1 166 167 # 3. 168 if self.position >= len(self.input): 169 seen_eof = True 170 else: 171 self.position += 1 172 173 # 4. 174 if '-->' in line: 175 # 1. 176 if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow): 177 # 1. 178 seen_arrow = True 179 180 # 2. 181 previous_position = self.position 182 183 # 3. 184 cue = VTTCue( 185 id = buffer, 186 pause_on_exit = False, 187 region = None, 188 writing_direction = 'horizontal', 189 snap_to_lines = True, 190 line = 'auto', 191 line_alignment = 'start alignment', 192 position = 'auto', 193 position_alignment = 'auto', 194 cue_size = 100, 195 text_alignment = 'center', 196 text = '', 197 ) 198 199 # 4. 200 if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings(): 201 cue = None 202 else: 203 buffer = '' 204 self.seen_cue = True # DIFFERENCE 205 206 else: 207 self.errors.append('invalid webvtt cue block') 208 self.position = previous_position 209 break 210 211 # 5. 212 elif line == '': 213 break 214 215 # 6. 216 else: 217 # 1. 218 if not in_header and line_count == 2: 219 # 1. 220 if not self.seen_cue and re.match(r'^STYLE\s*$', buffer): 221 stylesheet = Stylesheet( 222 location = None, 223 parent = None, 224 owner_node = None, 225 owner_rule = None, 226 media = None, 227 title = None, 228 alternate = False, 229 origin_clean = True, 230 source = None, 231 ) 232 buffer = '' 233 # 2. 234 elif not self.seen_cue and re.match(r'^REGION\s*$', buffer): 235 region = VTTRegion( 236 id = '', 237 width = 100, 238 lines = 3, 239 anchor_point = (0, 100), 240 viewport_anchor_point = (0, 100), 241 scroll_value = None, 242 ) 243 buffer = '' 244 245 # 2. 246 if buffer != '': 247 buffer += '\n' 248 249 # 3. 250 buffer += line 251 252 # 4. 253 previous_position = self.position 254 255 # 7. 256 if seen_eof: 257 break 258 259 # 12. 260 if cue is not None: 261 cue.text = buffer 262 return cue 263 264 # 13. 265 elif stylesheet is not None: 266 stylesheet.source = buffer 267 return stylesheet 268 269 # 14. 270 elif region is not None: 271 self.collect_region_settings(region, buffer) 272 return region 273 274 # 15. 275 return None 276 277 def collect_region_settings(self, region, input): 278 'collect WebVTT region settings' 279 280 # 1. 281 settings = re.split(SPACE_SPLIT_PATTERN, input) 282 283 # 2. 284 for setting in settings: 285 # 1. 286 if ':' not in setting: 287 continue 288 289 index = setting.index(':') 290 if index in [0, len(setting) - 1]: 291 continue 292 293 # 2. 294 name = setting[:index] 295 296 # 3. 297 value = setting[index + 1:] 298 299 # 4. 300 if name == "id": 301 region.id = value 302 303 elif name == "width": 304 percentage = self.parse_percentage_string(value) 305 if percentage is not None: 306 region.width = percentage 307 308 elif name == "lines": 309 # 1. 310 if not re.match(r'^\d+$', value): 311 continue 312 313 # 2. 314 number = int(value) 315 316 # 3. 317 region.lines = number 318 319 elif name == "regionanchor": 320 # 1. 321 if ',' not in value: 322 continue 323 324 #. 2. 325 index = value.index(',') 326 anchorX = value[:index] 327 328 # 3. 329 anchorY = value[index + 1:] 330 331 # 4. 332 percentageX = self.parse_percentage_string(anchorX) 333 percentageY = self.parse_percentage_string(anchorY) 334 if None in [percentageX, percentageY]: 335 continue 336 337 # 5. 338 region.anchor_point = (percentageX, percentageY) 339 340 elif name == "viewportanchor": 341 # 1. 342 if ',' not in value: 343 continue 344 345 #. 2. 346 index = value.index(',') 347 viewportanchorX = value[:index] 348 349 # 3. 350 viewportanchorY = value[index + 1:] 351 352 # 4. 353 percentageX = self.parse_percentage_string(viewportanchorX) 354 percentageY = self.parse_percentage_string(viewportanchorY) 355 if None in [percentageX, percentageY]: 356 continue 357 358 # 5. 359 region.viewport_anchor_point = (percentageX, percentageY) 360 361 elif name == "scroll": 362 # 1. 363 if value == "up": 364 region.scroll_value = "up" 365 366 # 5. 367 continue 368 369 370 class VTTCueParser(W3CParser): 371 def __init__(self, parent, input, cue): 372 self.parent = parent 373 self.errors = self.parent.errors 374 self.input = input 375 self.position = 0 376 self.cue = cue 377 378 def collect_cue_timings_and_settings(self): 379 'collect WebVTT cue timings and settings' 380 381 # 1. (handled by class) 382 383 # 2. 384 self.position = 0 385 386 # 3. 387 self.skip_whitespace() 388 389 # 4. 390 timestamp = self.collect_timestamp() 391 if timestamp is None: 392 self.errors.append('invalid start time for VTTCue') 393 return False 394 self.cue.start_time = timestamp 395 396 # 5. 397 self.skip_whitespace() 398 399 # 6. 400 if self.input[self.position] != '-': 401 return False 402 self.position += 1 403 404 # 7. 405 if self.input[self.position] != '-': 406 return False 407 self.position += 1 408 409 # 8. 410 if self.input[self.position] != '>': 411 return False 412 self.position += 1 413 414 # 9. 415 self.skip_whitespace() 416 417 # 10. 418 timestamp = self.collect_timestamp() 419 if timestamp is None: 420 self.errors.append('invalid end time for VTTCue') 421 return False 422 self.cue.end_time = timestamp 423 424 # 11. 425 remainder = self.input[self.position:] 426 427 # 12. 428 self.parse_settings(remainder) 429 430 # Extra 431 return True 432 433 def parse_settings(self, input): 434 'parse the WebVTT cue settings' 435 436 # 1. 437 438 settings = re.split(SPACE_SPLIT_PATTERN, input) 439 440 # 2. 441 for setting in settings: 442 # 1. 443 if ':' not in setting: 444 continue 445 446 index = setting.index(':') 447 if index in [0, len(setting) - 1]: 448 continue 449 450 # 2. 451 name = setting[:index] 452 453 # 3. 454 value = setting[index + 1:] 455 456 # 4. 457 if name == 'region': 458 # 1. 459 last_regions = (region for region in reversed(self.parent.regions) if region.id == value) 460 self.cue.region = next(last_regions, None) 461 462 elif name == 'vertical': 463 # 1. and 2. 464 if value in ['rl', 'lr']: 465 self.cue.writing_direction = value 466 467 elif name == 'line': 468 # 1. 469 if ',' in value: 470 index = value.index(',') 471 linepos = value[:index] 472 linealign = value[index + 1:] 473 474 # 2. 475 else: 476 linepos = value 477 linealign = None 478 479 # 3. 480 if not re.search(r'\d', linepos): 481 continue 482 483 # 4. 484 if linepos[-1] == '%': 485 number = self.parse_percentage_string(linepos) 486 if number is None: 487 continue 488 else: 489 # 1. 490 if not re.match(r'^[-\.\d]*$', linepos): 491 continue 492 493 # 2. 494 if '-' in linepos[1:]: 495 continue 496 497 # 3. 498 if linepos.count('.') > 1: 499 continue 500 501 # 4. 502 if '.' in linepos: 503 if not re.search(r'\d\.\d', linepos): 504 continue 505 506 # 5. 507 number = float(linepos) 508 509 # 5. 510 if linealign == "start": 511 self.cue.line_alignment = 'start' 512 513 # 6. 514 elif linealign == "center": 515 self.cue.line_alignment = 'center' 516 517 # 7. 518 elif linealign == "end": 519 self.cue.line_alignment = 'end' 520 521 # 8. 522 elif linealign != None: 523 continue 524 525 # 9. 526 self.cue.line = number 527 528 # 10. 529 if linepos[-1] == '%': 530 self.cue.snap_to_lines = False 531 else: 532 self.cue.snap_to_lines = True 533 534 elif name == 'position': 535 # 1. 536 if ',' in value: 537 index = value.index(',') 538 colpos = value[:index] 539 colalign = value[index + 1:] 540 541 # 2. 542 else: 543 colpos = value 544 colalign = None 545 546 # 3. 547 number = self.parse_percentage_string(colpos) 548 if number is None: 549 continue 550 551 # 4. 552 if colalign == "line-left": 553 self.cue.line_alignment = 'line-left' 554 555 # 5. 556 elif colalign == "center": 557 self.cue.line_alignment = 'center' 558 559 # 6. 560 elif colalign == "line-right": 561 self.cue.line_alignment = 'line-right' 562 563 # 7. 564 elif colalign != None: 565 continue 566 567 # 8. 568 self.cue.position = number 569 570 elif name == 'size': 571 # 1. 572 number = self.parse_percentage_string(value) 573 if number is None: 574 continue 575 576 # 2. 577 self.cue.cue_size = number 578 579 elif name == 'align': 580 # 1. 581 if value == 'start': 582 self.cue.text_alignment = 'start' 583 584 # 2. 585 if value == 'center': 586 self.cue.text_alignment = 'center' 587 588 # 3. 589 if value == 'end': 590 self.cue.text_alignment = 'end' 591 592 # 4. 593 if value == 'left': 594 self.cue.text_alignment = 'left' 595 596 # 5. 597 if value == 'right': 598 self.cue.text_alignment = 'right' 599 600 # 5. 601 continue 602 603 def collect_timestamp(self): 604 'collect a WebVTT timestamp' 605 606 # 1. (handled by class) 607 608 # 2. 609 most_significant_units = 'minutes' 610 611 # 3. 612 if self.position >= len(self.input): 613 return None 614 615 # 4. 616 if self.input[self.position] not in DIGITS: 617 return None 618 619 # 5. 620 string = self.collect_characters(lambda c: c in DIGITS) 621 622 # 6. 623 value_1 = int(string) 624 625 # 7. 626 if len(string) != 2 or value_1 > 59: 627 most_significant_units = 'hours' 628 629 # 8. 630 if self.position >= len(self.input) or self.input[self.position] != ':': 631 return None 632 self.position += 1 633 634 # 9. 635 string = self.collect_characters(lambda c: c in DIGITS) 636 637 # 10. 638 if len(string) != 2: 639 return None 640 641 # 11. 642 value_2 = int(string) 643 644 # 12. 645 if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':': 646 # 1. 647 if self.position >= len(self.input) or self.input[self.position] != ':': 648 return None 649 self.position += 1 650 651 # 2. 652 string = self.collect_characters(lambda c: c in DIGITS) 653 654 # 3. 655 if len(string) != 2: 656 return None 657 658 # 4. 659 value_3 = int(string) 660 else: 661 value_3 = value_2 662 value_2 = value_1 663 value_1 = 0 664 665 # 13. 666 if self.position >= len(self.input) or self.input[self.position] != '.': 667 return None 668 self.position += 1 669 670 # 14. 671 string = self.collect_characters(lambda c: c in DIGITS) 672 673 # 15. 674 if len(string) != 3: 675 return None 676 677 # 16. 678 value_4 = int(string) 679 680 # 17. 681 if value_2 >= 59 or value_3 >= 59: 682 return None 683 684 # 18. 685 result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000 686 687 # 19. 688 return result 689 690 691 def main(argv): 692 files = [open(path, 'r') for path in argv[1:]] 693 694 try: 695 for file in files: 696 parser = VTTParser(file.read()) 697 parser.parse() 698 699 print("Results: {}".format(file)) 700 print(" Cues: {}".format(parser.text_tracks)) 701 print(" StyleSheets: {}".format(parser.stylesheets)) 702 print(" Regions: {}".format(parser.regions)) 703 print(" Errors: {}".format(parser.errors)) 704 finally: 705 for file in files: 706 file.close() 707 708 if __name__ == '__main__': 709 import sys 710 main(sys.argv);