tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dexdump.py (11112B)


      1 # Copyright 2016 The Chromium Authors
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import os
      6 import re
      7 import shutil
      8 import sys
      9 import tempfile
     10 from xml.etree import ElementTree
     11 from collections import namedtuple
     12 from typing import Dict
     13 
     14 from devil.utils import cmd_helper
     15 from pylib import constants
     16 
     17 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'gyp'))
     18 from util import build_utils
     19 
     20 DEXDUMP_PATH = os.path.join(constants.ANDROID_SDK_TOOLS, 'dexdump')
     21 
     22 
     23 # Annotations dict format:
     24 #   {
     25 #     'empty-annotation-class-name': None,
     26 #     'annotation-class-name': {
     27 #       'fieldA': 'primitive-value',
     28 #       'fieldB': [ 'array-item-1', 'array-item-2', ... ],
     29 #       'fieldC': {  # CURRENTLY UNSUPPORTED.
     30 #         /* Object value */
     31 #         'field': 'primitive-value',
     32 #         'field': [ 'array-item-1', 'array-item-2', ... ],
     33 #         'field': { /* Object value */ }
     34 #       }
     35 #     }
     36 #   }
     37 Annotations = namedtuple('Annotations',
     38                         ['classAnnotations', 'methodsAnnotations'])
     39 
     40 # Finds each space-separated "foo=..." (where ... can contain spaces).
     41 _ANNOTATION_VALUE_MATCHER = re.compile(r'\w+=.*?(?:$|(?= \w+=))')
     42 
     43 
     44 def Dump(apk_path):
     45  """Dumps class and method information from a APK into a dict via dexdump.
     46 
     47  Args:
     48    apk_path: An absolute path to an APK file to dump.
     49  Returns:
     50    A dict in the following format:
     51      {
     52        <package_name>: {
     53          'classes': {
     54            <class_name>: {
     55              'methods': [<method_1>, <method_2>],
     56              'superclass': <string>,
     57              'is_abstract': <boolean>,
     58              'annotations': <Annotations>
     59            }
     60          }
     61        }
     62      }
     63  """
     64  try:
     65    dexfile_dir = tempfile.mkdtemp()
     66    parsed_dex_files = []
     67    for dex_file in build_utils.ExtractAll(apk_path,
     68                                           dexfile_dir,
     69                                           pattern='*classes*.dex'):
     70      output_xml = cmd_helper.GetCmdOutput(
     71          [DEXDUMP_PATH, '-a', '-j', '-l', 'xml', dex_file])
     72      # Dexdump doesn't escape its XML output very well; decode it as utf-8 with
     73      # invalid sequences replaced, then remove forbidden characters and
     74      # re-encode it (as etree expects a byte string as input so it can figure
     75      # out the encoding itself from the XML declaration)
     76      BAD_XML_CHARS = re.compile(
     77          u'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x84\x86-\x9f' +
     78          u'\ud800-\udfff\ufdd0-\ufddf\ufffe-\uffff]')
     79 
     80      # Line duplicated to avoid pylint redefined-variable-type error.
     81      clean_xml = BAD_XML_CHARS.sub(u'\ufffd', output_xml)
     82 
     83      # Constructors are referenced as "<init>" in our annotations
     84      # which will result in in the ElementTree failing to parse
     85      # our xml as it won't find a closing tag for this
     86      clean_xml = clean_xml.replace('<init>', 'constructor')
     87 
     88      annotations = _ParseAnnotations(clean_xml)
     89 
     90      parsed_dex_files.append(
     91          _ParseRootNode(ElementTree.fromstring(clean_xml.encode('utf-8')),
     92                         annotations))
     93    return parsed_dex_files
     94  finally:
     95    shutil.rmtree(dexfile_dir)
     96 
     97 
     98 def _ParseAnnotationValues(values_str):
     99  if not values_str:
    100    return None
    101  ret = {}
    102  for key_value in _ANNOTATION_VALUE_MATCHER.findall(values_str):
    103    key, value_str = key_value.split('=', 1)
    104    # TODO: support for dicts if ever needed.
    105    if value_str.startswith('{ ') and value_str.endswith(' }'):
    106      value = value_str[2:-2].split()
    107    else:
    108      value = value_str
    109    ret[key] = value
    110  return ret
    111 
    112 
    113 def _ParseAnnotations(dexRaw: str) -> Dict[int, Annotations]:
    114  """ Parse XML strings and return a list of Annotations mapped to
    115  classes by index.
    116 
    117  Annotations are written to the dex dump as human readable blocks of text
    118  The only prescription is that they appear before the class in our xml file
    119  They are not required to be nested within the package as our classes
    120  It is simpler to parse for all the annotations and then associate them
    121  back to the
    122  classes
    123 
    124  Example:
    125  Class #12 annotations:
    126  Annotations on class
    127    VISIBILITY_RUNTIME Ldalvik/annotation/EnclosingClass; value=...
    128  Annotations on method #512 'example'
    129    VISIBILITY_SYSTEM Ldalvik/annotation/Signature; value=...
    130    VISIBILITY_RUNTIME Landroidx/test/filters/SmallTest;
    131    VISIBILITY_RUNTIME Lorg/chromium/base/test/util/Feature; value={ Cronet }
    132    VISIBILITY_RUNTIME LFoo; key1={ A B } key2=4104 key3=null
    133  """
    134 
    135  # We want to find the lines matching the annotations header pattern
    136  # Eg: Class #12 annotations -> true
    137  annotationsBlockMatcher = re.compile(u'^Class #.*annotations:$')
    138  # We want to retrieve the index of the class
    139  # Eg: Class #12 annotations -> 12
    140  classIndexMatcher = re.compile(u'(?<=#)[0-9]*')
    141  # We want to retrieve the method name from between the quotes
    142  # of the annotations line
    143  # Eg: Annotations on method #512 'example'  -> example
    144  methodMatcher = re.compile(u"(?<=')[^']*")
    145  # We want to match everything after the last slash until before the semi colon
    146  # Eg: Ldalvik/annotation/Signature; -> Signature
    147  annotationMatcher = re.compile(u'([^/]+); ?(.*)?')
    148 
    149  annotations = {}
    150  currentAnnotationsForClass = None
    151  currentAnnotationsBlock: Dict[str, None] = None
    152 
    153  # This loop does four things
    154  # 1. It looks for a line telling us we are describing annotations for
    155  #  a new class
    156  # 2. It looks for a line telling us if the annotations we find will be
    157  #  for the class or for any of it's methods; we will keep reference to
    158  #  this
    159  # 3. It adds the annotations to whatever we are holding reference to
    160  # 4. It looks for a line to see if we should start looking for a
    161  #  new class again
    162  for line in dexRaw.splitlines():
    163    if currentAnnotationsForClass is None:
    164      # Step 1
    165      # We keep searching until we find an annotation descriptor
    166      # This lets us know that we are storing annotations for a new class
    167      if annotationsBlockMatcher.match(line):
    168        currentClassIndex = int(classIndexMatcher.findall(line)[0])
    169        currentAnnotationsForClass = Annotations(classAnnotations={},
    170                                                 methodsAnnotations={})
    171        annotations[currentClassIndex] = currentAnnotationsForClass
    172    else:
    173      # Step 2
    174      # If we find a descriptor indicating we are tracking annotations
    175      # for the class or it's methods, we'll keep a reference of this
    176      # block for when we start finding annotation references
    177      if line.startswith(u'Annotations on class'):
    178        currentAnnotationsBlock = currentAnnotationsForClass.classAnnotations
    179      elif line.startswith(u'Annotations on method'):
    180        method = methodMatcher.findall(line)[0]
    181        currentAnnotationsBlock = {}
    182        currentAnnotationsForClass.methodsAnnotations[
    183            method] = currentAnnotationsBlock
    184 
    185      # If we match against any other type of annotations
    186      # we will ignore them
    187      elif line.startswith(u'Annotations on'):
    188        currentAnnotationsBlock = None
    189 
    190      # Step 3
    191      # We are only adding runtime annotations as those are the types
    192      # that will affect if we should run tests or not (where this is
    193      # being used)
    194      elif currentAnnotationsBlock is not None and line.strip().startswith(
    195          'VISIBILITY_RUNTIME'):
    196        annotationName, annotationValuesStr = annotationMatcher.findall(line)[0]
    197        annotationValues = _ParseAnnotationValues(annotationValuesStr)
    198 
    199        # Our instrumentation tests expect a mapping of "Annotation: Value"
    200        # We aren't using the value for anything and this would increase
    201        # the complexity of this parser so just mapping these to None
    202        currentAnnotationsBlock.update({annotationName: annotationValues})
    203 
    204      # Step 4
    205      # Empty lines indicate that the annotation descriptions are complete
    206      # and we should look for new classes
    207      elif not line.strip():
    208        currentAnnotationsForClass = None
    209        currentAnnotationsBlock = None
    210 
    211  return annotations
    212 
    213 
    214 def _ParseRootNode(root, annotations: Dict[int, Annotations]):
    215  """Parses the XML output of dexdump. This output is in the following format.
    216 
    217  This is a subset of the information contained within dexdump output.
    218 
    219  <api>
    220    <package name="foo.bar">
    221      <class name="Class" extends="foo.bar.SuperClass">
    222        <field name="Field">
    223        </field>
    224        <constructor name="Method">
    225          <parameter name="Param" type="int">
    226          </parameter>
    227        </constructor>
    228        <method name="Method">
    229          <parameter name="Param" type="int">
    230          </parameter>
    231        </method>
    232      </class>
    233    </package>
    234  </api>
    235  """
    236  results = {}
    237 
    238  # Annotations are referenced by the class order
    239  # To match them, we need to keep track of the class number and
    240  # match it to the appropriate annotation at that stage
    241  classCount = 0
    242 
    243  for child in root:
    244    if child.tag == 'package':
    245      package_name = child.attrib['name']
    246      parsed_node, classCount = _ParsePackageNode(child, classCount,
    247                                                  annotations)
    248      if package_name in results:
    249        results[package_name]['classes'].update(parsed_node['classes'])
    250      else:
    251        results[package_name] = parsed_node
    252  return results
    253 
    254 
    255 def _ParsePackageNode(package_node, classCount: int,
    256                      annotations: Dict[int, Annotations]):
    257  """Parses a <package> node from the dexdump xml output.
    258 
    259  Returns:
    260    A tuple in the format:
    261      (classes: {
    262        'classes': {
    263          <class_1>: {
    264            'methods': [<method_1>, <method_2>],
    265            'superclass': <string>,
    266            'is_abstract': <boolean>,
    267            'annotations': <Annotations or None>
    268          },
    269          <class_2>: {
    270            'methods': [<method_1>, <method_2>],
    271            'superclass': <string>,
    272            'is_abstract': <boolean>,
    273            'annotations': <Annotations or None>
    274          },
    275        }
    276      }, classCount: number)
    277  """
    278  classes = {}
    279  for child in package_node:
    280    if child.tag == 'class':
    281      classes[child.attrib['name']] = _ParseClassNode(child, classCount,
    282                                                      annotations)
    283      classCount += 1
    284  return ({'classes': classes}, classCount)
    285 
    286 
    287 def _ParseClassNode(class_node, classIndex: int,
    288                    annotations: Dict[int, Annotations]):
    289  """Parses a <class> node from the dexdump xml output.
    290 
    291  Returns:
    292    A dict in the format:
    293      {
    294        'methods': [<method_1>, <method_2>],
    295        'superclass': <string>,
    296        'is_abstract': <boolean>
    297      }
    298  """
    299  methods = []
    300  for child in class_node:
    301    if child.tag == 'method' and child.attrib['visibility'] == 'public':
    302      methods.append(child.attrib['name'])
    303  return {
    304      'methods':
    305      methods,
    306      'superclass':
    307      class_node.attrib['extends'],
    308      'is_abstract':
    309      class_node.attrib.get('abstract') == 'true',
    310      'annotations':
    311      annotations.get(classIndex,
    312                      Annotations(classAnnotations={}, methodsAnnotations={}))
    313  }