bytecode.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tools for searching bytecode for key statements that indicate the need for additional resources, such as data files
  4. and package metadata.
  5. By *bytecode* I mean the ``code`` object given by ``compile()``, accessible from the ``__code__`` attribute of any
  6. non-builtin function or, in PyInstallerLand, the ``PyiModuleGraph.node("some.module").code`` attribute. The best
  7. guide for bytecode format I have found is the disassembler reference: https://docs.python.org/3/library/dis.html
  8. This parser implementation aims to combine the flexibility and speed of regex with the clarity of the output of
  9. ``dis.dis(code)``. It has not achieved the 2nd, but C'est la vie...
  10. The biggest clarity killer here is the ``EXTENDED_ARG`` opcode which can appear almost anywhere and therefore needs
  11. to be tiptoed around at every step. If this code needs to expand significantly, I would recommend an upgrade to a
  12. regex-based grammar parsing library such as Reparse. This way, little steps like unpacking ``EXTENDED_ARGS`` can be
  13. defined once then simply referenced forming a nice hierarchy rather than copied everywhere its needed.
  14. """
  15. import dis
  16. import re
  17. from types import CodeType
  18. from typing import Pattern
  19. def _instruction_to_regex(x: str):
  20. """
  21. Get a regex-escaped opcode byte from its human readable name.
  22. """
  23. if x not in dis.opname: # pragma: no cover
  24. # These opcodes are available only in Python >=3.7. For our purposes, these aliases will do.
  25. if x == "LOAD_METHOD":
  26. x = "LOAD_ATTR"
  27. elif x == "CALL_METHOD":
  28. x = "CALL_FUNCTION"
  29. return re.escape(bytes([dis.opmap[x]]))
  30. def bytecode_regex(pattern: bytes, flags=re.VERBOSE | re.DOTALL):
  31. """
  32. A regex-powered Python bytecode matcher.
  33. ``bytecode_regex`` provides a very thin wrapper around :func:`re.compile`.
  34. * Any opcode names wrapped in backticks are substituted for their corresponding opcode bytes.
  35. * Patterns are compiled in VERBOSE mode by default so that whitespace and comments may be used.
  36. This aims to mirror the output of :func:`dis.dis`, which is far more readable than looking at raw byte strings.
  37. """
  38. assert isinstance(pattern, bytes)
  39. # Replace anything wrapped in backticks with regex-escaped opcodes.
  40. pattern = re.sub(
  41. rb"`(\w+)`",
  42. lambda m: _instruction_to_regex(m[1].decode()),
  43. pattern,
  44. )
  45. return re.compile(pattern, flags=flags)
  46. def finditer(pattern: Pattern, string):
  47. """
  48. Call ``pattern.finditer(string)``, but remove any matches beginning on an odd byte (i.e., matches where
  49. match.start() is not a multiple of 2).
  50. This should be used to avoid false positive matches where a bytecode pair's argument is mistaken for an opcode.
  51. """
  52. matches = pattern.finditer(string)
  53. while True:
  54. for match in matches:
  55. if match.start() % 2 == 0:
  56. # All is good. This match starts on an OPCODE.
  57. yield match
  58. else:
  59. # This match has started on an odd byte, meaning that it is a false positive and should be skipped.
  60. # There is a very slim chance that a genuine match overlaps this one and, because re.finditer() does not
  61. # allow overlapping matches, it would be lost. To avoid that, restart the regex scan, starting at the
  62. # next even byte.
  63. matches = pattern.finditer(string, match.start() + 1)
  64. break
  65. else:
  66. break
  67. # language=PythonVerboseRegExp
  68. _call_function_bytecode = bytecode_regex(
  69. rb"""
  70. # Matches `global_function('some', 'constant', 'arguments')`.
  71. # Load the global function. In code with >256 of names, this may require extended name references.
  72. ((?:`EXTENDED_ARG`.)*
  73. (?:`LOAD_NAME`|`LOAD_GLOBAL`|`LOAD_FAST`).)
  74. # For foo.bar.whizz(), the above is the 'foo', below is the 'bar.whizz'.
  75. ((?:(?:`EXTENDED_ARG`.)*
  76. (?:`LOAD_METHOD`|`LOAD_ATTR`).)*)
  77. # Load however many arguments it takes. These (for now) must all be constants.
  78. # Again, code with >256 constants may need extended enumeration.
  79. ((?:(?:`EXTENDED_ARG`.)*
  80. `LOAD_CONST`.)*)
  81. # Call the function. The parameter is the argument count (which may also be >256) if CALL_FUNCTION or CALL_METHOD
  82. # are used. For CALL_FUNCTION_EX, the parameter are flags.
  83. ((?:`EXTENDED_ARG`.)*
  84. (?:`CALL_FUNCTION`|`CALL_METHOD`|`CALL_FUNCTION_EX`).)
  85. """
  86. )
  87. # language=PythonVerboseRegExp
  88. _extended_arg_bytecode = bytecode_regex(
  89. rb"""(
  90. # Arbitrary number of EXTENDED_ARG pairs.
  91. (?:`EXTENDED_ARG`.)*
  92. # Followed by some other instruction (usually a LOAD).
  93. [^`EXTENDED_ARG`].
  94. )"""
  95. )
  96. def extended_arguments(extended_args: bytes):
  97. """
  98. Unpack the (extended) integer used to reference names or constants.
  99. The input should be a bytecode snippet of the following form::
  100. EXTENDED_ARG ? # Repeated 0-4 times.
  101. LOAD_xxx ? # Any of LOAD_NAME/LOAD_CONST/LOAD_METHOD/...
  102. Each ? byte combined together gives the number we want.
  103. """
  104. return int.from_bytes(extended_args[1::2], "big")
  105. def load(raw: bytes, code: CodeType) -> str:
  106. """
  107. Parse an (extended) LOAD_xxx instruction.
  108. """
  109. # Get the enumeration.
  110. index = extended_arguments(raw)
  111. # Work out what that enumeration was for (constant/local var/global var).
  112. # If the last instruction byte is a LOAD_FAST:
  113. if raw[-2] == dis.opmap["LOAD_FAST"]:
  114. # Then this is a local variable.
  115. return code.co_varnames[index]
  116. # Or if it is a LOAD_CONST:
  117. if raw[-2] == dis.opmap["LOAD_CONST"]:
  118. # Then this is a literal.
  119. return code.co_consts[index]
  120. # Otherwise, it is a global name.
  121. return code.co_names[index]
  122. def loads(raw: bytes, code: CodeType) -> list:
  123. """
  124. Parse multiple consecutive LOAD_xxx instructions. Or load() in a for loop.
  125. May be used to unpack a function's parameters or nested attributes ``(foo.bar.pop.whack)``.
  126. """
  127. return [load(i, code) for i in _extended_arg_bytecode.findall(raw)]
  128. def function_calls(code: CodeType) -> list:
  129. """
  130. Scan a code object for all function calls on constant arguments.
  131. """
  132. match: re.Match
  133. out = []
  134. for match in finditer(_call_function_bytecode, code.co_code):
  135. function_root, methods, args, function_call = match.groups()
  136. # For foo():
  137. # `function_root` contains 'foo' and `methods` is empty.
  138. # For foo.bar.whizz():
  139. # `function_root` contains 'foo' and `methods` contains the rest.
  140. function_root = load(function_root, code)
  141. methods = loads(methods, code)
  142. function = ".".join([function_root] + methods)
  143. args = loads(args, code)
  144. if function_call[0] == dis.opmap['CALL_FUNCTION_EX']:
  145. flags = extended_arguments(function_call)
  146. if flags != 0:
  147. # Keyword arguments present. Unhandled at the moment.
  148. continue
  149. # In calls with const arguments, args contains a single
  150. # tuple with all values.
  151. if len(args) != 1 or not isinstance(args[0], tuple):
  152. continue
  153. args = list(args[0])
  154. else:
  155. arg_count = extended_arguments(function_call)
  156. if arg_count != len(args):
  157. # This happens if there are variable or keyword arguments. Bail out in either case.
  158. continue
  159. out.append((function, args))
  160. return out
  161. def search_recursively(search: callable, code: CodeType, _memo=None) -> dict:
  162. """
  163. Apply a search function to a code object, recursing into child code objects (function definitions).
  164. """
  165. if _memo is None:
  166. _memo = {}
  167. if code not in _memo:
  168. _memo[code] = search(code)
  169. for const in code.co_consts:
  170. if isinstance(const, CodeType):
  171. search_recursively(search, const, _memo)
  172. return _memo
  173. def recursive_function_calls(code: CodeType) -> dict:
  174. """
  175. Scan a code object for function calls on constant arguments, recursing into function definitions and bodies of
  176. comprehension loops.
  177. """
  178. return search_recursively(function_calls, code)
  179. def any_alias(full_name: str):
  180. """List possible aliases of a fully qualified Python name.
  181. >>> list(any_alias("foo.bar.wizz"))
  182. ['foo.bar.wizz', 'bar.wizz', 'wizz']
  183. This crudely allows us to capture uses of wizz() under any of
  184. ::
  185. import foo
  186. foo.bar.wizz()
  187. ::
  188. from foo import bar
  189. bar.wizz()
  190. ::
  191. from foo.bar import wizz
  192. wizz()
  193. However, it will fail for any form of aliases and quite likely find false matches.
  194. """
  195. parts = full_name.split('.')
  196. while parts:
  197. yield ".".join(parts)
  198. parts = parts[1:]