diff --git a/python/ql/lib/change-notes/2026-06-30-instance-attribute-typetracking-performance.md b/python/ql/lib/change-notes/2026-06-30-instance-attribute-typetracking-performance.md new file mode 100644 index 000000000000..53532f6e65ce --- /dev/null +++ b/python/ql/lib/change-notes/2026-06-30-instance-attribute-typetracking-performance.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Type tracking of values stored in instance attributes and read from outside the class (for example `instance.attr` where the value was assigned to `self.attr` in a method) no longer relies on a dedicated instance type-tracker. This avoids a structural mutual recursion that could cause catastrophic query slowdowns on some OOP-heavy code bases. Such reads are now resolved using local flow from the constructor call, which is slightly less precise for instances that flow across a call or return before being read. diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll b/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll index 02fae4611f4f..865c4767aba1 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll @@ -170,13 +170,7 @@ module TypeTrackingInput implements Shared::TypeTrackingInput { /** Holds if there is a level step from `nodeFrom` to `nodeTo`, which may depend on the call graph. */ predicate levelStepCall(Node nodeFrom, LocalSourceNode nodeTo) { - // HOTFIX: `instanceFieldStep` is temporarily disabled (via `and none()`). - // It uses `classInstanceTracker(cls)` -- itself a type-tracker run -- - // from inside `levelStepCall`, creating a structural mutual recursion - // that causes catastrophic query slowdowns on some OOP-heavy Python - // codebases (e.g. mypy and dask). The `and none()` should be removed - // once that recursion is redesigned. - instanceFieldStep(nodeFrom, nodeTo) and none() + instanceFieldStep(nodeFrom, nodeTo) or inheritedFieldStep(nodeFrom, nodeTo) } @@ -355,11 +349,23 @@ module TypeTrackingInput implements Shared::TypeTrackingInput { * `instance.attr`, where `instance` is a reference to an instance of `cls`). * * This complements `selfAttrRef`, which only handles `self.attr` accesses inside the - * methods of `cls`. Unlike `selfAttrRef`, this depends on the call graph (via - * `classInstanceTracker`), so steps using it must be reported as `levelStepCall`. + * methods of `cls`. The instance is identified using *local* flow from a constructor + * call `cls(...)` (resolved via the call graph by `resolveClassCall`), rather than a + * dedicated instance type-tracker (`classInstanceTracker`). + * + * Using `classInstanceTracker` here would make `levelStepCall` mutually recursive with + * `classInstanceTracker` -- itself a full type-tracker run -- which caused catastrophic + * query slowdowns on some OOP-heavy Python code bases (e.g. `mypy` and `dask`). Relying + * on local flow from a resolved constructor call instead depends only on `classTracker` + * (the same call-graph machinery already used by `inheritedFieldStep`), avoiding that + * blow-up. The trade-off is reduced precision: instances that flow across a call or + * return before being read are no longer covered by this step. */ private predicate instanceAttrRead(Class cls, string attr, DataFlowPublic::AttrRead read) { - read.getObject() = DataFlowDispatch::classInstanceTracker(cls) and + exists(DataFlowPublic::CallCfgNode construction | + DataFlowDispatch::resolveClassCall(construction.asCfgNode(), cls) and + read.getObject().getALocalSource() = construction + ) and read.mayHaveAttributeName(attr) } @@ -438,9 +444,9 @@ module TypeTrackingInput implements Shared::TypeTrackingInput { * This is the cross-instance counterpart of `localFieldStep`: it relates a write of * `self.attr` inside a class to a read of `attr` on a reference to an instance of that * class or one of its subclasses. Identifying instances relies on the call graph (via - * `classInstanceTracker`), so this step is reported as `levelStepCall` rather than - * `levelStepNoCall`. The write may occur in the instance's own class or in any of its - * superclasses, since those methods are inherited. + * `resolveClassCall`, see `instanceAttrRead`), so this step is reported as + * `levelStepCall` rather than `levelStepNoCall`. The write may occur in the instance's + * own class or in any of its superclasses, since those methods are inherited. * * Like `localFieldStep`, this is an over-approximation: it is both instance-insensitive * and order-insensitive. diff --git a/python/ql/test/library-tests/dataflow/typetracking/attribute_tests.py b/python/ql/test/library-tests/dataflow/typetracking/attribute_tests.py index 09fed01398ed..724be1091a4a 100644 --- a/python/ql/test/library-tests/dataflow/typetracking/attribute_tests.py +++ b/python/ql/test/library-tests/dataflow/typetracking/attribute_tests.py @@ -157,10 +157,22 @@ def possibly_uncalled_method(self): # $ MISSING: tracked=foo print(self.foo) # $ tracked MISSING: tracked=foo instance = MyClass2() -print(instance.foo) # $ MISSING: tracked=foo tracked +print(instance.foo) # $ tracked MISSING: tracked=foo instance.print_foo() # $ MISSING: tracked=foo +# attribute set in method, but the instance flows across a call/return before the read. +# `instanceFieldStep` identifies the instance using only local flow from the constructor +# call, so a value stored on `self.foo` is not seen once the instance has crossed a +# function boundary. + +def make_my_class2(): + return MyClass2() + +returned_instance = make_my_class2() +print(returned_instance.foo) # $ MISSING: tracked + + # attribute set from outside of class class MyClass3(object): @@ -195,7 +207,7 @@ def read_foo(self): # $ MISSING: tracked=foo sub1 = Sub1() sub1.read_foo() -print(sub1.foo) # $ MISSING: tracked=foo tracked +print(sub1.foo) # $ tracked MISSING: tracked=foo # attribute written in a subclass method, read in an inherited base class method @@ -210,7 +222,7 @@ def __init__(self): # $ tracked=bar sub2 = Sub2() sub2.read_bar() -print(sub2.bar) # $ MISSING: tracked=bar tracked +print(sub2.bar) # $ tracked MISSING: tracked=bar # attribute written in a base class method, read on an instance of the subclass @@ -223,4 +235,4 @@ class Sub3(Base3): pass sub3 = Sub3() -print(sub3.baz) # $ MISSING: tracked=baz tracked +print(sub3.baz) # $ tracked MISSING: tracked=baz diff --git a/python/ql/test/query-tests/Security/CWE-089-SqlInjection/SqlInjection.expected b/python/ql/test/query-tests/Security/CWE-089-SqlInjection/SqlInjection.expected index 4cbcb33440ba..8f60394d8a2b 100644 --- a/python/ql/test/query-tests/Security/CWE-089-SqlInjection/SqlInjection.expected +++ b/python/ql/test/query-tests/Security/CWE-089-SqlInjection/SqlInjection.expected @@ -1,6 +1,7 @@ #select | app.py:23:20:23:24 | ControlFlowNode for query | app.py:20:18:20:21 | ControlFlowNode for name | app.py:23:20:23:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:20:18:20:21 | ControlFlowNode for name | user-provided value | | app.py:30:20:30:24 | ControlFlowNode for query | app.py:27:19:27:22 | ControlFlowNode for name | app.py:30:20:30:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:27:19:27:22 | ControlFlowNode for name | user-provided value | +| app.py:37:20:37:24 | ControlFlowNode for query | app.py:34:19:34:22 | ControlFlowNode for name | app.py:37:20:37:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:34:19:34:22 | ControlFlowNode for name | user-provided value | | app.py:44:20:44:24 | ControlFlowNode for query | app.py:41:19:41:22 | ControlFlowNode for name | app.py:44:20:44:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:41:19:41:22 | ControlFlowNode for name | user-provided value | | app.py:51:20:51:24 | ControlFlowNode for query | app.py:48:19:48:22 | ControlFlowNode for name | app.py:51:20:51:24 | ControlFlowNode for query | This SQL query depends on a $@. | app.py:48:19:48:22 | ControlFlowNode for name | user-provided value | | sql_injection.py:21:24:21:77 | ControlFlowNode for BinaryExpr | sql_injection.py:14:15:14:22 | ControlFlowNode for username | sql_injection.py:21:24:21:77 | ControlFlowNode for BinaryExpr | This SQL query depends on a $@. | sql_injection.py:14:15:14:22 | ControlFlowNode for username | user-provided value | @@ -24,6 +25,8 @@ edges | app.py:21:5:21:9 | ControlFlowNode for query | app.py:23:20:23:24 | ControlFlowNode for query | provenance | | | app.py:27:19:27:22 | ControlFlowNode for name | app.py:28:5:28:9 | ControlFlowNode for query | provenance | | | app.py:28:5:28:9 | ControlFlowNode for query | app.py:30:20:30:24 | ControlFlowNode for query | provenance | | +| app.py:34:19:34:22 | ControlFlowNode for name | app.py:35:5:35:9 | ControlFlowNode for query | provenance | | +| app.py:35:5:35:9 | ControlFlowNode for query | app.py:37:20:37:24 | ControlFlowNode for query | provenance | | | app.py:41:19:41:22 | ControlFlowNode for name | app.py:42:5:42:9 | ControlFlowNode for query | provenance | | | app.py:42:5:42:9 | ControlFlowNode for query | app.py:44:20:44:24 | ControlFlowNode for query | provenance | | | app.py:48:19:48:22 | ControlFlowNode for name | app.py:49:5:49:9 | ControlFlowNode for query | provenance | | @@ -51,6 +54,9 @@ nodes | app.py:27:19:27:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name | | app.py:28:5:28:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | | app.py:30:20:30:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | +| app.py:34:19:34:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name | +| app.py:35:5:35:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | +| app.py:37:20:37:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | | app.py:41:19:41:22 | ControlFlowNode for name | semmle.label | ControlFlowNode for name | | app.py:42:5:42:9 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | | app.py:44:20:44:24 | ControlFlowNode for query | semmle.label | ControlFlowNode for query | diff --git a/python/ql/test/query-tests/Security/CWE-089-SqlInjection/app.py b/python/ql/test/query-tests/Security/CWE-089-SqlInjection/app.py index 8046f1ef52ed..4de61346d8f5 100644 --- a/python/ql/test/query-tests/Security/CWE-089-SqlInjection/app.py +++ b/python/ql/test/query-tests/Security/CWE-089-SqlInjection/app.py @@ -31,10 +31,10 @@ async def unsafe2(name: str): # $ Source cursor.close() @app.get("/unsafe3/") -async def unsafe3(name: str): # $ MISSING: Source +async def unsafe3(name: str): # $ Source query = "select * from users where name=" + name cursor = hdb_con3.cursor() - cursor.execute(query) # $ MISSING: Alert + cursor.execute(query) # $ Alert cursor.close() @app.get("/unsafe4/")