Have SkyQuery tolerate the situation where a done node has a transitive rdep from a not-done node.

This situation can easily arise when `query` and `build` are interspersed. SkyQuery wasn't designed with this use-case in mind [1], but some Bazel users are doing this (see https://github.com/bazelbuild/bazel/issues/8582), so we can at least tolerate it. See added code comments and test for details.

This bug was diagnosed jointly by janakr@ and nharmata@ in https://github.com/bazelbuild/bazel/issues/8582#issuecomment-615021266 and https://github.com/bazelbuild/bazel/issues/8582#issuecomment-615026057.

[1] It was originally planned to be query-only. In fact, the more general issue here points to an inefficiency in the interspersed usage case. I explained in the code comment and added a TODO.

Fixes #8582

RELNOTES: None
PiperOrigin-RevId: 307925028
diff --git a/src/main/java/com/google/devtools/build/skyframe/DelegatingWalkableGraph.java b/src/main/java/com/google/devtools/build/skyframe/DelegatingWalkableGraph.java
index 78b5e6d..56ec78f 100644
--- a/src/main/java/com/google/devtools/build/skyframe/DelegatingWalkableGraph.java
+++ b/src/main/java/com/google/devtools/build/skyframe/DelegatingWalkableGraph.java
@@ -109,6 +109,8 @@
     Map<SkyKey, ? extends NodeEntry> entries = getBatch(null, Reason.WALKABLE_GRAPH_DEPS, keys);
     Map<SkyKey, Iterable<SkyKey>> result = new HashMap<>(entries.size());
     for (Map.Entry<SkyKey, ? extends NodeEntry> entry : entries.entrySet()) {
+      // Note that the situation described in #getReverseDeps doesn't apply here. If the nodes for
+      // `keys` are done, then their direct deps must be done too.
       Preconditions.checkState(entry.getValue().isDone(), entry);
       result.put(entry.getKey(), entry.getValue().getDirectDeps());
     }
@@ -119,6 +121,8 @@
   public Iterable<SkyKey> getDirectDeps(SkyKey key) throws InterruptedException {
     NodeEntry entry = getEntryForValue(key);
     Preconditions.checkNotNull(entry, key);
+    // Note that the situation described in #getReverseDeps doesn't apply here. If the node for
+    // `key` is done, then its direct deps must be done too.
     Preconditions.checkState(entry.isDone(), "Node %s (with key %s) isn't done yet.", entry, key);
     return entry.getDirectDeps();
   }
@@ -129,8 +133,19 @@
     Map<SkyKey, ? extends NodeEntry> entries = getBatch(null, Reason.WALKABLE_GRAPH_RDEPS, keys);
     Map<SkyKey, Iterable<SkyKey>> result = new HashMap<>(entries.size());
     for (Map.Entry<SkyKey, ? extends NodeEntry> entry : entries.entrySet()) {
-      Preconditions.checkState(entry.getValue().isDone(), entry);
-      result.put(entry.getKey(), entry.getValue().getReverseDepsForDoneEntry());
+      // SkyQuery may be operating on a Skyframe graph that contains more nodes and edges than its
+      // universe. In this situation, Blaze's eager invalidation strategy may mean here we can
+      // observe a rdep edge from a not-done node (because that node may have been invalidated but
+      // not re-evaluated). Therefore, we tolerate this case gracefully.
+      //
+      // More generally, the fact that the Skyframe graph may be larger than SkyQuery's universe
+      // means that SkyQuery may be traversing edges irrelevant for query evaluation.
+      // TODO(bazel-team): Get rid of this wasted work. One approach is to hardcode the Skyframe
+      // *type* graph structure, and follow only edges for relevant node types. This would work, but
+      // is brittle so we'd want a strong regression testing story.
+      if (entry.getValue().isDone()) {
+        result.put(entry.getKey(), entry.getValue().getReverseDepsForDoneEntry());
+      }
     }
     return result;
   }
@@ -149,12 +164,14 @@
     Map<SkyKey, Pair<SkyValue, Iterable<SkyKey>>> result =
         Maps.newHashMapWithExpectedSize(entries.size());
     for (Map.Entry<SkyKey, ? extends NodeEntry> entry : entries.entrySet()) {
-      Preconditions.checkState(entry.getValue().isDone(), entry);
-      result.put(
-          entry.getKey(),
-          Pair.of(
-              getValueFromNodeEntry(entry.getValue()),
-              entry.getValue().getReverseDepsForDoneEntry()));
+      // See comment in #getReverseDeps.
+      if (entry.getValue().isDone()) {
+        result.put(
+            entry.getKey(),
+            Pair.of(
+                getValueFromNodeEntry(entry.getValue()),
+                entry.getValue().getReverseDepsForDoneEntry()));
+      }
     }
     return result;
   }
diff --git a/src/test/shell/integration/bazel_query_test.sh b/src/test/shell/integration/bazel_query_test.sh
index e4a4c92..1228bd2 100755
--- a/src/test/shell/integration/bazel_query_test.sh
+++ b/src/test/shell/integration/bazel_query_test.sh
@@ -552,4 +552,51 @@
   expect_log "$expected_error_msg"
 }
 
+# Regression test for https://github.com/bazelbuild/bazel/issues/8582.
+function test_rbuildfiles_can_handle_non_loading_phase_edges() {
+  mkdir -p foo
+  # When we have a package //foo whose BUILD file
+  cat > foo/BUILD <<EOF
+  # Defines a target //foo:foo, with input file foo/foo.sh,
+sh_library(name = 'foo', srcs = ['foo.sh'])
+EOF
+  # And foo/foo.sh has some initial contents.
+  echo "bar" > foo/foo.sh
+
+  # Then `rbuildfiles` correctly thinks //foo "depends" on foo/BUILD,
+  bazel query \
+    --universe_scope=//foo:foo \
+    --order_output=no \
+    "rbuildfiles(foo/BUILD)" >& $TEST_log || fail "Expected success"
+  expect_log //foo:BUILD
+  # And that no package "depends" on foo/foo.sh.
+  bazel query \
+    --universe_scope=//foo:foo \
+    --order_output=no \
+    "rbuildfiles(foo/foo.sh)" >& $TEST_log || fail "Expected success"
+  expect_not_log //foo:BUILD
+
+  # But then, after we *build* //foo:foo (thus priming the Skyframe graph with
+  # a transitive dep path from the input ArtifactValue for foo/foo.sh to the
+  # FileStateValue for foo/foo.sh),
+  bazel build //foo:foo >& $TEST_log || fail "Expected success"
+
+  # And we modify the contents of foo/foo.sh,
+  echo "baz" > foo/foo.sh
+
+  # And we again do a `rbuildfiles(foo/foo.sh)`, Bazel again correctly thinks
+  # no package "depends" on foo/foo.sh.
+  #
+  # Historically, Bazel would crash here because it would first invalidate the
+  # UTC of FileStateValue for foo/foo.sh (invalidating the ArtifactValue for
+  # foo/foo.sh), and then evaluate the DTC of the *skyquery-land* universe of
+  # //foo:foo (*not* evaluating that ArtifactValue), and then observe an rdep
+  # edge on the not-done ArtifactValue, and then crash.
+  bazel query \
+    --universe_scope=//foo:foo \
+    --order_output=no \
+    "rbuildfiles(foo/foo.sh)" >& $TEST_log || fail "Expected success"
+  expect_not_log //foo:BUILD
+}
+
 run_suite "${PRODUCT_NAME} query tests"