Add caching of computed file digests based on file metadata.

This change modifies DigestUtils to add a cache of (path, inode, mtime,
size) to the digest of the file for those digests that are computed by
reading the file contents.

The cache itself is optional because relying on file metadata to cache
the file digests could lead to correctness issues.  Enabling the cache
is exposed via a new (undocumented) --cache_computed_file_digests flag
that we can use post-release to tune the built-in values if they prove
to be incorrect or problematic.

For Bazel, enable this cache unconditionally because the rest of
Bazel already relies on mtimes and other file metadata to determine
changes to files.

The rationale for this change is performance: once we have lost the
in-memory file metadata (e.g. because of a flag flip), Bazel has to
redigest all of the output files to recompute action cache keys.  For
a pathological case of rebuilding a large app with 5GB of outputs and
then flipping the --[no]check_visibility flag on the command line, we
get the following numbers before this change:

____Elapsed time: 11.170s, Critical Path: 8.34s
____Elapsed time: 11.027s, Critical Path: 8.20s
____Elapsed time: 11.084s, Critical Path: 7.46s
____Elapsed time: 11.051s, Critical Path: 6.61s
____Elapsed time: 11.211s, Critical Path: 7.81s
____Elapsed time: 10.884s, Critical Path: 8.20s
____Elapsed time: 11.385s, Critical Path: 8.12s
____Elapsed time: 11.723s, Critical Path: 8.18s
____Elapsed time: 11.327s, Critical Path: 7.73s
____Elapsed time: 11.028s, Critical Path: 7.89s

And after this change:

____Elapsed time: 4.294s, Critical Path: 0.27s
____Elapsed time: 4.376s, Critical Path: 0.83s
____Elapsed time: 8.083s, Critical Path: 0.52s
____Elapsed time: 4.302s, Critical Path: 0.64s
____Elapsed time: 4.282s, Critical Path: 0.37s
____Elapsed time: 4.219s, Critical Path: 0.61s
____Elapsed time: 4.214s, Critical Path: 0.97s
____Elapsed time: 4.185s, Critical Path: 0.71s
____Elapsed time: 7.962s, Critical Path: 4.30s
____Elapsed time: 4.149s, Critical Path: 1.03s

--
PiperOrigin-RevId: 150351444
MOS_MIGRATED_REVID=150351444
diff --git a/src/main/java/com/google/devtools/build/lib/actions/cache/DigestUtils.java b/src/main/java/com/google/devtools/build/lib/actions/cache/DigestUtils.java
index 508ee89..aaf8b1d 100644
--- a/src/main/java/com/google/devtools/build/lib/actions/cache/DigestUtils.java
+++ b/src/main/java/com/google/devtools/build/lib/actions/cache/DigestUtils.java
@@ -13,14 +13,21 @@
 // limitations under the License.
 package com.google.devtools.build.lib.actions.cache;
 
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheStats;
 import com.google.common.io.BaseEncoding;
+import com.google.common.primitives.Longs;
 import com.google.devtools.build.lib.profiler.Profiler;
 import com.google.devtools.build.lib.profiler.ProfilerTask;
 import com.google.devtools.build.lib.util.BlazeClock;
 import com.google.devtools.build.lib.util.Fingerprint;
 import com.google.devtools.build.lib.util.LoggingUtil;
+import com.google.devtools.build.lib.util.Preconditions;
 import com.google.devtools.build.lib.util.VarInt;
+import com.google.devtools.build.lib.vfs.FileStatus;
 import com.google.devtools.build.lib.vfs.Path;
+import com.google.devtools.build.lib.vfs.PathFragment;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
@@ -31,6 +38,12 @@
 /**
  * Utility class for getting md5 digests of files.
  *
+ * <p>This class implements an optional cache of file digests when the computation of the digests is
+ * costly (i.e. when {@link Path#getFastDigest()} is not available). The cache can be enabled via
+ * the {@link #configureCache(long)} function, but note that enabling this cache might have an
+ * impact on correctness because not all changes to files can be purely detected from their
+ * metadata.
+ *
  * <p>Note that this class is responsible for digesting file metadata in an order-independent
  * manner. Care must be taken to do this properly. The digest must be a function of the set of
  * (path, metadata) tuples. While the order of these pairs must not matter, it would <b>not</b> be
@@ -45,6 +58,76 @@
   private static final Object DIGEST_LOCK = new Object();
   private static final AtomicBoolean MULTI_THREADED_DIGEST = new AtomicBoolean(false);
 
+  /**
+   * Keys used to cache the values of the digests for files where we don't have fast digests.
+   *
+   * <p>The cache keys are derived from many properties of the file metadata in an attempt to be
+   * able to detect most file changes.
+   */
+  private static class CacheKey {
+    /** Path to the file. */
+    private final PathFragment path;
+
+    /** File system identifier of the file (typically the inode number). */
+    private final long nodeId;
+
+    /** Last modification time of the file. */
+    private final long modifiedTime;
+
+    /** Size of the file. */
+    private final long size;
+
+    /**
+     * Constructs a new cache key.
+     *
+     * @param path path to the file
+     * @param status file status data from which to obtain the cache key properties
+     * @throws IOException if reading the file status data fails
+     */
+    public CacheKey(Path path, FileStatus status) throws IOException {
+      this.path = path.asFragment();
+      this.nodeId = status.getNodeId();
+      this.modifiedTime = status.getLastModifiedTime();
+      this.size = status.getSize();
+    }
+
+    @Override
+    public boolean equals(Object object) {
+      if (object == this) {
+        return true;
+      } else if (!(object instanceof CacheKey)) {
+        return false;
+      } else {
+        CacheKey key = (CacheKey) object;
+        return path.equals(key.path)
+            && nodeId == key.nodeId
+            && modifiedTime == key.modifiedTime
+            && size == key.size;
+      }
+    }
+
+    @Override
+    public int hashCode() {
+      int result = 17;
+      result = 31 * result + path.hashCode();
+      result = 31 * result + Longs.hashCode(nodeId);
+      result = 31 * result + Longs.hashCode(modifiedTime);
+      result = 31 * result + Longs.hashCode(size);
+      return result;
+    }
+  }
+
+  /**
+   * Global cache of files to their digests.
+   *
+   * <p>This is null when the cache is disabled.
+   *
+   * <p>Note that we do not use a {@link com.google.common.cache.LoadingCache} because our keys
+   * represent the paths as strings, not as {@link Path} instances. As a result, the loading
+   * function cannot actually compute the digests of the files so we have to handle this externally.
+   */
+  private static volatile Cache<CacheKey, byte[]> globalCache = null;
+
   /** Private constructor to prevent instantiation of utility class. */
   private DigestUtils() {}
 
@@ -76,6 +159,35 @@
   }
 
   /**
+   * Enables the caching of file digests based on file status data.
+   *
+   * <p>If the cache was already enabled, this causes the cache to be reinitialized thus losing all
+   * contents. If the given size is zero, the cache is disabled altogether.
+   *
+   * @param maximumSize maximumSize of the cache in number of entries
+   */
+  public static void configureCache(long maximumSize) {
+    if (maximumSize == 0) {
+      globalCache = null;
+    } else {
+      globalCache = CacheBuilder.newBuilder().maximumSize(maximumSize).recordStats().build();
+    }
+  }
+
+  /**
+   * Obtains cache statistics.
+   *
+   * <p>The cache must have previously been enabled by a call to {@link #configureCache(long)}.
+   *
+   * @return an immutable snapshot of the cache statistics
+   */
+  public static CacheStats getCacheStats() {
+    Cache<CacheKey, byte[]> cache = globalCache;
+    Preconditions.checkNotNull(cache, "configureCache() must have been called with a size >= 0");
+    return cache.stats();
+  }
+
+  /**
    * Enable or disable multi-threaded digesting even for large files.
    */
   public static void setMultiThreadedDigest(boolean multiThreadedDigest) {
@@ -104,19 +216,45 @@
       digest = null;
     }
 
+    // At this point, either we could not get a fast digest or the fast digest we got is corrupt.
+    // Attempt a cache lookup if the cache is enabled and return the cached digest if found.
+    Cache<CacheKey, byte[]> cache = globalCache;
+    CacheKey key = null;
+    if (cache != null && digest == null) {
+      key = new CacheKey(path, path.stat());
+      digest = cache.getIfPresent(key);
+    }
     if (digest != null) {
       return digest;
-    } else if (fileSize > 4096 && !MULTI_THREADED_DIGEST.get()) {
+    }
+
+    // All right, we have neither a fast nor a cached digest. Let's go through the costly process of
+    // computing it from the file contents.
+    if (fileSize > 4096 && !MULTI_THREADED_DIGEST.get()) {
       // We'll have to read file content in order to calculate the digest. In that case
       // it would be beneficial to serialize those calculations since there is a high
       // probability that MD5 will be requested for multiple output files simultaneously.
       // Exception is made for small (<=4K) files since they will not likely to introduce
       // significant delays (at worst they will result in two extra disk seeks by
       // interrupting other reads).
-      return getDigestInExclusiveMode(path);
+      digest = getDigestInExclusiveMode(path);
     } else {
-      return getDigestInternal(path);
+      digest = getDigestInternal(path);
     }
+
+    Preconditions.checkNotNull(
+        digest,
+        "We should have gotten a digest for %s at this point but we still don't have one",
+        path);
+    if (cache != null) {
+      Preconditions.checkNotNull(
+          key,
+          "We should have computed a cache key earlier for %s because the cache is enabled and we"
+              + " did not get a fast digest for this file, but we don't have a key here",
+          path);
+      cache.put(key, digest);
+    }
+    return digest;
   }
 
   /**
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/BazelMain.java b/src/main/java/com/google/devtools/build/lib/bazel/BazelMain.java
index c32e383..336715a 100644
--- a/src/main/java/com/google/devtools/build/lib/bazel/BazelMain.java
+++ b/src/main/java/com/google/devtools/build/lib/bazel/BazelMain.java
@@ -45,6 +45,7 @@
           com.google.devtools.build.lib.ssd.SsdModule.class,
           com.google.devtools.build.lib.worker.WorkerModule.class,
           com.google.devtools.build.lib.remote.RemoteModule.class,
+          com.google.devtools.build.lib.runtime.CacheFileDigestsModule.class,
           com.google.devtools.build.lib.standalone.StandaloneModule.class,
           com.google.devtools.build.lib.sandbox.SandboxModule.class,
           com.google.devtools.build.lib.runtime.BuildSummaryStatsModule.class,
diff --git a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java
index b8b2ac2..7766b6c 100644
--- a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java
+++ b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java
@@ -205,4 +205,17 @@
     help = "Print the contents of the SpawnActionContext and ContextProviders maps."
   )
   public boolean debugPrintActionContexts;
+
+  @Option(
+    name = "cache_computed_file_digests",
+    defaultValue = "50000",
+    category = "undocumented",
+    help =
+        "If greater than 0, configures Blaze to cache file digests in memory based on their "
+            + "metadata instead of recomputing the digests from disk every time they are needed. "
+            + "Setting this to 0 ensures correctness because not all file changes can be noted "
+            + "from file metadata. When not 0, the number indicates the size of the cache as the "
+            + "number of file digests to be cached."
+  )
+  public long cacheSizeForComputedFileDigests;
 }
diff --git a/src/main/java/com/google/devtools/build/lib/runtime/CacheFileDigestsModule.java b/src/main/java/com/google/devtools/build/lib/runtime/CacheFileDigestsModule.java
new file mode 100644
index 0000000..1c94bd8
--- /dev/null
+++ b/src/main/java/com/google/devtools/build/lib/runtime/CacheFileDigestsModule.java
@@ -0,0 +1,93 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.devtools.build.lib.runtime;
+
+import com.google.common.cache.CacheStats;
+import com.google.devtools.build.lib.actions.cache.DigestUtils;
+import com.google.devtools.build.lib.buildtool.BuildRequest;
+import com.google.devtools.build.lib.exec.ExecutionOptions;
+import com.google.devtools.build.lib.exec.ExecutorBuilder;
+import com.google.devtools.build.lib.util.Preconditions;
+import java.util.logging.Logger;
+
+/** Enables the caching of file digests in {@link DigestUtils}. */
+public class CacheFileDigestsModule extends BlazeModule {
+
+  private static final Logger log = Logger.getLogger(CacheFileDigestsModule.class.getName());
+
+  /** Stats gathered at the beginning of a command, to compute deltas on completion. */
+  private CacheStats stats;
+
+  /**
+   * Last known size of the cache. Changes to this value cause the cache to be reinitialized. null
+   * if we don't know anything about the last value yet (i.e. before any command has been run).
+   */
+  private Long lastKnownCacheSize;
+
+  public CacheFileDigestsModule() {}
+
+  /**
+   * Adds a line to the log with cache statistics.
+   *
+   * @param message message to prefix to the written line
+   * @param stats the cache statistics to be logged
+   */
+  private static void logStats(String message, CacheStats stats) {
+    log.info(
+        message
+            + ": hit count="
+            + stats.hitCount()
+            + ", miss count="
+            + stats.missCount()
+            + ", hit rate="
+            + stats.hitRate()
+            + ", eviction count="
+            + stats.evictionCount());
+  }
+
+  @Override
+  public void executorInit(CommandEnvironment env, BuildRequest request, ExecutorBuilder builder) {
+    super.executorInit(env, request, builder);
+
+    ExecutionOptions options = request.getOptions(ExecutionOptions.class);
+    if (lastKnownCacheSize == null
+        || options.cacheSizeForComputedFileDigests != lastKnownCacheSize) {
+      log.info("Reconfiguring cache with size=" + options.cacheSizeForComputedFileDigests);
+      DigestUtils.configureCache(options.cacheSizeForComputedFileDigests);
+      lastKnownCacheSize = options.cacheSizeForComputedFileDigests;
+    }
+
+    if (options.cacheSizeForComputedFileDigests == 0) {
+      stats = null;
+      log.info("Disabled cache");
+    } else {
+      stats = DigestUtils.getCacheStats();
+      logStats("Accumulated cache stats before command", stats);
+    }
+  }
+
+  @Override
+  public void afterCommand() {
+    super.afterCommand();
+
+    if (stats != null) {
+      CacheStats newStats = DigestUtils.getCacheStats();
+      Preconditions.checkNotNull(newStats, "The cache is enabled so we must get some stats back");
+      logStats("Accumulated cache stats after command", newStats);
+      logStats("Cache stats for finished command", newStats.minus(stats));
+      stats = null; // Silence stats until next command that uses the executor.
+    }
+  }
+}