blob: b14263d6ddb3c74667ec59e1424ad61ff59f2140 [file] [log] [blame]
// Copyright 2026 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.remote;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.devtools.build.lib.actions.Action;
import com.google.devtools.build.lib.actions.ActionExecutionMetadata;
import com.google.devtools.build.lib.actions.ActionLookupData;
import com.google.devtools.build.lib.actions.Artifact;
import com.google.devtools.build.lib.actions.Artifact.DerivedArtifact;
import com.google.devtools.build.lib.actions.InputMetadataProvider;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.ProfilerTask;
import com.google.devtools.build.lib.profiler.SilentCloseable;
import com.google.devtools.build.lib.vfs.OutputService.RewoundActionSynchronizer;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.StampedLock;
import javax.annotation.Nullable;
/**
* A {@link RewoundActionSynchronizer} implementation for Bazel's remote filesystem, which is backed
* by actual files on disk and requires synchronization to ensure that action outputs aren't deleted
* while they are being read.
*/
final class RemoteRewoundActionSynchronizer implements RewoundActionSynchronizer {
/** A task with a cancellation callback. */
public interface Cancellable {
void cancel() throws InterruptedException;
}
private final RemoteActionInputFetcher actionInputFetcher;
private final ConcurrentHashMap<ActionExecutionMetadata, Cancellable> outputUploadTasks =
new ConcurrentHashMap<>();
// A single coarse lock is used to synchronize rewound actions (writers) and both rewound and
// non-rewound actions (readers) as long as no rewound action has attempted to prepare for its
// execution.
// This ensures high throughput and low memory footprint for the common case of no rewound
// actions. In this case, there won't be any writers and the performance characteristics of a
// ReentrantReadWriteLock are comparable to that of an atomic counter. A StampedLock would not be
// a good fit as its performance regresses with 127 or more concurrent readers.
// Note that it wouldn't be correct to only start using this lock once an action is rewound,
// because a non-rewound action consuming its non-lost outputs could have already started
// executing.
@Nullable private volatile ReadWriteLock coarseLock = new ReentrantReadWriteLock();
// A fine-grained lock structure that is switched to when the first rewound action attempts to
// prepare for its execution. This structure is used to ensure that rewound actions do not
// delete their outputs while they are being read by other actions, while still allowing
// rewound actions and non-rewound actions to run concurrently (i.e., not force the equivalent
// of --jobs=1 for as long as a rewound action is running, as the coarse lock would).
// A rewound action will acquire a write lock on its lookup data before it prepares for
// execution, while any action will acquire a read lock on the lookup data of any generating
// action of its inputs before it starts executing.
// The values of this cache are weakly referenced to ensure that locks are cleaned up when they
// are no longer needed.
@Nullable private volatile LoadingCache<ActionLookupData, ReadWriteLock> fineLocks;
public RemoteRewoundActionSynchronizer(RemoteActionInputFetcher actionInputFetcher) {
this.actionInputFetcher = actionInputFetcher;
}
/*
Proof of deadlock freedom:
As long as the coarse lock is used, there can't be any deadlock because there is only a single
read-write lock.
Now assume that there is a deadlock while the fine locks are used. First, note that the logic in
ImportantOutputHandler that is guarded by enterProcessOutputsAndGetLostArtifacts does not block
on any (rewound or non-rewound) action executions while it holds read locks and can thus be
ignored in the following. Consider the directed labeled "wait-for" graph defined as follows:
* Nodes are given by the currently active Skyframe action execution threads, each of which is
identified with the action it is (or will be) executing. Actions are in one-to-one
correspondence with the ActionLookupData that is used as the key in the fine locks map.
* For each pair of actions A_1 and A_2, there is an edge from A_1 to A_2 labeled with XY(A_3)
if A_1 is waiting for the X lock of A_3 and A_2 currently holds the Y lock of A_3, where X and
Y are either R (for read) or W (for write). The resulting graph may have parallel edges with
distinct labels.
Let C be any directed cycle in the graph representing a deadlock, let A_1 -[XY(A_3)]-> A_2 be an
edge in C and consider the following cases for the pair XY:
* RR: Since a read-write lock whose read lock is held by at least one thread doesn't
block any other thread from acquiring its read lock, this case doesn't occur.
* WW: The write lock of A_3 is only ever (attempted to be) acquired by A_3 itself when it is
rewound, which means that the edge would necessarily be of the shape A_3 -[WW(A_3)]-> A_3.
But this isn't possible since the write lock for an action is only acquired in one place (
enterActionPreparationForRewinding) and not recursively.
* WR: In this case, A_1 attempts to acquire a write lock, which only happens when A_1 is a
rewound action about to prepare for its (re-)execution. This means that the edge is
necessarily of the shape A_1 -[WR(A_1)]-> A_2. While a rewound action is waiting for its
own write lock in enterActionPreparation, it doesn't hold any locks since
enterActionExecution hasn't been called yet in SkyframeActionExecutor and all past
executions of the action have released all their locks due to use of try-with-resources.
This means that A_1 can't have any incoming edges in the wait-for graph, which is a
contradiction to the assumption that it is contained in the directed cycle C.
We conclude that XY = RW. Since the write lock of A_3 is only ever acquired by A_3 itself, all
edges in C are of the form A_1 -[RW(A_2)]-> A_2. But by construction of inputKeysFor, the
action A_1 is attempting to acquire the read locks of all its inputs' generating actions, and
thus the action A_1 depends on one of the outputs of A_2 (*).
Applied to all edges of C, we conclude that there is a corresponding directed cycle in the
action graph, which is a contradiction since Bazel disallows dependency cycles.
Notes:
* The proof would not go through at (*) if fineLocks were replaced by a Striped lock structure
with a fixed number of locks. In fact, this gives rise to a deadlock if the number of stripes
is at least 2, but low enough that distinct generating actions hash to the same stripe.
*/
@Override
public SilentCloseable enterActionPreparation(Action action, boolean wasRewound)
throws InterruptedException {
// Skyframe schedules non-rewound actions such that they never run concurrently with actions
// that consume their outputs.
if (!wasRewound) {
return () -> {};
}
try (SilentCloseable c =
Profiler.instance().profile(ProfilerTask.ACTION_LOCK, "action.enterActionPreparation")) {
return enterActionPreparationForRewinding(action);
}
}
private SilentCloseable enterActionPreparationForRewinding(Action action)
throws InterruptedException {
var localCoarseLock = coarseLock;
if (localCoarseLock != null) {
// This is the first time a rewound action has attempted to prepare for its execution.
// Switch to using the fine locks under the protection of the coarse write lock.
localCoarseLock.writeLock().lockInterruptibly();
try {
// Check again under the lock to avoid a race between multiple rewound actions attempting
// to prepare for execution at the same time.
if (fineLocks == null) {
fineLocks =
Caffeine.newBuilder()
.weakValues()
// ReentrantReadWriteLock would not work here as its individual read and write
// locks do not strongly reference the parent lock, which would lead to locks
// being cleaned up while they are still held
// (https://bugs.openjdk.org/browse/JDK-8189598). This can be worked around by
// using a construction similar to Guava's Striped helpers. StampedLock is both
// more memory-efficient and its views do strongly reference the parent lock
// (https://github.com/openjdk/jdk/blob/b349f661ea5f14b258191134714a7e712c90ef3e/src/java.base/share/classes/java/util/concurrent/locks/StampedLock.java#L1039),
// TODO: Investigate the effect of fair locks on build wall time.
.build((ActionLookupData unused) -> new StampedLock().asReadWriteLock());
coarseLock = null;
}
} finally {
localCoarseLock.writeLock().unlock();
}
}
var writeLock = fineLocks.get(outputKeyFor(action)).writeLock();
writeLock.lockInterruptibly();
prepareOutputsForRewinding(action);
return writeLock::unlock;
}
/**
* Cancels all async tasks that operate on the action's outputs and resets any cached data about
* their prefetching state.
*/
private void prepareOutputsForRewinding(Action action) throws InterruptedException {
Cancellable task = outputUploadTasks.remove(action);
if (task != null) {
task.cancel();
}
actionInputFetcher.handleRewoundActionOutputs(action.getOutputs());
}
@Override
public SilentCloseable enterActionExecution(Action action, InputMetadataProvider metadataProvider)
throws InterruptedException {
try (SilentCloseable c =
Profiler.instance().profile(ProfilerTask.ACTION_LOCK, "action.enterActionExecution")) {
return lockArtifactsForConsumption(
() -> action.getInputs().toList().iterator(), metadataProvider);
}
}
/**
* Guards a call to {@link
* com.google.devtools.build.lib.remote.RemoteImportantOutputHandler#processOutputsAndGetLostArtifacts}.
*/
public SilentCloseable enterProcessOutputsAndGetLostArtifacts(
Iterable<Artifact> importantOutputs, InputMetadataProvider fullMetadataProvider)
throws InterruptedException {
try (SilentCloseable c =
Profiler.instance()
.profile(ProfilerTask.ACTION_LOCK, "action.enterProcessOutputsAndGetLostArtifacts")) {
return lockArtifactsForConsumption(importantOutputs, fullMetadataProvider);
}
}
/**
* Registers a cancellation callback for an upload of action outputs that may still be running
* after the action has completed.
*/
public void registerOutputUploadTask(ActionExecutionMetadata action, Cancellable task) {
// We don't expect to have multiple output upload tasks for the same action registered at the
// same time.
outputUploadTasks.merge(
action,
task,
(oldTask, newTask) -> {
throw new IllegalStateException(
"Attempted to register multiple output upload tasks for %s: %s and %s"
.formatted(action, oldTask, newTask));
});
}
private SilentCloseable lockArtifactsForConsumption(
Iterable<Artifact> artifacts, InputMetadataProvider metadataProvider)
throws InterruptedException {
var localCoarseLock = coarseLock;
if (localCoarseLock != null) {
// Common case for builds without any rewound actions: acquire the single lock that is never
// acquired by a writer.
localCoarseLock.readLock().lockInterruptibly();
}
// Read the fine locks after acquiring the coarse lock to allow the fine locks to be inflated
// lazily.
var localFineLocks = fineLocks;
if (localFineLocks == null) {
// Continuation of the common case for builds without any rewound actions: the fine locks
// have not been inflated.
return localCoarseLock.readLock()::unlock;
}
// At this point, there has been at least one rewound action that has inflated the fine locks.
// We need to switch to it.
if (localCoarseLock != null) {
localCoarseLock.readLock().unlock();
}
var allReadWriteLocks =
localFineLocks.getAll(inputKeysFor(artifacts, metadataProvider)).values();
var locksToUnlockBuilder =
ImmutableList.<Lock>builderWithExpectedSize(allReadWriteLocks.size());
try {
for (var readWriteLock : allReadWriteLocks) {
var readLock = readWriteLock.readLock();
readLock.lockInterruptibly();
locksToUnlockBuilder.add(readLock);
}
} catch (InterruptedException e) {
for (var readLock : locksToUnlockBuilder.build()) {
readLock.unlock();
}
throw e;
}
var locksToUnlock = locksToUnlockBuilder.build();
return () -> locksToUnlock.forEach(Lock::unlock);
}
private static Iterable<ActionLookupData> inputKeysFor(
Iterable<Artifact> artifacts, InputMetadataProvider metadataProvider) {
var allArtifacts =
Iterables.concat(
artifacts,
Iterables.concat(
Iterables.transform(
metadataProvider.getRunfilesTrees(),
runfilesTree -> runfilesTree.getArtifacts().toList())));
return Iterables.transform(
Iterables.filter(allArtifacts, artifact -> artifact instanceof DerivedArtifact),
artifact -> ((DerivedArtifact) artifact).getGeneratingActionKey());
}
private static ActionLookupData outputKeyFor(Action action) {
return ((DerivedArtifact) action.getPrimaryOutput()).getGeneratingActionKey();
}
}