blob: 258ade469b65e262030d64a1db5641553aa4b8b1 [file] [log] [blame]
// Copyright 2014 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.util;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadCompatible;
import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadSafe;
import java.util.ArrayList;
/**
* Provides memory-efficient bidirectional mapping String <-> unique integer.
* Uses byte-wise compressed prefix trie internally.
* <p>
* Class allows index retrieval for the given string, addition of the new
* index and string retrieval for the given index. It also allows efficient
* serialization of the internal data structures.
* <p>
* Internally class stores list of nodes with each node containing byte[]
* representation of compressed trie node:
* <pre>
* varint32 parentIndex; // index of the parent node
* varint32 keylen; // length of the node key
* byte[keylen] key; // node key data
* repeated jumpEntry { // Zero or more jump entries, referencing child nodes
* byte key // jump key (first byte of the child node key)
* varint32 nodeIndex // child index
* }
* <p>
* Note that jumpEntry key byte is actually duplicated in the child node
* instance. This is done to improve performance of the index->string
* lookup (so we can avoid jump table parsing during this lookup).
* <p>
* Root node of the trie must have parent id pointing to itself.
* <p>
* TODO(bazel-team): (2010) Consider more fine-tuned locking mechanism - e.g.
* distinguishing between read and write locks.
*/
@ThreadSafe
public class CompactStringIndexer extends AbstractIndexer {
private static final int NOT_FOUND = -1;
private ArrayList<byte[]> nodes; // Compressed prefix trie nodes.
private int rootId; // Root node id.
/*
* Creates indexer instance.
*/
public CompactStringIndexer (int expectedCapacity) {
Preconditions.checkArgument(expectedCapacity > 0);
nodes = Lists.newArrayListWithExpectedSize(expectedCapacity);
rootId = NOT_FOUND;
}
/**
* Allocates new node index. Must be called only from
* synchronized methods.
*/
private int allocateIndex() {
nodes.add(null);
return nodes.size() - 1;
}
/**
* Replaces given node record with the new one. Must be called only from
* synchronized methods.
* <p>
* Subclasses can override this method to be notified when an update actually
* takes place.
*/
@ThreadCompatible
protected void updateNode(int index, byte[] content) {
nodes.set(index, content);
}
/**
* Returns parent id for the given node content.
*
* @return parent node id
*/
private int getParentId(byte[] content) {
int[] intHolder = new int[1];
VarInt.getVarInt(content, 0, intHolder);
return intHolder[0];
}
/**
* Creates new node using specified key suffix. Must be called from
* synchronized methods.
*
* @param parentNode parent node id
* @param key original key that is being added to the indexer
* @param offset node key offset in the original key.
*
* @return new node id corresponding to the given key
*/
private int createNode(int parentNode, byte[] key, int offset) {
int index = allocateIndex();
int len = key.length - offset;
Preconditions.checkState(len >= 0);
// Content consists of parent id, key length and key. There are no jump records.
byte[] content = new byte[VarInt.varIntSize(parentNode) + VarInt.varIntSize(len) + len];
// Add parent id.
int contentOffset = VarInt.putVarInt(parentNode, content, 0);
// Add node key length.
contentOffset = VarInt.putVarInt(len, content, contentOffset);
// Add node key content.
System.arraycopy(key, offset, content, contentOffset, len);
updateNode(index, content);
return index;
}
/**
* Updates jump entry index in the given node.
*
* @param node node id to update
* @param oldIndex old jump entry index
* @param newIndex updated jump entry index
*/
private void updateJumpEntry(int node, int oldIndex, int newIndex) {
byte[] content = nodes.get(node);
int[] intHolder = new int[1];
int offset = VarInt.getVarInt(content, 0, intHolder); // parent id
offset = VarInt.getVarInt(content, offset, intHolder); // key length
offset += intHolder[0]; // Offset now points to the first jump entry.
while (offset < content.length) {
int next = VarInt.getVarInt(content, offset + 1, intHolder); // jump index
if (intHolder[0] == oldIndex) {
// Substitute oldIndex value with newIndex.
byte[] newContent =
new byte[content.length + VarInt.varIntSize(newIndex) - VarInt.varIntSize(oldIndex)];
System.arraycopy(content, 0, newContent, 0, offset + 1);
offset = VarInt.putVarInt(newIndex, newContent, offset + 1);
System.arraycopy(content, next, newContent, offset, content.length - next);
updateNode(node, newContent);
return;
} else {
offset = next;
}
}
StringBuilder builder = new StringBuilder().append("Index ").append(oldIndex)
.append(" is not present in the node ").append(node).append(", ");
dumpNodeContent(builder, content);
throw new IllegalArgumentException(builder.toString());
}
/**
* Creates new branch node content at the predefined location, splitting
* prefix from the given node and optionally adding another child node
* jump entry.
*
* @param originalNode node that will be split
* @param newBranchNode new branch node id
* @param splitOffset offset at which to split original node key
* @param indexKey optional additional jump key
* @param childIndex optional additional jump index. Optional jump entry will
* be skipped if this index is set to NOT_FOUND.
*/
private void createNewBranchNode(int originalNode, int newBranchNode, int splitOffset,
byte indexKey, int childIndex) {
byte[] content = nodes.get(originalNode);
int[] intHolder = new int[1];
int keyOffset = VarInt.getVarInt(content, 0, intHolder); // parent id
// If original node is a root node, new branch node will become new root. So set parent id
// appropriately (for root node it is set to the node's own id).
int parentIndex = (originalNode == intHolder[0] ? newBranchNode : intHolder[0]);
keyOffset = VarInt.getVarInt(content, keyOffset, intHolder); // key length
Preconditions.checkState(intHolder[0] >= splitOffset);
// Calculate new content size.
int newSize = VarInt.varIntSize(parentIndex)
+ VarInt.varIntSize(splitOffset) + splitOffset
+ 1 + VarInt.varIntSize(originalNode)
+ (childIndex != NOT_FOUND ? 1 + VarInt.varIntSize(childIndex) : 0);
// New content consists of parent id, new key length, truncated key and two jump records.
byte[] newContent = new byte[newSize];
// Add parent id.
int contentOffset = VarInt.putVarInt(parentIndex, newContent, 0);
// Add adjusted key length.
contentOffset = VarInt.putVarInt(splitOffset, newContent, contentOffset);
// Add truncated key content and first jump key.
System.arraycopy(content, keyOffset, newContent, contentOffset, splitOffset + 1);
// Add index for the first jump key.
contentOffset = VarInt.putVarInt(originalNode, newContent, contentOffset + splitOffset + 1);
// If requested, add additional jump entry.
if (childIndex != NOT_FOUND) {
// Add second jump key.
newContent[contentOffset] = indexKey;
// Add index for the second jump key.
VarInt.putVarInt(childIndex, newContent, contentOffset + 1);
}
updateNode(newBranchNode, newContent);
}
/**
* Inject newly created branch node into the trie data structure. Method
* will update parent node jump entry to point to the new branch node (or
* will update root id if branch node becomes new root) and will truncate
* key prefix from the original node that was split (that prefix now
* resides in the branch node).
*
* @param originalNode node that will be split
* @param newBranchNode new branch node id
* @param commonPrefixLength how many bytes should be split into the new branch node.
*/
private void injectNewBranchNode(int originalNode, int newBranchNode, int commonPrefixLength) {
byte[] content = nodes.get(originalNode);
int parentId = getParentId(content);
if (originalNode == parentId) {
rootId = newBranchNode; // update root index
} else {
updateJumpEntry(parentId, originalNode, newBranchNode);
}
// Truncate prefix from the original node and set its parent to the our new branch node.
int[] intHolder = new int[1];
int suffixOffset = VarInt.getVarInt(content, 0, intHolder); // parent id
suffixOffset = VarInt.getVarInt(content, suffixOffset, intHolder); // key length
int len = intHolder[0] - commonPrefixLength;
Preconditions.checkState(len >= 0);
suffixOffset += commonPrefixLength;
// New content consists of parent id, new key length and duplicated key suffix.
byte[] newContent = new byte[VarInt.varIntSize(newBranchNode) + VarInt.varIntSize(len) +
(content.length - suffixOffset)];
// Add parent id.
int contentOffset = VarInt.putVarInt(newBranchNode, newContent, 0);
// Add new key length.
contentOffset = VarInt.putVarInt(len, newContent, contentOffset);
// Add key and jump table.
System.arraycopy(content, suffixOffset, newContent, contentOffset,
content.length - suffixOffset);
updateNode(originalNode, newContent);
}
/**
* Adds new child node (that uses specified key suffix) to the given
* current node.
* Example:
* <pre>
* Had "ab". Adding "abcd".
*
* 1:"ab",'c'->2
* 1:"ab" -> \
* 2:"cd"
* </pre>
*/
private int addChildNode(int parentNode, byte[] key, int keyOffset) {
int child = createNode(parentNode, key, keyOffset);
byte[] content = nodes.get(parentNode);
// Add jump table entry to the parent node.
int entryOffset = content.length;
// New content consists of original content and additional jump record.
byte[] newContent = new byte[entryOffset + 1 + VarInt.varIntSize(child)];
// Copy original content.
System.arraycopy(content, 0, newContent, 0, entryOffset);
// Add jump key.
newContent[entryOffset] = key[keyOffset];
// Add jump index.
VarInt.putVarInt(child, newContent, entryOffset + 1);
updateNode(parentNode, newContent);
return child;
}
/**
* Splits node into two at the specified offset.
* Example:
* <pre>
* Had "abcd". Adding "ab".
*
* 2:"ab",'c'->1
* 1:"abcd" -> \
* 1:"cd"
* </pre>
*/
private int splitNodeSuffix(int nodeToSplit, int commonPrefixLength) {
int newBranchNode = allocateIndex();
// Create new node with truncated key.
createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength, (byte) 0, NOT_FOUND);
injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength);
return newBranchNode;
}
/**
* Splits node into two at the specified offset and adds another leaf.
* Example:
* <pre>
* Had "abcd". Adding "abef".
*
* 3:"ab",'c'->1,'e'->2
* 1:"abcd" -> / \
* 1:"cd" 2:"ef"
* </pre>
*/
private int addBranch(int nodeToSplit, byte[] key, int offset, int commonPrefixLength) {
int newBranchNode = allocateIndex();
int child = createNode(newBranchNode, key, offset + commonPrefixLength);
// Create new node with the truncated key and reference to the new child node.
createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength,
key[offset + commonPrefixLength], child);
injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength);
return child;
}
private int findOrCreateIndexInternal(int node, byte[] key, int offset,
boolean createIfNotFound) {
byte[] content = nodes.get(node);
int[] intHolder = new int[1];
int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id
contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length
int skyKeyLen = intHolder[0];
int remainingKeyLen = key.length - offset;
int minKeyLen = Math.min(skyKeyLen, remainingKeyLen);
// Compare given key/offset content with the node key. Skip first key byte for recursive
// calls - this byte is equal to the byte in the jump entry and was already compared.
for (int i = (offset > 0 ? 1 : 0); i < minKeyLen; i++) { // compare key
if (key[offset + i] != content[contentOffset + i]) {
// Mismatch found somewhere in the middle of the node key. If requested, node
// should be split and another leaf added for the new key.
return createIfNotFound ? addBranch(node, key, offset, i) : NOT_FOUND;
}
}
if (remainingKeyLen > minKeyLen) {
// Node key matched portion of the key - find appropriate jump entry. If found - recursion.
// If not - mismatch (we will add new child node if requested).
contentOffset += skyKeyLen;
while (contentOffset < content.length) {
if (key[offset + skyKeyLen] == content[contentOffset]) { // compare index value
VarInt.getVarInt(content, contentOffset + 1, intHolder);
// Found matching jump entry - recursively compare the child.
return findOrCreateIndexInternal(intHolder[0], key, offset + skyKeyLen,
createIfNotFound);
} else {
// Jump entry key does not match. Skip rest of the entry data.
contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder);
}
}
// There are no matching jump entries - report mismatch or create a new leaf if necessary.
return createIfNotFound ? addChildNode(node, key, offset + skyKeyLen) : NOT_FOUND;
} else if (skyKeyLen > minKeyLen) {
// Key suffix is a subset of the node key. Report mismatch or split the node if requested).
return createIfNotFound ? splitNodeSuffix(node, minKeyLen) : NOT_FOUND;
} else {
// Node key exactly matches key suffix - return associated index value.
return node;
}
}
private synchronized int findOrCreateIndex(byte[] key, boolean createIfNotFound) {
if (rootId == NOT_FOUND) {
// Root node does not seem to exist - create it if needed.
if (createIfNotFound) {
rootId = createNode(0, key, 0);
Preconditions.checkState(rootId == 0);
return 0;
} else {
return NOT_FOUND;
}
}
return findOrCreateIndexInternal(rootId, key, 0, createIfNotFound);
}
private byte[] reconstructKeyInternal(int node, int suffixSize) {
byte[] content = nodes.get(node);
Preconditions.checkNotNull(content);
int[] intHolder = new int[1];
int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id
int parentNode = intHolder[0];
contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length
int len = intHolder[0];
byte[] key;
if (node != parentNode) {
// We haven't reached root node yet. Make a recursive call, adjusting suffix length.
key = reconstructKeyInternal(parentNode, suffixSize + len);
} else {
// We are in a root node. Finally allocate array for the key. It will be filled up
// on our way back from recursive call tree.
key = new byte[suffixSize + len];
}
// Fill appropriate portion of the full key with the node key content.
System.arraycopy(content, contentOffset, key, key.length - suffixSize - len, len);
return key;
}
private byte[] reconstructKey(int node) {
return reconstructKeyInternal(node, 0);
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#clear()
*/
@Override
public synchronized void clear() {
nodes.clear();
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#size()
*/
@Override
public synchronized int size() {
return nodes.size();
}
protected int getOrCreateIndexForBytes(byte[] bytes) {
return findOrCreateIndex(bytes, true);
}
protected synchronized boolean addBytes(byte[] bytes) {
int count = nodes.size();
int index = getOrCreateIndexForBytes(bytes);
return index >= count;
}
protected int getIndexForBytes(byte[] bytes) {
return findOrCreateIndex(bytes, false);
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#getOrCreateIndex(java.lang.String)
*/
@Override
public int getOrCreateIndex(String s) {
return getOrCreateIndexForBytes(string2bytes(s));
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#getIndex(java.lang.String)
*/
@Override
public int getIndex(String s) {
return getIndexForBytes(string2bytes(s));
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#addString(java.lang.String)
*/
@Override
public boolean addString(String s) {
return addBytes(string2bytes(s));
}
protected synchronized byte[] getBytesForIndex(int i) {
Preconditions.checkArgument(i >= 0);
if (i >= nodes.size()) {
return null;
}
return reconstructKey(i);
}
/* (non-Javadoc)
* @see com.google.devtools.build.lib.util.StringIndexer#getStringForIndex(int)
*/
@Override
public String getStringForIndex(int i) {
byte[] bytes = getBytesForIndex(i);
return bytes != null ? bytes2string(bytes) : null;
}
private void dumpNodeContent(StringBuilder builder, byte[] content) {
int[] intHolder = new int[1];
int offset = VarInt.getVarInt(content, 0, intHolder);
builder.append("parent: ").append(intHolder[0]);
offset = VarInt.getVarInt(content, offset, intHolder);
int len = intHolder[0];
builder.append(", len: ").append(len).append(", key: \"")
.append(new String(content, offset, len, UTF_8)).append('"');
offset += len;
while (offset < content.length) {
builder.append(", '").append(new String(content, offset, 1, UTF_8)).append("': ");
offset = VarInt.getVarInt(content, offset + 1, intHolder);
builder.append(intHolder[0]);
}
builder.append(", size: ").append(content.length);
}
private int dumpContent(StringBuilder builder, int node, int indent, boolean[] seen) {
for(int i = 0; i < indent; i++) {
builder.append(" ");
}
builder.append(node).append(": ");
if (node >= nodes.size()) {
builder.append("OUT_OF_BOUNDS\n");
return 0;
} else if (seen[node]) {
builder.append("ALREADY_SEEN\n");
return 0;
}
seen[node] = true;
byte[] content = nodes.get(node);
if (content == null) {
builder.append("NULL\n");
return 0;
}
dumpNodeContent(builder, content);
builder.append("\n");
int contentSize = content.length;
int[] intHolder = new int[1];
int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id
contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length
contentOffset += intHolder[0];
while (contentOffset < content.length) {
contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder);
contentSize += dumpContent(builder, intHolder[0], indent + 1, seen);
}
return contentSize;
}
@Override
public synchronized String toString() {
StringBuilder builder = new StringBuilder();
builder.append("size = ").append(nodes.size()).append("\n");
if (!nodes.isEmpty()) {
int contentSize = dumpContent(builder, rootId, 0, new boolean[nodes.size()]);
builder.append("contentSize = ").append(contentSize).append("\n");
}
return builder.toString();
}
}