blob: 331a3b9bebf20c66595dc0590b44395d06e74016 [file] [log] [blame]
// Copyright 2018, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Internal helper functions for finding optimal edit transformations
// between strings.
#include "gtest/gtest.h"
#include <functional>
#include <list>
#include <ostream> // NOLINT
#include <queue>
#include <vector>
namespace testing {
namespace internal {
namespace {
// The following implement data structures and code for a Dijkstra-search
// based implementation of optimal edit distance.
// Posible states a node can be in. Either a node is unsettled (it hasn't been
// drawn from the priority queue yet), or it is settled and a back-link to its
// parent node is fixed.
enum EditSearchState {
kUnsettled,
kMatchParent,
kAddParent,
kRemoveParent,
kReplaceParent
};
// Custom container for search states. This is smaller and faster than a hash
// map, because the used states are dense along diagonals.
// Specifically, each state requires only 1 byte, whereas a hash_map would
// require storing the key, which would come to at least 8 bytes. std::map has
// an extra 32 bytes per node (3 pointers + 1 byte, padded), so even though
// there are circumstances where this class can have kBlockSize overhead per
// state, on average it does better than 40 bytes of overhead per state.
// In addition, in unopt builds (the usual way tests are run) the fewer
// allocations + better locality has this method running 10-50x faster than
// std::map for inputs that are large enough to measure.
class EditSearchMap {
public:
EditSearchMap(size_t left_size, size_t right_size)
: left_size_(left_size), right_size_(right_size) {
GTEST_CHECK_(left_size_ == left_size && right_size_ == right_size)
<< "Overflow in size: Arguments too large";
}
// Gets a mutable reference to a state - this is actually of type
// EditSearchState - inserting if it does not exist.
unsigned char& insert(UInt32 left, UInt32 right) {
std::vector<UInt32>* vec;
size_t index1;
size_t index2;
if (left > right) {
vec = &left_nodes_;
index1 = left - right - 1;
index2 = right;
} else {
vec = &right_nodes_;
index1 = right - left;
index2 = left;
}
if (vec->size() <= index1) {
GTEST_CHECK_(vec->size() == index1)
<< "Array diagonals should only grow by one " << vec->size() << " vs "
<< index1;
vec->push_back(block_indices_.size());
// Round up
block_indices_.resize(
block_indices_.size() +
(DiagonalLength(left, right) + kBlockSize - 1) / kBlockSize,
kUnallocatedBlock);
}
const size_t bucket = index2 / kBlockSize;
const size_t pos_in_bucket = index2 % kBlockSize;
UInt32& level2 = block_indices_[(*vec)[index1] + bucket];
if (level2 == kUnallocatedBlock) {
level2 = nodes_.size();
size_t diagonal_length = DiagonalLength(left, right);
GTEST_CHECK_(diagonal_length > index2)
<< diagonal_length << " " << index2;
size_t block_size = kBlockSize;
if (diagonal_length / kBlockSize == bucket) {
// We can never get here if diagonal_length is a multiple of
// kBlockSize, which is what we want, since this would evaluate to 0.
block_size = diagonal_length % kBlockSize;
}
nodes_.resize(nodes_.size() + block_size);
}
return nodes_[level2 + pos_in_bucket];
}
size_t MemoryUsage() const {
return nodes_.capacity() +
sizeof(UInt32) * (left_nodes_.capacity() + right_nodes_.capacity() +
block_indices_.capacity());
}
private:
enum { kBlockSize = 1024, kUnallocatedBlock = 0xFFFFFFFFul };
size_t DiagonalLength(UInt32 left, UInt32 right) const {
return std::min(left_size_ - left, right_size_ - right) +
(left < right ? left : right);
}
// The state space is conceptually a left_size_ by right_size_ sparse matrix
// of EditSearchStates. However, due to the access pattern of the search, it
// is much better to store the nodes per diagonal rather than per row.
UInt32 left_size_;
UInt32 right_size_;
// The nodes are stored by diagonals, split in two: Those to the left of the
// main diagonal are in left_nodes_, and everything else is in right_nodes_.
// The values are indices into block_indices_.
std::vector<UInt32> left_nodes_;
std::vector<UInt32> right_nodes_;
// Every entry here is an offset into the beginning of a kBlockSize-sized
// block in nodes_. An entire diagonal is allocated together here; for a
// diagonal of length <= kBlockSize, that's just a single entry, but for
// longer diagonals multiple contiguous index entries will be reserved at
// once. Unused entries will be assigned kUnallocatedBlock; this
// double-indirect scheme is used to save memory in the cases when an entire
// diagonal isn't needed.
std::vector<UInt32> block_indices_;
// This stores the actual EditSearchState data, pointed to by block_indices_.
std::vector<unsigned char> nodes_;
};
struct EditHeapEntry {
EditHeapEntry(UInt32 l, UInt32 r, UInt64 c, EditSearchState s)
: left(l), right(r), cost(c), state(s) {}
UInt32 left;
UInt32 right;
UInt64 cost : 61;
// The state that the node will get when this entry is settled. Therefore,
// this can never be kUnsettled.
UInt64 state : 3;
bool operator>(const EditHeapEntry& other) const { return cost > other.cost; }
};
// Need a min-queue, so invert the comparator.
typedef std::priority_queue<EditHeapEntry, std::vector<EditHeapEntry>,
std::greater<EditHeapEntry>>
EditHeap;
} // namespace
std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
const std::vector<size_t>& right,
size_t* memory_usage) {
const UInt64 kBaseCost = 100000;
// We make replace a little more expensive than add/remove to lower
// their priority.
const UInt64 kReplaceCost = 100001;
// In the common case where the vectors are the same (or almost the same)
// size, we know that an add will have to be followed by some later remove
// (or vice versa) in order to get the lengths to balance. We "borrow" some
// of the cost of the later operation and bring it forward into the earlier
// operation, to increase the cost of exploring (usually fruitlessly) around
// the beginning of the graph.
// However, there is a trade-off: This cheapens the cost of exploring around
// the beginning of the graph (in one direction) when the vectors are
// unequal in length. So we don't steal *all* the cost.
// You can view this as a form of A*, using an admissable heuristic that has
// been re-cast as a cost function that can be used in Dijkstra.
const UInt64 kTowardsGoalCost = 50003;
const UInt64 kAwayFromGoalCost = 2 * kBaseCost - kTowardsGoalCost;
EditSearchMap node_map(left.size() + 1, right.size() + 1);
EditHeap heap;
heap.push(EditHeapEntry(0, 0, 0, kReplaceParent));
while (!heap.empty()) {
const EditHeapEntry current_entry = heap.top();
heap.pop();
UInt32 left_pos = current_entry.left;
UInt32 right_pos = current_entry.right;
unsigned char& current_state = node_map.insert(left_pos, right_pos);
if (current_state != kUnsettled) {
// Node was already settled by a previous entry in the priority queue,
// this is a suboptimal path that should be ignored.
continue;
}
current_state = current_entry.state;
if (left_pos == left.size() && right_pos == right.size()) {
// This is the normal exit point; if we terminate due to the heap being
// empty, we'll fail a check later.
break;
}
// Special case: Since the cost of a match is zero, we can immediately
// settle the new node without putting it in the queue, since nothing can
// have a smaller cost than it. Furthermore, we don't need to relax the
// other two edges, since we know we don't need them: Any path from this
// node that would use them has an path via the match that is at least as
// cheap. Together, this means we can loop here until we stop matching.
while (left_pos < left.size() && right_pos < right.size() &&
left[left_pos] == right[right_pos]) {
left_pos++;
right_pos++;
unsigned char& fast_forward_state = node_map.insert(left_pos, right_pos);
if (fast_forward_state != kUnsettled) {
// The search reached around and settled this node before settling the
// base node. This means we're completely done with this iteration;
// abort to the outer loop.
goto outer_loop_bottom;
// Otherwise, when can settle this node, even if it was created from
// another state - we know the cost of settling it now is optimal.
}
fast_forward_state = kMatchParent;
}
// Relax adjacent nodes. We have no way to find or lower the cost of
// existing entries in the heap, so we just push new entries and throw
// them out at the top if the node is already settled. We *could* check to
// see if they're already settled before pushing, but it turns out to be
// ~not any faster, and more complicated to do so.
//
// If we're at an edge, there's only one node to relax.
if (left_pos >= left.size()) {
if (right_pos >= right.size()) {
break; // Can happen due to the fast-path loop above.
}
heap.push(EditHeapEntry(left_pos, right_pos + 1,
current_entry.cost + kTowardsGoalCost,
kAddParent));
continue;
}
if (right_pos >= right.size()) {
heap.push(EditHeapEntry(left_pos + 1, right_pos,
current_entry.cost + kTowardsGoalCost,
kRemoveParent));
continue;
}
// General case: Relax 3 edges.
heap.push(EditHeapEntry(
left_pos, right_pos + 1,
current_entry.cost + (right.size() + left_pos > right_pos + left.size()
? kTowardsGoalCost
: kAwayFromGoalCost),
kAddParent));
heap.push(EditHeapEntry(
left_pos + 1, right_pos,
current_entry.cost + (right.size() + left_pos < right_pos + left.size()
? kTowardsGoalCost
: kAwayFromGoalCost),
kRemoveParent));
heap.push(EditHeapEntry(left_pos + 1, right_pos + 1,
current_entry.cost + kReplaceCost, kReplaceParent));
outer_loop_bottom : {} // Need the curlies to form a statement.
}
// Reconstruct the best path. We do it in reverse order.
std::vector<EditType> best_path;
UInt32 left_pos = left.size();
UInt32 right_pos = right.size();
while (left_pos != 0 || right_pos != 0) {
GTEST_CHECK_(left_pos <= left.size() && right_pos <= right.size());
// The node must already exist, but if it somehow doesn't, it will be
// added as kUnsettled, which will crash below.
const unsigned char state = node_map.insert(left_pos, right_pos);
switch (state) {
case kAddParent:
right_pos--;
break;
case kRemoveParent:
left_pos--;
break;
case kMatchParent:
case kReplaceParent:
left_pos--;
right_pos--;
break;
default:
GTEST_LOG_(FATAL) << "Unsettled node at " << left_pos << ","
<< right_pos;
}
best_path.push_back(static_cast<EditType>(state - 1));
}
std::reverse(best_path.begin(), best_path.end());
if (memory_usage != NULL) {
*memory_usage = node_map.MemoryUsage();
}
return best_path;
}
namespace {
// Helper class to convert string into ids with deduplication.
class InternalStrings {
public:
size_t GetId(const std::string* str) {
IdMap::iterator it = ids_.find(str);
if (it != ids_.end()) return it->second;
size_t id = ids_.size();
return ids_[str] = id;
}
private:
struct IdMapCmp {
bool operator()(const std::string* first, const std::string* second) const {
return *first < *second;
}
};
typedef std::map<const std::string*, size_t, IdMapCmp> IdMap;
IdMap ids_;
};
} // namespace
std::vector<EditType> CalculateOptimalEdits(
const std::vector<std::string>& left,
const std::vector<std::string>& right) {
std::vector<size_t> left_ids, right_ids;
{
InternalStrings intern_table;
for (size_t i = 0; i < left.size(); ++i) {
left_ids.push_back(intern_table.GetId(&left[i]));
}
for (size_t i = 0; i < right.size(); ++i) {
right_ids.push_back(intern_table.GetId(&right[i]));
}
}
return CalculateOptimalEdits(left_ids, right_ids);
}
namespace {
// Helper class that holds the state for one hunk and prints it out to the
// stream.
// It reorders adds/removes when possible to group all removes before all
// adds. It also adds the hunk header before printing into the stream.
class Hunk {
public:
Hunk(size_t left_start, size_t right_start)
: left_start_(left_start),
right_start_(right_start),
adds_(),
removes_(),
common_() {}
void PushLine(char edit, const char* line) {
switch (edit) {
case ' ':
++common_;
FlushEdits();
hunk_.push_back(std::make_pair(' ', line));
break;
case '-':
++removes_;
hunk_removes_.push_back(std::make_pair('-', line));
break;
case '+':
++adds_;
hunk_adds_.push_back(std::make_pair('+', line));
break;
}
}
void PrintTo(std::ostream* os) {
PrintHeader(os);
FlushEdits();
for (std::list<std::pair<char, const char*> >::const_iterator it =
hunk_.begin();
it != hunk_.end(); ++it) {
*os << it->first << it->second << "\n";
}
}
bool has_edits() const { return adds_ || removes_; }
private:
void FlushEdits() {
hunk_.splice(hunk_.end(), hunk_removes_);
hunk_.splice(hunk_.end(), hunk_adds_);
}
// Print a unified diff header for one hunk.
// The format is
// "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
// where the left/right parts are omitted if unnecessary.
void PrintHeader(std::ostream* ss) const {
*ss << "@@ ";
if (removes_) {
*ss << "-" << left_start_ << "," << (removes_ + common_);
}
if (removes_ && adds_) {
*ss << " ";
}
if (adds_) {
*ss << "+" << right_start_ << "," << (adds_ + common_);
}
*ss << " @@\n";
}
size_t left_start_, right_start_;
size_t adds_, removes_, common_;
std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
};
} // namespace
// Create a list of diff hunks in Unified diff format.
// Each hunk has a header generated by PrintHeader above plus a body with
// lines prefixed with ' ' for no change, '-' for deletion and '+' for
// addition.
// 'context' represents the desired unchanged prefix/suffix around the diff.
// If two hunks are close enough that their contexts overlap, then they are
// joined into one hunk.
std::string CreateUnifiedDiff(const std::vector<std::string>& left,
const std::vector<std::string>& right,
size_t context) {
const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
size_t l_i = 0, r_i = 0, edit_i = 0;
std::stringstream ss;
while (edit_i < edits.size()) {
// Find first edit.
while (edit_i < edits.size() && edits[edit_i] == kEditMatch) {
++l_i;
++r_i;
++edit_i;
}
// Find the first line to include in the hunk.
const size_t prefix_context = std::min(l_i, context);
Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
for (size_t i = prefix_context; i > 0; --i) {
hunk.PushLine(' ', left[l_i - i].c_str());
}
// Iterate the edits until we found enough suffix for the hunk or the input
// is over.
size_t n_suffix = 0;
for (; edit_i < edits.size(); ++edit_i) {
if (n_suffix >= context) {
// Continue only if the next hunk is very close.
std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
while (it != edits.end() && *it == kEditMatch) ++it;
if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
// There is no next edit or it is too far away.
break;
}
}
EditType edit = edits[edit_i];
// Reset count when a non match is found.
n_suffix = edit == kEditMatch ? n_suffix + 1 : 0;
if (edit == kEditMatch || edit == kEditRemove || edit == kEditReplace) {
hunk.PushLine(edit == kEditMatch ? ' ' : '-', left[l_i].c_str());
}
if (edit == kEditAdd || edit == kEditReplace) {
hunk.PushLine('+', right[r_i].c_str());
}
// Advance indices, depending on edit type.
l_i += edit != kEditAdd;
r_i += edit != kEditRemove;
}
if (!hunk.has_edits()) {
// We are done. We don't want this hunk.
break;
}
hunk.PrintTo(&ss);
}
return ss.str();
}
} // namespace internal
} // namespace testing