Skip to content

Commit

Permalink
Rewrite IntSet to avoid std::vector entirely
Browse files Browse the repository at this point in the history
This reduces time to prepare chunk index from 7s to 5.8s on 1.5GB input.
  • Loading branch information
zeux committed Nov 13, 2022
1 parent d29c3ba commit c622811
Showing 1 changed file with 25 additions and 10 deletions.
35 changes: 25 additions & 10 deletions src/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "workqueue.hpp"
#include "blockingqueue.hpp"

#include <algorithm>
#include <vector>
#include <list>
#include <numeric>
Expand Down Expand Up @@ -344,34 +345,48 @@ static unsigned int getIndexHashIterations(unsigned int indexSize, unsigned int

struct IntSet
{
std::vector<unsigned int> data;
unsigned int size;
unsigned int* data;
size_t capacity;
size_t size;

IntSet(size_t capacity = 0): data(capacity), size(0)
IntSet(size_t capacity = 0): data(new unsigned int[capacity]), capacity(capacity), size(0)
{
assert((capacity & (capacity - 1)) == 0);

memset(data, 0, capacity * sizeof(unsigned int));
}

~IntSet()
{
delete[] data;
}

IntSet(const IntSet&) = delete;
IntSet(IntSet&&) = delete;
IntSet& operator=(const IntSet&) = delete;
IntSet& operator=(IntSet&&) = delete;

void grow()
{
IntSet res(std::max(data.size() * 2, size_t(16)));
IntSet res(std::max(capacity * 2, size_t(16)));

for (size_t i = 0; i < data.size(); ++i)
for (size_t i = 0; i < capacity; ++i)
if (data[i])
res.insert(data[i]);

data.swap(res.data);
std::swap(data, res.data);
std::swap(capacity, res.capacity);
assert(size == res.size);
}

void insert(unsigned int key)
{
assert(key != 0);

if (size >= data.size() / 2)
if (size >= capacity / 2)
grow();

unsigned int m = data.size() - 1;
unsigned int m = capacity - 1;
unsigned int h = bloomHash2(key) & m;
unsigned int i = 0;

Expand Down Expand Up @@ -434,8 +449,8 @@ static ChunkIndex prepareChunkIndex(const char* data, size_t size)

memset(index, 0, indexSize);

for (auto n: ngrams.data)
if (n != 0)
for (size_t i = 0; i < ngrams.capacity; ++i)
if (unsigned int n = ngrams.data[i])
bloomFilterUpdate(index, indexSize, n, iterations);

return result;
Expand Down

0 comments on commit c622811

Please sign in to comment.