feat(libs): add incremental version of murmurhash2 calculation
This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow <flowlnlnln@gmail.com>
This commit is contained in:
parent
15ec1abb6a
commit
f95bcf45ad
@ -1,86 +1,110 @@
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash2 was written by Austin Appleby, and is placed in the public
|
||||
// domain. The author hereby disclaims copyright to this source code.
|
||||
|
||||
// Note - This code makes a few assumptions about how your machine behaves -
|
||||
|
||||
// 1. We can read a 4-byte value from any address without crashing
|
||||
// 2. sizeof(int) == 4
|
||||
|
||||
// And it has a few limitations -
|
||||
|
||||
// 1. It will not work incrementally.
|
||||
// 2. It will not produce the same results on little-endian and big-endian
|
||||
// machines.
|
||||
//
|
||||
// This was modified as to possibilitate it's usage incrementally.
|
||||
// Those modifications are also placed in the public domain, and the author of
|
||||
// such modifications hereby disclaims copyright to this source code.
|
||||
|
||||
#include "MurmurHash2.h"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Platform-specific functions and macros
|
||||
|
||||
// Microsoft Visual Studio
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define BIG_CONSTANT(x) (x)
|
||||
|
||||
// Other compilers
|
||||
|
||||
#else // defined(_MSC_VER)
|
||||
|
||||
#define BIG_CONSTANT(x) (x##LLU)
|
||||
|
||||
#endif // !defined(_MSC_VER)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed )
|
||||
{
|
||||
// 'm' and 'r' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
|
||||
const uint32_t m = 0x5bd1e995;
|
||||
const int r = 24;
|
||||
|
||||
// Initialize the hash to a 'random' value
|
||||
uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out)
|
||||
{
|
||||
auto* buffer = new char[buffer_size];
|
||||
char data[4];
|
||||
|
||||
uint32_t h = seed ^ len;
|
||||
int read = 0;
|
||||
uint32_t size = 0;
|
||||
|
||||
// We need the size without the filtered out characters before actually calculating the hash,
|
||||
// to setup the initial value for the hash.
|
||||
do {
|
||||
file_stream.read(buffer, buffer_size);
|
||||
read = file_stream.gcount();
|
||||
for (int i = 0; i < read; i++) {
|
||||
if (!filter_out(buffer[i]))
|
||||
size += 1;
|
||||
}
|
||||
} while (!file_stream.eof());
|
||||
|
||||
file_stream.clear();
|
||||
file_stream.seekg(0, file_stream.beg);
|
||||
|
||||
int index = 0;
|
||||
|
||||
// This forces a seed of 1.
|
||||
IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size };
|
||||
do {
|
||||
file_stream.read(buffer, buffer_size);
|
||||
read = file_stream.gcount();
|
||||
for (int i = 0; i < read; i++) {
|
||||
char c = buffer[i];
|
||||
|
||||
if (filter_out(c))
|
||||
continue;
|
||||
|
||||
data[index] = c;
|
||||
index = (index + 1) % 4;
|
||||
|
||||
// Mix 4 bytes at a time into the hash
|
||||
const auto* data = (const unsigned char*) key;
|
||||
while(len >= 4)
|
||||
if (index == 0)
|
||||
FourBytes_MurmurHash2((unsigned char*)&data, info);
|
||||
}
|
||||
} while (!file_stream.eof());
|
||||
|
||||
// Do one last bit shuffle in the hash
|
||||
FourBytes_MurmurHash2((unsigned char*)&data, info);
|
||||
|
||||
delete[] buffer;
|
||||
|
||||
file_stream.close();
|
||||
return info.h;
|
||||
}
|
||||
|
||||
void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev)
|
||||
{
|
||||
if (prev.len >= 4) {
|
||||
// Not the final mix
|
||||
uint32_t k = *(uint32_t*)data;
|
||||
|
||||
k *= m;
|
||||
k ^= k >> r;
|
||||
k *= m;
|
||||
|
||||
h *= m;
|
||||
h ^= k;
|
||||
prev.h *= m;
|
||||
prev.h ^= k;
|
||||
|
||||
data += 4*sizeof(char);
|
||||
len -= 4;
|
||||
}
|
||||
prev.len -= 4;
|
||||
} else {
|
||||
// The final mix
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
|
||||
switch(len)
|
||||
{
|
||||
case 3: h ^= data[2] << 16;
|
||||
case 2: h ^= data[1] << 8;
|
||||
case 1: h ^= data[0];
|
||||
h *= m;
|
||||
switch (prev.len) {
|
||||
case 3:
|
||||
prev.h ^= data[2] << 16;
|
||||
case 2:
|
||||
prev.h ^= data[1] << 8;
|
||||
case 1:
|
||||
prev.h ^= data[0];
|
||||
prev.h *= m;
|
||||
};
|
||||
|
||||
// Do a few final mixes of the hash to ensure the last few
|
||||
// bytes are well-incorporated.
|
||||
|
||||
h ^= h >> 13;
|
||||
h *= m;
|
||||
h ^= h >> 15;
|
||||
prev.h ^= prev.h >> 13;
|
||||
prev.h *= m;
|
||||
prev.h ^= prev.h >> 15;
|
||||
|
||||
return h;
|
||||
prev.len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -1,30 +1,30 @@
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash2 was written by Austin Appleby, and is placed in the public
|
||||
// domain. The author hereby disclaims copyright to this source code.
|
||||
// The original MurmurHash2 was written by Austin Appleby, and is placed in the
|
||||
// public domain. The author hereby disclaims copyright to this source code.
|
||||
//
|
||||
// This was modified as to possibilitate it's usage incrementally.
|
||||
// Those modifications are also placed in the public domain, and the author of
|
||||
// such modifications hereby disclaims copyright to this source code.
|
||||
|
||||
#pragma once
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Platform-specific functions and macros
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
|
||||
// Microsoft Visual Studio
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1600)
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
|
||||
// Other compilers
|
||||
|
||||
#else // defined(_MSC_VER)
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#endif // !defined(_MSC_VER)
|
||||
#include <functional>
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed = 1 );
|
||||
uint32_t MurmurHash2(
|
||||
std::ifstream&& file_stream,
|
||||
std::size_t buffer_size = 4096,
|
||||
std::function<bool(char)> filter_out = [](char) { return true; });
|
||||
|
||||
struct IncrementalHashInfo {
|
||||
uint32_t h;
|
||||
uint32_t len;
|
||||
};
|
||||
|
||||
void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev);
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
Loading…
x
Reference in New Issue
Block a user