feat(libs): add incremental version of murmurhash2 calculation
This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow <flowlnlnln@gmail.com>
This commit is contained in:
		| @@ -1,86 +1,110 @@ | |||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
| // MurmurHash2 was written by Austin Appleby, and is placed in the public | // MurmurHash2 was written by Austin Appleby, and is placed in the public | ||||||
| // domain. The author hereby disclaims copyright to this source code. | // domain. The author hereby disclaims copyright to this source code. | ||||||
|  | // | ||||||
| // Note - This code makes a few assumptions about how your machine behaves - | // This was modified as to possibilitate it's usage incrementally. | ||||||
|  | // Those modifications are also placed in the public domain, and the author of | ||||||
| // 1. We can read a 4-byte value from any address without crashing | // such modifications hereby disclaims copyright to this source code. | ||||||
| // 2. sizeof(int) == 4 |  | ||||||
|  |  | ||||||
| // And it has a few limitations - |  | ||||||
|  |  | ||||||
| // 1. It will not work incrementally. |  | ||||||
| // 2. It will not produce the same results on little-endian and big-endian |  | ||||||
| //    machines. |  | ||||||
|  |  | ||||||
| #include "MurmurHash2.h" | #include "MurmurHash2.h" | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
| // Platform-specific functions and macros |  | ||||||
|  |  | ||||||
| // Microsoft Visual Studio | // 'm' and 'r' are mixing constants generated offline. | ||||||
|  | // They're not really 'magic', they just happen to work well. | ||||||
|  | const uint32_t m = 0x5bd1e995; | ||||||
|  | const int r = 24; | ||||||
|  |  | ||||||
| #if defined(_MSC_VER) | uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out) | ||||||
|  |  | ||||||
| #define BIG_CONSTANT(x) (x) |  | ||||||
|  |  | ||||||
| // Other compilers |  | ||||||
|  |  | ||||||
| #else	// defined(_MSC_VER) |  | ||||||
|  |  | ||||||
| #define BIG_CONSTANT(x) (x##LLU) |  | ||||||
|  |  | ||||||
| #endif // !defined(_MSC_VER) |  | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- |  | ||||||
|  |  | ||||||
| uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed ) |  | ||||||
| { | { | ||||||
|   // 'm' and 'r' are mixing constants generated offline. |     auto* buffer = new char[buffer_size]; | ||||||
|   // They're not really 'magic', they just happen to work well. |     char data[4]; | ||||||
|  |  | ||||||
|   const uint32_t m = 0x5bd1e995; |     int read = 0; | ||||||
|   const int r = 24; |     uint32_t size = 0; | ||||||
|  |  | ||||||
|   // Initialize the hash to a 'random' value |     // We need the size without the filtered out characters before actually calculating the hash, | ||||||
|  |     // to setup the initial value for the hash. | ||||||
|  |     do { | ||||||
|  |         file_stream.read(buffer, buffer_size); | ||||||
|  |         read = file_stream.gcount(); | ||||||
|  |         for (int i = 0; i < read; i++) { | ||||||
|  |             if (!filter_out(buffer[i])) | ||||||
|  |                 size += 1; | ||||||
|  |         } | ||||||
|  |     } while (!file_stream.eof()); | ||||||
|  |  | ||||||
|   uint32_t h = seed ^ len; |     file_stream.clear(); | ||||||
|  |     file_stream.seekg(0, file_stream.beg); | ||||||
|  |  | ||||||
|  |     int index = 0; | ||||||
|  |  | ||||||
|  |     // This forces a seed of 1. | ||||||
|  |     IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size }; | ||||||
|  |     do { | ||||||
|  |         file_stream.read(buffer, buffer_size); | ||||||
|  |         read = file_stream.gcount(); | ||||||
|  |         for (int i = 0; i < read; i++) { | ||||||
|  |             char c = buffer[i]; | ||||||
|  |  | ||||||
|  |             if (filter_out(c)) | ||||||
|  |                 continue; | ||||||
|  |  | ||||||
|  |             data[index] = c; | ||||||
|  |             index = (index + 1) % 4; | ||||||
|  |  | ||||||
|             // Mix 4 bytes at a time into the hash |             // Mix 4 bytes at a time into the hash | ||||||
|   const auto* data = (const unsigned char*) key; |             if (index == 0) | ||||||
|   while(len >= 4) |                 FourBytes_MurmurHash2((unsigned char*)&data, info); | ||||||
|   { |         } | ||||||
|  |     } while (!file_stream.eof()); | ||||||
|  |  | ||||||
|  |     // Do one last bit shuffle in the hash | ||||||
|  |     FourBytes_MurmurHash2((unsigned char*)&data, info); | ||||||
|  |  | ||||||
|  |     delete[] buffer; | ||||||
|  |  | ||||||
|  |     file_stream.close(); | ||||||
|  |     return info.h; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev) | ||||||
|  | { | ||||||
|  |     if (prev.len >= 4) { | ||||||
|  |         // Not the final mix | ||||||
|         uint32_t k = *(uint32_t*)data; |         uint32_t k = *(uint32_t*)data; | ||||||
|  |  | ||||||
|         k *= m; |         k *= m; | ||||||
|         k ^= k >> r; |         k ^= k >> r; | ||||||
|         k *= m; |         k *= m; | ||||||
|  |  | ||||||
|     h *= m; |         prev.h *= m; | ||||||
|     h ^= k; |         prev.h ^= k; | ||||||
|  |  | ||||||
|     data += 4*sizeof(char); |         prev.len -= 4; | ||||||
|     len -= 4; |     } else { | ||||||
|   } |         // The final mix | ||||||
|  |  | ||||||
|         // Handle the last few bytes of the input array |         // Handle the last few bytes of the input array | ||||||
|  |         switch (prev.len) { | ||||||
|   switch(len) |             case 3: | ||||||
|   { |                 prev.h ^= data[2] << 16; | ||||||
|   case 3: h ^= data[2] << 16; |             case 2: | ||||||
|   case 2: h ^= data[1] << 8; |                 prev.h ^= data[1] << 8; | ||||||
|   case 1: h ^= data[0]; |             case 1: | ||||||
|       h *= m; |                 prev.h ^= data[0]; | ||||||
|  |                 prev.h *= m; | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // Do a few final mixes of the hash to ensure the last few |         // Do a few final mixes of the hash to ensure the last few | ||||||
|         // bytes are well-incorporated. |         // bytes are well-incorporated. | ||||||
|  |  | ||||||
|   h ^= h >> 13; |         prev.h ^= prev.h >> 13; | ||||||
|   h *= m; |         prev.h *= m; | ||||||
|   h ^= h >> 15; |         prev.h ^= prev.h >> 15; | ||||||
|  |  | ||||||
|   return h; |         prev.len = 0; | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
|   | |||||||
| @@ -1,30 +1,30 @@ | |||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
| // MurmurHash2 was written by Austin Appleby, and is placed in the public | // The original MurmurHash2 was written by Austin Appleby, and is placed in the | ||||||
| // domain. The author hereby disclaims copyright to this source code. | // public domain. The author hereby disclaims copyright to this source code. | ||||||
|  | // | ||||||
|  | // This was modified as to possibilitate it's usage incrementally. | ||||||
|  | // Those modifications are also placed in the public domain, and the author of | ||||||
|  | // such modifications hereby disclaims copyright to this source code. | ||||||
|  |  | ||||||
| #pragma once | #pragma once | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- | #include <cstdint> | ||||||
| // Platform-specific functions and macros | #include <fstream> | ||||||
|  |  | ||||||
| // Microsoft Visual Studio | #include <functional> | ||||||
|  |  | ||||||
| #if defined(_MSC_VER) && (_MSC_VER < 1600) |  | ||||||
|  |  | ||||||
| typedef unsigned char uint8_t; |  | ||||||
| typedef unsigned int uint32_t; |  | ||||||
| typedef unsigned __int64 uint64_t; |  | ||||||
|  |  | ||||||
| // Other compilers |  | ||||||
|  |  | ||||||
| #else	// defined(_MSC_VER) |  | ||||||
|  |  | ||||||
| #include <stdint.h> |  | ||||||
|  |  | ||||||
| #endif // !defined(_MSC_VER) |  | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
|  |  | ||||||
| uint64_t MurmurHash2        ( const void* key, int len, uint32_t seed = 1 ); | uint32_t MurmurHash2( | ||||||
|  |     std::ifstream&& file_stream, | ||||||
|  |     std::size_t buffer_size = 4096, | ||||||
|  |     std::function<bool(char)> filter_out = [](char) { return true; }); | ||||||
|  |  | ||||||
|  | struct IncrementalHashInfo { | ||||||
|  |     uint32_t h; | ||||||
|  |     uint32_t len; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev); | ||||||
|  |  | ||||||
| //----------------------------------------------------------------------------- | //----------------------------------------------------------------------------- | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 flow
					flow