feat(libs): add incremental version of murmurhash2 calculation
This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow <flowlnlnln@gmail.com>
This commit is contained in:
		| @@ -1,86 +1,110 @@ | ||||
| //----------------------------------------------------------------------------- | ||||
| // MurmurHash2 was written by Austin Appleby, and is placed in the public | ||||
| // domain. The author hereby disclaims copyright to this source code. | ||||
|  | ||||
| // Note - This code makes a few assumptions about how your machine behaves - | ||||
|  | ||||
| // 1. We can read a 4-byte value from any address without crashing | ||||
| // 2. sizeof(int) == 4 | ||||
|  | ||||
| // And it has a few limitations - | ||||
|  | ||||
| // 1. It will not work incrementally. | ||||
| // 2. It will not produce the same results on little-endian and big-endian | ||||
| //    machines. | ||||
| // | ||||
| // This was modified as to possibilitate it's usage incrementally. | ||||
| // Those modifications are also placed in the public domain, and the author of | ||||
| // such modifications hereby disclaims copyright to this source code. | ||||
|  | ||||
| #include "MurmurHash2.h" | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
| // Platform-specific functions and macros | ||||
|  | ||||
| // Microsoft Visual Studio | ||||
| // 'm' and 'r' are mixing constants generated offline. | ||||
| // They're not really 'magic', they just happen to work well. | ||||
| const uint32_t m = 0x5bd1e995; | ||||
| const int r = 24; | ||||
|  | ||||
| #if defined(_MSC_VER) | ||||
|  | ||||
| #define BIG_CONSTANT(x) (x) | ||||
|  | ||||
| // Other compilers | ||||
|  | ||||
| #else	// defined(_MSC_VER) | ||||
|  | ||||
| #define BIG_CONSTANT(x) (x##LLU) | ||||
|  | ||||
| #endif // !defined(_MSC_VER) | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
|  | ||||
| uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed ) | ||||
| uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out) | ||||
| { | ||||
|   // 'm' and 'r' are mixing constants generated offline. | ||||
|   // They're not really 'magic', they just happen to work well. | ||||
|     auto* buffer = new char[buffer_size]; | ||||
|     char data[4]; | ||||
|  | ||||
|   const uint32_t m = 0x5bd1e995; | ||||
|   const int r = 24; | ||||
|     int read = 0; | ||||
|     uint32_t size = 0; | ||||
|  | ||||
|   // Initialize the hash to a 'random' value | ||||
|     // We need the size without the filtered out characters before actually calculating the hash, | ||||
|     // to setup the initial value for the hash. | ||||
|     do { | ||||
|         file_stream.read(buffer, buffer_size); | ||||
|         read = file_stream.gcount(); | ||||
|         for (int i = 0; i < read; i++) { | ||||
|             if (!filter_out(buffer[i])) | ||||
|                 size += 1; | ||||
|         } | ||||
|     } while (!file_stream.eof()); | ||||
|  | ||||
|   uint32_t h = seed ^ len; | ||||
|     file_stream.clear(); | ||||
|     file_stream.seekg(0, file_stream.beg); | ||||
|  | ||||
|   // Mix 4 bytes at a time into the hash | ||||
|   const auto* data = (const unsigned char*) key; | ||||
|   while(len >= 4) | ||||
|   { | ||||
|     uint32_t k = *(uint32_t*)data; | ||||
|     int index = 0; | ||||
|  | ||||
|     k *= m; | ||||
|     k ^= k >> r; | ||||
|     k *= m; | ||||
|     // This forces a seed of 1. | ||||
|     IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size }; | ||||
|     do { | ||||
|         file_stream.read(buffer, buffer_size); | ||||
|         read = file_stream.gcount(); | ||||
|         for (int i = 0; i < read; i++) { | ||||
|             char c = buffer[i]; | ||||
|  | ||||
|     h *= m; | ||||
|     h ^= k; | ||||
|             if (filter_out(c)) | ||||
|                 continue; | ||||
|  | ||||
|     data += 4*sizeof(char); | ||||
|     len -= 4; | ||||
|   } | ||||
|             data[index] = c; | ||||
|             index = (index + 1) % 4; | ||||
|  | ||||
|   // Handle the last few bytes of the input array | ||||
|             // Mix 4 bytes at a time into the hash | ||||
|             if (index == 0) | ||||
|                 FourBytes_MurmurHash2((unsigned char*)&data, info); | ||||
|         } | ||||
|     } while (!file_stream.eof()); | ||||
|  | ||||
|   switch(len) | ||||
|   { | ||||
|   case 3: h ^= data[2] << 16; | ||||
|   case 2: h ^= data[1] << 8; | ||||
|   case 1: h ^= data[0]; | ||||
|       h *= m; | ||||
|   }; | ||||
|     // Do one last bit shuffle in the hash | ||||
|     FourBytes_MurmurHash2((unsigned char*)&data, info); | ||||
|  | ||||
|   // Do a few final mixes of the hash to ensure the last few | ||||
|   // bytes are well-incorporated. | ||||
|     delete[] buffer; | ||||
|  | ||||
|   h ^= h >> 13; | ||||
|   h *= m; | ||||
|   h ^= h >> 15; | ||||
|     file_stream.close(); | ||||
|     return info.h; | ||||
| } | ||||
|  | ||||
|   return h; | ||||
| void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev) | ||||
| { | ||||
|     if (prev.len >= 4) { | ||||
|         // Not the final mix | ||||
|         uint32_t k = *(uint32_t*)data; | ||||
|  | ||||
|         k *= m; | ||||
|         k ^= k >> r; | ||||
|         k *= m; | ||||
|  | ||||
|         prev.h *= m; | ||||
|         prev.h ^= k; | ||||
|  | ||||
|         prev.len -= 4; | ||||
|     } else { | ||||
|         // The final mix | ||||
|  | ||||
|         // Handle the last few bytes of the input array | ||||
|         switch (prev.len) { | ||||
|             case 3: | ||||
|                 prev.h ^= data[2] << 16; | ||||
|             case 2: | ||||
|                 prev.h ^= data[1] << 8; | ||||
|             case 1: | ||||
|                 prev.h ^= data[0]; | ||||
|                 prev.h *= m; | ||||
|         }; | ||||
|  | ||||
|         // Do a few final mixes of the hash to ensure the last few | ||||
|         // bytes are well-incorporated. | ||||
|  | ||||
|         prev.h ^= prev.h >> 13; | ||||
|         prev.h *= m; | ||||
|         prev.h ^= prev.h >> 15; | ||||
|  | ||||
|         prev.len = 0; | ||||
|     } | ||||
| } | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
|   | ||||
| @@ -1,30 +1,30 @@ | ||||
| //----------------------------------------------------------------------------- | ||||
| // MurmurHash2 was written by Austin Appleby, and is placed in the public | ||||
| // domain. The author hereby disclaims copyright to this source code. | ||||
| // The original MurmurHash2 was written by Austin Appleby, and is placed in the | ||||
| // public domain. The author hereby disclaims copyright to this source code. | ||||
| // | ||||
| // This was modified as to possibilitate it's usage incrementally. | ||||
| // Those modifications are also placed in the public domain, and the author of | ||||
| // such modifications hereby disclaims copyright to this source code. | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
| // Platform-specific functions and macros | ||||
| #include <cstdint> | ||||
| #include <fstream> | ||||
|  | ||||
| // Microsoft Visual Studio | ||||
|  | ||||
| #if defined(_MSC_VER) && (_MSC_VER < 1600) | ||||
|  | ||||
| typedef unsigned char uint8_t; | ||||
| typedef unsigned int uint32_t; | ||||
| typedef unsigned __int64 uint64_t; | ||||
|  | ||||
| // Other compilers | ||||
|  | ||||
| #else	// defined(_MSC_VER) | ||||
|  | ||||
| #include <stdint.h> | ||||
|  | ||||
| #endif // !defined(_MSC_VER) | ||||
| #include <functional> | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
|  | ||||
| uint64_t MurmurHash2        ( const void* key, int len, uint32_t seed = 1 ); | ||||
| uint32_t MurmurHash2( | ||||
|     std::ifstream&& file_stream, | ||||
|     std::size_t buffer_size = 4096, | ||||
|     std::function<bool(char)> filter_out = [](char) { return true; }); | ||||
|  | ||||
| struct IncrementalHashInfo { | ||||
|     uint32_t h; | ||||
|     uint32_t len; | ||||
| }; | ||||
|  | ||||
| void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev); | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 flow
					flow