diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b54f6b8..d6a6c496 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,11 +99,21 @@ file(GLOB tilemaker_src_files src/helpers.cpp src/osm_lua_processing.cpp src/osm_store.cpp + src/mmap_allocator.cpp src/pbf_blocks.cpp src/read_shp.cpp src/shp_mem_tiles.cpp src/tilemaker.cpp src/write_geometry.cpp + src/node_stores.cpp + src/coordinates_geom.cpp + src/sorted_node_store.cpp + src/sorted_way_store.cpp + src/way_stores.cpp + src/external/streamvbyte_decode.cc + src/external/streamvbyte_encode.cc + src/external/streamvbyte_zigzag.cc + ) add_executable(tilemaker vector_tile.pb.cc osmformat.pb.cc ${tilemaker_src_files}) target_include_directories(tilemaker PRIVATE include) diff --git a/Makefile b/Makefile index 6a510b38..47207833 100644 --- a/Makefile +++ b/Makefile @@ -88,12 +88,54 @@ LIB := -L$(PLATFORM_PATH)/lib -lz $(LUA_LIBS) -lboost_program_options -lsqlite3 INC := -I$(PLATFORM_PATH)/include -isystem ./include -I./src $(LUA_CFLAGS) # Targets +.PHONY: test all: tilemaker -tilemaker: include/osmformat.pb.o include/vector_tile.pb.o src/mbtiles.o src/pbf_blocks.o src/coordinates.o src/osm_store.o src/helpers.o src/output_object.o src/read_shp.o src/read_pbf.o src/osm_lua_processing.o src/write_geometry.o src/shared_data.o src/tile_worker.o src/tile_data.o src/osm_mem_tiles.o src/shp_mem_tiles.o src/attribute_store.o src/tilemaker.o src/geom.o +tilemaker: \ + include/osmformat.pb.o \ + include/vector_tile.pb.o \ + src/attribute_store.o \ + src/coordinates_geom.o \ + src/coordinates.o \ + src/external/streamvbyte_decode.o \ + src/external/streamvbyte_encode.o \ + src/external/streamvbyte_zigzag.o \ + src/geom.o \ + src/helpers.o \ + src/mbtiles.o \ + src/mmap_allocator.o \ + src/node_stores.o \ + src/osm_lua_processing.o \ + src/osm_mem_tiles.o \ + src/osm_store.o \ + src/output_object.o \ + src/pbf_blocks.o \ + src/read_pbf.o \ + src/read_shp.o \ + src/shared_data.o \ + src/shp_mem_tiles.o \ + src/sorted_node_store.o \ + src/sorted_way_store.o \ + src/tile_data.o \ + src/tilemaker.o \ + src/tile_worker.o \ + src/way_stores.o \ + src/write_geometry.o $(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS) +test: test_sorted_way_store + +test_sorted_way_store: \ + src/external/streamvbyte_decode.o \ + src/external/streamvbyte_encode.o \ + src/external/streamvbyte_zigzag.o \ + src/mmap_allocator.o \ + src/sorted_way_store.o \ + src/sorted_way_store.test.o + $(CXX) $(CXXFLAGS) -o test $^ $(INC) $(LIB) $(LDFLAGS) && ./test + + %.o: %.cpp $(CXX) $(CXXFLAGS) -o $@ -c $< $(INC) @@ -110,6 +152,6 @@ install: install docs/man/tilemaker.1 ${DESTDIR}${MANPREFIX}/man1/ clean: - rm -f tilemaker src/*.o include/*.o include/*.pb.h + rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h .PHONY: install diff --git a/README.md b/README.md index 367d821a..0094a1b0 100644 --- a/README.md +++ b/README.md @@ -90,4 +90,13 @@ Formatting: braces and indents as shown, hard tabs (4sp). (Yes, I know.) Please Tilemaker is maintained by Richard Fairhurst and supported by [many contributors](https://github.com/systemed/tilemaker/graphs/contributors). -Copyright tilemaker contributors, 2015-2023. The tilemaker code is licensed as FTWPL; you may do anything you like with this code and there is no warranty. The included sqlite_modern_cpp (Amin Roosta) is MIT; [kaguya](https://github.com/satoren/kaguya) is licensed under the Boost Software Licence. +Copyright tilemaker contributors, 2015-2023. + +The tilemaker code is licensed as FTWPL; you may do anything you like with this code and there is no warranty. + +Licenses of third-party libraries: + +- sqlite_modern_cpp (Amin Roosta) is licensed under MIT +- [kaguya](https://github.com/satoren/kaguya) is licensed under the Boost Software Licence +- [libpopcnt](https://github.com/kimwalisch/libpopcnt) is licensed under BSD 2-clause +- [streamvbyte](https://github.com/lemire/streamvbyte) is licensed under Apache 2 diff --git a/include/coordinates.h b/include/coordinates.h index fe5fd30d..3ca3b8cf 100644 --- a/include/coordinates.h +++ b/include/coordinates.h @@ -2,11 +2,27 @@ #ifndef _COORDINATES_H #define _COORDINATES_H -#include -#include "geom.h" +// Lightweight types and functions for coordinates, for classes that don't +// need to pull in boost::geometry. +// +// Things that pull in boost::geometry should go in coordinates_geom.h + +#include #include +#include +#include #include +// A 36-bit integer can store all OSM node IDs; we represent this as 16 collections +// of 32-bit integers. +#define NODE_SHARDS 16 +typedef uint32_t ShardedNodeID; +typedef uint64_t NodeID; +typedef uint64_t WayID; + +typedef std::vector WayVec; + + #ifdef FAT_TILE_INDEX // Supports up to z22 typedef uint32_t TileCoordinate; @@ -92,18 +108,18 @@ double lat2latp(double lat); double latp2lat(double latp); // Tile conversions -double lon2tilexf(double lon, uint z); -double latp2tileyf(double latp, uint z); -double lat2tileyf(double lat, uint z); -uint lon2tilex(double lon, uint z); -uint latp2tiley(double latp, uint z); -uint lat2tiley(double lat, uint z); -double tilex2lon(uint x, uint z); -double tiley2latp(uint y, uint z); -double tiley2lat(uint y, uint z); +double lon2tilexf(double lon, uint8_t z); +double latp2tileyf(double latp, uint8_t z); +double lat2tileyf(double lat, uint8_t z); +uint32_t lon2tilex(double lon, uint8_t z); +uint32_t latp2tiley(double latp, uint8_t z); +uint32_t lat2tiley(double lat, uint8_t z); +double tilex2lon(uint32_t x, uint8_t z); +double tiley2latp(uint32_t y, uint8_t z); +double tiley2lat(uint32_t y, uint8_t z); // Get a tile index -TileCoordinates latpLon2index(LatpLon ll, uint baseZoom); +TileCoordinates latpLon2index(LatpLon ll, uint8_t baseZoom); // Earth's (mean) radius // http://nssdc.gsfc.nasa.gov/planetary/factsheet/earthfact.html @@ -115,36 +131,8 @@ double degp2meter(double degp, double latp); double meter2degp(double meter, double latp); -void insertIntermediateTiles(Linestring const &points, uint baseZoom, std::unordered_set &tileSet); -void insertIntermediateTiles(Ring const &points, uint baseZoom, std::unordered_set &tileSet); - // the range between smallest y and largest y is filled, for each x void fillCoveredTiles(std::unordered_set &tileSet); -// ------------------------------------------------------ -// Helper class for dealing with spherical Mercator tiles - -class TileBbox { - -public: - double minLon, maxLon, minLat, maxLat, minLatp, maxLatp; - double xmargin, ymargin, xscale, yscale; - TileCoordinates index; - uint zoom; - bool hires; - bool endZoom; - Box clippingBox; - - TileBbox(TileCoordinates i, uint z, bool h, bool e); - - std::pair scaleLatpLon(double latp, double lon) const; - std::vector scaleRing(Ring const &src) const; - MultiPolygon scaleGeometry(MultiPolygon const &src) const; - std::pair floorLatpLon(double latp, double lon) const; - - Box getTileBox() const; - Box getExtendBox() const; -}; - #endif //_COORDINATES_H diff --git a/include/coordinates_geom.h b/include/coordinates_geom.h new file mode 100644 index 00000000..279de459 --- /dev/null +++ b/include/coordinates_geom.h @@ -0,0 +1,35 @@ +#ifndef _COORDINATES_GEOM_H +#define _COORDINATES_GEOM_H + +#include "coordinates.h" +#include "geom.h" + +void insertIntermediateTiles(Linestring const &points, uint baseZoom, std::unordered_set &tileSet); +void insertIntermediateTiles(Ring const &points, uint baseZoom, std::unordered_set &tileSet); + +// ------------------------------------------------------ +// Helper class for dealing with spherical Mercator tiles +class TileBbox { + +public: + double minLon, maxLon, minLat, maxLat, minLatp, maxLatp; + double xmargin, ymargin, xscale, yscale; + TileCoordinates index; + uint zoom; + bool hires; + bool endZoom; + Box clippingBox; + + TileBbox(TileCoordinates i, uint z, bool h, bool e); + + std::pair scaleLatpLon(double latp, double lon) const; + std::vector scaleRing(Ring const &src) const; + MultiPolygon scaleGeometry(MultiPolygon const &src) const; + std::pair floorLatpLon(double latp, double lon) const; + + Box getTileBox() const; + Box getExtendBox() const; +}; + + +#endif diff --git a/include/README_sqlite_cpp.md b/include/external/README_sqlite_cpp.md similarity index 100% rename from include/README_sqlite_cpp.md rename to include/external/README_sqlite_cpp.md diff --git a/include/kaguya.hpp b/include/external/kaguya.hpp similarity index 100% rename from include/kaguya.hpp rename to include/external/kaguya.hpp diff --git a/include/external/libpopcnt.h b/include/external/libpopcnt.h new file mode 100644 index 00000000..ffcd976b --- /dev/null +++ b/include/external/libpopcnt.h @@ -0,0 +1,798 @@ +/* + * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit + * population count) in an array as quickly as possible using + * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. + * + * Copyright (c) 2016 - 2020, Kim Walisch + * Copyright (c) 2016 - 2018, Wojciech Muła + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBPOPCNT_H +#define LIBPOPCNT_H + +#include +#include + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +#ifdef __GNUC__ + #define GNUC_PREREQ(x, y) \ + (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) +#else + #define GNUC_PREREQ(x, y) 0 +#endif + +#ifdef __clang__ + #define CLANG_PREREQ(x, y) \ + (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) +#else + #define CLANG_PREREQ(x, y) 0 +#endif + +#if (_MSC_VER < 1900) && \ + !defined(__cplusplus) + #define inline __inline +#endif + +#if (defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64)) + #define X86_OR_X64 +#endif + +#if GNUC_PREREQ(4, 2) || \ + __has_builtin(__builtin_popcount) + #define HAVE_BUILTIN_POPCOUNT +#endif + +#if GNUC_PREREQ(4, 2) || \ + CLANG_PREREQ(3, 0) + #define HAVE_ASM_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + (defined(HAVE_ASM_POPCNT) || \ + defined(_MSC_VER)) + #define HAVE_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(4, 9) + #define HAVE_AVX2 +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(5, 0) + #define HAVE_AVX512 +#endif + +#if defined(X86_OR_X64) + /* MSVC compatible compilers (Windows) */ + #if defined(_MSC_VER) + /* clang-cl (LLVM 10 from 2020) requires /arch:AVX2 or + * /arch:AVX512 to enable vector instructions */ + #if defined(__clang__) + #if defined(__AVX2__) + #define HAVE_AVX2 + #endif + #if defined(__AVX512__) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* MSVC 2017 or later does not require + * /arch:AVX2 or /arch:AVX512 */ + #elif _MSC_VER >= 1910 + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* Clang (Unix-like OSes) */ + #elif CLANG_PREREQ(3, 8) && \ + __has_attribute(target) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif +#endif + +/* + * Only enable CPUID runtime checks if this is really + * needed. E.g. do not enable if user has compiled + * using -march=native on a CPU that supports AVX512. + */ +#if defined(X86_OR_X64) && \ + (defined(__cplusplus) || \ + defined(_MSC_VER) || \ + (GNUC_PREREQ(4, 2) || \ + __has_builtin(__sync_val_compare_and_swap))) && \ + ((defined(HAVE_AVX512) && !(defined(__AVX512__) || defined(__AVX512BW__))) || \ + (defined(HAVE_AVX2) && !defined(__AVX2__)) || \ + (defined(HAVE_POPCNT) && !defined(__POPCNT__))) + #define HAVE_CPUID +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This uses fewer arithmetic operations than any other known + * implementation on machines with fast multiplication. + * It uses 12 arithmetic operations, one of which is a multiply. + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ +static inline uint64_t popcount64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +#if defined(HAVE_ASM_POPCNT) && \ + defined(__x86_64__) + +static inline uint64_t popcnt64(uint64_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +#elif defined(HAVE_ASM_POPCNT) && \ + defined(__i386__) + +static inline uint32_t popcnt32(uint32_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcnt32((uint32_t) x) + + popcnt32((uint32_t)(x >> 32)); +} + +#elif defined(_MSC_VER) && \ + defined(_M_X64) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u64(x); +} + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u32((uint32_t) x) + + _mm_popcnt_u32((uint32_t)(x >> 32)); +} + +/* non x86 CPUs */ +#elif defined(HAVE_BUILTIN_POPCOUNT) + +static inline uint64_t popcnt64(uint64_t x) +{ + return __builtin_popcountll(x); +} + +/* no hardware POPCNT, + * use pure integer algorithm */ +#else + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcount64(x); +} + +#endif + +#if defined(HAVE_CPUID) + +#if defined(_MSC_VER) + #include + #include +#endif + +/* %ecx bit flags */ +#define bit_POPCNT (1 << 23) + +/* %ebx bit flags */ +#define bit_AVX2 (1 << 5) +#define bit_AVX512 (1 << 30) + +/* xgetbv bit flags */ +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +static inline void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ ("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=D" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #else + __asm__ ("cpuid;" + : "+b" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + +static inline int get_xcr0() +{ + int xcr0; + +#if defined(_MSC_VER) + xcr0 = (int) _xgetbv(0); +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + + return xcr0; +} + +#endif + +static inline int get_cpuid() +{ + int flags = 0; + int abcd[4]; + + run_cpuid(1, 0, abcd); + + if ((abcd[2] & bit_POPCNT) == bit_POPCNT) + flags |= bit_POPCNT; + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + + int osxsave_mask = (1 << 27); + + /* ensure OS supports extended processor state management */ + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return 0; + + int ymm_mask = XSTATE_SSE | XSTATE_YMM; + int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + + int xcr0 = get_xcr0(); + + if ((xcr0 & ymm_mask) == ymm_mask) + { + run_cpuid(7, 0, abcd); + + if ((abcd[1] & bit_AVX2) == bit_AVX2) + flags |= bit_AVX2; + + if ((xcr0 & zmm_mask) == zmm_mask) + { + if ((abcd[1] & bit_AVX512) == bit_AVX512) + flags |= bit_AVX512; + } + } + +#endif + + return flags; +} + +#endif /* cpuid */ + +#if defined(HAVE_AVX2) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) +{ + __m256i u = _mm256_xor_si256(a, b); + *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); + *l = _mm256_xor_si256(u, c); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline __m256i popcnt256(__m256i v) +{ + __m256i lookup1 = _mm256_setr_epi8( + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8 + ); + + __m256i lookup2 = _mm256_setr_epi8( + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0, + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0 + ); + + __m256i low_mask = _mm256_set1_epi8(0x0f); + __m256i lo = _mm256_and_si256(v, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); + __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); + __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); + + return _mm256_sad_epu8(popcnt1, popcnt2); +} + +/* + * AVX2 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size) +{ + __m256i cnt = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsA, &fours, fours, foursA, foursB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsB, &fours, fours, foursA, foursB); + CSA256(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm256_add_epi64(cnt, popcnt256(sixteens)); + } + + cnt = _mm256_slli_epi64(cnt, 4); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1)); + cnt = _mm256_add_epi64(cnt, popcnt256(ones)); + + for(; i < size; i++) + cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3]; +} + +#endif + +#if defined(HAVE_AVX512) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline __m512i popcnt512(__m512i v) +{ + __m512i m1 = _mm512_set1_epi8(0x55); + __m512i m2 = _mm512_set1_epi8(0x33); + __m512i m4 = _mm512_set1_epi8(0x0F); + __m512i vm = _mm512_and_si512(_mm512_srli_epi16(v, 1), m1); + __m512i t1 = _mm512_sub_epi8(v, vm); + __m512i tm = _mm512_and_si512(t1, m2); + __m512i tm2 = _mm512_and_si512(_mm512_srli_epi16(t1, 2), m2); + __m512i t2 = _mm512_add_epi8(tm, tm2); + __m512i tt = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)); + __m512i t3 = _mm512_and_si512(tt, m4); + + return _mm512_sad_epu8(t3, _mm512_setzero_si512()); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) +{ + *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); + *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); +} + +/* + * AVX512 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline uint64_t popcnt_avx512(const __m512i* ptr, const uint64_t size) +{ + __m512i cnt = _mm512_setzero_si512(); + __m512i ones = _mm512_setzero_si512(); + __m512i twos = _mm512_setzero_si512(); + __m512i fours = _mm512_setzero_si512(); + __m512i eights = _mm512_setzero_si512(); + __m512i sixteens = _mm512_setzero_si512(); + __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 0), _mm512_loadu_si512(ptr + i + 1)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 2), _mm512_loadu_si512(ptr + i + 3)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 4), _mm512_loadu_si512(ptr + i + 5)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 6), _mm512_loadu_si512(ptr + i + 7)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsA, &fours, fours, foursA, foursB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 8), _mm512_loadu_si512(ptr + i + 9)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 10), _mm512_loadu_si512(ptr + i + 11)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 12), _mm512_loadu_si512(ptr + i + 13)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 14), _mm512_loadu_si512(ptr + i + 15)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsB, &fours, fours, foursA, foursB); + CSA512(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm512_add_epi64(cnt, popcnt512(sixteens)); + } + + cnt = _mm512_slli_epi64(cnt, 4); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1)); + cnt = _mm512_add_epi64(cnt, popcnt512(ones)); + + for(; i < size; i++) + cnt = _mm512_add_epi64(cnt, popcnt512(_mm512_loadu_si512(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3] + + cnt64[4] + + cnt64[5] + + cnt64[6] + + cnt64[7]; +} + +#endif + +/* x86 CPUs */ +#if defined(X86_OR_X64) + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + +/* + * CPUID runtime checks are only enabled if this is needed. + * E.g. CPUID is disabled when a user compiles his + * code using -march=native on a CPU with AVX512. + */ +#if defined(HAVE_CPUID) + #if defined(__cplusplus) + /* C++11 thread-safe singleton */ + static const int cpuid = get_cpuid(); + #else + static int cpuid_ = -1; + int cpuid = cpuid_; + if (cpuid == -1) + { + cpuid = get_cpuid(); + + #if defined(_MSC_VER) + _InterlockedCompareExchange(&cpuid_, cpuid, -1); + #else + __sync_val_compare_and_swap(&cpuid_, -1, cpuid); + #endif + } + #endif +#endif + +#if defined(HAVE_AVX512) + #if defined(__AVX512__) || defined(__AVX512BW__) + /* AVX512 requires arrays >= 1024 bytes */ + if (i + 1024 <= size) + #else + if ((cpuid & bit_AVX512) && + i + 1024 <= size) + #endif + { + const __m512i* ptr512 = (const __m512i*)(ptr + i); + cnt += popcnt_avx512(ptr512, (size - i) / 64); + i = size - size % 64; + } +#endif + +#if defined(HAVE_AVX2) + #if defined(__AVX2__) + /* AVX2 requires arrays >= 512 bytes */ + if (i + 512 <= size) + #else + if ((cpuid & bit_AVX2) && + i + 512 <= size) + #endif + { + const __m256i* ptr256 = (const __m256i*)(ptr + i); + cnt += popcnt_avx2(ptr256, (size - i) / 32); + i = size - size % 32; + } +#endif + +#if defined(HAVE_POPCNT) + /* + * The user has compiled without -mpopcnt. + * Unfortunately the MSVC compiler does not have + * a POPCNT macro so we cannot get rid of the + * runtime check for MSVC. + */ + #if !defined(__POPCNT__) + if (cpuid & bit_POPCNT) + #endif + { + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; + } +#endif + +#if !defined(HAVE_POPCNT) || \ + !defined(__POPCNT__) + /* + * Pure integer popcount algorithm. + * We use unaligned memory accesses here to improve performance. + */ + for (; i < size - size % 8; i += 8) + cnt += popcount64(*(const uint64_t*)(ptr + i)); + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +#endif +} + +#elif defined(__ARM_NEON) || \ + defined(__aarch64__) + +#include + +static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) +{ + return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + uint64_t chunk_size = 64; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= chunk_size) + { + uint64_t iters = size / chunk_size; + uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); + uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); + + do + { + uint8x16_t t0 = zero; + uint8x16_t t1 = zero; + uint8x16_t t2 = zero; + uint8x16_t t3 = zero; + + /* + * After every 31 iterations we need to add the + * temporary sums (t0, t1, t2, t3) to the total sum. + * We must ensure that the temporary sums <= 255 + * and 31 * 8 bits = 248 which is OK. + */ + uint64_t limit = (i + 31 < iters) ? i + 31 : iters; + + /* Each iteration processes 64 bytes */ + for (; i < limit; i++) + { + uint8x16x4_t input = vld4q_u8(ptr); + ptr += chunk_size; + + t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); + t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); + t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); + t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); + } + + sum = vpadalq(sum, t0); + sum = vpadalq(sum, t1); + sum = vpadalq(sum, t2); + sum = vpadalq(sum, t3); + } + while (i < iters); + + i = 0; + size %= chunk_size; + + uint64_t tmp[2]; + vst1q_u64(tmp, sum); + cnt += tmp[0]; + cnt += tmp[1]; + } + +#if defined(__ARM_FEATURE_UNALIGNED) + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); +#else + if (i + 8 <= size) + { + /* Align memory to an 8 byte boundary */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } +#endif + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +} + +/* all other CPUs */ +#else + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= 8) + { + /* + * Since we don't know whether this CPU architecture + * supports unaligned memory accesses we align + * memory to an 8 byte boundary. + */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } + + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; +} + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LIBPOPCNT_H */ diff --git a/include/sqlite_modern_cpp.h b/include/external/sqlite_modern_cpp.h similarity index 100% rename from include/sqlite_modern_cpp.h rename to include/external/sqlite_modern_cpp.h diff --git a/include/external/streamvbyte.h b/include/external/streamvbyte.h new file mode 100644 index 00000000..e88ab080 --- /dev/null +++ b/include/external/streamvbyte.h @@ -0,0 +1,73 @@ +#ifndef INCLUDE_STREAMVBYTE_H_ +#define INCLUDE_STREAMVBYTE_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define STREAMVBYTE_PADDING 16 + +// Encode an array of a given length read from in to bout in varint format. +// Returns the number of bytes written. +// The number of values being stored (length) is not encoded in the compressed stream, +// the caller is responsible for keeping a record of this length. +// The pointer "in" should point to "length" values of size uint32_t +// there is no alignment requirement on the out pointer +// For safety, the out pointer should point to at least streamvbyte_max_compressedbyte(length) +// bytes. +// Uses 1,2,3 or 4 bytes per value + the decoding keys. +size_t streamvbyte_encode(const uint32_t* in, uint32_t length, uint8_t* out); + +// same as streamvbyte_encode but 0,1,2 or 4 bytes per value (plus decoding keys) instead of +// using 1,2,3 or 4 bytes. This might be useful when there's a lot of zeros in the input array. +size_t streamvbyte_encode_0124(const uint32_t* in, uint32_t length, uint8_t* out); + +// return the maximum number of compressed bytes given length input integers +// in the worst case we overestimate data bytes required by four, see below +// for a function you can run upfront over your data to compute allocations +// It includes the STREAMVBYTE_PADDING bytes. +static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) { + // number of control bytes: + size_t cb = (length + 3) / 4; + // maximum number of control bytes: + size_t db = (size_t)length * sizeof(uint32_t); + return cb + db + STREAMVBYTE_PADDING; +} + +// return the exact number of compressed bytes given length input integers +// runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you +// care about speed more than potentially over-allocating memory +// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond +// the compressed data: the user needs to ensure that this region is allocated, and it +// is not included by streamvbyte_compressedbytes. +size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length); + +// return the exact number of compressed bytes given length input integers +// runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you +// care about speed more than potentially over-allocating memory +// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond +// the compressed data: the user needs to ensure that this region is allocated, and it +// is not included by streamvbyte_compressedbytes. +size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length); + +// Read "length" 32-bit integers in varint format from in, storing the result in out. +// Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes +// from the input buffer (these bytes are read but never used). +// The caller is responsible for knowing how many integers ("length") are to be read: +// this information ought to be stored somehow. +// There is no alignment requirement on the "in" pointer. +// The out pointer should point to length * sizeof(uint32_t) bytes. +size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length); + +// Same as streamvbyte_decode but is meant to be used for streams encoded with +// streamvbyte_encode_0124. +size_t streamvbyte_decode_0124(const uint8_t* in, uint32_t* out, uint32_t length); + +#ifdef __cplusplus +} +#endif + +#endif /* INCLUDE_STREAMVBYTE_H_ */ diff --git a/include/external/streamvbyte_zigzag.h b/include/external/streamvbyte_zigzag.h new file mode 100644 index 00000000..9791ae7f --- /dev/null +++ b/include/external/streamvbyte_zigzag.h @@ -0,0 +1,35 @@ +#ifndef INCLUDE_STREAMVBYTE_ZIGZAG_H_ +#define INCLUDE_STREAMVBYTE_ZIGZAG_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Convert N signed integers to N unsigned integers, using zigzag encoding. + */ +void zigzag_encode(const int32_t* in, uint32_t* out, size_t N); + +/** + * Convert N signed integers to N unsigned integers, using zigzag delta encoding. + */ +void zigzag_delta_encode(const int32_t* in, uint32_t* out, size_t N, int32_t prev); + +/** + * Convert N unsigned integers to N signed integers, using zigzag encoding. + */ +void zigzag_decode(const uint32_t* in, int32_t* out, size_t N); + +/** + * Convert N unsigned integers to N signed integers, using zigzag delta encoding. + */ +void zigzag_delta_decode(const uint32_t* in, int32_t* out, size_t N, int32_t prev); + +#ifdef __cplusplus +} +#endif + +#endif /* INCLUDE_STREAMVBYTE_ZIGZAG_H_ */ diff --git a/include/external/streamvbytedelta.h b/include/external/streamvbytedelta.h new file mode 100644 index 00000000..9ee80403 --- /dev/null +++ b/include/external/streamvbytedelta.h @@ -0,0 +1,35 @@ +#ifndef INCLUDE_STREAMVBYTEDELTA_H_ +#define INCLUDE_STREAMVBYTEDELTA_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Encode an array of a given length read from in to bout in StreamVByte format. +// Returns the number of bytes written. +// The number of values being stored (length) is not encoded in the compressed stream, +// the caller is responsible for keeping a record of this length. The pointer "in" should +// point to "length" values of size uint32_t, there is no alignment requirement on the out +// pointer. This version uses differential coding (coding differences between values) starting +// at prev (you can often set prev to zero). For safety, the out pointer should point to at least +// streamvbyte_max_compressedbyte(length) bytes (see streamvbyte.h) +size_t streamvbyte_delta_encode(const uint32_t* in, uint32_t length, uint8_t* out, uint32_t prev); + +// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out. +// Returns the number of bytes read. +// We may read up to STREAMVBYTE_PADDING extra bytes from the input buffer (these bytes are +// read but never used). The caller is responsible for knowing how many integers ("length") +// are to be read: this information ought to be stored somehow. There is no alignment requirement +// on the "in" pointer. The out pointer should point to length * sizeof(uint32_t) bytes. This +// version uses differential coding (coding differences between values) starting at prev +// (you can often set prev to zero). +size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t prev); + +#ifdef __cplusplus +} +#endif + +#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */ diff --git a/include/geom.h b/include/geom.h index 986b2b0a..d47ef1ed 100644 --- a/include/geom.h +++ b/include/geom.h @@ -40,15 +40,6 @@ typedef boost::variant Geometry; typedef std::pair IndexValue; typedef boost::geometry::index::rtree< IndexValue, boost::geometry::index::quadratic<16> > RTree; -// A 36-bit integer can store all OSM node IDs; we represent this as 16 collections -// of 32-bit integers. -#define NODE_SHARDS 16 -typedef uint32_t ShardedNodeID; -typedef uint64_t NodeID; -typedef uint64_t WayID; - -typedef std::vector WayVec; - // Perform self-intersection aware simplification of geometry types Linestring simplify(Linestring const &ls, double max_distance); Polygon simplify(Polygon const &p, double max_distance); diff --git a/include/mbtiles.h b/include/mbtiles.h index 8ed4c670..d81626e9 100644 --- a/include/mbtiles.h +++ b/include/mbtiles.h @@ -5,7 +5,7 @@ #include #include #include -#include "sqlite_modern_cpp.h" +#include "external/sqlite_modern_cpp.h" /** \brief Write to MBTiles (sqlite) database * diff --git a/include/minunit.h b/include/minunit.h new file mode 100644 index 00000000..b019c346 --- /dev/null +++ b/include/minunit.h @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2012 David Siñuela Pastor, siu.4coders@gmail.com + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef MINUNIT_MINUNIT_H +#define MINUNIT_MINUNIT_H + +#ifdef __cplusplus + extern "C" { +#endif + +#if defined(_WIN32) +#include +#if defined(_MSC_VER) && _MSC_VER < 1900 + #define snprintf _snprintf + #define __func__ __FUNCTION__ +#endif + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) + +/* Change POSIX C SOURCE version for pure c99 compilers */ +#if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200112L +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200112L +#endif + +#include /* POSIX flags */ +#include /* clock_gettime(), time() */ +#include /* gethrtime(), gettimeofday() */ +#include +#include +#include + +#if defined(__MACH__) && defined(__APPLE__) +#include +#include +#endif + +#if __GNUC__ >= 5 && !defined(__STDC_VERSION__) +#define __func__ __extension__ __FUNCTION__ +#endif + +#else +#error "Unable to define timers for an unknown OS." +#endif + +#include +#include + +/* Maximum length of last message */ +#define MINUNIT_MESSAGE_LEN 1024 +/* Accuracy with which floats are compared */ +#define MINUNIT_EPSILON 1E-12 + +/* Misc. counters */ +static int minunit_run = 0; +static int minunit_assert = 0; +static int minunit_fail = 0; +static int minunit_status = 0; + +/* Timers */ +static double minunit_real_timer = 0; +static double minunit_proc_timer = 0; + +/* Last message */ +static char minunit_last_message[MINUNIT_MESSAGE_LEN]; + +/* Test setup and teardown function pointers */ +static void (*minunit_setup)(void) = NULL; +static void (*minunit_teardown)(void) = NULL; + +/* Definitions */ +#define MU_TEST(method_name) static void method_name(void) +#define MU_TEST_SUITE(suite_name) static void suite_name(void) + +#define MU__SAFE_BLOCK(block) do {\ + block\ +} while(0) + +/* Run test suite and unset setup and teardown functions */ +#define MU_RUN_SUITE(suite_name) MU__SAFE_BLOCK(\ + suite_name();\ + minunit_setup = NULL;\ + minunit_teardown = NULL;\ +) + +/* Configure setup and teardown functions */ +#define MU_SUITE_CONFIGURE(setup_fun, teardown_fun) MU__SAFE_BLOCK(\ + minunit_setup = setup_fun;\ + minunit_teardown = teardown_fun;\ +) + +/* Test runner */ +#define MU_RUN_TEST(test) MU__SAFE_BLOCK(\ + if (minunit_real_timer==0 && minunit_proc_timer==0) {\ + minunit_real_timer = mu_timer_real();\ + minunit_proc_timer = mu_timer_cpu();\ + }\ + if (minunit_setup) (*minunit_setup)();\ + minunit_status = 0;\ + test();\ + minunit_run++;\ + if (minunit_status) {\ + minunit_fail++;\ + printf("F");\ + printf("\n%s\n", minunit_last_message);\ + }\ + fflush(stdout);\ + if (minunit_teardown) (*minunit_teardown)();\ +) + +/* Report */ +#define MU_REPORT() MU__SAFE_BLOCK(\ + double minunit_end_real_timer;\ + double minunit_end_proc_timer;\ + printf("\n\n%d tests, %d assertions, %d failures\n", minunit_run, minunit_assert, minunit_fail);\ + minunit_end_real_timer = mu_timer_real();\ + minunit_end_proc_timer = mu_timer_cpu();\ + printf("\nFinished in %.8f seconds (real) %.8f seconds (proc)\n\n",\ + minunit_end_real_timer - minunit_real_timer,\ + minunit_end_proc_timer - minunit_proc_timer);\ +) +#define MU_EXIT_CODE minunit_fail + +/* Assertions */ +#define mu_check(test) MU__SAFE_BLOCK(\ + minunit_assert++;\ + if (!(test)) {\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", __func__, __FILE__, __LINE__, #test);\ + minunit_status = 1;\ + return;\ + } else {\ + printf(".");\ + }\ +) + +#define mu_fail(message) MU__SAFE_BLOCK(\ + minunit_assert++;\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", __func__, __FILE__, __LINE__, message);\ + minunit_status = 1;\ + return;\ +) + +#define mu_assert(test, message) MU__SAFE_BLOCK(\ + minunit_assert++;\ + if (!(test)) {\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %s", __func__, __FILE__, __LINE__, message);\ + minunit_status = 1;\ + return;\ + } else {\ + printf(".");\ + }\ +) + +#define mu_assert_int_eq(expected, result) MU__SAFE_BLOCK(\ + int minunit_tmp_e;\ + int minunit_tmp_r;\ + minunit_assert++;\ + minunit_tmp_e = (expected);\ + minunit_tmp_r = (result);\ + if (minunit_tmp_e != minunit_tmp_r) {\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %d expected but was %d", __func__, __FILE__, __LINE__, minunit_tmp_e, minunit_tmp_r);\ + minunit_status = 1;\ + return;\ + } else {\ + printf(".");\ + }\ +) + +#define mu_assert_double_eq(expected, result) MU__SAFE_BLOCK(\ + double minunit_tmp_e;\ + double minunit_tmp_r;\ + minunit_assert++;\ + minunit_tmp_e = (expected);\ + minunit_tmp_r = (result);\ + if (fabs(minunit_tmp_e-minunit_tmp_r) > MINUNIT_EPSILON) {\ + int minunit_significant_figures = 1 - log10(MINUNIT_EPSILON);\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: %.*g expected but was %.*g", __func__, __FILE__, __LINE__, minunit_significant_figures, minunit_tmp_e, minunit_significant_figures, minunit_tmp_r);\ + minunit_status = 1;\ + return;\ + } else {\ + printf(".");\ + }\ +) + +#define mu_assert_string_eq(expected, result) MU__SAFE_BLOCK(\ + const char* minunit_tmp_e = expected;\ + const char* minunit_tmp_r = result;\ + minunit_assert++;\ + if (!minunit_tmp_e) {\ + minunit_tmp_e = "";\ + }\ + if (!minunit_tmp_r) {\ + minunit_tmp_r = "";\ + }\ + if(strcmp(minunit_tmp_e, minunit_tmp_r)) {\ + snprintf(minunit_last_message, MINUNIT_MESSAGE_LEN, "%s failed:\n\t%s:%d: '%s' expected but was '%s'", __func__, __FILE__, __LINE__, minunit_tmp_e, minunit_tmp_r);\ + minunit_status = 1;\ + return;\ + } else {\ + printf(".");\ + }\ +) + +/* + * The following two functions were written by David Robert Nadeau + * from http://NadeauSoftware.com/ and distributed under the + * Creative Commons Attribution 3.0 Unported License + */ + +/** + * Returns the real time, in seconds, or -1.0 if an error occurred. + * + * Time is measured since an arbitrary and OS-dependent start time. + * The returned real time is only useful for computing an elapsed time + * between two calls to this function. + */ +static double mu_timer_real(void) +{ +#if defined(_WIN32) + /* Windows 2000 and later. ---------------------------------- */ + LARGE_INTEGER Time; + LARGE_INTEGER Frequency; + + QueryPerformanceFrequency(&Frequency); + QueryPerformanceCounter(&Time); + + Time.QuadPart *= 1000000; + Time.QuadPart /= Frequency.QuadPart; + + return (double)Time.QuadPart / 1000000.0; + +#elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__))) + /* HP-UX, Solaris. ------------------------------------------ */ + return (double)gethrtime( ) / 1000000000.0; + +#elif defined(__MACH__) && defined(__APPLE__) + /* OSX. ----------------------------------------------------- */ + static double timeConvert = 0.0; + if ( timeConvert == 0.0 ) + { + mach_timebase_info_data_t timeBase; + (void)mach_timebase_info( &timeBase ); + timeConvert = (double)timeBase.numer / + (double)timeBase.denom / + 1000000000.0; + } + return (double)mach_absolute_time( ) * timeConvert; + +#elif defined(_POSIX_VERSION) + /* POSIX. --------------------------------------------------- */ + struct timeval tm; +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) + { + struct timespec ts; +#if defined(CLOCK_MONOTONIC_PRECISE) + /* BSD. --------------------------------------------- */ + const clockid_t id = CLOCK_MONOTONIC_PRECISE; +#elif defined(CLOCK_MONOTONIC_RAW) + /* Linux. ------------------------------------------- */ + const clockid_t id = CLOCK_MONOTONIC_RAW; +#elif defined(CLOCK_HIGHRES) + /* Solaris. ----------------------------------------- */ + const clockid_t id = CLOCK_HIGHRES; +#elif defined(CLOCK_MONOTONIC) + /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */ + const clockid_t id = CLOCK_MONOTONIC; +#elif defined(CLOCK_REALTIME) + /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */ + const clockid_t id = CLOCK_REALTIME; +#else + const clockid_t id = (clockid_t)-1; /* Unknown. */ +#endif /* CLOCK_* */ + if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 ) + return (double)ts.tv_sec + + (double)ts.tv_nsec / 1000000000.0; + /* Fall thru. */ + } +#endif /* _POSIX_TIMERS */ + + /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */ + gettimeofday( &tm, NULL ); + return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0; +#else + return -1.0; /* Failed. */ +#endif +} + +/** + * Returns the amount of CPU time used by the current process, + * in seconds, or -1.0 if an error occurred. + */ +static double mu_timer_cpu(void) +{ +#if defined(_WIN32) + /* Windows -------------------------------------------------- */ + FILETIME createTime; + FILETIME exitTime; + FILETIME kernelTime; + FILETIME userTime; + + /* This approach has a resolution of 1/64 second. Unfortunately, Windows' API does not offer better */ + if ( GetProcessTimes( GetCurrentProcess( ), + &createTime, &exitTime, &kernelTime, &userTime ) != 0 ) + { + ULARGE_INTEGER userSystemTime; + memcpy(&userSystemTime, &userTime, sizeof(ULARGE_INTEGER)); + return (double)userSystemTime.QuadPart / 10000000.0; + } + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) + /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, and Solaris --------- */ + +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) + /* Prefer high-res POSIX timers, when available. */ + { + clockid_t id; + struct timespec ts; +#if _POSIX_CPUTIME > 0 + /* Clock ids vary by OS. Query the id, if possible. */ + if ( clock_getcpuclockid( 0, &id ) == -1 ) +#endif +#if defined(CLOCK_PROCESS_CPUTIME_ID) + /* Use known clock id for AIX, Linux, or Solaris. */ + id = CLOCK_PROCESS_CPUTIME_ID; +#elif defined(CLOCK_VIRTUAL) + /* Use known clock id for BSD or HP-UX. */ + id = CLOCK_VIRTUAL; +#else + id = (clockid_t)-1; +#endif + if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 ) + return (double)ts.tv_sec + + (double)ts.tv_nsec / 1000000000.0; + } +#endif + +#if defined(RUSAGE_SELF) + { + struct rusage rusage; + if ( getrusage( RUSAGE_SELF, &rusage ) != -1 ) + return (double)rusage.ru_utime.tv_sec + + (double)rusage.ru_utime.tv_usec / 1000000.0; + } +#endif + +#if defined(_SC_CLK_TCK) + { + const double ticks = (double)sysconf( _SC_CLK_TCK ); + struct tms tms; + if ( times( &tms ) != (clock_t)-1 ) + return (double)tms.tms_utime / ticks; + } +#endif + +#if defined(CLOCKS_PER_SEC) + { + clock_t cl = clock( ); + if ( cl != (clock_t)-1 ) + return (double)cl / (double)CLOCKS_PER_SEC; + } +#endif + +#endif + + return -1; /* Failed. */ +} + +#ifdef __cplusplus +} +#endif + +#endif /* MINUNIT_MINUNIT_H */ \ No newline at end of file diff --git a/include/mmap_allocator.h b/include/mmap_allocator.h new file mode 100644 index 00000000..1130c80f --- /dev/null +++ b/include/mmap_allocator.h @@ -0,0 +1,66 @@ +#ifndef _MMAP_ALLOCATOR_H +#define _MMAP_ALLOCATOR_H + +#include +#include + +class void_mmap_allocator +{ +public: + typedef std::size_t size_type; + + static void *allocate(size_type n, const void *hint = 0); + static void deallocate(void *p, size_type n); + static void destroy(void *p); + static void shutdown(); + static void reportStoreSize(std::ostringstream &str); + static void openMmapFile(const std::string& mmapFilename); +}; + +template +class mmap_allocator +{ +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T *pointer; + typedef const T *const_pointer; + typedef const T &const_reference; + typedef T value_type; + + template + struct rebind + { + typedef mmap_allocator other; + }; + + mmap_allocator() = default; + + template + mmap_allocator(OtherT &) + { } + + pointer allocate(size_type n, const void *hint = 0) + { + return reinterpret_cast(void_mmap_allocator::allocate(n * sizeof(T), hint)); + } + + void deallocate(pointer p, size_type n) + { + void_mmap_allocator::deallocate(p, n); + } + + void construct(pointer p, const_reference val) + { + new((void *)p) T(val); + } + + void destroy(pointer p) { void_mmap_allocator::destroy(p); } +}; + +template +static inline bool operator==(mmap_allocator &, mmap_allocator &) { return true; } +template +static inline bool operator!=(mmap_allocator &, mmap_allocator &) { return false; } + +#endif diff --git a/include/node_store.h b/include/node_store.h new file mode 100644 index 00000000..cc84aba2 --- /dev/null +++ b/include/node_store.h @@ -0,0 +1,28 @@ +#ifndef _NODE_STORE_H +#define _NODE_STORE_H + +#include "coordinates.h" +#include + +class NodeStore +{ +public: + using element_t = std::pair; + + // Mutators + virtual void insert(const std::vector& elements) = 0; + virtual void clear() = 0; + virtual void reopen() = 0; + // Run on each thread when a batch of blocks is started. Only + // meaningful for SortedNodeStore + virtual void batchStart() = 0; + + // Run on a single-thread, after all nodes have been inserted. + virtual void finalize(size_t threadNum) = 0; + + // Accessors + virtual size_t size() const = 0; + virtual LatpLon at(NodeID i) const = 0; +}; + +#endif diff --git a/include/node_stores.h b/include/node_stores.h new file mode 100644 index 00000000..c5151bec --- /dev/null +++ b/include/node_stores.h @@ -0,0 +1,72 @@ +#ifndef _NODE_STORES_H +#define _NODE_STORES_H + +#include +#include +#include "node_store.h" +#include "sorted_node_store.h" +#include "mmap_allocator.h" + +class BinarySearchNodeStore : public NodeStore +{ + +public: + using internal_element_t = std::pair; + using map_t = std::deque>; + + void reopen() override; + void finalize(size_t threadNum) override; + LatpLon at(NodeID i) const override; + size_t size() const override; + void insert(const std::vector& elements) override; + void clear() { + reopen(); + } + void batchStart() {} + +private: + mutable std::mutex mutex; + std::vector> mLatpLons; + + uint32_t shardPart(NodeID id) const { + uint32_t rv = id >> 32; + return rv; + } + + uint32_t idPart(NodeID id) const { return id; } +}; + +class CompactNodeStore : public NodeStore +{ + +public: + using element_t = std::pair; + using map_t = std::deque>; + + void reopen() override; + LatpLon at(NodeID i) const override; + size_t size() const override; + void insert(const std::vector& elements) override; + void clear() override; + void finalize(size_t numThreads) override {} + void batchStart() {} + +private: + // @brief Insert a latp/lon pair. + // @param i OSM ID of a node + // @param coord a latp/lon pair to be inserted + // @invariant The OSM ID i must be larger than previously inserted OSM IDs of nodes + // (though unnecessarily for current impl, future impl may impose that) + void insert_back(NodeID i, LatpLon coord) { + if(i >= mLatpLons->size()) + mLatpLons->resize(i + 1); + (*mLatpLons)[i] = coord; + } + + + mutable std::mutex mutex; + std::shared_ptr mLatpLons; +}; + + +#endif diff --git a/include/osm_lua_processing.h b/include/osm_lua_processing.h index 6fd28cb0..291ea0b3 100644 --- a/include/osm_lua_processing.h +++ b/include/osm_lua_processing.h @@ -23,7 +23,7 @@ extern "C" { #include "lauxlib.h" } -#include "kaguya.hpp" +#include "external/kaguya.hpp" // FIXME: why is this global ? extern bool verbose; diff --git a/include/osm_store.h b/include/osm_store.h index 3debeb0f..11158bb2 100644 --- a/include/osm_store.h +++ b/include/osm_store.h @@ -4,6 +4,7 @@ #include "geom.h" #include "coordinates.h" +#include "mmap_allocator.h" #include #include @@ -13,219 +14,12 @@ extern bool verbose; -class void_mmap_allocator -{ -public: - typedef std::size_t size_type; - - static void *allocate(size_type n, const void *hint = 0); - static void deallocate(void *p, size_type n); - static void destroy(void *p); - static void shutdown(); -}; - -template -class mmap_allocator -{ - -public: - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T *pointer; - typedef const T *const_pointer; - typedef const T &const_reference; - typedef T value_type; - - template - struct rebind - { - typedef mmap_allocator other; - }; - - mmap_allocator() = default; - - template - mmap_allocator(OtherT &) - { } - - pointer allocate(size_type n, const void *hint = 0) - { - return reinterpret_cast(void_mmap_allocator::allocate(n * sizeof(T), hint)); - } - - void deallocate(pointer p, size_type n) - { - void_mmap_allocator::deallocate(p, n); - } - - void construct(pointer p, const_reference val) - { - new((void *)p) T(val); - } - - void destroy(pointer p) { void_mmap_allocator::destroy(p); } -}; - -template -static inline bool operator==(mmap_allocator &, mmap_allocator &) { return true; } -template -static inline bool operator!=(mmap_allocator &, mmap_allocator &) { return false; } +class NodeStore; +class WayStore; // // Internal data structures. // -class NodeStore -{ - -public: - using element_t = std::pair; - using internal_element_t = std::pair; - using map_t = std::deque>; - - void reopen() - { - std::lock_guard lock(mutex); - for (auto i = 0; i < mLatpLons.size(); i++) - mLatpLons[i]->clear(); - - mLatpLons.clear(); - for (auto i = 0; i < NODE_SHARDS; i++) { - mLatpLons.push_back(std::make_unique()); - } - } - - // @brief Lookup a latp/lon pair - // @param i OSM ID of a node - // @return Latp/lon pair - // @exception NotFound - LatpLon at(NodeID i) const { - auto shard = mLatpLons[shardPart(i)]; - auto id = idPart(i); - - auto iter = std::lower_bound(shard->begin(), shard->end(), id, [](auto const &e, auto i) { - return e.first < i; - }); - - if(iter == shard->end() || iter->first != id) - throw std::out_of_range("Could not find node with id " + std::to_string(i)); - - return iter->second; - } - - // @brief Return the number of stored items - size_t size() const { - std::lock_guard lock(mutex); - uint64_t size = 0; - for (auto i = 0; i < mLatpLons.size(); i++) - size += mLatpLons[i]->size(); - - return size; - } - - void insert_back(std::vector const &element) { - uint32_t newEntries[NODE_SHARDS] = {}; - std::vector iterators; - - // Before taking the lock, do a pass to find out how much - // to grow each backing collection - for (auto it = element.begin(); it != element.end(); it++) { - newEntries[shardPart(it->first)]++; - } - - std::lock_guard lock(mutex); - for (auto i = 0; i < NODE_SHARDS; i++) { - auto size = mLatpLons[i]->size(); - mLatpLons[i]->resize(size + newEntries[i]); - iterators.push_back(mLatpLons[i]->begin() + size); - } - - for (auto it = element.begin(); it != element.end(); it++) { - uint32_t shard = shardPart(it->first); - uint32_t id = idPart(it->first); - - *iterators[shard] = std::make_pair(id, it->second); - iterators[shard]++; - } - } - - // @brief Make the store empty - void clear() { - reopen(); - } - - void sort(unsigned int threadNum); - -private: - mutable std::mutex mutex; - std::vector> mLatpLons; - - uint32_t shardPart(NodeID id) const { - uint32_t rv = id >> 32; - return rv; - } - - uint32_t idPart(NodeID id) const { - return id; - } -}; - -class CompactNodeStore -{ - -public: - using element_t = std::pair; - using map_t = std::deque>; - - void reopen() - { - std::lock_guard lock(mutex); - mLatpLons = std::make_unique(); - } - - // @brief Lookup a latp/lon pair - // @param i OSM ID of a node - // @return Latp/lon pair - // @exception NotFound - LatpLon at(NodeID i) const { - if(i >= mLatpLons->size()) - throw std::out_of_range("Could not find node with id " + std::to_string(i)); - return mLatpLons->at(i); - } - - // @brief Return the number of stored items - size_t size() const { - std::lock_guard lock(mutex); - return mLatpLons->size(); - } - - // @brief Insert a latp/lon pair. - // @param i OSM ID of a node - // @param coord a latp/lon pair to be inserted - // @invariant The OSM ID i must be larger than previously inserted OSM IDs of nodes - // (though unnecessarily for current impl, future impl may impose that) - void insert_back(NodeID i, LatpLon coord) { - if(i >= mLatpLons->size()) - mLatpLons->resize(i + 1); - (*mLatpLons)[i] = coord; - } - - void insert_back(std::vector const &elements) { - std::lock_guard lock(mutex); - for(auto const &i: elements) - insert_back(i.first, i.second); - } - - // @brief Make the store empty - void clear() { - std::lock_guard lock(mutex); - mLatpLons->clear(); - } - -private: - mutable std::mutex mutex; - std::shared_ptr mLatpLons; -}; - // list of ways used by relations // by noting these in advance, we don't need to store all ways in the store class UsedWays { @@ -238,6 +32,9 @@ class UsedWays { bool inited = false; // Size the vector to a reasonable estimate, to avoid resizing on the fly + // TODO: it'd be preferable if UsedWays didn't know about compact mode -- + // instead, use an efficient data structure if numNodes < 1B, otherwise + // use a large bitvector void reserve(bool compact, size_t numNodes) { std::lock_guard lock(mutex); if (inited) return; @@ -308,64 +105,6 @@ class RelationScanStore { } }; -// way store -class WayStore { - -public: - using latplon_vector_t = std::vector>; - using element_t = std::pair; - using map_t = std::deque>; - - void reopen() { - mLatpLonLists = std::make_unique(); - } - - // @brief Lookup a node list - // @param i OSM ID of a way - // @return A node list - // @exception NotFound - latplon_vector_t const &at(WayID wayid) const { - std::lock_guard lock(mutex); - - auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), wayid, [](auto const &e, auto wayid) { - return e.first < wayid; - }); - - if(iter == mLatpLonLists->end() || iter->first != wayid) - throw std::out_of_range("Could not find way with id " + std::to_string(wayid)); - - return iter->second; - } - - // @brief Insert a node list. - // @param i OSM ID of a way - // @param llVec a coordinate vector to be inserted - // @invariant The OSM ID i must be larger than previously inserted OSM IDs of ways - // (though unnecessarily for current impl, future impl may impose that) - void insert_back(std::vector &new_ways) { - std::lock_guard lock(mutex); - auto i = mLatpLonLists->size(); - mLatpLonLists->resize(i + new_ways.size()); - std::copy(std::make_move_iterator(new_ways.begin()), std::make_move_iterator(new_ways.end()), mLatpLonLists->begin() + i); - } - - // @brief Make the store empty - void clear() { - std::lock_guard lock(mutex); - mLatpLonLists->clear(); - } - - std::size_t size() const { - std::lock_guard lock(mutex); - return mLatpLonLists->size(); - } - - void sort(unsigned int threadNum); - -private: - mutable std::mutex mutex; - std::unique_ptr mLatpLonLists; -}; // relation store // (this isn't currently used as we don't need to store relations for later processing, but may be needed for nested relations) @@ -425,57 +164,33 @@ class RelationStore { */ class OSMStore { +public: + NodeStore& nodes; + WayStore& ways; + protected: - NodeStore nodes; - CompactNodeStore compact_nodes; bool use_compact_nodes = false; bool require_integrity = true; - WayStore ways; RelationStore relations; // unused UsedWays used_ways; RelationScanStore scanned_relations; - void reopen() { - nodes.reopen(); - compact_nodes.reopen(); - ways.reopen(); - relations.reopen(); - } - public: - OSMStore() + OSMStore(NodeStore& nodes, WayStore& ways): nodes(nodes), ways(ways) { reopen(); } + void reopen(); + void open(std::string const &osm_store_filename); - void use_compact_store(bool use = true) { use_compact_nodes = use; } - void enforce_integrity(bool ei = true) { require_integrity = ei; } + void use_compact_store(bool use) { use_compact_nodes = use; } + void enforce_integrity(bool ei) { require_integrity = ei; } bool integrity_enforced() { return require_integrity; } - void nodes_insert_back(std::vector const &new_nodes) { - if(!use_compact_nodes) - nodes.insert_back(new_nodes); - else - compact_nodes.insert_back(new_nodes); - } - void nodes_sort(unsigned int threadNum); - std::size_t nodes_size() { - return use_compact_nodes ? compact_nodes.size() : nodes.size(); - } - - LatpLon nodes_at(NodeID i) const { - return use_compact_nodes ? compact_nodes.at(i) : nodes.at(i); - } - - void ways_insert_back(std::vector &new_ways) { - ways.insert_back(new_ways); - } - void ways_sort(unsigned int threadNum); - void relations_insert_front(std::vector &new_relations) { relations.insert_front(new_relations); } @@ -483,10 +198,9 @@ class OSMStore void mark_way_used(WayID i) { used_ways.insert(i); } bool way_is_used(WayID i) { return used_ways.at(i); } - void ensure_used_ways_inited() { - if (!used_ways.inited) used_ways.reserve(use_compact_nodes, nodes_size()); - } - + + void ensureUsedWaysInited(); + using tag_map_t = boost::container::flat_map; void relation_contains_way(WayID relid, WayID wayid) { scanned_relations.relation_contains_way(relid,wayid); } void store_relation_tags(WayID relid, const tag_map_t &tags) { scanned_relations.store_relation_tags(relid,tags); } @@ -494,15 +208,7 @@ class OSMStore std::vector relations_for_way(WayID wayid) { return scanned_relations.relations_for_way(wayid); } std::string get_relation_tag(WayID relid, const std::string &key) { return scanned_relations.get_relation_tag(relid, key); } - void clear() { - nodes.clear(); - compact_nodes.clear(); - ways.clear(); - relations.clear(); - used_ways.clear(); - } - - void reportStoreSize(std::ostringstream &str); + void clear(); void reportSize() const; // Relation -> MultiPolygon or MultiLinestring diff --git a/include/read_pbf.h b/include/read_pbf.h index 7761dc58..89b88a15 100644 --- a/include/read_pbf.h +++ b/include/read_pbf.h @@ -14,6 +14,28 @@ class OsmLuaProcessing; +extern const std::string OptionSortTypeThenID; +extern const std::string OptionLocationsOnWays; + +struct BlockMetadata { + long int offset; + google::protobuf::int32 length; + bool hasNodes; + bool hasWays; + bool hasRelations; + + // We use blocks as the unit of parallelism. Sometimes, a PBF only + // has a few blocks with relations. In this case, to keep all cores + // busy, we'll subdivide the block into chunks, and each thread + // will only process a chunk of the block. + size_t chunk; + size_t chunks; +}; + +struct IndexedBlockMetadata: BlockMetadata { + size_t index; +}; + /** *\brief Reads a PBF OSM file and returns objects as a stream of events to a class derived from OsmLuaProcessing * @@ -22,16 +44,20 @@ class OsmLuaProcessing; class PbfReader { public: - enum class ReadPhase { Nodes = 1, Ways = 2, Relations = 4, RelationScan = 8, All = 15 }; + enum class ReadPhase { Nodes = 1, Ways = 2, Relations = 4, RelationScan = 8 }; PbfReader(OSMStore &osmStore); using pbfreader_generate_output = std::function< std::shared_ptr () >; using pbfreader_generate_stream = std::function< std::shared_ptr () >; - int ReadPbfFile(std::unordered_set const &nodeKeys, unsigned int threadNum, - pbfreader_generate_stream const &generate_stream, - pbfreader_generate_output const &generate_output); + int ReadPbfFile( + bool hasSortTypeThenID, + const std::unordered_set& nodeKeys, + unsigned int threadNum, + const pbfreader_generate_stream& generate_stream, + const pbfreader_generate_output& generate_output + ); // Read tags into a map from a way/node/relation using tag_map_t = boost::container::flat_map; @@ -46,13 +72,24 @@ class PbfReader } private: - bool ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::pair progress, std::size_t datasize, - std::unordered_set const &nodeKeys, bool locationsOnWays, ReadPhase phase = ReadPhase::All); + bool ReadBlock( + std::istream &infile, + OsmLuaProcessing &output, + const BlockMetadata& blockMetadata, + const std::unordered_set& nodeKeys, + bool locationsOnWays, + ReadPhase phase + ); bool ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const std::unordered_set &nodeKeyPositions); bool ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays); bool ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb); - bool ReadRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb); + bool ReadRelations( + OsmLuaProcessing& output, + PrimitiveGroup& pg, + const PrimitiveBlock& pb, + const BlockMetadata& blockMetadata + ); inline bool RelationIsType(Relation const &rel, int typeKey, int val) { if (typeKey==-1 || val==-1) return false; @@ -71,4 +108,6 @@ class PbfReader int ReadPbfBoundingBox(const std::string &inputFile, double &minLon, double &maxLon, double &minLat, double &maxLat, bool &hasClippingBox); +bool PbfHasOptionalFeature(const std::string& inputFile, const std::string& feature); + #endif //_READ_PBF_H diff --git a/include/read_shp.h b/include/read_shp.h index 5c24af8e..ed3a0f2a 100644 --- a/include/read_shp.h +++ b/include/read_shp.h @@ -9,7 +9,6 @@ #include "geom.h" #include "output_object.h" #include "osm_lua_processing.h" -#include "kaguya.hpp" #include "attribute_store.h" // Shapelib diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h new file mode 100644 index 00000000..5c156ad3 --- /dev/null +++ b/include/sorted_node_store.h @@ -0,0 +1,89 @@ +#ifndef _SORTED_NODE_STORE_H +#define _SORTED_NODE_STORE_H + +#include "node_store.h" +#include "mmap_allocator.h" +#include +#include +#include + +// SortedNodeStore requires the Sort.Type_then_ID property on the source PBF. +// +// It stores nodes in chunks of 256, and chunks in groups of 256. +// Access to a node given its NodeID is constant time. +// +// Additional memory usage varies, approaching 1% for very large PBFs. + +namespace SortedNodeStoreTypes { + struct ChunkInfoBase { + // If high bit is set, this is a compressed chunk. + // Bits 0..9 are the length of the compressed lons. + // Bits 10..19 are the length of the compressed lats. + // The upper-most bit should be set iff this is a compressed chunk. + uint32_t flags; + // A bitmask indicating how many nodes are in this chunk. + uint8_t nodeMask[32]; + }; + + struct CompressedChunkInfo: ChunkInfoBase { + // streamvbyte_decode needs N, the size of the original array. + // N is popcnt(nodeMask) - 1. + // data is zigzag delta encoded, so we need firstLatp and firstLatp to recover it. + int32_t firstLatp; + int32_t firstLon; + uint8_t data[0]; + }; + + struct UncompressedChunkInfo: ChunkInfoBase { + LatpLon nodes[0]; + }; + + struct GroupInfo { + // A bitmask indicating how many chunks are in this group. + uint8_t chunkMask[32]; + + // There is an entry for each set bit in chunkMask. They identify + // the address of a ChunkInfo. The address is relative to the end + // of the GroupInfo struct + // + // e.g. given an offset 12, the chunk is located at + // &chunkOffsets[popcnt(chunkMask)] + offset * 8. + uint16_t chunkOffsets[0]; + }; +} + + +class SortedNodeStore : public NodeStore +{ + +public: + SortedNodeStore(bool compressNodes); + ~SortedNodeStore(); + void reopen() override; + void finalize(size_t threadNum) override; + LatpLon at(NodeID i) const override; + size_t size() const override; + void batchStart() override; + void insert(const std::vector& elements) override; + void clear() { + reopen(); + } + +private: + // When true, store chunks compressed. Only store compressed if the + // chunk is sufficiently large. + bool compressNodes; + + mutable std::mutex orphanageMutex; + std::vector groups; + std::vector> allocatedMemory; + + // The orphanage stores nodes that come from groups that may be worked on by + // multiple threads. They'll get folded into the index during finalize() + std::map> orphanage; + std::vector> workerBuffers; + void collectOrphans(const std::vector& orphans); + void publishGroup(const std::vector& nodes); +}; + +#endif diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h new file mode 100644 index 00000000..145e467b --- /dev/null +++ b/include/sorted_way_store.h @@ -0,0 +1,124 @@ +#ifndef _SORTED_WAY_STORE_H +#define _SORTED_WAY_STORE_H + +#include +#include +#include +#include "way_store.h" +#include "mmap_allocator.h" + +class NodeStore; + +// Like SortedNodeStore, but for ways. +// +// Ways are variable length, whereas nodes are a fixed 8 bytes. +// +// This is important for two reasons: +// - we were able to directly calculate the offset of the node in a chunk (the size is fixed) +// - we could fit the offsets of chunk in a short (the size is small) +// +// Per https://wiki.openstreetmap.org/wiki/Way, a way can have at most 2,000 nodes. +// +// In practice, most ways have far fewer than 2,000 nodes. +// for NS: p50=7, p90=32, p95=54, p99=161 +// for GB: p50=5, p90=19, p95=30, p99=82 +// for ON: p50=8, p90=31, p95=54, p99=172 +// +// That is, 50% of the time, ways have 8 or fewer nodes. 90% of the time, +// they have 32 or fewer nodes. + +namespace SortedWayStoreTypes { + + struct EncodedWay { + // A way can have 2000 nodes. + // Bits 0..10 track how many nodes are in this way. + // That leaves 5 bits for activities: + // ab0xx: bits 31..34 of node ID are interwoven as bytes. + // ab1xx: bits 31..34 of node ID are the same, stored as first byte + // + // 1xxxx: This way is stored zigzag encoded. + // z1zzz: This is a closed way, repeat the first node as the last node. + // + // When it's compressed, we still handle high bits the same, + // but the low bytes are compressed. + // + // We'd need to add a compressedLength, but otherwise it'd + // be the same. + uint16_t flags; + // Data could be: + // (if interwoven bit) N/2 bytes: interwoven high bits + // (if compression bit) 2 bytes: compressed length + // (if compression bit) 4 bytes: first 32-bit value + // N 32-bit ints: the N low ints + uint8_t data[0]; + }; + + struct ChunkInfo { + // Bitmasks indicating which ways are in this chunk. + // Small ways: these are ways that can be stored in 256 bytes or less, + // they can be identified with a scale of 1 relative to end of wayOffsets. + // + // We expect 60-80% of ways to be small ways. + // + // Big ways: these are ways that require more than 256 bytes, + // they can be identified with a scale of 64 relative to start of chunk. + uint8_t smallWayMask[32]; + uint8_t bigWayMask[32]; + + uint16_t wayOffsets[0]; + }; + + struct GroupInfo { + // A bitmask indicating how many chunks are in this group. + uint8_t chunkMask[32]; + + // There is an entry for each set bit in chunkMask. They identify + // the address of a ChunkInfo. The address is relative to the end + // of the GroupInfo struct. + uint32_t chunkOffsets[0]; + }; +} + +class SortedWayStore: public WayStore { + +public: + SortedWayStore(bool compressWays, const NodeStore& nodeStore); + ~SortedWayStore(); + void reopen() override; + void batchStart() override; + std::vector at(WayID wayid) const override; + bool requiresNodes() const override { return true; } + void insertLatpLons(std::vector &newWays) override; + const void insertNodes(const std::vector>>& newWays) override; + void clear() override; + std::size_t size() const override; + void finalize(unsigned int threadNum) override; + + static uint16_t encodeWay( + const std::vector& way, + std::vector& output, + bool compress + ); + + static std::vector decodeWay(uint16_t flags, const uint8_t* input); + +private: + bool compressWays; + const NodeStore& nodeStore; + mutable std::mutex orphanageMutex; + std::vector groups; + std::vector> allocatedMemory; + + // The orphanage stores nodes that come from groups that may be worked on by + // multiple threads. They'll get folded into the index during finalize() + std::map>>> orphanage; + std::vector>>> workerBuffers; + void collectOrphans(const std::vector>>& orphans); + void publishGroup(const std::vector>>& ways); +}; + + +// TODO: consider extracting this for SortedNodeStore if we rewrite that class +void populateMask(uint8_t* mask, const std::vector& ids); + +#endif diff --git a/include/tile_data.h b/include/tile_data.h index caaf0392..c453b72e 100644 --- a/include/tile_data.h +++ b/include/tile_data.h @@ -13,6 +13,8 @@ typedef std::set TileCoordinatesSet; typedef std::vector SourceList; +class TileBbox; + // We cluster output objects by z6 tile #define CLUSTER_ZOOM 6 #define CLUSTER_ZOOM_WIDTH (1 << CLUSTER_ZOOM) diff --git a/include/way_store.h b/include/way_store.h new file mode 100644 index 00000000..8650cbea --- /dev/null +++ b/include/way_store.h @@ -0,0 +1,26 @@ +#ifndef _WAY_STORE_H +#define _WAY_STORE_H + +#include +#include "coordinates.h" +#include "mmap_allocator.h" + +class WayStore { +public: + using latplon_vector_t = std::vector>; + using ll_element_t = std::pair; + + virtual void reopen() = 0; + // Run on each thread when a batch of blocks is started. Only + // meaningful for SortedWayStore + virtual void batchStart() = 0; + virtual std::vector at(WayID wayid) const = 0; + virtual bool requiresNodes() const = 0; + virtual void insertLatpLons(std::vector& newWays) = 0; + virtual const void insertNodes(const std::vector>>& newWays) = 0; + virtual void clear() = 0; + virtual std::size_t size() const = 0; + virtual void finalize(unsigned int threadNum) = 0; +}; + +#endif diff --git a/include/way_stores.h b/include/way_stores.h new file mode 100644 index 00000000..dfb5f74c --- /dev/null +++ b/include/way_stores.h @@ -0,0 +1,29 @@ +#ifndef _WAY_STORES_H +#define _WAY_STORES_H + +#include +#include +#include "way_store.h" +#include "sorted_way_store.h" + +class BinarySearchWayStore: public WayStore { + +public: + using map_t = std::deque>; + + void reopen() override; + void batchStart() override {} + std::vector at(WayID wayid) const override; + bool requiresNodes() const override { return false; } + void insertLatpLons(std::vector &newWays) override; + const void insertNodes(const std::vector>>& newWays) override; + void clear() override; + std::size_t size() const override; + void finalize(unsigned int threadNum) override; + +private: + mutable std::mutex mutex; + std::unique_ptr mLatpLonLists; +}; + +#endif diff --git a/include/write_geometry.h b/include/write_geometry.h index 8ec29f23..8d1d014b 100644 --- a/include/write_geometry.h +++ b/include/write_geometry.h @@ -6,6 +6,7 @@ #include #include #include "coordinates.h" +#include "coordinates_geom.h" // Protobuf #include "osmformat.pb.h" @@ -13,6 +14,8 @@ typedef std::vector > XYString; +class TileBbox; + /** \brief WriteGeometryVisitor takes a boost::geometry object and writes it into a tile */ diff --git a/src/coordinates.cpp b/src/coordinates.cpp index e5c107d8..3e8ea2f2 100644 --- a/src/coordinates.cpp +++ b/src/coordinates.cpp @@ -1,8 +1,6 @@ #include "coordinates.h" #include - -using namespace std; -namespace geom = boost::geometry; +#include TileCoordinates_::TileCoordinates_() { this->x = 0; @@ -26,18 +24,18 @@ double lat2latp(double lat) { return rad2deg(log(tan(deg2rad(clamp(lat,85.06)+90 double latp2lat(double latp) { return rad2deg(atan(exp(deg2rad(latp)))*2.0)-90.0; } // Tile conversions -double lon2tilexf(double lon, uint z) { return scalbn((lon+180.0) * (1/360.0), (int)z); } -double latp2tileyf(double latp, uint z) { return scalbn((180.0-latp) * (1/360.0), (int)z); } -double lat2tileyf(double lat, uint z) { return latp2tileyf(lat2latp(lat), z); } -uint lon2tilex(double lon, uint z) { return lon2tilexf(lon, z); } -uint latp2tiley(double latp, uint z) { return latp2tileyf(latp, z); } -uint lat2tiley(double lat, uint z) { return lat2tileyf(lat, z); } -double tilex2lon(uint x, uint z) { return scalbn(x, -(int)z) * 360.0 - 180.0; } -double tiley2latp(uint y, uint z) { return 180.0 - scalbn(y, -(int)z) * 360.0; } -double tiley2lat(uint y, uint z) { return latp2lat(tiley2latp(y, z)); } +double lon2tilexf(double lon, uint8_t z) { return scalbn((lon+180.0) * (1/360.0), (int)z); } +double latp2tileyf(double latp, uint8_t z) { return scalbn((180.0-latp) * (1/360.0), (int)z); } +double lat2tileyf(double lat, uint8_t z) { return latp2tileyf(lat2latp(lat), z); } +uint32_t lon2tilex(double lon, uint8_t z) { return lon2tilexf(lon, z); } +uint32_t latp2tiley(double latp, uint8_t z) { return latp2tileyf(latp, z); } +uint32_t lat2tiley(double lat, uint8_t z) { return lat2tileyf(lat, z); } +double tilex2lon(uint32_t x, uint8_t z) { return scalbn(x, -(int)z) * 360.0 - 180.0; } +double tiley2latp(uint32_t y, uint8_t z) { return 180.0 - scalbn(y, -(int)z) * 360.0; } +double tiley2lat(uint32_t y, uint8_t z) { return latp2lat(tiley2latp(y, z)); } // Get a tile index -TileCoordinates latpLon2index(LatpLon ll, uint baseZoom) { +TileCoordinates latpLon2index(LatpLon ll, uint8_t baseZoom) { return TileCoordinates(lon2tilex(ll.lon /10000000.0, baseZoom), latp2tiley(ll.latp/10000000.0, baseZoom)); } @@ -51,9 +49,9 @@ double meter2degp(double meter, double latp) { } // the range between smallest y and largest y is filled, for each x -void fillCoveredTiles(unordered_set &tileSet) { - vector tileList(tileSet.begin(), tileSet.end()); - sort(tileList.begin(), tileList.end(), TileCoordinatesCompare()); +void fillCoveredTiles(std::unordered_set& tileSet) { + std::vector tileList(tileSet.begin(), tileSet.end()); + std::sort(tileList.begin(), tileList.end(), TileCoordinatesCompare()); TileCoordinate prevX = 0, prevY = static_cast(-2); for (TileCoordinates index: tileList) { @@ -67,189 +65,3 @@ void fillCoveredTiles(unordered_set &tileSet) { prevX = tileX, prevY = tileY; } } - - -// ------------------------------------------------------ -// Helper class for dealing with spherical Mercator tiles - -TileBbox::TileBbox(TileCoordinates i, uint z, bool h, bool e) { - zoom = z; - index = i; - hires = h; - endZoom = e; - minLon = tilex2lon(i.x ,zoom); - minLat = tiley2lat(i.y+1,zoom); - maxLon = tilex2lon(i.x+1,zoom); - maxLat = tiley2lat(i.y ,zoom); - minLatp = lat2latp(minLat); - maxLatp = lat2latp(maxLat); - xmargin = (maxLon -minLon )/200.0; - ymargin = (maxLatp-minLatp)/200.0; - xscale = (maxLon -minLon )/(hires ? 8192.0 : 4096.0); - yscale = (maxLatp-minLatp)/(hires ? 8192.0 : 4096.0); - clippingBox = Box(geom::make(minLon-xmargin, minLatp-ymargin), - geom::make(maxLon+xmargin, maxLatp+ymargin)); -} - -pair TileBbox::scaleLatpLon(double latp, double lon) const { - int x = floor( (lon - minLon) / xscale ); - int y = floor( (maxLatp - latp) / yscale ); - return pair(x,y); -} - -// Scaling with naive self-intersection check - if we've added the new point -// within the last 5 points, then backtrack to the last time we added it -std::vector TileBbox::scaleRing(Ring const &src) const { - std::vector points; - points.reserve(src.size()); - for(auto &i: src) { - auto scaled = scaleLatpLon(i.y(), i.x()); // -> .first is x, .second is y - bool found = false; - for (size_t j=1; j<5; j++) { - if (points.size() < 1+j) break; - Point check = points[points.size()-j]; - if (check.x()==scaled.first && check.y()==scaled.second) { - points.resize(points.size()-j+1); found=true; break; - } - } - if (!found) points.push_back(Point(scaled.first,scaled.second)); - } - return points; -} - -MultiPolygon TileBbox::scaleGeometry(MultiPolygon const &src) const { - MultiPolygon dst; - for(auto poly: src) { - Polygon p; - - // Copy the outer ring - std::vector points = scaleRing(poly.outer()); - if (points.size()<4) continue; - Ring outer; - geom::append(outer,points); - geom::append(p,outer); - - // Copy the inner rings - int num_rings = 0; - for(auto &r: poly.inners()) { - points = scaleRing(r); - if (points.size()<4) continue; - Ring inner; - geom::append(inner,points); - num_rings++; - geom::interior_rings(p).resize(num_rings); - geom::append(p, inner, num_rings-1); - } - - // Add to multipolygon - dst.push_back(p); - } - return dst; -} - -pair TileBbox::floorLatpLon(double latp, double lon) const { - auto p = scaleLatpLon(latp, lon); - return std::make_pair( -(p.second * yscale - maxLatp), p.first * xscale + minLon); -} - -Box TileBbox::getTileBox() const { - double xmargin = (maxLon -minLon )/8192.0; - double ymargin = (maxLatp-minLatp)/8192.0; - return Box(geom::make(minLon+xmargin, minLatp+ymargin), geom::make(maxLon-xmargin, maxLatp-ymargin)); -} - -Box TileBbox::getExtendBox() const { - return Box( - geom::make( minLon-(maxLon-minLon)*2.0, minLatp-(maxLatp-minLatp)*(8191.0/8192.0)), - geom::make( maxLon+(maxLon-minLon)*(8191.0/8192.0), maxLatp+(maxLatp-minLatp)*2.0)); -} - -template -void impl_insertIntermediateTiles(T const &points, uint baseZoom, std::unordered_set &tileSet) { - Point p2(0, 0); - for (auto it = points.begin(); it != points.end(); ++it) { - // Line is from p1 to p2 - Point p1 = p2; - p2 = *it; - - // Calculate p2 tile, and mark it - double tileXf2 = lon2tilexf(p2.x(), baseZoom), tileYf2 = latp2tileyf(p2.y(), baseZoom); - TileCoordinate tileX2 = static_cast(tileXf2), tileY2 = static_cast(tileYf2); - tileSet.insert(TileCoordinates(tileX2, tileY2)); - if (it == points.begin()) continue; // first point, so no line - - // Calculate p1 tile - double tileXf1 = lon2tilexf(p1.x(), baseZoom), tileYf1 = latp2tileyf(p1.y(), baseZoom); - TileCoordinate tileX1 = static_cast(tileXf1), tileY1 = static_cast(tileYf1); - tileSet.insert(TileCoordinates(tileX1,tileY1)); - - // Supercover line algorithm from http://eugen.dedu.free.fr/projects/bresenham/ - int i; // loop counter - int ystep, xstep; // the step on y and x axis - int error; // the error accumulated during the increment - int errorprev; // *vision the previous value of the error variable - int y = tileY1, x = tileX1; // the line points - int ddy, ddx; // compulsory variables: the double values of dy and dx - int dx = tileX2 - tileX1; - int dy = tileY2 - tileY1; - - if (dy < 0) { ystep = -1; dy = -dy; } else { ystep = 1; } - if (dx < 0) { xstep = -1; dx = -dx; } else { xstep = 1; } - - ddy = 2 * dy; // work with double values for full precision - ddx = 2 * dx; - if (ddx >= ddy) { // first octant (0 <= slope <= 1) - // compulsory initialization (even for errorprev, needed when dx==dy) - errorprev = error = dx; // start in the middle of the square - for (i=0 ; i < dx ; i++) { // do not use the first point (already done) - x += xstep; - error += ddy; - if (error > ddx){ // increment y if AFTER the middle ( > ) - y += ystep; - error -= ddx; - // three cases (octant == right->right-top for directions below): - if (error + errorprev < ddx) // bottom square also - tileSet.insert(TileCoordinates(x, y-ystep)); - else if (error + errorprev > ddx) // left square also - tileSet.insert(TileCoordinates(x-xstep, y)); - else { // corner: bottom and left squares also - tileSet.insert(TileCoordinates(x, y-ystep)); - tileSet.insert(TileCoordinates(x-xstep, y)); - } - } - tileSet.insert(TileCoordinates(x, y)); - errorprev = error; - } - } else { // the same as above - errorprev = error = dy; - for (i=0 ; i < dy ; i++){ - y += ystep; - error += ddx; - if (error > ddy){ - x += xstep; - error -= ddy; - if (error + errorprev < ddy) - tileSet.insert(TileCoordinates(x-xstep, y)); - else if (error + errorprev > ddy) - tileSet.insert(TileCoordinates(x, y-ystep)); - else{ - tileSet.insert(TileCoordinates(x-xstep, y)); - tileSet.insert(TileCoordinates(x, y-ystep)); - } - } - tileSet.insert(TileCoordinates(x, y)); - errorprev = error; - } - } - } -} - -void insertIntermediateTiles(Linestring const &points, uint baseZoom, std::unordered_set &tileSet) -{ - impl_insertIntermediateTiles(points, baseZoom, tileSet); -} - -void insertIntermediateTiles(Ring const &points, uint baseZoom, std::unordered_set &tileSet) -{ - impl_insertIntermediateTiles(points, baseZoom, tileSet); -} diff --git a/src/coordinates_geom.cpp b/src/coordinates_geom.cpp new file mode 100644 index 00000000..0a5b2410 --- /dev/null +++ b/src/coordinates_geom.cpp @@ -0,0 +1,189 @@ +#include "coordinates_geom.h" + +using namespace std; +namespace geom = boost::geometry; + +// ------------------------------------------------------ +// Helper class for dealing with spherical Mercator tiles + +TileBbox::TileBbox(TileCoordinates i, uint z, bool h, bool e) { + zoom = z; + index = i; + hires = h; + endZoom = e; + minLon = tilex2lon(i.x ,zoom); + minLat = tiley2lat(i.y+1,zoom); + maxLon = tilex2lon(i.x+1,zoom); + maxLat = tiley2lat(i.y ,zoom); + minLatp = lat2latp(minLat); + maxLatp = lat2latp(maxLat); + xmargin = (maxLon -minLon )/200.0; + ymargin = (maxLatp-minLatp)/200.0; + xscale = (maxLon -minLon )/(hires ? 8192.0 : 4096.0); + yscale = (maxLatp-minLatp)/(hires ? 8192.0 : 4096.0); + clippingBox = Box(geom::make(minLon-xmargin, minLatp-ymargin), + geom::make(maxLon+xmargin, maxLatp+ymargin)); +} + +pair TileBbox::scaleLatpLon(double latp, double lon) const { + int x = floor( (lon - minLon) / xscale ); + int y = floor( (maxLatp - latp) / yscale ); + return pair(x,y); +} + +// Scaling with naive self-intersection check - if we've added the new point +// within the last 5 points, then backtrack to the last time we added it +std::vector TileBbox::scaleRing(Ring const &src) const { + std::vector points; + points.reserve(src.size()); + for(auto &i: src) { + auto scaled = scaleLatpLon(i.y(), i.x()); // -> .first is x, .second is y + bool found = false; + for (size_t j=1; j<5; j++) { + if (points.size() < 1+j) break; + Point check = points[points.size()-j]; + if (check.x()==scaled.first && check.y()==scaled.second) { + points.resize(points.size()-j+1); found=true; break; + } + } + if (!found) points.push_back(Point(scaled.first,scaled.second)); + } + return points; +} + +MultiPolygon TileBbox::scaleGeometry(MultiPolygon const &src) const { + MultiPolygon dst; + for(auto poly: src) { + Polygon p; + + // Copy the outer ring + std::vector points = scaleRing(poly.outer()); + if (points.size()<4) continue; + Ring outer; + geom::append(outer,points); + geom::append(p,outer); + + // Copy the inner rings + int num_rings = 0; + for(auto &r: poly.inners()) { + points = scaleRing(r); + if (points.size()<4) continue; + Ring inner; + geom::append(inner,points); + num_rings++; + geom::interior_rings(p).resize(num_rings); + geom::append(p, inner, num_rings-1); + } + + // Add to multipolygon + dst.push_back(p); + } + return dst; +} + +pair TileBbox::floorLatpLon(double latp, double lon) const { + auto p = scaleLatpLon(latp, lon); + return std::make_pair( -(p.second * yscale - maxLatp), p.first * xscale + minLon); +} + +Box TileBbox::getTileBox() const { + double xmargin = (maxLon -minLon )/8192.0; + double ymargin = (maxLatp-minLatp)/8192.0; + return Box(geom::make(minLon+xmargin, minLatp+ymargin), geom::make(maxLon-xmargin, maxLatp-ymargin)); +} + +Box TileBbox::getExtendBox() const { + return Box( + geom::make( minLon-(maxLon-minLon)*2.0, minLatp-(maxLatp-minLatp)*(8191.0/8192.0)), + geom::make( maxLon+(maxLon-minLon)*(8191.0/8192.0), maxLatp+(maxLatp-minLatp)*2.0)); +} + +template +void impl_insertIntermediateTiles(T const &points, uint baseZoom, std::unordered_set &tileSet) { + Point p2(0, 0); + for (auto it = points.begin(); it != points.end(); ++it) { + // Line is from p1 to p2 + Point p1 = p2; + p2 = *it; + + // Calculate p2 tile, and mark it + double tileXf2 = lon2tilexf(p2.x(), baseZoom), tileYf2 = latp2tileyf(p2.y(), baseZoom); + TileCoordinate tileX2 = static_cast(tileXf2), tileY2 = static_cast(tileYf2); + tileSet.insert(TileCoordinates(tileX2, tileY2)); + if (it == points.begin()) continue; // first point, so no line + + // Calculate p1 tile + double tileXf1 = lon2tilexf(p1.x(), baseZoom), tileYf1 = latp2tileyf(p1.y(), baseZoom); + TileCoordinate tileX1 = static_cast(tileXf1), tileY1 = static_cast(tileYf1); + tileSet.insert(TileCoordinates(tileX1,tileY1)); + + // Supercover line algorithm from http://eugen.dedu.free.fr/projects/bresenham/ + int i; // loop counter + int ystep, xstep; // the step on y and x axis + int error; // the error accumulated during the increment + int errorprev; // *vision the previous value of the error variable + int y = tileY1, x = tileX1; // the line points + int ddy, ddx; // compulsory variables: the double values of dy and dx + int dx = tileX2 - tileX1; + int dy = tileY2 - tileY1; + + if (dy < 0) { ystep = -1; dy = -dy; } else { ystep = 1; } + if (dx < 0) { xstep = -1; dx = -dx; } else { xstep = 1; } + + ddy = 2 * dy; // work with double values for full precision + ddx = 2 * dx; + if (ddx >= ddy) { // first octant (0 <= slope <= 1) + // compulsory initialization (even for errorprev, needed when dx==dy) + errorprev = error = dx; // start in the middle of the square + for (i=0 ; i < dx ; i++) { // do not use the first point (already done) + x += xstep; + error += ddy; + if (error > ddx){ // increment y if AFTER the middle ( > ) + y += ystep; + error -= ddx; + // three cases (octant == right->right-top for directions below): + if (error + errorprev < ddx) // bottom square also + tileSet.insert(TileCoordinates(x, y-ystep)); + else if (error + errorprev > ddx) // left square also + tileSet.insert(TileCoordinates(x-xstep, y)); + else { // corner: bottom and left squares also + tileSet.insert(TileCoordinates(x, y-ystep)); + tileSet.insert(TileCoordinates(x-xstep, y)); + } + } + tileSet.insert(TileCoordinates(x, y)); + errorprev = error; + } + } else { // the same as above + errorprev = error = dy; + for (i=0 ; i < dy ; i++){ + y += ystep; + error += ddx; + if (error > ddy){ + x += xstep; + error -= ddy; + if (error + errorprev < ddy) + tileSet.insert(TileCoordinates(x-xstep, y)); + else if (error + errorprev > ddy) + tileSet.insert(TileCoordinates(x, y-ystep)); + else{ + tileSet.insert(TileCoordinates(x-xstep, y)); + tileSet.insert(TileCoordinates(x, y-ystep)); + } + } + tileSet.insert(TileCoordinates(x, y)); + errorprev = error; + } + } + } +} + +void insertIntermediateTiles(Linestring const &points, uint baseZoom, std::unordered_set &tileSet) +{ + impl_insertIntermediateTiles(points, baseZoom, tileSet); +} + +void insertIntermediateTiles(Ring const &points, uint baseZoom, std::unordered_set &tileSet) +{ + impl_insertIntermediateTiles(points, baseZoom, tileSet); +} diff --git a/src/external/streamvbyte_0124_decode.c b/src/external/streamvbyte_0124_decode.c new file mode 100644 index 00000000..524cfe70 --- /dev/null +++ b/src/external/streamvbyte_0124_decode.c @@ -0,0 +1,185 @@ +#include "streamvbyte.h" +#include "streamvbyte_isadetection.h" + +#ifdef STREAMVBYTE_X64 +#include "streamvbyte_shuffle_tables_0124_decode.h" +#endif + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +#ifdef STREAMVBYTE_X64 +STREAMVBYTE_TARGET_SSE41 +static inline __m128i svb_decode_sse41(uint32_t key, + const uint8_t *__restrict__ *dataPtrPtr) { + uint8_t len; + __m128i Data = _mm_loadu_si128((const __m128i *)*dataPtrPtr); + uint8_t *pshuf = (uint8_t *) &shuffleTable[key]; + __m128i Shuf = *(__m128i *)pshuf; + len = lengthTable[key]; + Data = _mm_shuffle_epi8(Data, Shuf); + *dataPtrPtr += len; + return Data; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static inline void svb_write_sse41(uint32_t *out, __m128i Vec) { + _mm_storeu_si128((__m128i *)out, Vec); +} +STREAMVBYTE_UNTARGET_REGION + +#endif // STREAMVBYTE_X64 + +static inline uint32_t svb_decode_data(const uint8_t **dataPtrPtr, uint8_t code) { + const uint8_t *dataPtr = *dataPtrPtr; + uint32_t val; + + if (code == 0) { // 0 byte + val = 0; + } else if (code == 1) { // 1 bytes + val = (uint32_t)*dataPtr; + dataPtr += 1; + } else if (code == 2) { // 2 bytes + val = 0; + memcpy(&val, dataPtr, 2); // assumes little endian + dataPtr += 2; + } else { // code == 3, 4 bytes + memcpy(&val, dataPtr, 4); + dataPtr += 4; + } + + *dataPtrPtr = dataPtr; + return val; +} + +static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr, + const uint8_t *dataPtr, + uint32_t count) { + if (count == 0) + return dataPtr; // no reads or writes if no data + + uint8_t shift = 0; + uint32_t key = *keyPtr++; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; + } + uint32_t val = svb_decode_data(&dataPtr, (key >> shift) & 0x3); + *outPtr++ = val; + shift += 2; + } + + return dataPtr; // pointer to first unused byte after end +} + +#ifdef STREAMVBYTE_X64 +STREAMVBYTE_TARGET_SSE41 +static const uint8_t *svb_decode_sse41_simple(uint32_t *out, + const uint8_t *__restrict__ keyPtr, + const uint8_t *__restrict__ dataPtr, + uint64_t count) { + + uint64_t keybytes = count / 4; // number of key bytes + __m128i Data; + if (keybytes >= 8) { + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); + + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 4, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 8, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 12, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 16, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 20, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 24, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 28, Data); + + out += 32; + } + { + uint64_t keys = nextkeys; + + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 4, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 8, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 12, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 16, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 20, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 24, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 28, Data); + + out += 32; + } + } + + return dataPtr; +} +STREAMVBYTE_UNTARGET_REGION + + +#endif + +// Read count 32-bit integers in maskedvbyte format from in, storing the result +// in out. Returns the number of bytes read. +size_t streamvbyte_decode_0124(const uint8_t *in, uint32_t *out, uint32_t count) { + if (count == 0) + return 0; + + + const uint8_t *keyPtr = in; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + dataPtr = svb_decode_sse41_simple(out, keyPtr, dataPtr, count); + out += count & ~ 31U; + keyPtr += (count/4) & ~ 7U; + count &= 31; + } +#endif + + return (size_t)(svb_decode_scalar(out, keyPtr, dataPtr, count) - in); + +} diff --git a/src/external/streamvbyte_0124_encode.c b/src/external/streamvbyte_0124_encode.c new file mode 100644 index 00000000..376d673c --- /dev/null +++ b/src/external/streamvbyte_0124_encode.c @@ -0,0 +1,113 @@ +#include "streamvbyte.h" +#include "streamvbyte_isadetection.h" +#include "streamvbyte_shuffle_tables_0124_encode.h" + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +static uint8_t svb_encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) { + uint8_t *dataPtr = *dataPtrPtr; + uint8_t code; + + if (val == 0) { // 0 bytes + code = 0; + } else if (val < (1 << 8)) { // 1 bytes + *dataPtr = (uint8_t)(val); + *dataPtrPtr += 1; + code = 1; + } else if (val < (1 << 16)) { // 2 bytes + memcpy(dataPtr, &val, 2); // assumes little endian + *dataPtrPtr += 2; + code = 2; + } else { // 4 bytes + memcpy(dataPtr, &val, sizeof(uint32_t)); + *dataPtrPtr += sizeof(uint32_t); + code = 3; + } + return code; +} + +static uint8_t *svb_encode_scalar(const uint32_t *in, + uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint32_t count) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; + } + uint32_t val = in[c]; + uint8_t code = svb_encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } + + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte +} + +#ifdef STREAMVBYTE_X64 +STREAMVBYTE_TARGET_SSE41 +static size_t streamvbyte_encode4(__m128i in, uint8_t *outData, uint8_t *outCode) { + const __m128i Ones = _mm_set1_epi32(0x01010101); + const __m128i GatherBits = _mm_set1_epi32(0x08040102); + const __m128i CodeTable = _mm_set_epi32(0x03030303, 0x03030303, 0x03030303, 0x02020100); + const __m128i GatherBytes = _mm_set_epi32(0, 0, 0x0D090501, 0x0D090501); + const __m128i Aggregators = _mm_set_epi32(0, 0, 0x01010101, 0x10400104); + + __m128i m0, m1; + m0 = _mm_min_epu8(in, Ones); // set byte to 1 if it is not zero + m0 = _mm_madd_epi16(m0, GatherBits); // gather bits 8,16,24 to bits 8,9,10 + m1 = _mm_shuffle_epi8(CodeTable, m0); // translate to a 2-bit encoded symbol + m1 = _mm_shuffle_epi8(m1, GatherBytes); // gather bytes holding symbols; 2 copies + m1 = _mm_madd_epi16(m1, Aggregators); // sum dword_1, pack dword_0 + + size_t code = (size_t)_mm_extract_epi8(m1, 1); + size_t length = lengthTable[code]; + + __m128i* shuf = (__m128i*)(((uint8_t*)encodingShuffleTable) + code * 16); + __m128i out = _mm_shuffle_epi8(in, _mm_loadu_si128(shuf)); // todo: aligned access + + _mm_storeu_si128((__m128i *)outData, out); + *outCode = (uint8_t)code; + return length; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static size_t streamvbyte_encode_quad(const uint32_t *in, uint8_t *outData, uint8_t *outKey) { + __m128i vin = _mm_loadu_si128((const __m128i *) in ); + return streamvbyte_encode4(vin, outData, outKey); +} +STREAMVBYTE_UNTARGET_REGION + +#endif + +size_t streamvbyte_encode_0124(const uint32_t *in, uint32_t count, uint8_t *out) { + uint8_t *keyPtr = out; + uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte + uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + uint32_t count_quads = count / 4; + count -= 4 * count_quads; + for (uint32_t c = 0; c < count_quads; c++) { + dataPtr += streamvbyte_encode_quad(in, dataPtr, keyPtr); + keyPtr++; + in += 4; + } + } +#endif + return (size_t)(svb_encode_scalar(in, keyPtr, dataPtr, count) - out); + +} diff --git a/src/external/streamvbyte_arm_decode.c b/src/external/streamvbyte_arm_decode.c new file mode 100644 index 00000000..02caabab --- /dev/null +++ b/src/external/streamvbyte_arm_decode.c @@ -0,0 +1,54 @@ + + +#include "streamvbyte_isadetection.h" +#ifdef STREAMVBYTE_ARM +#include "streamvbyte_shuffle_tables_decode.h" +#ifdef __aarch64__ +typedef uint8x16_t decode_t; +#else +typedef uint8x8x2_t decode_t; +#endif +static inline decode_t _decode_neon(const uint8_t key, + const uint8_t * restrict *dataPtrPtr) { + + uint8_t len; + uint8_t *pshuf = (uint8_t *)&shuffleTable[key]; + uint8x16_t decodingShuffle = vld1q_u8(pshuf); + + uint8x16_t compressed = vld1q_u8(*dataPtrPtr); +#ifdef AVOIDLENGTHLOOKUP + // this avoids the dependency on lengthTable, + // see https://github.com/lemire/streamvbyte/issues/12 + len = pshuf[12 + (key >> 6)] + 1; +#else + len = lengthTable[key]; +#endif +#ifdef __aarch64__ + uint8x16_t data = vqtbl1q_u8(compressed, decodingShuffle); +#else + uint8x8x2_t codehalves = {{vget_low_u8(compressed), vget_high_u8(compressed)}}; + + uint8x8x2_t data = {{vtbl2_u8(codehalves, vget_low_u8(decodingShuffle)), + vtbl2_u8(codehalves, vget_high_u8(decodingShuffle))}}; +#endif + *dataPtrPtr += len; + return data; +} + +static void streamvbyte_decode_quad( const uint8_t * restrict *dataPtrPtr, uint8_t key, uint32_t * restrict out ) { + decode_t data =_decode_neon( key, dataPtrPtr ); +#ifdef __aarch64__ + vst1q_u8((uint8_t *) out, data); +#else + vst1_u8((uint8_t *) out, data.val[0]); + vst1_u8((uint8_t *) (out + 2), data.val[1]); +#endif +} + +static const uint8_t *svb_decode_vector(uint32_t *out, const uint8_t *keyPtr, const uint8_t *dataPtr, uint32_t count) { + for(uint32_t i = 0; i < count/4; i++) + streamvbyte_decode_quad( &dataPtr, keyPtr[i], out + 4*i ); + + return dataPtr; +} +#endif diff --git a/src/external/streamvbyte_arm_encode.c b/src/external/streamvbyte_arm_encode.c new file mode 100644 index 00000000..4b77cf52 --- /dev/null +++ b/src/external/streamvbyte_arm_encode.c @@ -0,0 +1,57 @@ +#include "streamvbyte_isadetection.h" +#include "streamvbyte_shuffle_tables_encode.h" +#ifdef STREAMVBYTE_ARM +static const uint8_t pgatherlo[] = {12, 8, 4, 0, 12, 8, 4, 0}; // apparently only used in streamvbyte_encode4 +#define concat (1 | 1 << 10 | 1 << 20 | 1 << 30) +#define sum (1 | 1 << 8 | 1 << 16 | 1 << 24) +static const uint32_t pAggregators[2] = {concat, sum}; // apparently only used in streamvbyte_encode4 + +static inline size_t streamvbyte_encode4(uint32x4_t data, uint8_t *__restrict__ outData, uint8_t *__restrict__ outCode) { + + const uint8x8_t gatherlo = vld1_u8(pgatherlo); + const uint32x2_t Aggregators = vld1_u32(pAggregators); + + // lane code is 3 - (saturating sub) (clz(data)/8) + uint32x4_t clzbytes = vshrq_n_u32(vclzq_u32(data), 3); + uint32x4_t lanecodes = vqsubq_u32(vdupq_n_u32(3), clzbytes); + + // nops + uint8x16_t lanebytes = vreinterpretq_u8_u32(lanecodes); +#ifdef __aarch64__ + uint8x8_t lobytes = vqtbl1_u8( lanebytes, gatherlo ); +#else + uint8x8x2_t twohalves = {{vget_low_u8(lanebytes), vget_high_u8(lanebytes)}}; + + // shuffle lsbytes into two copies of an int + uint8x8_t lobytes = vtbl2_u8(twohalves, gatherlo); +#endif + + uint32x2_t mulshift = vreinterpret_u32_u8(lobytes); + + uint32_t codeAndLength[2]; + vst1_u32(codeAndLength, vmul_u32(mulshift, Aggregators)); + + uint32_t code = codeAndLength[0] >> 24; + size_t length = 4 + (codeAndLength[1] >> 24); + + // shuffle in 8-byte chunks + uint8x16_t databytes = vreinterpretq_u8_u32(data); + uint8x16_t encodingShuffle = vld1q_u8((uint8_t *) &encodingShuffleTable[code]); +#ifdef __aarch64__ + vst1q_u8(outData, vqtbl1q_u8(databytes, encodingShuffle)); +#else + uint8x8x2_t datahalves = {{vget_low_u8(databytes), vget_high_u8(databytes)}}; + vst1_u8(outData, vtbl2_u8(datahalves, vget_low_u8(encodingShuffle))); + vst1_u8(outData + 8, vtbl2_u8(datahalves, vget_high_u8(encodingShuffle))); +#endif + + *outCode = (uint8_t) code; + return length; +} + +static inline size_t streamvbyte_encode_quad(const uint32_t *__restrict__ in, uint8_t *__restrict__ outData, uint8_t *__restrict__ outCode) { + uint32x4_t inq = vld1q_u32(in); + + return streamvbyte_encode4(inq, outData, outCode); +} +#endif diff --git a/src/external/streamvbyte_decode.cc b/src/external/streamvbyte_decode.cc new file mode 100644 index 00000000..68883e7a --- /dev/null +++ b/src/external/streamvbyte_decode.cc @@ -0,0 +1,88 @@ +#include "external/streamvbyte.h" +#include "streamvbyte_isadetection.h" + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +#ifdef __ARM_NEON__ +#include "streamvbyte_arm_decode.c" +#endif + +#ifdef STREAMVBYTE_X64 +#include "streamvbyte_x64_decode.c" +#endif // STREAMVBYTE_X64 + +static inline uint32_t svb_decode_data(const uint8_t **dataPtrPtr, uint8_t code) { + const uint8_t *dataPtr = *dataPtrPtr; + uint32_t val; + + if (code == 0) { // 1 byte + val = (uint32_t)*dataPtr; + dataPtr += 1; + } else if (code == 1) { // 2 bytes + val = 0; + memcpy(&val, dataPtr, 2); // assumes little endian + dataPtr += 2; + } else if (code == 2) { // 3 bytes + val = 0; + memcpy(&val, dataPtr, 3); // assumes little endian + dataPtr += 3; + } else { // code == 3 + memcpy(&val, dataPtr, 4); + dataPtr += 4; + } + + *dataPtrPtr = dataPtr; + return val; +} +static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr, + const uint8_t *dataPtr, + uint32_t count) { + if (count == 0) + return dataPtr; // no reads or writes if no data + + uint8_t shift = 0; + uint32_t key = *keyPtr++; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; + } + uint32_t val = svb_decode_data(&dataPtr, (key >> shift) & 0x3); + *outPtr++ = val; + shift += 2; + } + + return dataPtr; // pointer to first unused byte after end +} + +// Read count 32-bit integers in maskedvbyte format from in, storing the result +// in out. Returns the number of bytes read. +size_t streamvbyte_decode(const uint8_t *in, uint32_t *out, uint32_t count) { + if (count == 0) + return 0; + + const uint8_t *keyPtr = in; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + dataPtr = svb_decode_sse41_simple(out, keyPtr, dataPtr, count); + out += count & ~ 31U; + keyPtr += (count/4) & ~ 7U; + count &= 31; + } +#elif defined(__ARM_NEON__) + dataPtr = svb_decode_vector(out, keyPtr, dataPtr, count); + out += count - (count & 3); + keyPtr += count/4; + count &= 3; +#endif + + return (size_t)(svb_decode_scalar(out, keyPtr, dataPtr, count) - in); + +} diff --git a/src/external/streamvbyte_encode.cc b/src/external/streamvbyte_encode.cc new file mode 100644 index 00000000..2bae6a50 --- /dev/null +++ b/src/external/streamvbyte_encode.cc @@ -0,0 +1,137 @@ +#include "external/streamvbyte.h" +#include "streamvbyte_isadetection.h" + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +#ifdef STREAMVBYTE_X64 +#include "streamvbyte_x64_encode.c" +#endif + +static uint8_t svb_encode_data(uint32_t val, uint8_t * *dataPtrPtr) { + uint8_t *dataPtr = *dataPtrPtr; + uint8_t code; + + if (val < (1 << 8)) { // 1 byte + *dataPtr = (uint8_t)(val); + *dataPtrPtr += 1; + code = 0; + } else if (val < (1 << 16)) { // 2 bytes + memcpy(dataPtr, &val, 2); // assumes little endian + *dataPtrPtr += 2; + code = 1; + } else if (val < (1 << 24)) { // 3 bytes + memcpy(dataPtr, &val, 3); // assumes little endian + *dataPtrPtr += 3; + code = 2; + } else { // 4 bytes + memcpy(dataPtr, &val, sizeof(uint32_t)); + *dataPtrPtr += sizeof(uint32_t); + code = 3; + } + + return code; +} + +static uint8_t *svb_encode_scalar(const uint32_t *in, + uint8_t * keyPtr, + uint8_t * dataPtr, + uint32_t count) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; + } + uint32_t val = in[c]; + uint8_t code = svb_encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } + + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte +} + + +#ifdef __ARM_NEON__ +#include "streamvbyte_arm_encode.c" +#endif + +static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) { + size_t db = 0; + for (uint32_t c = 0; c < length; c++) { + uint32_t val = in[c]; + + uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF); + db += bytes; + } + return db; +} + +static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) { + size_t db = 0; + for (uint32_t c = 0; c < length; c++) { + uint32_t val = in[c]; + + uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2; + db += bytes; + } + return db; +} + +size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) { + // number of control bytes: + size_t cb = (length + 3) / 4; + +#ifdef STREAMVBYTE_X64 + if (streamvbyte_sse41()) { + return cb + svb_data_bytes_SSE41(in, length); + } +#endif + return cb + svb_data_bytes_scalar(in, length); +} + +size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) { + // number of control bytes: + size_t cb = (length + 3) / 4; + + return cb + svb_data_bytes_0124_scalar(in, length); +} + + +// Encode an array of a given length read from in to bout in streamvbyte format. +// Returns the number of bytes written. +size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) { +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + return streamvbyte_encode_SSE41(in,count,out); + } +#endif + uint8_t *keyPtr = out; + uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte + uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys + +#if defined(__ARM_NEON__) + + uint32_t count_quads = count / 4; + count -= 4 * count_quads; + + for (uint32_t c = 0; c < count_quads; c++) { + dataPtr += streamvbyte_encode_quad(in, dataPtr, keyPtr); + keyPtr++; + in += 4; + } + +#endif + + return (size_t)(svb_encode_scalar(in, keyPtr, dataPtr, count) - out); +} diff --git a/src/external/streamvbyte_isadetection.h b/src/external/streamvbyte_isadetection.h new file mode 100644 index 00000000..aee4c7cf --- /dev/null +++ b/src/external/streamvbyte_isadetection.h @@ -0,0 +1,315 @@ +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef STREAMVBYTE_ISADETECTION_H +#define STREAMVBYTE_ISADETECTION_H + +#include +#include +#include + + +#if defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#include +#include +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif + +#if defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif // defined(_MSC_VER) + + +enum streamvbyte_instruction_set { + streamvbyte_DEFAULT = 0x0, + streamvbyte_NEON = 0x1, + streamvbyte_SSSE3 = 0x2, + streamvbyte_AVX2 = 0x4, + streamvbyte_SSE42 = 0x8, + streamvbyte_PCLMULQDQ = 0x10, + streamvbyte_BMI1 = 0x20, + streamvbyte_BMI2 = 0x40, + streamvbyte_ALTIVEC = 0x80, + streamvbyte_SSE41 = 0x100, + streamvbyte_UNINITIALIZED = 0x8000 +}; + +#if defined(__PPC64__) + +static inline uint32_t dynamic_streamvbyte_detect_supported_architectures(void) { + return streamvbyte_ALTIVEC; +} + +#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64 + +#if defined(__ARM_NEON) + +static inline uint32_t dynamic_streamvbyte_detect_supported_architectures(void) { + return streamvbyte_NEON; +} + +#else // ARM without NEON + +static inline uint32_t dynamic_streamvbyte_detect_supported_architectures(void) { + return streamvbyte_DEFAULT; +} + +#endif + +#elif defined(__x86_64__) || defined(_M_AMD64) // x64 + + + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { + +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, (int)*eax); + *eax = (uint32_t)cpu_info[0]; + *ebx = (uint32_t)cpu_info[1]; + *ecx = (uint32_t)cpu_info[2]; + *edx = (uint32_t)cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + +static inline uint32_t dynamic_streamvbyte_detect_supported_architectures(void) { + uint32_t eax, ebx, ecx, edx; + uint32_t host_isa = 0x0; + // Can be found on Intel ISA Reference for CPUID + static uint32_t cpuid_ssse3_bit = 1 << 1; ///< @private Bit 1 of EBX for EAX=0x7 + static uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 + static uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 + static uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 + static uint32_t cpuid_sse41_bit = 1 << 19; ///< @private bit 20 of ECX for EAX=0x1 + static uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 + static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_avx2_bit) { + host_isa |= streamvbyte_AVX2; + } + if (ebx & cpuid_bmi1_bit) { + host_isa |= streamvbyte_BMI1; + } + + if (ebx & cpuid_bmi2_bit) { + host_isa |= streamvbyte_BMI2; + } + + // EBX for EAX=0x1 + eax = 0x1; + cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & cpuid_ssse3_bit) { + host_isa |= streamvbyte_SSSE3; + } + if (ecx & cpuid_sse42_bit) { + host_isa |= streamvbyte_SSE42; + } + if (ecx & cpuid_sse41_bit) { + host_isa |= streamvbyte_SSE41; + } + if (ecx & cpuid_pclmulqdq_bit) { + host_isa |= streamvbyte_PCLMULQDQ; + } + + return host_isa; +} +#else // fallback + + +static inline uint32_t dynamic_streamvbyte_detect_supported_architectures(void) { + return streamvbyte_DEFAULT; +} + + +#endif // end SIMD extension detection code + + +#if defined(__x86_64__) || defined(_M_AMD64) // x64 +#define STREAMVBYTE_X64 +#if defined(__cplusplus) +#include +static inline uint32_t streamvbyte_detect_supported_architectures(void) { + static std::atomic buffer{streamvbyte_UNINITIALIZED}; + if(buffer == streamvbyte_UNINITIALIZED) { + buffer = dynamic_streamvbyte_detect_supported_architectures(); + } + return buffer; +} +#elif defined(_MSC_VER) && !defined(__clang__) +// Visual Studio does not support C11 atomics. +static inline uint32_t streamvbyte_detect_supported_architectures(void) { + static int buffer = streamvbyte_UNINITIALIZED; + if(buffer == streamvbyte_UNINITIALIZED) { + buffer = dynamic_streamvbyte_detect_supported_architectures(); + } + return buffer; +} +#else // defined(__cplusplus) and defined(_MSC_VER) && !defined(__clang__) +#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) +#include +#endif + +static inline uint32_t streamvbyte_detect_supported_architectures(void) { +#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) + static _Atomic uint32_t buffer = streamvbyte_UNINITIALIZED; +#else + static int buffer = streamvbyte_UNINITIALIZED; +#endif + +#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) + uint32_t result = atomic_load_explicit(&buffer, memory_order_acquire); + if(result == streamvbyte_UNINITIALIZED) { + result = dynamic_streamvbyte_detect_supported_architectures(); + atomic_store_explicit(&buffer, result, memory_order_release); + } + return result; +#else + if (buffer == streamvbyte_UNINITIALIZED) { + buffer = dynamic_streamvbyte_detect_supported_architectures(); + } + return buffer; +#endif +} +#endif // defined(_MSC_VER) && !defined(__clang__) + + +#if defined(__sse41__) +static inline bool streamvbyte_sse41(void) { + return true; +} +#else +static inline bool streamvbyte_sse41(void) { + return (streamvbyte_detect_supported_architectures() & streamvbyte_SSE41) == streamvbyte_SSE41; +} +#endif + + +#else // defined(__x86_64__) || defined(_M_AMD64) // x64 + +static inline bool streamvbyte_sse41(void) { + return false; +} + +static inline uint32_t streamvbyte_detect_supported_architectures(void) { + // no runtime dispatch + return dynamic_streamvbyte_detect_supported_architectures(); +} +#endif + +#ifdef __ARM_NEON__ +#define STREAMVBYTE_ARM +#endif + +#ifdef STREAMVBYTE_X64 +// this is almost standard? +#undef STRINGIFY_IMPLEMENTATION_ +#undef STRINGIFY +#define STRINGIFY_IMPLEMENTATION_(a) #a +#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) + +#ifdef __clang__ +// clang does not have GCC push pop +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so STREAMVBYTE_TARGET_REGION and STREAMVBYTE_UNTARGET_REGION must be *outside* of a +// namespace. +#define STREAMVBYTE_TARGET_REGION(T) \ + _Pragma(STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) +#define STREAMVBYTE_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +// GCC is easier +#define STREAMVBYTE_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) +#define STREAMVBYTE_UNTARGET_REGION _Pragma("GCC pop_options") +#endif // clang then gcc + + +// Default target region macros don't do anything. +#ifndef STREAMVBYTE_TARGET_REGION +#define STREAMVBYTE_TARGET_REGION(T) +#define STREAMVBYTE_UNTARGET_REGION +#endif + +#define STREAMVBYTE_TARGET_SSE41 STREAMVBYTE_TARGET_REGION("sse4.1") + +#ifdef __sse41___ +#undef STREAMVBYTE_TARGET_SSE41 +#define STREAMVBYTE_TARGET_SSE41 +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define STREAMVBYTE_ASSUME_ALIGNED(P, A) __builtin_assume_aligned((P), (A)) +#else +#define STREAMVBYTE_ASSUME_ALIGNED(P, A) +#endif + +#endif // STREAMVBYTE_IS_X64 + +#endif // STREAMVBYTE_ISADETECTION_H diff --git a/src/external/streamvbyte_shuffle_tables_0124_decode.h b/src/external/streamvbyte_shuffle_tables_0124_decode.h new file mode 100644 index 00000000..af593acc --- /dev/null +++ b/src/external/streamvbyte_shuffle_tables_0124_decode.h @@ -0,0 +1,279 @@ +// using 0,1,2,4 bytes per value +static uint8_t lengthTable[256] ={ + 0, 1, 2, 4, 1, 2, 3, 5, 2, 3, 4, 6, 4, 5, 6, 8, + 1, 2, 3, 5, 2, 3, 4, 6, 3, 4, 5, 7, 5, 6, 7, 9, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 1, 2, 3, 5, 2, 3, 4, 6, 3, 4, 5, 7, 5, 6, 7, 9, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 3, 4, 5, 7, 4, 5, 6, 8, 5, 6, 7, 9, 7, 8, 9, 11, + 5, 6, 7, 9, 6, 7, 8, 10, 7, 8, 9, 11, 9, 10, 11, 13, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 3, 4, 5, 7, 4, 5, 6, 8, 5, 6, 7, 9, 7, 8, 9, 11, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 6, 7, 8, 10, 7, 8, 9, 11, 8, 9, 10, 12, 10, 11, 12, 14, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 5, 6, 7, 9, 6, 7, 8, 10, 7, 8, 9, 11, 9, 10, 11, 13, + 6, 7, 8, 10, 7, 8, 9, 11, 8, 9, 10, 12, 10, 11, 12, 14, + 8, 9, 10, 12, 9, 10, 11, 13, 10, 11, 12, 14, 12, 13, 14, 16, + }; + +// decoding: +static int8_t shuffleTable[256][16] = { + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0000 + { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1000 + { 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2000 + { 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3000 + { -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0100 + { 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1100 + { 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2100 + { 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3100 + { -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0200 + { 0, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1200 + { 0, 1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2200 + { 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3200 + { -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0300 + { 0, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1300 + { 0, 1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2300 + { 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3300 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1 }, // 0010 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1 }, // 1010 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1 }, // 2010 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1 }, // 3010 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1 }, // 0110 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1 }, // 1110 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1 }, // 2110 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1 }, // 3110 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1 }, // 0210 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1 }, // 1210 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1 }, // 2210 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1 }, // 3210 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1 }, // 0310 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1 }, // 1310 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1 }, // 2310 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1 }, // 3310 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1 }, // 0020 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1 }, // 1020 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 }, // 2020 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, -1, -1, -1, -1 }, // 3020 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1 }, // 0120 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 }, // 1120 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, -1, -1, -1, -1 }, // 2120 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, -1, -1, -1, -1 }, // 3120 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 }, // 0220 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, -1, -1, -1, -1 }, // 1220 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, -1, -1, -1, -1 }, // 2220 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 }, // 3220 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1 }, // 0320 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1 }, // 1320 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1 }, // 2320 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1 }, // 3320 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1 }, // 0030 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1 }, // 1030 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1 }, // 2030 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1 }, // 3030 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1 }, // 0130 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1 }, // 1130 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, -1, -1, -1, -1 }, // 2130 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, -1, -1, -1, -1 }, // 3130 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1 }, // 0230 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, -1, -1, -1, -1 }, // 1230 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, -1, -1, -1, -1 }, // 2230 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, -1, -1, -1, -1 }, // 3230 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1 }, // 0330 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1 }, // 1330 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1 }, // 2330 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1 }, // 3330 + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1 }, // 0001 + { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1 }, // 1001 + { 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1 }, // 2001 + { 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1 }, // 3001 + { -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1 }, // 0101 + { 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1 }, // 1101 + { 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1 }, // 2101 + { 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1 }, // 3101 + { -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1 }, // 0201 + { 0, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1 }, // 1201 + { 0, 1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1 }, // 2201 + { 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1 }, // 3201 + { -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, 4, -1, -1, -1 }, // 0301 + { 0, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1, 5, -1, -1, -1 }, // 1301 + { 0, 1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1, 6, -1, -1, -1 }, // 2301 + { 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 8, -1, -1, -1 }, // 3301 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1 }, // 0011 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1 }, // 1011 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1 }, // 2011 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3011 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1 }, // 0111 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1 }, // 1111 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3111 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1 }, // 0211 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3211 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1 }, // 0311 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1311 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2311 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 3311 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1, -1 }, // 0021 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1 }, // 1021 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 2021 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3021 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1 }, // 0121 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3121 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 0221 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3221 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1 }, // 0321 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1321 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2321 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 3321 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, -1, -1, -1 }, // 0031 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1 }, // 1031 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 2031 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3031 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1 }, // 0131 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1131 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2131 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3131 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 0231 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1231 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2231 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3231 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1 }, // 0331 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1331 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2331 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 3331 + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1 }, // 0002 + { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1 }, // 1002 + { 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1 }, // 2002 + { 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, -1, -1 }, // 3002 + { -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1 }, // 0102 + { 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1 }, // 1102 + { 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, 3, 4, -1, -1 }, // 2102 + { 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, 5, 6, -1, -1 }, // 3102 + { -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1 }, // 0202 + { 0, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1, 3, 4, -1, -1 }, // 1202 + { 0, 1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, 4, 5, -1, -1 }, // 2202 + { 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1 }, // 3202 + { -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1 }, // 0302 + { 0, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1, 5, 6, -1, -1 }, // 1302 + { 0, 1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1, 6, 7, -1, -1 }, // 2302 + { 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1 }, // 3302 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, -1, -1 }, // 0012 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1 }, // 1012 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 2012 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3012 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1 }, // 0112 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3112 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 0212 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3212 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1 }, // 0312 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1312 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2312 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 3312 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, -1, -1 }, // 0022 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1 }, // 1022 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 2022 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3022 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1 }, // 0122 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3122 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 0222 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3222 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1 }, // 0322 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1322 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2322 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 3322 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, -1, -1 }, // 0032 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1 }, // 1032 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 2032 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3032 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1 }, // 0132 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1132 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2132 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3132 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 0232 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1232 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2232 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3232 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1 }, // 0332 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1332 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2332 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 3332 + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3 }, // 0003 + { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4 }, // 1003 + { 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5 }, // 2003 + { 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 6, 7 }, // 3003 + { -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4 }, // 0103 + { 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5 }, // 1103 + { 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, 3, 4, 5, 6 }, // 2103 + { 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, 5, 6, 7, 8 }, // 3103 + { -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5 }, // 0203 + { 0, -1, -1, -1, 1, 2, -1, -1, -1, -1, -1, -1, 3, 4, 5, 6 }, // 1203 + { 0, 1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, 4, 5, 6, 7 }, // 2203 + { 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, 8, 9 }, // 3203 + { -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, 6, 7 }, // 0303 + { 0, -1, -1, -1, 1, 2, 3, 4, -1, -1, -1, -1, 5, 6, 7, 8 }, // 1303 + { 0, 1, -1, -1, 2, 3, 4, 5, -1, -1, -1, -1, 6, 7, 8, 9 }, // 2303 + { 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 8, 9, 10, 11 }, // 3303 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, 3, 4 }, // 0013 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5 }, // 1013 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 2013 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3013 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5 }, // 0113 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1113 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2113 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3113 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 0213 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1213 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2213 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3213 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8 }, // 0313 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1313 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2313 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 3313 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, 4, 5 }, // 0023 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6 }, // 1023 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 2023 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3023 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6 }, // 0123 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1123 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2123 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3123 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 0223 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1223 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2223 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3223 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9 }, // 0323 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1323 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2323 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 3323 + { -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7 }, // 0033 + { 0, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8 }, // 1033 + { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 2033 + { 0, 1, 2, 3, -1, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3033 + { -1, -1, -1, -1, 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8 }, // 0133 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1133 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2133 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3133 + { -1, -1, -1, -1, 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 0233 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1233 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2233 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3233 + { -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, // 0333 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1333 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2333 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, // 3333 +}; diff --git a/src/external/streamvbyte_shuffle_tables_0124_encode.h b/src/external/streamvbyte_shuffle_tables_0124_encode.h new file mode 100644 index 00000000..e26ebfe6 --- /dev/null +++ b/src/external/streamvbyte_shuffle_tables_0124_encode.h @@ -0,0 +1,282 @@ +#ifndef STREAMVBYTE_SHUFFLE_TABLES_0124_ENCODE_H +#define STREAMVBYTE_SHUFFLE_TABLES_0124_ENCODE_H +// using 0,1,2,4 bytes per value +static uint8_t lengthTable[256] ={ + 0, 1, 2, 4, 1, 2, 3, 5, 2, 3, 4, 6, 4, 5, 6, 8, + 1, 2, 3, 5, 2, 3, 4, 6, 3, 4, 5, 7, 5, 6, 7, 9, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 1, 2, 3, 5, 2, 3, 4, 6, 3, 4, 5, 7, 5, 6, 7, 9, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 3, 4, 5, 7, 4, 5, 6, 8, 5, 6, 7, 9, 7, 8, 9, 11, + 5, 6, 7, 9, 6, 7, 8, 10, 7, 8, 9, 11, 9, 10, 11, 13, + 2, 3, 4, 6, 3, 4, 5, 7, 4, 5, 6, 8, 6, 7, 8, 10, + 3, 4, 5, 7, 4, 5, 6, 8, 5, 6, 7, 9, 7, 8, 9, 11, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 6, 7, 8, 10, 7, 8, 9, 11, 8, 9, 10, 12, 10, 11, 12, 14, + 4, 5, 6, 8, 5, 6, 7, 9, 6, 7, 8, 10, 8, 9, 10, 12, + 5, 6, 7, 9, 6, 7, 8, 10, 7, 8, 9, 11, 9, 10, 11, 13, + 6, 7, 8, 10, 7, 8, 9, 11, 8, 9, 10, 12, 10, 11, 12, 14, + 8, 9, 10, 12, 9, 10, 11, 13, 10, 11, 12, 14, 12, 13, 14, 16, + }; + +// encoding: +static int8_t encodingShuffleTable[256][16] = { + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0000 + { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1000 + { 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2000 + { 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3000 + { 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0100 + { 0, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1100 + { 0, 1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2100 + { 0, 1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3100 + { 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0200 + { 0, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1200 + { 0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2200 + { 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3200 + { 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0300 + { 0, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1300 + { 0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2300 + { 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3300 + { 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0010 + { 0, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1010 + { 0, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2010 + { 0, 1, 2, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3010 + { 4, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0110 + { 0, 4, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1110 + { 0, 1, 4, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2110 + { 0, 1, 2, 3, 4, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3110 + { 4, 5, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0210 + { 0, 4, 5, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1210 + { 0, 1, 4, 5, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2210 + { 0, 1, 2, 3, 4, 5, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3210 + { 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0310 + { 0, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1310 + { 0, 1, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2310 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1 }, // 3310 + { 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0020 + { 0, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1020 + { 0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2020 + { 0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3020 + { 4, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0120 + { 0, 4, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1120 + { 0, 1, 4, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2120 + { 0, 1, 2, 3, 4, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3120 + { 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0220 + { 0, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1220 + { 0, 1, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2220 + { 0, 1, 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3220 + { 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0320 + { 0, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1320 + { 0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2320 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1 }, // 3320 + { 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0030 + { 0, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1030 + { 0, 1, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2030 + { 0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3030 + { 4, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0130 + { 0, 4, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1130 + { 0, 1, 4, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2130 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1 }, // 3130 + { 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0230 + { 0, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1230 + { 0, 1, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2230 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 }, // 3230 + { 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0330 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1 }, // 1330 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 }, // 2330 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1 }, // 3330 + { 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0001 + { 0, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1001 + { 0, 1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2001 + { 0, 1, 2, 3, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3001 + { 4, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0101 + { 0, 4, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1101 + { 0, 1, 4, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2101 + { 0, 1, 2, 3, 4, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3101 + { 4, 5, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0201 + { 0, 4, 5, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1201 + { 0, 1, 4, 5, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2201 + { 0, 1, 2, 3, 4, 5, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3201 + { 4, 5, 6, 7, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0301 + { 0, 4, 5, 6, 7, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1301 + { 0, 1, 4, 5, 6, 7, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2301 + { 0, 1, 2, 3, 4, 5, 6, 7, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3301 + { 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0011 + { 0, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1011 + { 0, 1, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2011 + { 0, 1, 2, 3, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3011 + { 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0111 + { 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1111 + { 0, 1, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2111 + { 0, 1, 2, 3, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3111 + { 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0211 + { 0, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1211 + { 0, 1, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2211 + { 0, 1, 2, 3, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3211 + { 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0311 + { 0, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1311 + { 0, 1, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2311 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1 }, // 3311 + { 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0021 + { 0, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1021 + { 0, 1, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2021 + { 0, 1, 2, 3, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3021 + { 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0121 + { 0, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1121 + { 0, 1, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2121 + { 0, 1, 2, 3, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3121 + { 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0221 + { 0, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1221 + { 0, 1, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2221 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3221 + { 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0321 + { 0, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1321 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1 }, // 2321 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1 }, // 3321 + { 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0031 + { 0, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1031 + { 0, 1, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2031 + { 0, 1, 2, 3, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3031 + { 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0131 + { 0, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1131 + { 0, 1, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2131 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 3131 + { 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0231 + { 0, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1231 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 2231 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1 }, // 3231 + { 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 0331 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 1331 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1 }, // 2331 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 3331 + { 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0002 + { 0, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1002 + { 0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2002 + { 0, 1, 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3002 + { 4, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0102 + { 0, 4, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1102 + { 0, 1, 4, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2102 + { 0, 1, 2, 3, 4, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3102 + { 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0202 + { 0, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1202 + { 0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2202 + { 0, 1, 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3202 + { 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0302 + { 0, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1302 + { 0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2302 + { 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3302 + { 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0012 + { 0, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1012 + { 0, 1, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2012 + { 0, 1, 2, 3, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3012 + { 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0112 + { 0, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1112 + { 0, 1, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2112 + { 0, 1, 2, 3, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3112 + { 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0212 + { 0, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1212 + { 0, 1, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2212 + { 0, 1, 2, 3, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 3212 + { 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0312 + { 0, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1312 + { 0, 1, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2312 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1 }, // 3312 + { 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0022 + { 0, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1022 + { 0, 1, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2022 + { 0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3022 + { 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0122 + { 0, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1122 + { 0, 1, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2122 + { 0, 1, 2, 3, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 3122 + { 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0222 + { 0, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1222 + { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2222 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3222 + { 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0322 + { 0, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 1322 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 }, // 2322 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1 }, // 3322 + { 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0032 + { 0, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1032 + { 0, 1, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2032 + { 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3032 + { 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0132 + { 0, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1132 + { 0, 1, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2132 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 3132 + { 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0232 + { 0, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 1232 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 2232 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 }, // 3232 + { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 0332 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 1332 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 }, // 2332 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 3332 + { 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0003 + { 0, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1003 + { 0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2003 + { 0, 1, 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3003 + { 4, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0103 + { 0, 4, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1103 + { 0, 1, 4, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2103 + { 0, 1, 2, 3, 4, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 3103 + { 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0203 + { 0, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1203 + { 0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2203 + { 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 3203 + { 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0303 + { 0, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1303 + { 0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2303 + { 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3303 + { 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0013 + { 0, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1013 + { 0, 1, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2013 + { 0, 1, 2, 3, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 3013 + { 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0113 + { 0, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1113 + { 0, 1, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2113 + { 0, 1, 2, 3, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 3113 + { 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0213 + { 0, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1213 + { 0, 1, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 2213 + { 0, 1, 2, 3, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 3213 + { 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 0313 + { 0, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1313 + { 0, 1, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2313 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1 }, // 3313 + { 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0023 + { 0, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1023 + { 0, 1, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2023 + { 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 3023 + { 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0123 + { 0, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1123 + { 0, 1, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 2123 + { 0, 1, 2, 3, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 3123 + { 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0223 + { 0, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1223 + { 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2223 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3223 + { 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 0323 + { 0, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 1323 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 }, // 2323 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1 }, // 3323 + { 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 0033 + { 0, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1033 + { 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2033 + { 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3033 + { 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 0133 + { 0, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1133 + { 0, 1, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2133 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 3133 + { 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 0233 + { 0, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 1233 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 2233 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 }, // 3233 + { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 0333 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 1333 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 }, // 2333 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, // 3333 +}; +#endif diff --git a/src/external/streamvbyte_shuffle_tables_decode.h b/src/external/streamvbyte_shuffle_tables_decode.h new file mode 100644 index 00000000..7aa9ba8b --- /dev/null +++ b/src/external/streamvbyte_shuffle_tables_decode.h @@ -0,0 +1,283 @@ +#ifndef STREAMVBYTE_SHUFFLE_TABLES_H +#define STREAMVBYTE_SHUFFLE_TABLES_H +#include +// using 1,2,3,4 bytes per value +static uint8_t lengthTable[256] ={ + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, + 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16, + }; + +// decoding: +static int8_t shuffleTable[256][16] = { + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1 }, // 0000 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1000 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2000 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3000 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 0100 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1100 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2100 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3100 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 0200 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1200 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2200 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3200 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 0300 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 1300 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 2300 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 3300 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 0010 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1010 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2010 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3010 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 0110 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1110 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2110 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3110 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 0210 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1210 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2210 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3210 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 0310 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 1310 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 2310 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 3310 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 0020 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1020 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2020 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3020 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 0120 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1120 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2120 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3120 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 0220 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1220 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2220 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3220 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 0320 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 1320 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 2320 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 3320 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 0030 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1030 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2030 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3030 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 0130 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1130 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2130 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3130 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 0230 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1230 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2230 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3230 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 0330 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 1330 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 2330 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 3330 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 0001 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1001 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2001 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3001 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 0101 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1101 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2101 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3101 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 0201 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1201 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2201 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3201 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 0301 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 1301 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 2301 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 3301 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 0011 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1011 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2011 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3011 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 0111 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1111 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2111 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3111 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 0211 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1211 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2211 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3211 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 0311 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 1311 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 2311 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 3311 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 0021 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1021 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2021 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3021 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 0121 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1121 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2121 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3121 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 0221 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1221 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2221 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3221 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 0321 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 1321 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 2321 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 3321 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 0031 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1031 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2031 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3031 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 0131 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1131 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2131 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3131 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 0231 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1231 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2231 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3231 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 0331 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 1331 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 2331 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 3331 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 0002 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1002 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2002 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3002 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 0102 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1102 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2102 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3102 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 0202 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1202 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2202 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3202 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 0302 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 1302 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 2302 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 3302 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 0012 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1012 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2012 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3012 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 0112 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1112 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2112 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3112 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 0212 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1212 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2212 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3212 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 0312 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 1312 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 2312 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 3312 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 0022 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1022 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2022 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3022 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 0122 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1122 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2122 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3122 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 0222 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1222 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2222 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3222 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 0322 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 1322 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 2322 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 3322 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 0032 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1032 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2032 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3032 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 0132 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1132 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2132 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3132 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 0232 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1232 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2232 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3232 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 0332 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 1332 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 2332 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 3332 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 0003 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1003 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2003 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3003 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 0103 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1103 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2103 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3103 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 0203 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1203 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2203 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3203 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 0303 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 1303 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 2303 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 3303 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 0013 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1013 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2013 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3013 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 0113 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1113 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2113 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3113 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 0213 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1213 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2213 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3213 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 0313 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 1313 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 2313 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 3313 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 0023 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1023 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2023 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3023 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 0123 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1123 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2123 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3123 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 0223 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1223 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2223 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3223 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 0323 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 1323 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 2323 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 3323 + { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 0033 + { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1033 + { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2033 + { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3033 + { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 0133 + { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1133 + { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2133 + { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3133 + { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 0233 + { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1233 + { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2233 + { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3233 + { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 0333 + { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 1333 + { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 2333 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, // 3333 +}; +#endif diff --git a/src/external/streamvbyte_shuffle_tables_encode.h b/src/external/streamvbyte_shuffle_tables_encode.h new file mode 100644 index 00000000..f72e02eb --- /dev/null +++ b/src/external/streamvbyte_shuffle_tables_encode.h @@ -0,0 +1,355 @@ +#ifndef STREAMVBYTE_SHUFFLE_TABLES_ENCODE_H +#define STREAMVBYTE_SHUFFLE_TABLES_ENCODE_H +#include "streamvbyte_isadetection.h" + +#include +#ifdef STREAMVBYTE_X64 +// encoding: +static const uint8_t shuf_lut[64*16] = { + 0x00, 0x04, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x04, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, + 0x00, 0x04, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, + 0x00, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, + 0x00, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +}; + + +static const uint8_t len_lut[256] = { + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, + 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, + 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, + 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16, +}; +#endif +#ifdef STREAMVBYTE_ARM +static int8_t encodingShuffleTable[256][16] = { + { 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1111 + { 0, 1, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2111 + { 0, 1, 2, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3111 + { 0, 1, 2, 3, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 4111 + { 0, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1211 + { 0, 1, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2211 + { 0, 1, 2, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3211 + { 0, 1, 2, 3, 4, 5, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 4211 + { 0, 4, 5, 6, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1311 + { 0, 1, 4, 5, 6, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2311 + { 0, 1, 2, 4, 5, 6, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3311 + { 0, 1, 2, 3, 4, 5, 6, 8, 12, -1, -1, -1, -1, -1, -1, -1 }, // 4311 + { 0, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1411 + { 0, 1, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2411 + { 0, 1, 2, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3411 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, -1, -1, -1, -1, -1, -1 }, // 4411 + { 0, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1121 + { 0, 1, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2121 + { 0, 1, 2, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3121 + { 0, 1, 2, 3, 4, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 4121 + { 0, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1221 + { 0, 1, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2221 + { 0, 1, 2, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3221 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1 }, // 4221 + { 0, 4, 5, 6, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1321 + { 0, 1, 4, 5, 6, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2321 + { 0, 1, 2, 4, 5, 6, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3321 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, -1, -1, -1, -1, -1, -1 }, // 4321 + { 0, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1421 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1, -1 }, // 2421 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1, -1 }, // 3421 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, -1, -1, -1, -1, -1 }, // 4421 + { 0, 4, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1131 + { 0, 1, 4, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2131 + { 0, 1, 2, 4, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3131 + { 0, 1, 2, 3, 4, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1 }, // 4131 + { 0, 4, 5, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1231 + { 0, 1, 4, 5, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2231 + { 0, 1, 2, 4, 5, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3231 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1 }, // 4231 + { 0, 4, 5, 6, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1331 + { 0, 1, 4, 5, 6, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1 }, // 2331 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1 }, // 3331 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, -1, -1, -1, -1, -1 }, // 4331 + { 0, 4, 5, 6, 7, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1, -1 }, // 1431 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 12, -1, -1, -1, -1, -1, -1 }, // 2431 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, -1, -1, -1, -1, -1 }, // 3431 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, -1, -1, -1, -1 }, // 4431 + { 0, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1141 + { 0, 1, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2141 + { 0, 1, 2, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 3141 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 4141 + { 0, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1241 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 2241 + { 0, 1, 2, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 3241 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1 }, // 4241 + { 0, 4, 5, 6, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1 }, // 1341 + { 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 2341 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1 }, // 3341 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, -1, -1, -1, -1 }, // 4341 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1 }, // 1441 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1 }, // 2441 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, -1 }, // 3441 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441 + { 0, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1112 + { 0, 1, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2112 + { 0, 1, 2, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3112 + { 0, 1, 2, 3, 4, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 4112 + { 0, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1212 + { 0, 1, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2212 + { 0, 1, 2, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3212 + { 0, 1, 2, 3, 4, 5, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 4212 + { 0, 4, 5, 6, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1312 + { 0, 1, 4, 5, 6, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2312 + { 0, 1, 2, 4, 5, 6, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 3312 + { 0, 1, 2, 3, 4, 5, 6, 8, 12, 13, -1, -1, -1, -1, -1, -1 }, // 4312 + { 0, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1412 + { 0, 1, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2412 + { 0, 1, 2, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3412 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, -1, -1, -1, -1, -1 }, // 4412 + { 0, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1122 + { 0, 1, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2122 + { 0, 1, 2, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3122 + { 0, 1, 2, 3, 4, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 4122 + { 0, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1222 + { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2222 + { 0, 1, 2, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 3222 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 }, // 4222 + { 0, 4, 5, 6, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1322 + { 0, 1, 4, 5, 6, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2322 + { 0, 1, 2, 4, 5, 6, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3322 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 13, -1, -1, -1, -1, -1 }, // 4322 + { 0, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 1422 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 }, // 2422 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1 }, // 3422 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1 }, // 4422 + { 0, 4, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1132 + { 0, 1, 4, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2132 + { 0, 1, 2, 4, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 3132 + { 0, 1, 2, 3, 4, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1 }, // 4132 + { 0, 4, 5, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1232 + { 0, 1, 4, 5, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2232 + { 0, 1, 2, 4, 5, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3232 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1 }, // 4232 + { 0, 4, 5, 6, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 1332 + { 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1 }, // 2332 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1 }, // 3332 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, -1, -1, -1, -1 }, // 4332 + { 0, 4, 5, 6, 7, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1, -1 }, // 1432 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, -1, -1, -1, -1, -1 }, // 2432 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, -1, -1, -1, -1 }, // 3432 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, -1, -1, -1 }, // 4432 + { 0, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1142 + { 0, 1, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 2142 + { 0, 1, 2, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 3142 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 4142 + { 0, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1 }, // 1242 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 2242 + { 0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 3242 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 }, // 4242 + { 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 }, // 1342 + { 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 2342 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 }, // 3342 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, -1, -1, -1 }, // 4342 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1 }, // 1442 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 }, // 2442 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1 }, // 3442 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442 + { 0, 4, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1113 + { 0, 1, 4, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2113 + { 0, 1, 2, 4, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 3113 + { 0, 1, 2, 3, 4, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 4113 + { 0, 4, 5, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1213 + { 0, 1, 4, 5, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2213 + { 0, 1, 2, 4, 5, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 3213 + { 0, 1, 2, 3, 4, 5, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 4213 + { 0, 4, 5, 6, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1313 + { 0, 1, 4, 5, 6, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 2313 + { 0, 1, 2, 4, 5, 6, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 3313 + { 0, 1, 2, 3, 4, 5, 6, 8, 12, 13, 14, -1, -1, -1, -1, -1 }, // 4313 + { 0, 4, 5, 6, 7, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 1413 + { 0, 1, 4, 5, 6, 7, 8, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 2413 + { 0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, -1, -1, -1, -1, -1 }, // 3413 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, -1, -1, -1, -1 }, // 4413 + { 0, 4, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1123 + { 0, 1, 4, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2123 + { 0, 1, 2, 4, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 3123 + { 0, 1, 2, 3, 4, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 4123 + { 0, 4, 5, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1223 + { 0, 1, 4, 5, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 2223 + { 0, 1, 2, 4, 5, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 3223 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1 }, // 4223 + { 0, 4, 5, 6, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 1323 + { 0, 1, 4, 5, 6, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 2323 + { 0, 1, 2, 4, 5, 6, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1 }, // 3323 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, -1, -1, -1, -1 }, // 4323 + { 0, 4, 5, 6, 7, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 1423 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, -1, -1, -1, -1, -1 }, // 2423 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 14, -1, -1, -1, -1 }, // 3423 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, -1, -1, -1 }, // 4423 + { 0, 4, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1133 + { 0, 1, 4, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 2133 + { 0, 1, 2, 4, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 3133 + { 0, 1, 2, 3, 4, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1 }, // 4133 + { 0, 4, 5, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 1233 + { 0, 1, 4, 5, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 2233 + { 0, 1, 2, 4, 5, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1 }, // 3233 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1 }, // 4233 + { 0, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 1333 + { 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1 }, // 2333 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1 }, // 3333 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1 }, // 4333 + { 0, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1 }, // 1433 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1 }, // 2433 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, -1, -1, -1 }, // 3433 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, -1, -1 }, // 4433 + { 0, 4, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1 }, // 1143 + { 0, 1, 4, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 2143 + { 0, 1, 2, 4, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1 }, // 3143 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1 }, // 4143 + { 0, 4, 5, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1 }, // 1243 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1 }, // 2243 + { 0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1 }, // 3243 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1 }, // 4243 + { 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1, -1 }, // 1343 + { 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1 }, // 2343 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1 }, // 3343 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, -1, -1 }, // 4343 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1, -1 }, // 1443 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1, -1, -1 }, // 2443 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1, -1 }, // 3443 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443 + { 0, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1114 + { 0, 1, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 2114 + { 0, 1, 2, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 3114 + { 0, 1, 2, 3, 4, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 4114 + { 0, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1214 + { 0, 1, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 2214 + { 0, 1, 2, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 3214 + { 0, 1, 2, 3, 4, 5, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 4214 + { 0, 4, 5, 6, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1314 + { 0, 1, 4, 5, 6, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2314 + { 0, 1, 2, 4, 5, 6, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 3314 + { 0, 1, 2, 3, 4, 5, 6, 8, 12, 13, 14, 15, -1, -1, -1, -1 }, // 4314 + { 0, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1414 + { 0, 1, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2414 + { 0, 1, 2, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3414 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, -1, -1, -1 }, // 4414 + { 0, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 }, // 1124 + { 0, 1, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 2124 + { 0, 1, 2, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 3124 + { 0, 1, 2, 3, 4, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 4124 + { 0, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1224 + { 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2224 + { 0, 1, 2, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 3224 + { 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 }, // 4224 + { 0, 4, 5, 6, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1324 + { 0, 1, 4, 5, 6, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2324 + { 0, 1, 2, 4, 5, 6, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3324 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, 15, -1, -1, -1 }, // 4324 + { 0, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 1424 + { 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 }, // 2424 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1 }, // 3424 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1 }, // 4424 + { 0, 4, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1 }, // 1134 + { 0, 1, 4, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 2134 + { 0, 1, 2, 4, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 3134 + { 0, 1, 2, 3, 4, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1 }, // 4134 + { 0, 4, 5, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1234 + { 0, 1, 4, 5, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2234 + { 0, 1, 2, 4, 5, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3234 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1 }, // 4234 + { 0, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 1334 + { 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1 }, // 2334 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1 }, // 3334 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, -1, -1 }, // 4334 + { 0, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1, -1 }, // 1434 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, -1, -1, -1 }, // 2434 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, -1, -1 }, // 3434 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, -1 }, // 4434 + { 0, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 }, // 1144 + { 0, 1, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 2144 + { 0, 1, 2, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 3144 + { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 4144 + { 0, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1 }, // 1244 + { 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 2244 + { 0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 3244 + { 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 }, // 4244 + { 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 }, // 1344 + { 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 2344 + { 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 }, // 3344 + { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, -1 }, // 4344 + { 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1 }, // 1444 + { 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 }, // 2444 + { 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 }, // 3444 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, // 4444 +}; +#endif +#endif diff --git a/src/external/streamvbyte_x64_decode.c b/src/external/streamvbyte_x64_decode.c new file mode 100644 index 00000000..a31d859f --- /dev/null +++ b/src/external/streamvbyte_x64_decode.c @@ -0,0 +1,114 @@ +#include "streamvbyte_isadetection.h" +#include "streamvbyte_shuffle_tables_decode.h" + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#endif + +#ifdef STREAMVBYTE_X64 +STREAMVBYTE_TARGET_SSE41 +static inline __m128i svb_decode_sse41(uint32_t key, + const uint8_t * *dataPtrPtr) { + uint8_t len; + __m128i Data = _mm_loadu_si128((const __m128i *)*dataPtrPtr); + uint8_t *pshuf = (uint8_t *) &shuffleTable[key]; + __m128i Shuf = *(__m128i *)pshuf; +#ifdef AVOIDLENGTHLOOKUP + // this avoids the dependency on lengthTable, + // see https://github.com/lemire/streamvbyte/issues/12 + len = pshuf[12 + (key >> 6)] + 1; +#else + len = lengthTable[key]; +#endif + Data = _mm_shuffle_epi8(Data, Shuf); + *dataPtrPtr += len; + return Data; +} +STREAMVBYTE_UNTARGET_REGION + + +STREAMVBYTE_TARGET_SSE41 +static inline void svb_write_sse41(uint32_t *out, __m128i Vec) { + _mm_storeu_si128((__m128i *)out, Vec); +} +STREAMVBYTE_UNTARGET_REGION + + + +STREAMVBYTE_TARGET_SSE41 +static inline const uint8_t *svb_decode_sse41_simple(uint32_t *out, + const uint8_t * keyPtr, + const uint8_t * dataPtr, + uint64_t count) { + + uint64_t keybytes = count / 4; // number of key bytes + __m128i Data; + if (keybytes >= 8) { + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); + + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 4, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 8, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 12, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 16, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 20, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 24, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 28, Data); + + out += 32; + } + { + uint64_t keys = nextkeys; + + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 4, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 8, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 12, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 16, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 20, Data); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0xFF), &dataPtr); + svb_write_sse41(out + 24, Data); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + svb_write_sse41(out + 28, Data); + + out += 32; + } + } + return dataPtr; +} +STREAMVBYTE_UNTARGET_REGION +#endif diff --git a/src/external/streamvbyte_x64_encode.c b/src/external/streamvbyte_x64_encode.c new file mode 100644 index 00000000..a5121a49 --- /dev/null +++ b/src/external/streamvbyte_x64_encode.c @@ -0,0 +1,102 @@ +#include "streamvbyte_isadetection.h" +#include "streamvbyte_shuffle_tables_encode.h" + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +#ifdef STREAMVBYTE_X64 +// contributed by aqrit + +static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length); + +STREAMVBYTE_TARGET_SSE41 +static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) { + const __m128i mask_01 = _mm_set1_epi8(0x01); + const __m128i mask_7F00 = _mm_set1_epi16(0x7F00); + + __m128i m0, m1; + size_t keys; + + m0 = _mm_min_epu8(mask_01, lo); + m1 = _mm_min_epu8(mask_01, hi); + m0 = _mm_packus_epi16(m0, m1); + m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101 + m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF + keys = (size_t)_mm_movemask_epi8(m0); + return keys; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) { + size_t dataLen = 0; + + for (const uint32_t* end = &in[(count & ~7U)]; in != end; in += 8) + { + __m128i r0, r1; + size_t keys; + + r0 = _mm_loadu_si128((const __m128i *) &in[0]); + r1 = _mm_loadu_si128((const __m128i *) &in[4]); + + keys = svb_control_SSE41(r0, r1); + dataLen += len_lut[keys & 0xFF]; + dataLen += len_lut[keys >> 8]; + } + + dataLen += svb_data_bytes_scalar(in, count & 7); + return dataLen; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) { + uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundary +// cldellow: NB `restrict` is only available in C99, not c++14. +// uint8_t *restrict keyPtr = &out[0]; +// uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys + uint8_t * keyPtr = &out[0]; + uint8_t * dataPtr = &out[keyLen]; // variable length data after keys + + for (const uint32_t* end = &in[(count & ~7U)]; in != end; in += 8) + { + __m128i r0, r1, r2, r3; + size_t keys; + + r0 = _mm_loadu_si128((const __m128i*)&in[0]); + r1 = _mm_loadu_si128((const __m128i*)&in[4]); + + keys = svb_control_SSE41(r0, r1); + + r2 = _mm_loadu_si128((const __m128i*)&shuf_lut[(keys << 4) & 0x03F0]); + r3 = _mm_loadu_si128((const __m128i*)&shuf_lut[(keys >> 4) & 0x03F0]); + r0 = _mm_shuffle_epi8(r0, r2); + r1 = _mm_shuffle_epi8(r1, r3); + + _mm_storeu_si128((__m128i *)dataPtr, r0); + dataPtr += len_lut[keys & 0xFF]; + _mm_storeu_si128((__m128i *)dataPtr, r1); + dataPtr += len_lut[keys >> 8]; + + *((uint16_t*)keyPtr) = (uint16_t)keys; + keyPtr += 2; + } + + // do remaining + uint32_t key = 0; + for(size_t i = 0; i < (count & 7); i++) + { + uint32_t dw = in[i]; + uint32_t symbol = (dw > 0x000000FF) + (dw > 0x0000FFFF) + (dw > 0x00FFFFFF); + key |= symbol << (i + i); + *((uint32_t*)dataPtr) = dw; + dataPtr += 1 + symbol; + } + memcpy(keyPtr, &key, ((count & 7) + 3) >> 2); + + return (size_t)(dataPtr - out); +} +STREAMVBYTE_UNTARGET_REGION +#endif diff --git a/src/external/streamvbyte_zigzag.cc b/src/external/streamvbyte_zigzag.cc new file mode 100644 index 00000000..27fd61ef --- /dev/null +++ b/src/external/streamvbyte_zigzag.cc @@ -0,0 +1,38 @@ +#include "external/streamvbyte_zigzag.h" + +static inline +uint32_t svb_zigzag_encode_32 (uint32_t val) { + return (val + val) ^ (uint32_t)((int32_t)val >> 31); +} + +void zigzag_encode(const int32_t * in, uint32_t * out, size_t N) { + for(size_t i = 0; i < N; i++) + out[i] = svb_zigzag_encode_32((uint32_t)in[i]); +} + +void zigzag_delta_encode(const int32_t * in, uint32_t * out, size_t N, int32_t prev) { + for (size_t i = 0; i < N; i++) { + out[i] = svb_zigzag_encode_32((uint32_t)(in[i] - prev)); + prev = in[i]; + } +} + +static inline +int32_t svb_zigzag_decode_32 (uint32_t val) { + return (val >> 1) ^ (0-(val & 1)); +} + + +void zigzag_decode(const uint32_t * in, int32_t * out, size_t N) { + for(size_t i = 0; i < N; i++) + out[i] = svb_zigzag_decode_32(in[i]); +} + + +void zigzag_delta_decode(const uint32_t * in, int32_t * out, size_t N, int32_t prev) { + for(size_t i = 0; i < N; i++) { + int32_t val =svb_zigzag_decode_32(in[i]); + out[i] = val + prev; + prev += val; + } +} diff --git a/src/external/streamvbytedelta.c b/src/external/streamvbytedelta.c new file mode 100644 index 00000000..c9d7a8b5 --- /dev/null +++ b/src/external/streamvbytedelta.c @@ -0,0 +1,342 @@ +#include "streamvbytedelta.h" +#include "streamvbyte_isadetection.h" + + +#ifdef STREAMVBYTE_X64 +#include "streamvbyte_shuffle_tables.h" +size_t streamvbyte_encode4(__m128i in, uint8_t *outData, uint8_t *outCode); +#endif + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +static uint8_t svb_encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) { + uint8_t *dataPtr = *dataPtrPtr; + uint8_t code; + + if (val < (1 << 8)) { // 1 byte + *dataPtr = (uint8_t)(val); + *dataPtrPtr += 1; + code = 0; + } else if (val < (1 << 16)) { // 2 bytes + memcpy(dataPtr, &val, 2); // assumes little endian + *dataPtrPtr += 2; + code = 1; + } else if (val < (1 << 24)) { // 3 bytes + memcpy(dataPtr, &val, 3); // assumes little endian + *dataPtrPtr += 3; + code = 2; + } else { // 4 bytes + memcpy(dataPtr, &val, sizeof(uint32_t)); + *dataPtrPtr += sizeof(uint32_t); + code = 3; + } + + return code; +} + +static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in, + uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint32_t count, uint32_t prev) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; + } + uint32_t val = in[c] - prev; + prev = in[c]; + uint8_t code = svb_encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } + + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte +} + +#ifdef STREAMVBYTE_X64 +// from streamvbyte.c +size_t streamvbyte_encode_quad(__m128i in, uint8_t *outData, uint8_t *outCode); + +static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_alignr_epi8(curr, prev, 12)); +} + +static uint8_t *svb_encode_vector_d1_init(const uint32_t *in, + uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint32_t count, uint32_t prev) { + + uint8_t *outData = dataPtr; + uint8_t *outKey = keyPtr; + + uint32_t count4 = count / 4; + __m128i Prev = _mm_set1_epi32((int32_t)prev); + + for (uint32_t c = 0; c < count4; c++) { + __m128i vin = _mm_loadu_si128((const __m128i *)(in + 4 * c)); + __m128i deltain = Delta(vin, Prev); + Prev = vin; + outData += streamvbyte_encode4(deltain, outData, outKey); + outKey++; + } + prev = (uint32_t)_mm_extract_epi32(Prev, 3); // we grab the last*/ + outData = svb_encode_scalar_d1_init(in + 4 * count4, outKey, outData, + count - 4 * count4, prev); + // outData = svb_encode_scalar_d1_init(in, outKey, outData, count, prev); + return outData; +} + +#endif + +size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out, + uint32_t prev) { + uint8_t *keyPtr = out; // keys come immediately after 32-bit count + uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte + uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + return (size_t)(svb_encode_vector_d1_init(in, keyPtr, dataPtr, count, prev) - out); + } +#endif + return (size_t)(svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out); +} + +#ifdef STREAMVBYTE_X64 +static inline __m128i svb_decode_sse41(uint32_t key, + const uint8_t *__restrict__ *dataPtrPtr) { + uint8_t len = lengthTable[key]; + __m128i Data = _mm_loadu_si128((const __m128i *)*dataPtrPtr); + __m128i Shuf = *(__m128i *)&shuffleTable[key]; + + Data = _mm_shuffle_epi8(Data, Shuf); + *dataPtrPtr += len; + + return Data; +} +#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element + +static inline void svb_write_sse41(uint32_t *out, __m128i Vec) { + _mm_storeu_si128((__m128i *)out, Vec); +} + +static __m128i svb_write_sse41_d1(uint32_t *out, __m128i Vec, __m128i Prev) { + __m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] + Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] + Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] + Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] + Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] + + svb_write_sse41(out, Vec); + return Vec; +} + +#ifndef _MSC_VER +static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C}; +#else +static __m128i High16To32 = {8, 9, -1, -1, 10, 11, -1, -1, + 12, 13, -1, -1, 14, 15, -1, -1}; +#endif + +static inline __m128i svb_write_16bit_sse41_d1(uint32_t *out, __m128i Vec, + __m128i Prev) { + // vec == [A B C D E F G H] (16 bit values) + __m128i Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) + Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] + Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] + Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] + __m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) + V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit) + __m128i V2 = + _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) + V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) + svb_write_sse41(out, V1); + svb_write_sse41(out + 4, V2); + return V2; +} +#endif + +static inline uint32_t svb_decode_data(const uint8_t **dataPtrPtr, uint8_t code) { + const uint8_t *dataPtr = *dataPtrPtr; + uint32_t val; + + if (code == 0) { // 1 byte + val = (uint32_t)*dataPtr; + dataPtr += 1; + } else if (code == 1) { // 2 bytes + val = 0; + memcpy(&val, dataPtr, 2); // assumes little endian + dataPtr += 2; + } else if (code == 2) { // 3 bytes + val = 0; + memcpy(&val, dataPtr, 3); // assumes little endian + dataPtr += 3; + } else { // code == 3 + memcpy(&val, dataPtr, 4); + dataPtr += 4; + } + + *dataPtrPtr = dataPtr; + return val; +} + +static const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, + const uint8_t *keyPtr, + const uint8_t *dataPtr, uint32_t count, + uint32_t prev) { + if (count == 0) + return dataPtr; // no reads or writes if no data + + uint8_t shift = 0; + uint32_t key = *keyPtr++; + + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; + } + uint32_t val = svb_decode_data(&dataPtr, (key >> shift) & 0x3); + val += prev; + *outPtr++ = val; + prev = val; + shift += 2; + } + + return dataPtr; // pointer to first unused byte after end +} + +#ifdef STREAMVBYTE_X64 +static const uint8_t *svb_decode_sse41_d1_init(uint32_t *out, + const uint8_t *__restrict__ keyPtr, + const uint8_t *__restrict__ dataPtr, + uint64_t count, uint32_t prev) { + uint64_t keybytes = count / 4; // number of key bytes + if (keybytes >= 8) { + __m128i Prev = _mm_set1_epi32((int32_t)prev); + __m128i Data; + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr))); + Prev = svb_write_16bit_sse41_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 8))); + Prev = svb_write_16bit_sse41_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 16))); + Prev = svb_write_16bit_sse41_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 24))); + Prev = svb_write_16bit_sse41_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + continue; + } + + Data = svb_decode_sse41(keys & 0x00FF, &dataPtr); + Prev = svb_write_sse41_d1(out, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 8, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 16, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 24, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 28, Data, Prev); + + out += 32; + } + { + uint64_t keys = nextkeys; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr))); + Prev = svb_write_16bit_sse41_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 8))); + Prev = svb_write_16bit_sse41_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 16))); + Prev = svb_write_16bit_sse41_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(dataPtr + 24))); + Prev = svb_write_16bit_sse41_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + + } else { + + Data = svb_decode_sse41(keys & 0x00FF, &dataPtr); + Prev = svb_write_sse41_d1(out, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 8, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 16, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 24, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 28, Data, Prev); + + out += 32; + } + } + prev = out[-1]; + } + uint64_t consumedkeys = keybytes - (keybytes & 7); + return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, + count & 31, prev); +} +#endif + +size_t streamvbyte_delta_decode(const uint8_t *in, uint32_t *out, + uint32_t count, uint32_t prev) { + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + const uint8_t *keyPtr = in; + const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + return (size_t)(svb_decode_sse41_d1_init(out, keyPtr, dataPtr, count, prev) - in); + } +#endif + return (size_t)(svb_decode_scalar_d1_init(out, keyPtr, dataPtr, count, prev) - in); +} diff --git a/src/external/streamvbytedelta_decode.c b/src/external/streamvbytedelta_decode.c new file mode 100644 index 00000000..b081904b --- /dev/null +++ b/src/external/streamvbytedelta_decode.c @@ -0,0 +1,74 @@ +#include "streamvbytedelta.h" +#include "streamvbyte_isadetection.h" + +#include // for memcpy + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +static inline uint32_t svb_decode_data(const uint8_t **dataPtrPtr, uint8_t code) { + const uint8_t *dataPtr = *dataPtrPtr; + uint32_t val; + + if (code == 0) { // 1 byte + val = (uint32_t)*dataPtr; + dataPtr += 1; + } else if (code == 1) { // 2 bytes + val = 0; + memcpy(&val, dataPtr, 2); // assumes little endian + dataPtr += 2; + } else if (code == 2) { // 3 bytes + val = 0; + memcpy(&val, dataPtr, 3); // assumes little endian + dataPtr += 3; + } else { // code == 3 + memcpy(&val, dataPtr, 4); + dataPtr += 4; + } + + *dataPtrPtr = dataPtr; + return val; +} + +static const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, + const uint8_t *keyPtr, + const uint8_t *dataPtr, uint32_t count, + uint32_t prev) { + if (count == 0) + return dataPtr; // no reads or writes if no data + + uint8_t shift = 0; + uint32_t key = *keyPtr++; + + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; + } + uint32_t val = svb_decode_data(&dataPtr, (key >> shift) & 0x3); + val += prev; + *outPtr++ = val; + prev = val; + shift += 2; + } + + return dataPtr; // pointer to first unused byte after end +} + +#ifdef STREAMVBYTE_X64 +#include "streamvbytedelta_x64_decode.c" +#endif + +size_t streamvbyte_delta_decode(const uint8_t *in, uint32_t *out, + uint32_t count, uint32_t prev) { + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + const uint8_t *keyPtr = in; + const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + return (size_t)(svb_decode_sse41_d1_init(out, keyPtr, dataPtr, count, prev) - in); + } +#endif + return (size_t)(svb_decode_scalar_d1_init(out, keyPtr, dataPtr, count, prev) - in); +} diff --git a/src/external/streamvbytedelta_encode.c b/src/external/streamvbytedelta_encode.c new file mode 100644 index 00000000..b745fa1c --- /dev/null +++ b/src/external/streamvbytedelta_encode.c @@ -0,0 +1,76 @@ +#include "streamvbytedelta.h" +#include "streamvbyte_isadetection.h" + +#include // for memcpy + +#ifdef STREAMVBYTE_X64 +#include "streamvbytedelta_x64_encode.c" +#endif + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +static uint8_t svb_encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) { + uint8_t *dataPtr = *dataPtrPtr; + uint8_t code; + + if (val < (1 << 8)) { // 1 byte + *dataPtr = (uint8_t)(val); + *dataPtrPtr += 1; + code = 0; + } else if (val < (1 << 16)) { // 2 bytes + memcpy(dataPtr, &val, 2); // assumes little endian + *dataPtrPtr += 2; + code = 1; + } else if (val < (1 << 24)) { // 3 bytes + memcpy(dataPtr, &val, 3); // assumes little endian + *dataPtrPtr += 3; + code = 2; + } else { // 4 bytes + memcpy(dataPtr, &val, sizeof(uint32_t)); + *dataPtrPtr += sizeof(uint32_t); + code = 3; + } + + return code; +} + +static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in, + uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint32_t count, uint32_t prev) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; + } + uint32_t val = in[c] - prev; + prev = in[c]; + uint8_t code = svb_encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } + + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte +} + +size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out, + uint32_t prev) { +#ifdef STREAMVBYTE_X64 + if(streamvbyte_sse41()) { + return streamvbyte_encode_SSE41_d1_init(in,count,out,prev); + } +#endif + uint8_t *keyPtr = out; // keys come immediately after 32-bit count + uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte + uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys + return (size_t)(svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out); +} diff --git a/src/external/streamvbytedelta_x64_decode.c b/src/external/streamvbytedelta_x64_decode.c new file mode 100644 index 00000000..4cf85e78 --- /dev/null +++ b/src/external/streamvbytedelta_x64_decode.c @@ -0,0 +1,177 @@ +#include // for memcpy +#include "streamvbyte_shuffle_tables_decode.h" +#include "streamvbyte_isadetection.h" +#ifdef STREAMVBYTE_X64 + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +STREAMVBYTE_TARGET_SSE41 +static inline __m128i svb_decode_sse41(uint32_t key, + const uint8_t *__restrict__ *dataPtrPtr) { + uint8_t len = lengthTable[key]; + __m128i Data = _mm_loadu_si128((const __m128i *)*dataPtrPtr); + __m128i Shuf = *(__m128i *)&shuffleTable[key]; + + Data = _mm_shuffle_epi8(Data, Shuf); + *dataPtrPtr += len; + + return Data; +} +STREAMVBYTE_UNTARGET_REGION +#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element +STREAMVBYTE_TARGET_SSE41 +static inline void svb_write_sse41(uint32_t *out, __m128i Vec) { + _mm_storeu_si128((__m128i *)out, Vec); +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static __m128i svb_write_sse41_d1(uint32_t *out, __m128i Vec, __m128i Prev) { + __m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] + Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] + Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] + Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] + Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] + + svb_write_sse41(out, Vec); + return Vec; +} +STREAMVBYTE_UNTARGET_REGION + + +STREAMVBYTE_TARGET_SSE41 +static inline __m128i svb_write_16bit_sse41_d1(uint32_t *out, __m128i Vec, + __m128i Prev) { + __m128i High16To32 = _mm_set_epi64x(0xFFFF0F0EFFFF0D0CLL, + 0xFFFF0B0AFFFF0908LL); + // vec == [A B C D E F G H] (16 bit values) + __m128i Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) + Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] + Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] + Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] + __m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) + V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit) + __m128i V2 = + _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) + V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) + svb_write_sse41(out, V1); + svb_write_sse41(out + 4, V2); + return V2; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +static const uint8_t *svb_decode_sse41_d1_init(uint32_t *out, + const uint8_t *__restrict__ keyPtr, + const uint8_t *__restrict__ dataPtr, + uint64_t count, uint32_t prev) { + uint64_t keybytes = count / 4; // number of key bytes + if (keybytes >= 8) { + __m128i Prev = _mm_set1_epi32((int32_t)prev); + __m128i Data; + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset, sizeof(nextkeys)); + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + memcpy(&nextkeys, keyPtr64 + Offset + 1, sizeof(nextkeys)); + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr))); + Prev = svb_write_16bit_sse41_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 8))); + Prev = svb_write_16bit_sse41_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 16))); + Prev = svb_write_16bit_sse41_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 24))); + Prev = svb_write_16bit_sse41_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + continue; + } + + Data = svb_decode_sse41(keys & 0x00FF, &dataPtr); + Prev = svb_write_sse41_d1(out, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 8, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 16, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 24, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 28, Data, Prev); + + out += 32; + } + { + uint64_t keys = nextkeys; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr))); + Prev = svb_write_16bit_sse41_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 8))); + Prev = svb_write_16bit_sse41_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)(dataPtr + 16))); + Prev = svb_write_16bit_sse41_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(dataPtr + 24))); + Prev = svb_write_16bit_sse41_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + + } else { + + Data = svb_decode_sse41(keys & 0x00FF, &dataPtr); + Prev = svb_write_sse41_d1(out, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 8, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 16, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = svb_decode_sse41((keys & 0x00FF), &dataPtr); + Prev = svb_write_sse41_d1(out + 24, Data, Prev); + Data = svb_decode_sse41((keys & 0xFF00) >> 8, &dataPtr); + Prev = svb_write_sse41_d1(out + 28, Data, Prev); + + out += 32; + } + } + prev = out[-1]; + } + uint64_t consumedkeys = keybytes - (keybytes & 7); + return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, + count & 31, prev); +} +STREAMVBYTE_UNTARGET_REGION +#endif diff --git a/src/external/streamvbytedelta_x64_encode.c b/src/external/streamvbytedelta_x64_encode.c new file mode 100644 index 00000000..543bbc52 --- /dev/null +++ b/src/external/streamvbytedelta_x64_encode.c @@ -0,0 +1,77 @@ + +#include "streamvbyte_shuffle_tables_encode.h" +#include "streamvbyte_isadetection.h" + +#ifdef STREAMVBYTE_X64 + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcast-align" +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +STREAMVBYTE_TARGET_SSE41 +static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_alignr_epi8(curr, prev, 12)); +} +STREAMVBYTE_UNTARGET_REGION + +// based on code by aqrit (streamvbyte_encode_SSE41) +STREAMVBYTE_TARGET_SSE41 +static size_t streamvbyte_encode_SSE41_d1_init (const uint32_t* in, uint32_t count, uint8_t* out, uint32_t prev) { + __m128i Prev = _mm_set1_epi32((int32_t)prev); + uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundary + uint8_t *restrict keyPtr = &out[0]; + uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys + + const __m128i mask_01 = _mm_set1_epi8(0x01); + const __m128i mask_7F00 = _mm_set1_epi16(0x7F00); + + for (const uint32_t* end = &in[(count & ~7U)]; in != end; in += 8) + { + __m128i rawr0, r0, rawr1, r1, r2, r3; + size_t keys; + + rawr0 = _mm_loadu_si128((const __m128i*)&in[0]); + r0 = Delta(rawr0, Prev); + Prev = rawr0; + rawr1 = _mm_loadu_si128((const __m128i*)&in[4]); + r1 = Delta(rawr1, Prev); + Prev = rawr1; + + r2 = _mm_min_epu8(mask_01, r0); + r3 = _mm_min_epu8(mask_01, r1); + r2 = _mm_packus_epi16(r2, r3); + r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101 + r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF + keys = (size_t)_mm_movemask_epi8(r2); + + r2 = _mm_loadu_si128((const __m128i*)&shuf_lut[(keys << 4) & 0x03F0]); + r3 = _mm_loadu_si128((const __m128i*)&shuf_lut[(keys >> 4) & 0x03F0]); + r0 = _mm_shuffle_epi8(r0, r2); + r1 = _mm_shuffle_epi8(r1, r3); + + _mm_storeu_si128((__m128i *)dataPtr, r0); + dataPtr += len_lut[keys & 0xFF]; + _mm_storeu_si128((__m128i *)dataPtr, r1); + dataPtr += len_lut[keys >> 8]; + + *((uint16_t*)keyPtr) = (uint16_t)keys; + keyPtr += 2; + } + prev = (uint32_t)_mm_extract_epi32(Prev,3); + // do remaining + uint32_t key = 0; + for(size_t i = 0; i < (count & 7); i++) + { + uint32_t dw = in[i] - prev; prev = in[i]; + uint32_t symbol = (dw > 0x000000FF) + (dw > 0x0000FFFF) + (dw > 0x00FFFFFF); + key |= symbol << (i + i); + *((uint32_t*)dataPtr) = dw; + dataPtr += 1 + symbol; + } + memcpy(keyPtr, &key, ((count & 7) + 3) >> 2); + + return (size_t)(dataPtr - out); +} +STREAMVBYTE_UNTARGET_REGION +#endif diff --git a/src/mmap_allocator.cpp b/src/mmap_allocator.cpp new file mode 100644 index 00000000..dc71f687 --- /dev/null +++ b/src/mmap_allocator.cpp @@ -0,0 +1,269 @@ +#include "mmap_allocator.h" +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +struct mmap_file +{ + std::string filename; + + std::mutex mutex; + boost::interprocess::file_mapping mapping; + boost::interprocess::mapped_region region; + boost::interprocess::managed_external_buffer buffer; + + mmap_file(std::string const &filename, std::size_t offset = 0); + ~mmap_file(); + + void remove(); +}; + +using mmap_file_ptr = std::shared_ptr; + +struct mmap_dir_t +{ + static constexpr std::size_t increase = 1024000000; + static constexpr std::size_t alignment = 32; + + std::string mmap_dir_filename; + bool mmap_dir_created = false; + +public: + ~mmap_dir_t(); + + bool is_open(); + void open_mmap_file(std::string const &filename, size_t file_size = increase); + void resize_mmap_file(size_t add_size); + + size_t mmap_file_size = 0; + + std::vector files; +}; + +thread_local mmap_file_ptr mmap_file_thread_ptr; + +struct mmap_shm +{ + std::mutex mutex; + std::vector region; + boost::interprocess::managed_external_buffer buffer; + + static size_t mmap_file_size; + + mmap_shm(size_t size); + + static void open(size_t add_size); + static void close(); +}; + +using mmap_shm_ptr = std::shared_ptr; + +using allocator_t = boost::interprocess::allocator; + +static mmap_dir_t mmap_dir; + +std::vector< mmap_shm_ptr > mmap_shm_regions; +size_t mmap_shm::mmap_file_size = 0; +thread_local mmap_shm_ptr mmap_shm_thread_region_ptr; + +std::mutex mmap_allocator_mutex; + +mmap_file::mmap_file(std::string const &filename, std::size_t offset) + : mapping(filename.c_str(), boost::interprocess::read_write) + , region(mapping, boost::interprocess::read_write) + , buffer(boost::interprocess::create_only, reinterpret_cast(region.get_address()) + offset, region.get_size() - offset) + , filename(filename) +{ } + +mmap_file::~mmap_file() +{ + mapping = boost::interprocess::file_mapping(); + region = boost::interprocess::mapped_region(); + buffer = boost::interprocess::managed_external_buffer(); + + remove(); +} + +void mmap_file::remove() +{ + if(!filename.empty()) { + try { + boost::filesystem::remove(filename.c_str()); + } catch(boost::filesystem::filesystem_error &e) { + std::cout << e.what() << std::endl; + } + } +} + +void mmap_dir_t::open_mmap_file(std::string const &dir_filename, size_t file_size) +{ + mmap_dir_filename = dir_filename; + mmap_file_size = file_size; + + mmap_dir_created |= boost::filesystem::create_directory(dir_filename); + + std::string new_filename = mmap_dir_filename + "/mmap_" + std::to_string(mmap_dir.files.size()) + ".dat"; + std::cout << "Filename: " << new_filename << ", size: " << mmap_file_size << std::endl; + if(std::ofstream(new_filename.c_str()).fail()) + throw std::runtime_error("Failed to open mmap file"); + boost::filesystem::resize_file(new_filename.c_str(), 0); + boost::filesystem::resize_file(new_filename.c_str(), mmap_file_size); + mmap_file_thread_ptr = std::make_shared(new_filename.c_str()); + + mmap_dir.files.emplace_back(mmap_file_thread_ptr); +} + +void mmap_dir_t::resize_mmap_file(size_t add_size) +{ + auto offset = mmap_file_size; + auto size = increase + (add_size + alignment) - (add_size % alignment); + + std::string new_filename = mmap_dir_filename + "/mmap_" + std::to_string(mmap_dir.files.size()) + ".dat"; + if(std::ofstream(new_filename.c_str()).fail()) + throw std::runtime_error("Failed to open mmap file"); + boost::filesystem::resize_file(new_filename.c_str(), size); + mmap_file_thread_ptr = std::make_shared(new_filename.c_str(), 0); + + mmap_dir.files.emplace_back(mmap_file_thread_ptr); + mmap_file_size = offset + size; +} + +bool mmap_dir_t::is_open() +{ + return !mmap_dir.files.empty(); +} + + mmap_dir_t::~mmap_dir_t() +{ + if(!mmap_dir_filename.empty()) { + try { + files.clear(); + + if(mmap_dir_created) { + boost::filesystem::remove(mmap_dir_filename.c_str()); + } + } catch(boost::filesystem::filesystem_error &e) { + std::cout << e.what() << std::endl; + } + } +} + +mmap_shm::mmap_shm(size_t size) + : region(size) + , buffer(boost::interprocess::create_only, region.data(), region.size()) +{ } + +void mmap_shm::open(size_t add_size) +{ + constexpr std::size_t increase = 64000000; + constexpr std::size_t alignment = 32; + + auto size = increase + (add_size + alignment) - (add_size % alignment); + mmap_shm_thread_region_ptr = std::make_shared(size); + mmap_shm_regions.emplace_back(mmap_shm_thread_region_ptr); + mmap_file_size += size; +} + +void mmap_shm::close() +{ + mmap_shm_regions.clear(); + mmap_shm_thread_region_ptr.reset(); +} + +bool void_mmap_allocator_shutdown = false; + +void void_mmap_allocator::shutdown() { void_mmap_allocator_shutdown = true; } + +void * void_mmap_allocator::allocate(size_type n, const void *hint) +{ + while(true) { + try { + if(mmap_dir.is_open() && mmap_file_thread_ptr != nullptr) { + auto &i = *mmap_file_thread_ptr; + std::lock_guard lock(i.mutex); + allocator_t allocator(i.buffer.get_segment_manager()); + return &(*allocator.allocate(n, hint)); + } else if(mmap_shm_thread_region_ptr != nullptr) { + auto &i = *mmap_shm_thread_region_ptr; + std::lock_guard lock(i.mutex); + allocator_t allocator(i.buffer.get_segment_manager()); + return &(*allocator.allocate(n, hint)); + } + } catch(boost::interprocess::bad_alloc &e) { + // This mmap file is full + } + + std::lock_guard lock(mmap_allocator_mutex); + if(mmap_dir.is_open()) + mmap_dir.resize_mmap_file(n); + else + mmap_shm::open(n); + } +} + +void void_mmap_allocator::deallocate(void *p, size_type n) +{ + destroy(p); +} + +void void_mmap_allocator::destroy(void *p) +{ + if(void_mmap_allocator_shutdown) return; + + if(mmap_shm_thread_region_ptr != nullptr) { + auto &i = *mmap_shm_thread_region_ptr; + if(p >= (void const *)i.region.data() && p < reinterpret_cast(reinterpret_cast(i.region.data()) + i.region.size())) { + allocator_t allocator(i.buffer.get_segment_manager()); + return allocator.destroy(reinterpret_cast(p)); + } + } + + if(mmap_file_thread_ptr != nullptr) { + auto &i = *mmap_file_thread_ptr; + if(p >= i.region.get_address() && p < reinterpret_cast(reinterpret_cast(i.region.get_address()) + i.region.get_size())) { + allocator_t allocator(i.buffer.get_segment_manager()); + allocator.destroy(reinterpret_cast(p)); + return; + } + } + + std::lock_guard lock(mmap_allocator_mutex); + for(auto &i: mmap_shm_regions) { + if(p >= (void const *)i->region.data() && p < reinterpret_cast(reinterpret_cast(i->region.data()) + i->region.size())) { + std::lock_guard lock(i->mutex); + allocator_t allocator(i->buffer.get_segment_manager()); + allocator.destroy(reinterpret_cast(p)); + return; + } + } + + for(auto &i: mmap_dir.files) { + if(p >= i->region.get_address() && p < reinterpret_cast(reinterpret_cast(i->region.get_address()) + i->region.get_size())) { + std::lock_guard lock(i->mutex); + allocator_t allocator(i->buffer.get_segment_manager()); + allocator.destroy(reinterpret_cast(p)); + return; + } + } +} + +void void_mmap_allocator::reportStoreSize(std::ostringstream &str) { + if (mmap_dir.mmap_file_size>0) { str << "Store size " << (mmap_dir.mmap_file_size / 1000000000) << "G | "; } +} + +void void_mmap_allocator::openMmapFile(const std::string& mmapFilename) { + mmap_dir.open_mmap_file(mmapFilename); +} + + + diff --git a/src/node_stores.cpp b/src/node_stores.cpp new file mode 100644 index 00000000..8c84b811 --- /dev/null +++ b/src/node_stores.cpp @@ -0,0 +1,105 @@ +#include +#include +#include "node_stores.h" + +void BinarySearchNodeStore::reopen() +{ + std::lock_guard lock(mutex); + for (auto i = 0; i < mLatpLons.size(); i++) + mLatpLons[i]->clear(); + + mLatpLons.clear(); + for (auto i = 0; i < NODE_SHARDS; i++) { + mLatpLons.push_back(std::make_unique()); + } +} + +LatpLon BinarySearchNodeStore::at(NodeID i) const { + auto shard = mLatpLons[shardPart(i)]; + auto id = idPart(i); + + auto iter = std::lower_bound(shard->begin(), shard->end(), id, [](auto const &e, auto i) { + return e.first < i; + }); + + if(iter == shard->end() || iter->first != id) + throw std::out_of_range("Could not find node with id " + std::to_string(i)); + + return iter->second; +} + +size_t BinarySearchNodeStore::size() const { + std::lock_guard lock(mutex); + uint64_t size = 0; + for (auto i = 0; i < mLatpLons.size(); i++) + size += mLatpLons[i]->size(); + + return size; +} + +void BinarySearchNodeStore::insert(const std::vector& elements) { + uint32_t newEntries[NODE_SHARDS] = {}; + std::vector iterators; + + // Before taking the lock, do a pass to find out how much + // to grow each backing collection + for (auto it = elements.begin(); it != elements.end(); it++) { + newEntries[shardPart(it->first)]++; + } + + std::lock_guard lock(mutex); + for (auto i = 0; i < NODE_SHARDS; i++) { + auto size = mLatpLons[i]->size(); + mLatpLons[i]->resize(size + newEntries[i]); + iterators.push_back(mLatpLons[i]->begin() + size); + } + + for (auto it = elements.begin(); it != elements.end(); it++) { + uint32_t shard = shardPart(it->first); + uint32_t id = idPart(it->first); + + *iterators[shard] = std::make_pair(id, it->second); + iterators[shard]++; + } +} + +void BinarySearchNodeStore::finalize(size_t threadNum) { + std::lock_guard lock(mutex); + for (auto i = 0; i < NODE_SHARDS; i++) { + boost::sort::block_indirect_sort( + mLatpLons[i]->begin(), mLatpLons[i]->end(), + [](auto const &a, auto const &b) { return a.first < b.first; }, + threadNum); + } +} + +void CompactNodeStore::reopen() +{ + std::lock_guard lock(mutex); + mLatpLons = std::make_unique(); +} + +LatpLon CompactNodeStore::at(NodeID i) const { + if(i >= mLatpLons->size()) + throw std::out_of_range("Could not find node with id " + std::to_string(i)); + return mLatpLons->at(i); +} + +size_t CompactNodeStore::size() const { + std::lock_guard lock(mutex); + return mLatpLons->size(); +} + +void CompactNodeStore::insert(const std::vector& elements) { + std::lock_guard lock(mutex); + for(auto const &i: elements) + insert_back(i.first, i.second); +} + +// @brief Make the store empty +void CompactNodeStore::clear() { + std::lock_guard lock(mutex); + mLatpLons->clear(); +} + + diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp index d8c1dd65..80d7c387 100644 --- a/src/osm_lua_processing.cpp +++ b/src/osm_lua_processing.cpp @@ -1,7 +1,9 @@ +#include + #include "osm_lua_processing.h" #include "attribute_store.h" #include "helpers.h" -#include +#include "coordinates_geom.h" using namespace std; diff --git a/src/osm_store.cpp b/src/osm_store.cpp index 5682a58c..98449917 100644 --- a/src/osm_store.cpp +++ b/src/osm_store.cpp @@ -5,300 +5,22 @@ #include #include -#include -#include -#include - #include -#include #include +#include "node_store.h" +#include "way_store.h" using namespace std; namespace bg = boost::geometry; -struct mmap_file -{ - std::string filename; - - std::mutex mutex; - boost::interprocess::file_mapping mapping; - boost::interprocess::mapped_region region; - boost::interprocess::managed_external_buffer buffer; - - mmap_file(std::string const &filename, std::size_t offset = 0); - ~mmap_file(); - - void remove(); -}; - -using mmap_file_ptr = std::shared_ptr; - -struct mmap_dir_t -{ - static constexpr std::size_t increase = 1024000000; - static constexpr std::size_t alignment = 32; - - std::string mmap_dir_filename; - bool mmap_dir_created = false; - -public: - ~mmap_dir_t(); - - bool is_open(); - void open_mmap_file(std::string const &filename, size_t file_size = increase); - void resize_mmap_file(size_t add_size); - - size_t mmap_file_size = 0; - - std::vector files; -}; - -thread_local mmap_file_ptr mmap_file_thread_ptr; - -struct mmap_shm -{ - std::mutex mutex; - std::vector region; - boost::interprocess::managed_external_buffer buffer; - - static size_t mmap_file_size; - - mmap_shm(size_t size); - - static void open(size_t add_size); - static void close(); -}; - -using mmap_shm_ptr = std::shared_ptr; - -using allocator_t = boost::interprocess::allocator; - -static mmap_dir_t mmap_dir; - -std::vector< mmap_shm_ptr > mmap_shm_regions; -size_t mmap_shm::mmap_file_size = 0; -thread_local mmap_shm_ptr mmap_shm_thread_region_ptr; - -std::mutex mmap_allocator_mutex; - -mmap_file::mmap_file(std::string const &filename, std::size_t offset) - : mapping(filename.c_str(), boost::interprocess::read_write) - , region(mapping, boost::interprocess::read_write) - , buffer(boost::interprocess::create_only, reinterpret_cast(region.get_address()) + offset, region.get_size() - offset) - , filename(filename) -{ } - -mmap_file::~mmap_file() -{ - mapping = boost::interprocess::file_mapping(); - region = boost::interprocess::mapped_region(); - buffer = boost::interprocess::managed_external_buffer(); - - remove(); -} - -void mmap_file::remove() -{ - if(!filename.empty()) { - try { - boost::filesystem::remove(filename.c_str()); - } catch(boost::filesystem::filesystem_error &e) { - std::cout << e.what() << std::endl; - } - } -} - -void mmap_dir_t::open_mmap_file(std::string const &dir_filename, size_t file_size) -{ - mmap_dir_filename = dir_filename; - mmap_file_size = file_size; - - mmap_dir_created |= boost::filesystem::create_directory(dir_filename); - - std::string new_filename = mmap_dir_filename + "/mmap_" + to_string(mmap_dir.files.size()) + ".dat"; - std::cout << "Filename: " << new_filename << ", size: " << mmap_file_size << std::endl; - if(std::ofstream(new_filename.c_str()).fail()) - throw std::runtime_error("Failed to open mmap file"); - boost::filesystem::resize_file(new_filename.c_str(), 0); - boost::filesystem::resize_file(new_filename.c_str(), mmap_file_size); - mmap_file_thread_ptr = std::make_shared(new_filename.c_str()); - - mmap_dir.files.emplace_back(mmap_file_thread_ptr); -} - -void mmap_dir_t::resize_mmap_file(size_t add_size) -{ - auto offset = mmap_file_size; - auto size = increase + (add_size + alignment) - (add_size % alignment); - - std::string new_filename = mmap_dir_filename + "/mmap_" + to_string(mmap_dir.files.size()) + ".dat"; - if(std::ofstream(new_filename.c_str()).fail()) - throw std::runtime_error("Failed to open mmap file"); - boost::filesystem::resize_file(new_filename.c_str(), size); - mmap_file_thread_ptr = std::make_shared(new_filename.c_str(), 0); - - mmap_dir.files.emplace_back(mmap_file_thread_ptr); - mmap_file_size = offset + size; -} - -bool mmap_dir_t::is_open() -{ - return !mmap_dir.files.empty(); -} - - mmap_dir_t::~mmap_dir_t() -{ - if(!mmap_dir_filename.empty()) { - try { - files.clear(); - - if(mmap_dir_created) { - boost::filesystem::remove(mmap_dir_filename.c_str()); - } - } catch(boost::filesystem::filesystem_error &e) { - std::cout << e.what() << std::endl; - } - } -} - -mmap_shm::mmap_shm(size_t size) - : region(size) - , buffer(boost::interprocess::create_only, region.data(), region.size()) -{ } - -void mmap_shm::open(size_t add_size) -{ - constexpr std::size_t increase = 64000000; - constexpr std::size_t alignment = 32; - - auto size = increase + (add_size + alignment) - (add_size % alignment); - mmap_shm_thread_region_ptr = std::make_shared(size); - mmap_shm_regions.emplace_back(mmap_shm_thread_region_ptr); - mmap_file_size += size; -} - -void mmap_shm::close() -{ - mmap_shm_regions.clear(); - mmap_shm_thread_region_ptr.reset(); -} - -bool void_mmap_allocator_shutdown = false; - -void void_mmap_allocator::shutdown() { void_mmap_allocator_shutdown = true; } - -void * void_mmap_allocator::allocate(size_type n, const void *hint) -{ - while(true) { - try { - if(mmap_dir.is_open() && mmap_file_thread_ptr != nullptr) { - auto &i = *mmap_file_thread_ptr; - std::lock_guard lock(i.mutex); - allocator_t allocator(i.buffer.get_segment_manager()); - return &(*allocator.allocate(n, hint)); - } else if(mmap_shm_thread_region_ptr != nullptr) { - auto &i = *mmap_shm_thread_region_ptr; - std::lock_guard lock(i.mutex); - allocator_t allocator(i.buffer.get_segment_manager()); - return &(*allocator.allocate(n, hint)); - } - } catch(boost::interprocess::bad_alloc &e) { - // This mmap file is full - } - - std::lock_guard lock(mmap_allocator_mutex); - if(mmap_dir.is_open()) - mmap_dir.resize_mmap_file(n); - else - mmap_shm::open(n); - } -} - -void void_mmap_allocator::deallocate(void *p, size_type n) -{ - destroy(p); -} - -void void_mmap_allocator::destroy(void *p) -{ - if(void_mmap_allocator_shutdown) return; - - if(mmap_shm_thread_region_ptr != nullptr) { - auto &i = *mmap_shm_thread_region_ptr; - if(p >= (void const *)i.region.data() && p < reinterpret_cast(reinterpret_cast(i.region.data()) + i.region.size())) { - allocator_t allocator(i.buffer.get_segment_manager()); - return allocator.destroy(reinterpret_cast(p)); - } - } - - if(mmap_file_thread_ptr != nullptr) { - auto &i = *mmap_file_thread_ptr; - if(p >= i.region.get_address() && p < reinterpret_cast(reinterpret_cast(i.region.get_address()) + i.region.get_size())) { - allocator_t allocator(i.buffer.get_segment_manager()); - allocator.destroy(reinterpret_cast(p)); - return; - } - } - - std::lock_guard lock(mmap_allocator_mutex); - for(auto &i: mmap_shm_regions) { - if(p >= (void const *)i->region.data() && p < reinterpret_cast(reinterpret_cast(i->region.data()) + i->region.size())) { - std::lock_guard lock(i->mutex); - allocator_t allocator(i->buffer.get_segment_manager()); - allocator.destroy(reinterpret_cast(p)); - return; - } - } - - for(auto &i: mmap_dir.files) { - if(p >= i->region.get_address() && p < reinterpret_cast(reinterpret_cast(i->region.get_address()) + i->region.get_size())) { - std::lock_guard lock(i->mutex); - allocator_t allocator(i->buffer.get_segment_manager()); - allocator.destroy(reinterpret_cast(p)); - return; - } - } -} - -void NodeStore::sort(unsigned int threadNum) { - std::lock_guard lock(mutex); - for (auto i = 0; i < NODE_SHARDS; i++) { - boost::sort::block_indirect_sort( - mLatpLons[i]->begin(), mLatpLons[i]->end(), - [](auto const &a, auto const &b) { return a.first < b.first; }, - threadNum); - } -} - -void WayStore::sort(unsigned int threadNum) { - std::lock_guard lock(mutex); - boost::sort::block_indirect_sort( - mLatpLonLists->begin(), mLatpLonLists->end(), - [](auto const &a, auto const &b) { return a.first < b.first; }, - threadNum); -} - -static inline bool isClosed(WayStore::latplon_vector_t const &way) { +static inline bool isClosed(const std::vector& way) { return way.begin() == way.end(); } void OSMStore::open(std::string const &osm_store_filename) { - mmap_dir.open_mmap_file(osm_store_filename); + void_mmap_allocator::openMmapFile(osm_store_filename); reopen(); - mmap_shm::close(); -} - -void OSMStore::nodes_sort(unsigned int threadNum) -{ - std::cout << "\nSorting nodes" << std::endl; - if(!use_compact_nodes) - nodes.sort(threadNum); -} - -void OSMStore::ways_sort(unsigned int threadNum) { - std::cout << "\nSorting ways" << std::endl; - ways.sort(threadNum); } MultiPolygon OSMStore::wayListMultiPolygon(WayVec::const_iterator outerBegin, WayVec::const_iterator outerEnd, WayVec::const_iterator innerBegin, WayVec::const_iterator innerEnd) const { @@ -466,10 +188,25 @@ void OSMStore::mergeMultiPolygonWays(std::vector &results, std::ma }; -void OSMStore::reportStoreSize(std::ostringstream &str) { - if (mmap_dir.mmap_file_size>0) { str << "Store size " << (mmap_dir.mmap_file_size / 1000000000) << "G | "; } -} - void OSMStore::reportSize() const { std::cout << "Stored " << nodes.size() << " nodes, " << ways.size() << " ways, " << relations.size() << " relations" << std::endl; } + +void OSMStore::reopen() { + nodes.reopen(); + ways.reopen(); + relations.reopen(); +} + +void OSMStore::ensureUsedWaysInited() { + if (!used_ways.inited) used_ways.reserve(use_compact_nodes, nodes.size()); +} + +void OSMStore::clear() { + nodes.clear(); + ways.clear(); + relations.clear(); + used_ways.clear(); +} + + diff --git a/src/output_object.cpp b/src/output_object.cpp index cf050283..b68fb27f 100644 --- a/src/output_object.cpp +++ b/src/output_object.cpp @@ -4,6 +4,7 @@ #include "output_object.h" #include "helpers.h" +#include "coordinates_geom.h" #include using namespace std; namespace geom = boost::geometry; diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp index 99a245f8..c5a3afb6 100644 --- a/src/read_pbf.cpp +++ b/src/read_pbf.cpp @@ -7,10 +7,17 @@ #include #include +#include "node_store.h" +#include "way_store.h" #include "osm_lua_processing.h" +#include "mmap_allocator.h" using namespace std; +const std::string OptionSortTypeThenID = "Sort.Type_then_ID"; +const std::string OptionLocationsOnWays = "LocationsOnWays"; +std::atomic blocksProcessed(0), blocksToProcess(0); + PbfReader::PbfReader(OSMStore &osmStore) : osmStore(osmStore) { } @@ -60,7 +67,7 @@ bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitiv } - osmStore.nodes_insert_back(nodes); + osmStore.nodes.insert(nodes); return true; } return false; @@ -72,7 +79,8 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive if (pg.ways_size() > 0) { Way pbfWay; - std::vector ways; + std::vector llWays; + std::vector>> nodeWays; for (int j=0; j nodeVec; if (locationsOnWays) { int lat=0, lon=0; for (int k=0; k(nodeId))); + llVec.push_back(osmStore.nodes.at(static_cast(nodeId))); + nodeVec.push_back(nodeId); } catch (std::out_of_range &err) { if (osmStore.integrity_enforced()) throw err; } @@ -108,7 +118,8 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive // If we need it for later, store the way's coordinates in the global way store if (osmStore.way_is_used(wayId)) { - ways.push_back(std::make_pair(wayId, WayStore::latplon_vector_t(llVec.begin(), llVec.end()))); + llWays.push_back(std::make_pair(wayId, WayStore::latplon_vector_t(llVec.begin(), llVec.end()))); + nodeWays.push_back(std::make_pair(wayId, nodeVec)); } output.setWay(static_cast(pbfWay.id()), llVec, tags); @@ -119,7 +130,12 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive } - osmStore.ways_insert_back(ways); + if (osmStore.ways.requiresNodes()) { + osmStore.ways.insertNodes(nodeWays); + } else { + osmStore.ways.insertLatpLons(llWays); + } + return true; } return false; @@ -157,7 +173,12 @@ bool PbfReader::ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, Prim return true; } -bool PbfReader::ReadRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb) { +bool PbfReader::ReadRelations( + OsmLuaProcessing& output, + PrimitiveGroup& pg, + const PrimitiveBlock& pb, + const BlockMetadata& blockMetadata +) { // ---- Read relations if (pg.relations_size() > 0) { @@ -170,6 +191,9 @@ bool PbfReader::ReadRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, Prim int outerKey= findStringPosition(pb, "outer"); if (typeKey >-1 && mpKey>-1) { for (int j=0; j progress, std::size_t datasize, - unordered_set const &nodeKeys, bool locationsOnWays, ReadPhase phase) +bool PbfReader::ReadBlock( + std::istream& infile, + OsmLuaProcessing& output, + const BlockMetadata& blockMetadata, + const unordered_set& nodeKeys, + bool locationsOnWays, + ReadPhase phase +) { + infile.seekg(blockMetadata.offset); + PrimitiveBlock pb; - readBlock(&pb, datasize, infile); + readBlock(&pb, blockMetadata.length, infile); if (infile.eof()) { return true; } @@ -232,13 +264,13 @@ bool PbfReader::ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::p auto output_progress = [&]() { std::ostringstream str; - osmStore.reportStoreSize(str); - str << "Block " << progress.first << "/" << progress.second << " ways " << pg.ways_size() << " relations " << pg.relations_size() << " \r"; + void_mmap_allocator::reportStoreSize(str); + str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << " \r"; std::cout << str.str(); std::cout.flush(); }; - if(phase == ReadPhase::Nodes || phase == ReadPhase::All) { + if(phase == ReadPhase::Nodes) { bool done = ReadNodes(output, pg, pb, nodeKeyPositions); if(done) { output_progress(); @@ -247,17 +279,17 @@ bool PbfReader::ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::p } } - if(phase == ReadPhase::RelationScan || phase == ReadPhase::All) { - osmStore.ensure_used_ways_inited(); + if(phase == ReadPhase::RelationScan) { + osmStore.ensureUsedWaysInited(); bool done = ScanRelations(output, pg, pb); if(done) { - std::cout << "(Scanning for ways used in relations: " << (100*progress.first/progress.second) << "%)\r"; + std::cout << "(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)\r"; std::cout.flush(); continue; } } - if(phase == ReadPhase::Ways || phase == ReadPhase::All) { + if(phase == ReadPhase::Ways) { bool done = ReadWays(output, pg, pb, locationsOnWays); if(done) { output_progress(); @@ -266,8 +298,8 @@ bool PbfReader::ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::p } } - if(phase == ReadPhase::Relations || phase == ReadPhase::All) { - bool done = ReadRelations(output, pg, pb); + if(phase == ReadPhase::Relations) { + bool done = ReadRelations(output, pg, pb, blockMetadata); if(done) { output_progress(); ++read_groups; @@ -288,11 +320,44 @@ bool PbfReader::ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::p return false; } + // We can only delete blocks if we're confident we've processed everything, + // which is not possible in the case of subdivided blocks. + return blockMetadata.chunks == 1; +} + +bool blockHasPrimitiveGroupSatisfying( + std::istream& infile, + const BlockMetadata block, + std::function test +) { + PrimitiveBlock pb; + + // We may have previously read to EOF, so clear the internal error state + infile.clear(); + infile.seekg(block.offset); + readBlock(&pb, block.length, infile); + if (infile.eof()) { + throw std::runtime_error("blockHasPrimitiveGroupSatisfying got unexpected eof"); + } + + for (int i=0; i const &nodeKeys, unsigned int threadNum, - pbfreader_generate_stream const &generate_stream, pbfreader_generate_output const &generate_output) +int PbfReader::ReadPbfFile( + bool hasSortTypeThenID, + unordered_set const& nodeKeys, + unsigned int threadNum, + const pbfreader_generate_stream& generate_stream, + const pbfreader_generate_output& generate_output +) { auto infile = generate_stream(); @@ -303,13 +368,13 @@ int PbfReader::ReadPbfFile(unordered_set const &nodeKeys, unsigned int t readBlock(&block, readHeader(*infile).datasize(), *infile); bool locationsOnWays = false; for (std::string option : block.optional_features()) { - if (option=="LocationsOnWays") { + if (option == OptionLocationsOnWays) { std::cout << ".osm.pbf file has locations on ways" << std::endl; locationsOnWays = true; } } - std::map > blocks; + std::map blocks; // Track the filesize - note that we can't rely on tellg(), as // its meant to be an opaque token useful only for seeking. @@ -321,15 +386,51 @@ int PbfReader::ReadPbfFile(unordered_set const &nodeKeys, unsigned int t break; } - blocks[blocks.size()] = std::make_pair(infile->tellg(), bh.datasize()); + blocks[blocks.size()] = { (long int)infile->tellg(), bh.datasize(), true, true, true, 0, 1 }; infile->seekg(bh.datasize(), std::ios_base::cur); - } + if (hasSortTypeThenID) { + // The PBF's blocks are sorted by type, then ID. We can do a binary search + // to learn where the blocks transition between object types, which + // enables a more efficient partitioning of work for reading. + std::vector indexes; + for (int i = 0; i < blocks.size(); i++) + indexes.push_back(i); + + const auto& waysStart = std::lower_bound( + indexes.begin(), + indexes.end(), + 0, + [&blocks, &infile](const auto &i, const auto &ignored) { + return blockHasPrimitiveGroupSatisfying( + *infile, + blocks[i], + [](const PrimitiveGroup&pg) { return pg.ways_size() > 0 || pg.relations_size() > 0; } + ); + } + ); + + const auto& relationsStart = std::lower_bound( + indexes.begin(), + indexes.end(), + 0, + [&blocks, &infile](const auto &i, const auto &ignored) { + return blockHasPrimitiveGroupSatisfying( + *infile, + blocks[i], + [](const PrimitiveGroup&pg) { return pg.relations_size() > 0; } + ); + } + ); - std::mutex block_mutex; + for (auto it = indexes.begin(); it != indexes.end(); it++) { + blocks[*it].hasNodes = it <= waysStart; + blocks[*it].hasWays = it >= waysStart && it <= relationsStart; + blocks[*it].hasRelations = it >= relationsStart; + } + } - std::size_t total_blocks = blocks.size(); // PBFs generated by Osmium have 8,000 entities per block, // and each block is about 64KB. @@ -340,7 +441,7 @@ int PbfReader::ReadPbfFile(unordered_set const &nodeKeys, unsigned int t // Osmium PBFs seem to be processed about 3x faster than osmconvert // PBFs, so try to hint to the user when they could speed up their // pipeline. - if (filesize / total_blocks > 1000000) { + if (filesize / blocks.size() > 1000000) { std::cout << "warning: PBF has very large blocks, which may slow processing" << std::endl; std::cout << " to fix: osmium cat -f pbf your-file.osm.pbf -o optimized.osm.pbf" << std::endl; } @@ -350,18 +451,76 @@ int PbfReader::ReadPbfFile(unordered_set const &nodeKeys, unsigned int t for(auto phase: all_phases) { // Launch the pool with threadNum threads boost::asio::thread_pool pool(threadNum); + std::mutex block_mutex; + + // If we're in ReadPhase::Relations and there aren't many blocks left + // to read, increase parallelism by letting each thread only process + // a portion of the block. + if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) { + std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl; + std::map moreBlocks; + for (const auto& block : blocks) { + BlockMetadata newBlock = block.second; + newBlock.chunks = threadNum; + for (size_t i = 0; i < threadNum; i++) { + newBlock.chunk = i; + moreBlocks[moreBlocks.size()] = newBlock; + } + } + blocks = moreBlocks; + } + + std::deque> blockRanges; + std::map filteredBlocks; + for (const auto& entry : blocks) { + if ((phase == ReadPhase::Nodes && entry.second.hasNodes) || + (phase == ReadPhase::RelationScan && entry.second.hasRelations) || + (phase == ReadPhase::Ways && entry.second.hasWays) || + (phase == ReadPhase::Relations && entry.second.hasRelations)) + filteredBlocks[entry.first] = entry.second; + } + + blocksToProcess = filteredBlocks.size(); + blocksProcessed = 0; + + // When processing blocks, we try to give each worker large batches + // of contiguous blocks, so that they might benefit from long runs + // of sorted indexes, and locality of nearby IDs. + const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1; + + size_t consumed = 0; + auto it = filteredBlocks.begin(); + while(it != filteredBlocks.end()) { + std::vector blockRange; + blockRange.reserve(batchSize); + size_t max = consumed + batchSize; + for (; consumed < max && it != filteredBlocks.end(); consumed++) { + IndexedBlockMetadata ibm; + memcpy(&ibm, &it->second, sizeof(BlockMetadata)); + ibm.index = it->first; + blockRange.push_back(ibm); + it++; + } + blockRanges.push_back(blockRange); + } { - const std::lock_guard lock(block_mutex); - for(auto const &block: blocks) { - boost::asio::post(pool, [=, progress=std::make_pair(block.first, total_blocks), block=block.second, &blocks, &block_mutex, &nodeKeys]() { - auto infile = generate_stream(); - auto output = generate_output(); - - infile->seekg(block.first); - if(ReadBlock(*infile, *output, progress, block.second, nodeKeys, locationsOnWays, phase)) { - const std::lock_guard lock(block_mutex); - blocks.erase(progress.first); + for(const std::vector& blockRange: blockRanges) { + boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() { + if (phase == ReadPhase::Nodes) + osmStore.nodes.batchStart(); + if (phase == ReadPhase::Ways) + osmStore.ways.batchStart(); + + for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) { + auto infile = generate_stream(); + auto output = generate_output(); + + if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase)) { + const std::lock_guard lock(block_mutex); + blocks.erase(indexedBlockMetadata.index); + blocksProcessed++; + } } }); } @@ -370,10 +529,10 @@ int PbfReader::ReadPbfFile(unordered_set const &nodeKeys, unsigned int t pool.join(); if(phase == ReadPhase::Nodes) { - osmStore.nodes_sort(threadNum); + osmStore.nodes.finalize(threadNum); } if(phase == ReadPhase::Ways) { - osmStore.ways_sort(threadNum); + osmStore.ways.finalize(threadNum); } } return 0; @@ -409,3 +568,15 @@ int ReadPbfBoundingBox(const std::string &inputFile, double &minLon, double &max return 0; } +bool PbfHasOptionalFeature(const std::string& inputFile, const std::string& feature) { + fstream infile(inputFile, ios::in | ios::binary); + if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; } + HeaderBlock block; + readBlock(&block, readHeader(infile).datasize(), infile); + + for (const std::string& option: block.optional_features()) + if (option == feature) + return true; + + return false; +} diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp new file mode 100644 index 00000000..811d4e23 --- /dev/null +++ b/src/sorted_node_store.cpp @@ -0,0 +1,564 @@ +#include +#include +#include +#include +#include +#include +#include +#include "sorted_node_store.h" +#include "external/libpopcnt.h" +#include "external/streamvbyte.h" +#include "external/streamvbyte_zigzag.h" + +namespace SortedNodeStoreTypes { + const uint16_t GroupSize = 256; + const uint16_t ChunkSize = 256; + const uint16_t ChunkAlignment = 16; + const uint32_t ChunkCompressed = 1 << 31; + + std::atomic totalGroups; + std::atomic totalNodes; + std::atomic totalGroupSpace; + std::atomic totalChunks; + std::atomic chunkSizeFreqs[257]; + std::atomic groupSizeFreqs[257]; + + + // When SortedNodeStore first starts, it's not confident that it has seen an + // entire segment, so it's in "collecting orphans" mode. Once it crosses a + // threshold of 64K elements, it ceases to be in this mode. + // + // Orphans are rounded up across multiple threads, and dealt with in + // the finalize step. + thread_local bool collectingOrphans = true; + thread_local uint64_t groupStart = -1; + thread_local std::vector* localNodes = nullptr; + + thread_local int64_t cachedChunk = -1; + thread_local std::vector cacheChunkLons; + thread_local std::vector cacheChunkLatps; + + thread_local uint32_t arenaSpace = 0; + thread_local char* arenaPtr = nullptr; +} + +using namespace SortedNodeStoreTypes; + +SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) { + // Each group can store 64K nodes. If we allocate 256K slots + // for groups, we support 2^34 = 17B nodes, or about twice + // the number used by OSM as of November 2023. + groups.resize(256 * 1024); +} + +void SortedNodeStore::reopen() +{ + for (const auto entry: allocatedMemory) + void_mmap_allocator::deallocate(entry.first, entry.second); + allocatedMemory.clear(); + + totalNodes = 0; + totalGroups = 0; + totalGroupSpace = 0; + totalChunks = 0; + memset(chunkSizeFreqs, 0, sizeof(chunkSizeFreqs)); + memset(groupSizeFreqs, 0, sizeof(groupSizeFreqs)); + orphanage.clear(); + workerBuffers.clear(); + groups.clear(); + groups.resize(256 * 1024); +} + +SortedNodeStore::~SortedNodeStore() { + for (const auto entry: allocatedMemory) + void_mmap_allocator::deallocate(entry.first, entry.second); +} + +LatpLon SortedNodeStore::at(const NodeID id) const { + const size_t groupIndex = id / (GroupSize * ChunkSize); + const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize; + const uint64_t chunkMaskByte = chunk / 8; + const uint64_t chunkMaskBit = chunk % 8; + + const uint64_t nodeMaskByte = (id % ChunkSize) / 8; + const uint64_t nodeMaskBit = id % 8; + + GroupInfo* groupPtr = groups[groupIndex]; + + if (groupPtr == nullptr) { + throw std::out_of_range("SortedNodeStore::at(" + std::to_string(id) + ") uses non-existent group " + std::to_string(groupIndex)); + } + + size_t chunkOffset = 0; + { + chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte); + uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte]; + maskByte = maskByte & ((1 << chunkMaskBit) - 1); + chunkOffset += popcnt(&maskByte, 1); + + if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit))) + throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no chunk"); + } + + uint16_t scaledOffset = groupPtr->chunkOffsets[chunkOffset]; + ChunkInfoBase* basePtr = (ChunkInfoBase*)(((char *)(groupPtr->chunkOffsets + popcnt(groupPtr->chunkMask, 32))) + (scaledOffset * ChunkAlignment)); + + if (basePtr->flags & ChunkCompressed) { + CompressedChunkInfo* ptr = (CompressedChunkInfo*)basePtr; + size_t latpSize = (ptr->flags >> 10) & ((1 << 10) - 1); + // TODO: we don't actually need the lonSize to decompress the data. + // May as well store it as a sanity check for now. + size_t lonSize = ptr->flags & ((1 << 10) - 1); + size_t n = popcnt(ptr->nodeMask, 32) - 1; + + const size_t neededChunk = groupIndex * ChunkSize + chunk; + + // Really naive caching strategy - just cache the last-used chunk. + // Probably good enough? + if (cachedChunk != neededChunk) { + cachedChunk = neededChunk; + cacheChunkLons.reserve(256); + cacheChunkLatps.reserve(256); + + uint8_t* latpData = ptr->data; + uint8_t* lonData = ptr->data + latpSize; + uint32_t recovdata[256] = {0}; + + streamvbyte_decode(latpData, recovdata, n); + cacheChunkLatps[0] = ptr->firstLatp; + zigzag_delta_decode(recovdata, &cacheChunkLatps[1], n, cacheChunkLatps[0]); + + streamvbyte_decode(lonData, recovdata, n); + cacheChunkLons[0] = ptr->firstLon; + zigzag_delta_decode(recovdata, &cacheChunkLons[1], n, cacheChunkLons[0]); + } + + size_t nodeOffset = 0; + nodeOffset = popcnt(ptr->nodeMask, nodeMaskByte); + uint8_t maskByte = ptr->nodeMask[nodeMaskByte]; + maskByte = maskByte & ((1 << nodeMaskBit) - 1); + nodeOffset += popcnt(&maskByte, 1); + if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit))) + throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node"); + + return { cacheChunkLatps[nodeOffset], cacheChunkLons[nodeOffset] }; + } + + UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr; + size_t nodeOffset = 0; + nodeOffset = popcnt(ptr->nodeMask, nodeMaskByte); + uint8_t maskByte = ptr->nodeMask[nodeMaskByte]; + maskByte = maskByte & ((1 << nodeMaskBit) - 1); + nodeOffset += popcnt(&maskByte, 1); + if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit))) + throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node"); + + return ptr->nodes[nodeOffset]; +} + +size_t SortedNodeStore::size() const { + // In general, use our atomic counter - it's fastest. + return totalNodes.load(); + + /* + // This code can be useful when debugging changes to the internal structure. + size_t rv = 0; + size_t totalChunks = 0; + for (const GroupInfo* group: groups) { + if (group != nullptr) { + uint64_t chunks = popcnt(group->chunkMask, 32); + totalChunks += chunks; + + for (size_t i = 0; i < chunks; i++) { + size_t rawOffset = group->chunkOffsets[i] * ChunkAlignment; + ChunkInfo* chunk = (ChunkInfo*)(((char*)(&group->chunkOffsets[chunks])) + rawOffset); + rv += popcnt(chunk->nodeMask, 32); + } + } + } + + std::cout << "SortedNodeStore::size(): totalChunks=" << totalChunks << ", size=" << rv << " (actual nodes: " << totalNodes.load() << ")" << std::endl; + return rv; + */ +} + +void SortedNodeStore::insert(const std::vector& elements) { + if (localNodes == nullptr) { + std::lock_guard lock(orphanageMutex); + if (workerBuffers.size() == 0) + workerBuffers.reserve(256); + else if (workerBuffers.size() == workerBuffers.capacity()) + throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores"); + workerBuffers.push_back(std::vector()); + localNodes = &workerBuffers.back(); + } + + if (groupStart == -1) { + // Mark where the first full group starts, so we know when to transition + // out of collecting orphans. + groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + } + + int i = 0; + while (collectingOrphans && i < elements.size()) { + const element_t& el = elements[i]; + if (el.first >= groupStart + (GroupSize * ChunkSize)) { + collectingOrphans = false; + // Calculate new groupStart, rounding to previous boundary. + groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + collectOrphans(*localNodes); + localNodes->clear(); + } + localNodes->push_back(el); + i++; + } + + while(i < elements.size()) { + const element_t& el = elements[i]; + + if (el.first >= groupStart + (GroupSize * ChunkSize)) { + publishGroup(*localNodes); + localNodes->clear(); + groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + } + + localNodes->push_back(el); + i++; + } +} + +void SortedNodeStore::batchStart() { + collectingOrphans = true; + groupStart = -1; + if (localNodes == nullptr || localNodes->size() == 0) + return; + + collectOrphans(*localNodes); + localNodes->clear(); +} + +void SortedNodeStore::finalize(size_t threadNum) { + for (const auto& buffer: workerBuffers) { + if (buffer.size() > 0) { + collectOrphans(buffer); + } + } + workerBuffers.clear(); + + // Empty the orphanage into the index. + std::vector copy; + for (const auto& entry: orphanage) { + for (const auto& orphan: entry.second) + copy.push_back(orphan); + + // Orphans may come from different workers, and thus be unsorted. + std::sort( + copy.begin(), + copy.end(), + [](auto const &a, auto const &b) { return a.first < b.first; } + ); + publishGroup(copy); + copy.clear(); + } + + orphanage.clear(); + + std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes" << std::endl; + /* + for (int i = 0; i < 257; i++) + std::cout << "chunkSizeFreqs[ " << i << " ]= " << chunkSizeFreqs[i].load() << std::endl; + for (int i = 0; i < 257; i++) + std::cout << "groupSizeFreqs[ " << i << " ]= " << groupSizeFreqs[i].load() << std::endl; + */ +} + +void SortedNodeStore::collectOrphans(const std::vector& orphans) { + std::lock_guard lock(orphanageMutex); + size_t groupIndex = orphans[0].first / (GroupSize * ChunkSize); + + std::vector& vec = orphanage[groupIndex]; + const size_t i = vec.size(); + vec.resize(i + orphans.size()); + std::copy(orphans.begin(), orphans.end(), vec.begin() + i); +} + +void SortedNodeStore::publishGroup(const std::vector& nodes) { + totalNodes += nodes.size(); + if (nodes.size() == 0) { + throw std::runtime_error("SortedNodeStore: group is empty"); + } + size_t groupIndex = nodes[0].first / (GroupSize * ChunkSize); + if (groupIndex >= groups.size()) + throw std::runtime_error("SortedNodeStore: unexpected groupIndex " + std::to_string(groupIndex)); + + if (nodes.size() > ChunkSize * GroupSize) { + std::cout << "groupIndex=" << groupIndex << ", first ID=" << nodes[0].first << ", nodes.size() = " << nodes.size() << std::endl; + throw std::runtime_error("SortedNodeStore: group is too big"); + } + + totalGroups++; + + // Calculate the space we need for this group's chunks. + + // Build up the lat/lons for each chunk; we use this to + // calculate if a compressed version is more efficient. + int32_t tmpLatpLons[257 * 2] = {0}; + uint32_t tmpLatpLonsZigzag[257 * 2] = {0}; + // NB that we're storing sparse indexes -- so if we had + // chunks 3, 6 and 7, only the first 3 indexes (0, 1, 2) would be set. + // compressed[chunkIndex] = 0 => no chunk, else it's the compressed size + // (or ~0 to skip compression) + uint32_t compressedLatpSize[256] = {0}; + uint32_t compressedLonSize[256] = {0}; + int64_t lastChunk = -1; + int64_t currentChunkIndex = 0; + int64_t currentNodeIndex = 0; + uint16_t numberNodesInChunk[256] = {0}; + uint8_t compressedBuffer[256 * 4 * 2]; + + for (size_t i = 0; i <= nodes.size(); i++) { + int64_t currentChunk = -1; + + if (i != nodes.size()) { + const element_t& node = nodes[i]; + currentChunk = (node.first % (GroupSize * ChunkSize)) / ChunkSize; + } + + if (lastChunk != currentChunk) { + if (lastChunk != -1) { + numberNodesInChunk[currentChunkIndex] = currentNodeIndex; + compressedLatpSize[currentChunkIndex] = ~0; + compressedLonSize[currentChunkIndex] = ~0; + + if (compressNodes) { + // Check to see if compression would help. + // Zigzag-delta-encode the lats/lons, then compress them. + tmpLatpLonsZigzag[0] = tmpLatpLons[0]; + tmpLatpLonsZigzag[256] = tmpLatpLons[256]; + zigzag_delta_encode(tmpLatpLons + 1, tmpLatpLonsZigzag + 1, currentNodeIndex - 1, tmpLatpLons[0]); + zigzag_delta_encode(tmpLatpLons + 256 + 1, tmpLatpLonsZigzag + 256 + 1, currentNodeIndex - 1, tmpLatpLons[256]); + + size_t latsCompressedSize = streamvbyte_encode(tmpLatpLonsZigzag + 1, currentNodeIndex - 1, compressedBuffer); + size_t lonsCompressedSize = streamvbyte_encode(tmpLatpLonsZigzag + 256 + 1, currentNodeIndex - 1, compressedBuffer); + + size_t uncompressedSize = currentNodeIndex * 8; + size_t totalCompressedSize = + latsCompressedSize + lonsCompressedSize + // The compressed buffers + 2 * 4; // The initial delta + + // We only allot 10 bits for storing the size of the compressed array-- + // if we need more than 10 bits, we haven't actually been able to + // compress the array. + if (totalCompressedSize < uncompressedSize && latsCompressedSize < 1024 && lonsCompressedSize < 1024) { + compressedLatpSize[currentChunkIndex] = latsCompressedSize; + compressedLonSize[currentChunkIndex] = lonsCompressedSize; + } + } + + currentChunkIndex++; + currentNodeIndex = 0; + } + + lastChunk = currentChunk; + } + + tmpLatpLons[currentNodeIndex] = nodes[i].second.latp; + tmpLatpLons[currentNodeIndex + 256] = nodes[i].second.lon; + currentNodeIndex++; + } + + uint64_t chunks = currentChunkIndex; + totalChunks += chunks; + + size_t groupSpace = + sizeof(GroupInfo) + // Every group needs a GroupInfo + chunks * sizeof(uint16_t); // Offsets for each chunk in GroupInfo + + for (currentChunkIndex = 0; currentChunkIndex < 256; currentChunkIndex++) { + if (compressedLatpSize[currentChunkIndex] == 0) + break; + + size_t chunkSpace = 0; + if (compressedLatpSize[currentChunkIndex] == ~0) { + // Store uncompressed. + chunkSpace = + sizeof(UncompressedChunkInfo) + + numberNodesInChunk[currentChunkIndex] * sizeof(LatpLon); + } else { + chunkSpace = + sizeof(CompressedChunkInfo) + + compressedLatpSize[currentChunkIndex] + compressedLonSize[currentChunkIndex]; + } + + // We require that chunks align on 16-byte boundaries + if (chunkSpace % ChunkAlignment != 0) + chunkSpace += ChunkAlignment - (chunkSpace % ChunkAlignment); + groupSpace += chunkSpace; + } + + // Per https://github.com/lemire/streamvbyte: + // During decoding, the library may read up to STREAMVBYTE_PADDING extra + // bytes from the input buffer (these bytes are read but never used). + // + // Thus, we need to reserve at least that much extra to ensure we don't + // have an out-of-bounds access. We could also allocate from an arena + // to amortize the cost across many groups, but with 256K groups, + // the overhead is only 4M, so who cares. + groupSpace += STREAMVBYTE_PADDING; + totalGroupSpace += groupSpace; + + GroupInfo* groupInfo = nullptr; + + if (groupSpace < 1024) { + // Avoid malloc for small groups, use a shared arena of memory + if (arenaSpace < groupSpace) { + arenaSpace = 32768; + arenaPtr = (char*)void_mmap_allocator::allocate(arenaSpace); + if (arenaPtr == nullptr) + throw std::runtime_error("SortedNodeStore: failed to allocate arena"); + std::lock_guard lock(orphanageMutex); + allocatedMemory.push_back(std::make_pair((void*)arenaPtr, arenaSpace)); + } + + arenaSpace -= groupSpace; + groupInfo = (GroupInfo*)arenaPtr; + arenaPtr += groupSpace; + } else { + groupInfo = (GroupInfo*)void_mmap_allocator::allocate(groupSpace); + if (groupInfo == nullptr) + throw std::runtime_error("SortedNodeStore: failed to allocate space for group"); + + std::lock_guard lock(orphanageMutex); + allocatedMemory.push_back(std::make_pair((void*)groupInfo, groupSpace)); + } + if (groups[groupIndex] != nullptr) + throw std::runtime_error("SortedNodeStore: group already present"); + groups[groupIndex] = groupInfo; + + lastChunk = -1; + uint8_t chunkMask[32], nodeMask[32]; + memset(chunkMask, 0, 32); + memset(nodeMask, 0, 32); + + currentChunkIndex = 0; + size_t numNodesInChunk = 0; + size_t chunkNodeStartIndex = 0; + + char* nextChunkInfo = (char*)&(groupInfo->chunkOffsets[chunks]); + + // NB: `i` goes past the end of `nodes` in order that we have + // the chance to publish the final ChunkInfo. We take care + // not to read past the end of `nodes`, though. + for (size_t i = 0; i <= nodes.size(); i++) { + int64_t currentChunk = -1; + + if (i != nodes.size()) { + const element_t& node = nodes[i]; + currentChunk = (node.first % (GroupSize * ChunkSize)) / ChunkSize; + } + + if (currentChunk != lastChunk) { + if (lastChunk != -1) { + // Publish a ChunkInfo. + + const size_t rawOffset = nextChunkInfo - (char*)(&groupInfo->chunkOffsets[chunks]); + const size_t scaledOffset = rawOffset / ChunkAlignment; + if (rawOffset % ChunkAlignment != 0) + throw std::runtime_error("SortedNodeStore: invalid scaledOffset for chunk"); + if (scaledOffset > 65535) + throw std::runtime_error("SortedNodeStore: scaledOffset too big (" + std::to_string(scaledOffset) + "), groupIndex=" + std::to_string(groupIndex)); + + groupInfo->chunkOffsets[currentChunkIndex] = (uint16_t)(scaledOffset); + + memcpy(((ChunkInfoBase*)nextChunkInfo)->nodeMask, nodeMask, 32); + if (compressedLatpSize[currentChunkIndex] == ~0) { + // Store uncompressed. + ((ChunkInfoBase*)nextChunkInfo)->flags = 0; + for (size_t j = chunkNodeStartIndex; j < i; j++) { + UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)nextChunkInfo; + ptr->nodes[j - chunkNodeStartIndex] = nodes[j].second; + } + } else { + // Store compressed. + CompressedChunkInfo* ptr = (CompressedChunkInfo*)nextChunkInfo; + ptr->flags = ChunkCompressed | (compressedLatpSize[currentChunkIndex] << 10) | compressedLonSize[currentChunkIndex]; + + ptr->firstLatp = nodes[chunkNodeStartIndex].second.latp; + ptr->firstLon = nodes[chunkNodeStartIndex].second.lon; + for (size_t j = chunkNodeStartIndex; j < i; j++) { + tmpLatpLons[j - chunkNodeStartIndex] = nodes[j].second.latp; + tmpLatpLons[j - chunkNodeStartIndex + 256] = nodes[j].second.lon; + } + + tmpLatpLonsZigzag[0] = tmpLatpLons[0]; + tmpLatpLonsZigzag[256] = tmpLatpLons[256]; + currentNodeIndex = i - chunkNodeStartIndex; + zigzag_delta_encode(tmpLatpLons + 1, tmpLatpLonsZigzag + 1, currentNodeIndex - 1, tmpLatpLons[0]); + zigzag_delta_encode(tmpLatpLons + 256 + 1, tmpLatpLonsZigzag + 256 + 1, currentNodeIndex - 1, tmpLatpLons[256]); + + size_t latsCompressedSize = streamvbyte_encode(tmpLatpLonsZigzag + 1, currentNodeIndex - 1, ptr->data); + + if (latsCompressedSize != compressedLatpSize[currentChunkIndex]) + throw std::runtime_error("unexpected latsCompressedSize"); + size_t lonsCompressedSize = streamvbyte_encode(tmpLatpLonsZigzag + 256 + 1, currentNodeIndex - 1, ptr->data + latsCompressedSize); + if (lonsCompressedSize != compressedLonSize[currentChunkIndex]) + throw std::runtime_error("unexpected lonsCompressedSize"); + } + + size_t chunkSpace = 0; + if (compressedLatpSize[currentChunkIndex] == ~0) { + // Store uncompressed. + chunkSpace = + sizeof(UncompressedChunkInfo) + + numberNodesInChunk[currentChunkIndex] * sizeof(LatpLon); + } else { + chunkSpace = + sizeof(CompressedChunkInfo) + + compressedLatpSize[currentChunkIndex] + compressedLonSize[currentChunkIndex]; + } + + // We require that chunks align on 16-byte boundaries + if (chunkSpace % ChunkAlignment != 0) + chunkSpace += ChunkAlignment - (chunkSpace % ChunkAlignment); + + nextChunkInfo += chunkSpace; + chunkSizeFreqs[numNodesInChunk]++; + + numNodesInChunk = 0; + memset(nodeMask, 0, 32); + + const uint64_t chunkMaskByte = lastChunk / 8; + const uint64_t chunkMaskBit = lastChunk % 8; + + chunkMask[chunkMaskByte] |= 1 << chunkMaskBit; + if (currentChunk != -1) + currentChunkIndex++; + } + + lastChunk = currentChunk; + chunkNodeStartIndex = i; + } + numNodesInChunk++; + + if (i != nodes.size()) { + const element_t& node = nodes[i]; + + const uint64_t nodeMaskByte = (node.first % ChunkSize) / 8; + const uint64_t nodeMaskBit = node.first % 8; + nodeMask[nodeMaskByte] |= 1 << nodeMaskBit; + } + } + + groupSizeFreqs[currentChunkIndex]++; + memcpy(groupInfo->chunkMask, chunkMask, 32); + + /* + // debug: verify that we can read every node we just wrote + for (const auto& node: nodes) { + const auto rv = at(node.first); + + if (rv.latp != node.second.latp || rv.lon != node.second.lon) + throw std::runtime_error("failed to roundtrip node ID " + std::to_string(node.first)); + } + */ +} diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp new file mode 100644 index 00000000..8fdaa806 --- /dev/null +++ b/src/sorted_way_store.cpp @@ -0,0 +1,587 @@ +#include +#include +#include +#include +#include +#include "external/libpopcnt.h" +#include "external/streamvbyte.h" +#include "external/streamvbyte_zigzag.h" +#include "sorted_way_store.h" +#include "node_store.h" + +namespace SortedWayStoreTypes { + const uint16_t GroupSize = 256; + const uint16_t ChunkSize = 256; + const size_t LargeWayAlignment = 64; + + // We encode some things in the length of a way's unused upper bits. + const uint16_t CompressedWay = 1 << 15; + const uint16_t ClosedWay = 1 << 14; + const uint16_t UniformUpperBits = 1 << 13; + + thread_local bool collectingOrphans = true; + thread_local uint64_t groupStart = -1; + thread_local std::vector>>* localWays = NULL; + + thread_local std::vector encodedWay; + + // C++ doesn't support variable length arrays declared on stack. + // g++ and clang support it, but msvc doesn't. Rather than pay the + // cost of a vector for every decode, we use a thread_local with room for at + // least 2,000 nodes. + thread_local uint64_t highBytes[2000]; + thread_local uint32_t uint32Buffer[2000]; + thread_local int32_t int32Buffer[2000]; + thread_local uint8_t uint8Buffer[8192]; + + std::atomic totalWays; + std::atomic totalNodes; + std::atomic totalGroups; + std::atomic totalGroupSpace; + std::atomic totalChunks; +} + +using namespace SortedWayStoreTypes; + +SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) { + // Each group can store 64K ways. If we allocate 32K slots, + // we support 2^31 = 2B ways, or about twice the number used + // by OSM as of December 2023. + groups.resize(32 * 1024); +} + +SortedWayStore::~SortedWayStore() { + for (const auto entry: allocatedMemory) + void_mmap_allocator::deallocate(entry.first, entry.second); +} + +void SortedWayStore::reopen() { + for (const auto entry: allocatedMemory) + void_mmap_allocator::deallocate(entry.first, entry.second); + allocatedMemory.clear(); + + totalWays = 0; + totalNodes = 0; + totalGroups = 0; + totalGroupSpace = 0; + totalChunks = 0; + orphanage.clear(); + workerBuffers.clear(); + groups.clear(); + groups.resize(256 * 1024); + +} + +std::vector SortedWayStore::at(WayID id) const { + const size_t groupIndex = id / (GroupSize * ChunkSize); + const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize; + const uint64_t chunkMaskByte = chunk / 8; + const uint64_t chunkMaskBit = chunk % 8; + + const uint64_t wayMaskByte = (id % ChunkSize) / 8; + const uint64_t wayMaskBit = id % 8; + + GroupInfo* groupPtr = groups[groupIndex]; + + if (groupPtr == nullptr) { + throw std::out_of_range("SortedWayStore::at(" + std::to_string(id) + ") uses non-existent group " + std::to_string(groupIndex)); + } + + size_t chunkOffset = 0; + { + chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte); + uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte]; + maskByte = maskByte & ((1 << chunkMaskBit) - 1); + chunkOffset += popcnt(&maskByte, 1); + + if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit))) + throw std::out_of_range("SortedWayStore: way " + std::to_string(id) + " missing, no chunk"); + } + + ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]); + const size_t numWays = popcnt(chunkPtr->smallWayMask, 32) + popcnt(chunkPtr->bigWayMask, 32); + + uint8_t* const endOfWayOffsetPtr = (uint8_t*)(chunkPtr->wayOffsets + numWays); + EncodedWay* wayPtr = nullptr; + + { + size_t wayOffset = 0; + wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte); + uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte]; + maskByte = maskByte & ((1 << wayMaskBit) - 1); + wayOffset += popcnt(&maskByte, 1); + if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit)) { + wayPtr = (EncodedWay*)(endOfWayOffsetPtr + chunkPtr->wayOffsets[wayOffset]); + } + } + + // If we didn't find it in small ways, look in big ways. + if (wayPtr == nullptr) { + size_t wayOffset = 0; + wayOffset += popcnt(chunkPtr->smallWayMask, 32); + wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte); + uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte]; + maskByte = maskByte & ((1 << wayMaskBit) - 1); + wayOffset += popcnt(&maskByte, 1); + if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit))) + throw std::out_of_range("SortedWayStore: way " + std::to_string(id) + " missing, no way"); + + wayPtr = (EncodedWay*)(endOfWayOffsetPtr + chunkPtr->wayOffsets[wayOffset] * LargeWayAlignment); + } + + std::vector nodes = SortedWayStore::decodeWay(wayPtr->flags, wayPtr->data); + std::vector rv; + for (const NodeID& node : nodes) + rv.push_back(nodeStore.at(node)); + return rv; +} + +void SortedWayStore::insertLatpLons(std::vector &newWays) { + throw std::runtime_error("SortedWayStore does not support insertLatpLons"); +} + +const void SortedWayStore::insertNodes(const std::vector>>& newWays) { + // read_pbf can call with an empty array if the only ways it read were unable to + // be processed due to missing nodes, so be robust against empty way vector. + if (newWays.empty()) + return; + + if (localWays == nullptr) { + std::lock_guard lock(orphanageMutex); + if (workerBuffers.size() == 0) + workerBuffers.reserve(256); + else if (workerBuffers.size() == workerBuffers.capacity()) + throw std::runtime_error("SortedWayStore doesn't support more than 256 cores"); + workerBuffers.push_back(std::vector>>()); + localWays = &workerBuffers.back(); + } + + if (groupStart == -1) { + // Mark where the first full group starts, so we know when to transition + // out of collecting orphans. + groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + } + + int i = 0; + while (collectingOrphans && i < newWays.size()) { + const auto& el = newWays[i]; + if (el.first >= groupStart + (GroupSize * ChunkSize)) { + collectingOrphans = false; + // Calculate new groupStart, rounding to previous boundary. + groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + collectOrphans(*localWays); + localWays->clear(); + } + localWays->push_back(el); + i++; + } + + while(i < newWays.size()) { + const auto& el = newWays[i]; + + if (el.first >= groupStart + (GroupSize * ChunkSize)) { + publishGroup(*localWays); + localWays->clear(); + groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize); + } + + localWays->push_back(el); + i++; + } +} + +void SortedWayStore::clear() { + // TODO: why does this function exist in addition to reopen? + reopen(); +} + +std::size_t SortedWayStore::size() const { + return totalWays.load(); +} + +void SortedWayStore::finalize(unsigned int threadNum) { + for (const auto& buffer: workerBuffers) { + if (buffer.size() > 0) { + collectOrphans(buffer); + } + } + workerBuffers.clear(); + + // Empty the orphanage into the index. + std::vector>> copy; + for (const auto& entry: orphanage) { + for (const auto& orphan: entry.second) + copy.push_back(orphan); + + // Orphans may come from different workers, and thus be unsorted. + std::sort( + copy.begin(), + copy.end(), + [](auto const &a, auto const &b) { return a.first < b.first; } + ); + publishGroup(copy); + copy.clear(); + } + + orphanage.clear(); + + std::cout << "SortedWayStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalWays.load() << " ways, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes" << std::endl; +} + +void SortedWayStore::batchStart() { + collectingOrphans = true; + groupStart = -1; + if (localWays == nullptr || localWays->size() == 0) + return; + + collectOrphans(*localWays); + localWays->clear(); +} + +void SortedWayStore::collectOrphans(const std::vector>>& orphans) { + std::lock_guard lock(orphanageMutex); + size_t groupIndex = orphans[0].first / (GroupSize * ChunkSize); + + std::vector>>& vec = orphanage[groupIndex]; + const size_t i = vec.size(); + vec.resize(i + orphans.size()); + std::copy(orphans.begin(), orphans.end(), vec.begin() + i); +} + +std::vector SortedWayStore::decodeWay(uint16_t flags, const uint8_t* input) { + std::vector rv; + + bool isCompressed = flags & CompressedWay; + bool isClosed = flags & ClosedWay; + + const uint16_t length = flags & 0b0000011111111111; + + if (!(flags & UniformUpperBits)) { + // The nodes don't all share the same upper int; unpack which + // bits are set on a per-node basis. + for (int i = 0; i <= (length - 1) / 2; i++) { + uint8_t byte = *input; + for (int j = i * 2; j < std::min(length, i * 2 + 2); j++) { + uint64_t highByte = 0; + highByte |= (byte & 0b00001111); + byte = byte >> 4; + highBytes[j] = (highByte << 31); + } + input++; + } + } else { + uint8_t setBits = *(uint8_t*)input; + input++; + uint64_t highByte = setBits; + highByte = highByte << 31; + for (int i = 0; i < length; i++) + highBytes[i] = highByte; + } + + if (!isCompressed) { + // Decode the low ints + uint32_t* lowIntData = (uint32_t*)input; + for (int i = 0; i < length; i++) + rv.push_back(highBytes[i] | lowIntData[i]); + } else { + uint16_t compressedLength = *(uint16_t*)input; + input += 2; + + uint32_t firstInt = *(uint32_t*)(input); + input += 4; + rv.push_back(highBytes[0] | firstInt); + + streamvbyte_decode(input, uint32Buffer, length - 1); + zigzag_delta_decode(uint32Buffer, int32Buffer, length - 1, firstInt); + for (int i = 1; i < length; i++) { + uint32_t tmp = int32Buffer[i - 1]; + rv.push_back(highBytes[i] | tmp); + } + } + + if (isClosed) + rv.push_back(rv[0]); + return rv; +}; + +uint16_t SortedWayStore::encodeWay(const std::vector& way, std::vector& output, bool compress) { + if (way.size() == 0) + throw std::runtime_error("Cannot encode an empty way"); + + if (way.size() > 2000) + throw std::runtime_error("Way had more than 2,000 nodes"); + + bool isClosed = way.size() > 1 && way[0] == way[way.size() - 1]; + output.clear(); + + // When the way is closed, store that in a single bit and omit + // the final point. + const int max = isClosed ? way.size() - 1 : way.size(); + + uint16_t rv = max; + + if (compress) + rv |= CompressedWay; + + if (isClosed) + rv |= ClosedWay; + + bool pushUpperBits = false; + + // zigzag encoding can only be done on ints, not uints, so we shift + // 31 bits, not 32. + uint32_t upperInt = way[0] >> 31; + for (int i = 1; i < way.size(); i++) { + if (way[i] >> 31 != upperInt) { + pushUpperBits = true; + break; + } + } + + if (pushUpperBits) { + for (int i = 0; i <= (max - 1) / 2; i++) { + uint8_t byte = 0; + + bool first = true; + for (int j = std::min(max, i * 2 + 2) - 1; j >= i * 2; j--) { + if (!first) + byte = byte << 4; + first = false; + uint8_t upper4Bits = way[j] >> 31; + if (upper4Bits > 15) + throw std::runtime_error("unexpectedly high node ID: " + std::to_string(way[j])); + byte |= upper4Bits; + } + + output.push_back(byte); + } + } else { + if (upperInt > 15) + throw std::runtime_error("unexpectedly high node ID"); + + rv |= UniformUpperBits; + output.push_back(upperInt); + } + + // Push the low bytes. + if (!compress) { + const size_t oldSize = output.size(); + output.resize(output.size() + max * 4); + uint32_t* dataStart = (uint32_t*)(output.data() + oldSize); + for (int i = 0; i < max; i++) { + uint32_t lowBits = way[i]; + lowBits = lowBits & 0x7FFFFFFF; + dataStart[i] = lowBits; + } + } else { + for (int i = 0; i < max; i++) { + uint32_t truncated = way[i]; + truncated = truncated & 0x7FFFFFFF; + int32Buffer[i] = truncated; + } + + zigzag_delta_encode(int32Buffer + 1, uint32Buffer, max - 1, int32Buffer[0]); + + size_t compressedSize = streamvbyte_encode(uint32Buffer, max - 1, uint8Buffer); + + const size_t oldSize = output.size(); + output.resize(output.size() + 2 /* compressed size */ + 4 /* first 32-bit value */ + compressedSize); + *(uint16_t*)(output.data() + oldSize) = compressedSize; + *(uint32_t*)(output.data() + oldSize + 2) = way[0]; + *(uint32_t*)(output.data() + oldSize + 2) &= 0x7FFFFFFF; + + memcpy(output.data() + oldSize + 2 + 4, uint8Buffer, compressedSize); + } + + return rv; +} + +void populateMask(uint8_t* mask, const std::vector& ids) { + // mask should be a 32-byte array of uint8_t + memset(mask, 0, 32); + for (const uint8_t id : ids) { + const uint64_t maskByte = id / 8; + const uint64_t maskBit = id % 8; + + mask[maskByte] |= 1 << maskBit; + } +} + +void SortedWayStore::publishGroup(const std::vector>>& ways) { + totalWays += ways.size(); + if (ways.size() == 0) { + throw std::runtime_error("SortedWayStore: group is empty"); + } + size_t groupIndex = ways[0].first / (GroupSize * ChunkSize); + + if (groupIndex >= groups.size()) + throw std::runtime_error("SortedWayStore: unexpected groupIndex " + std::to_string(groupIndex)); + + if (ways.size() > ChunkSize * GroupSize) { + std::cout << "groupIndex=" << groupIndex << ", first ID=" << ways[0].first << ", ways.size() = " << ways.size() << std::endl; + throw std::runtime_error("SortedWayStore: group is too big"); + } + + totalGroups++; + + struct ChunkData { + uint8_t id; + std::vector wayIds; + std::vector wayFlags; + std::deque> encodedWays; + }; + + std::deque chunks; + + + ChunkData* lastChunk = nullptr; + + // Encode the ways and group by chunk - don't allocate final memory yet. + uint32_t seenNodes = 0; + for (const auto& way : ways) { + seenNodes += way.second.size(); + const uint8_t currentChunk = (way.first % (GroupSize * ChunkSize)) / ChunkSize; + + if (lastChunk == nullptr || lastChunk->id != currentChunk) { + totalChunks++; + chunks.push_back({}); + lastChunk = &chunks.back(); + lastChunk->id = currentChunk; + } + const WayID id = way.first; + lastChunk->wayIds.push_back(id % ChunkSize); + + uint16_t flags = encodeWay(way.second, encodedWay, compressWays && way.second.size() >= 4); + lastChunk->wayFlags.push_back(flags); + + std::vector encoded; + encoded.resize(encodedWay.size()); + memcpy(encoded.data(), encodedWay.data(), encodedWay.size()); + + lastChunk->encodedWays.push_back(std::move(encoded)); + } + totalNodes += seenNodes; + + // We now have the sizes of everything, so we can generate the final memory layout. + + // 1. compute the memory that is needed + size_t groupSpace = sizeof(GroupInfo); // every group needs a GroupInfo + groupSpace += chunks.size() * sizeof(uint32_t); // every chunk needs a 32-bit offset + groupSpace += chunks.size() * sizeof(ChunkInfo); // every chunk needs a ChunkInfo + for (const auto& chunk : chunks) { + groupSpace += chunk.wayIds.size() * sizeof(uint16_t); // every way need a 16-bit offset + + // Ways that are < 256 bytes get stored in the small ways buffer with + // no wasted space. Ways that are >= 256 bytes are stored in the large ways + // buffer with some wasted space. + + size_t smallWaySize = 0; + size_t largeWaySize = 0; + for (int i = 0; i < chunk.wayIds.size(); i++) { + size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay); + if (waySize < 256) { + smallWaySize += waySize; + } else { + largeWaySize += (((waySize - 1) / LargeWayAlignment) + 1) * LargeWayAlignment; + } + } + + groupSpace += smallWaySize; + + if (smallWaySize % LargeWayAlignment != 0) + groupSpace += LargeWayAlignment - (smallWaySize % LargeWayAlignment); + groupSpace += largeWaySize; + } + // During decoding, the library may read up to STREAMVBYTE_PADDING extra + // bytes -- ensure that won't cause out-of-bounds reads. + groupSpace += STREAMVBYTE_PADDING; + + totalGroupSpace += groupSpace; + + // 2. allocate and track the memory + GroupInfo* groupInfo = nullptr; + { + groupInfo = (GroupInfo*)void_mmap_allocator::allocate(groupSpace); + if (groupInfo == nullptr) + throw std::runtime_error("SortedWayStore: failed to allocate space for group"); + std::lock_guard lock(orphanageMutex); + allocatedMemory.push_back(std::make_pair((void*)groupInfo, groupSpace)); + } + + if (groups[groupIndex] != nullptr) + throw std::runtime_error("SortedNodeStore: group already present"); + groups[groupIndex] = groupInfo; + + // 3. populate the masks and offsets + std::vector chunkIds; + chunkIds.reserve(chunks.size()); + for (const auto& chunk : chunks) + chunkIds.push_back(chunk.id); + populateMask(groupInfo->chunkMask, chunkIds); + + ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupInfo->chunkOffsets + (sizeof(uint32_t) * chunks.size())); + + for (size_t chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) { + groupInfo->chunkOffsets[chunkIndex] = (char*)chunkPtr - (char*)groupInfo; + + // Populate: smallWayMask, bigWayMask, wayOffsets + std::vector smallWays; + std::vector bigWays; + + const ChunkData& chunk = chunks[chunkIndex]; + const size_t numWays = chunk.wayIds.size(); + for (int i = 0; i < numWays; i++) { + const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay); + if (waySize < 256) { + smallWays.push_back(chunk.wayIds[i]); + } else { + bigWays.push_back(chunk.wayIds[i]); + } + } + populateMask(chunkPtr->smallWayMask, smallWays); + populateMask(chunkPtr->bigWayMask, bigWays); + + // Publish the small ways + uint8_t* const endOfWayOffsetPtr = (uint8_t*)(chunkPtr->wayOffsets + numWays); + uint8_t* wayStartPtr = endOfWayOffsetPtr; + int offsetIndex = 0; + for (int i = 0; i < numWays; i++) { + const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay); + if (waySize < 256) { + chunkPtr->wayOffsets[offsetIndex] = wayStartPtr - endOfWayOffsetPtr; + EncodedWay* wayPtr = (EncodedWay*)wayStartPtr; + wayPtr->flags = chunk.wayFlags[i]; + memcpy(wayPtr->data, chunk.encodedWays[i].data(), chunk.encodedWays[i].size()); + + wayStartPtr += sizeof(EncodedWay) + chunk.encodedWays[i].size(); + offsetIndex++; + } + } + + // Publish the big ways + // Offset is scaled for big ways, so make sure we're on a multiple of LargeWayAlignment + if ((wayStartPtr - endOfWayOffsetPtr) % LargeWayAlignment != 0) + wayStartPtr += LargeWayAlignment - ((wayStartPtr - endOfWayOffsetPtr) % LargeWayAlignment); + for (int i = 0; i < numWays; i++) { + const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay); + if (waySize >= 256) { + uint32_t spaceNeeded = (((waySize - 1) / LargeWayAlignment) + 1) * LargeWayAlignment; + uint32_t offset = wayStartPtr - endOfWayOffsetPtr; + if (offset % LargeWayAlignment != 0) + throw std::runtime_error("big way alignment error"); + + chunkPtr->wayOffsets[offsetIndex] = offset / LargeWayAlignment; + EncodedWay* wayPtr = (EncodedWay*)wayStartPtr; + wayPtr->flags = chunk.wayFlags[i]; + memcpy(wayPtr->data, chunk.encodedWays[i].data(), chunk.encodedWays[i].size()); + + wayStartPtr += spaceNeeded; + offsetIndex++; + } + } + + + // Update chunkPtr + chunkPtr = (ChunkInfo*)wayStartPtr; + } +} diff --git a/src/sorted_way_store.test.cpp b/src/sorted_way_store.test.cpp new file mode 100644 index 00000000..dcf06cc5 --- /dev/null +++ b/src/sorted_way_store.test.cpp @@ -0,0 +1,193 @@ +#include +#include "minunit.h" +#include "sorted_way_store.h" +#include "node_store.h" + +class TestNodeStore : public NodeStore { + void clear() override {} + void reopen() override {} + void batchStart() override {} + void finalize(size_t threadNum) override {} + size_t size() const override { return 1; } + LatpLon at(NodeID id) const override { + return { (int32_t)id, -(int32_t)id }; + } + void insert(const std::vector>& elements) override {} +}; + +void roundtripWay(const std::vector& way) { + bool compress = false; + + for (int i = 0; i < 2; i++) { + std::vector output; + uint16_t flags = SortedWayStore::encodeWay(way, output, compress); + + if (false) { + std::cout << "input="; + for (const auto& node : way) { + std::cout << node << " "; + } + std::cout << std::endl; + std::cout << "flags=" << flags << ", output.size()=" << output.size() << ", "; + + for (const uint8_t byte : output) + std::cout << " " << std::to_string(byte); + std::cout << std::endl; + } + + const std::vector roundtrip = SortedWayStore::decodeWay(flags, &output[0]); + + mu_check(roundtrip.size() == way.size()); + for (int i = 0; i < way.size(); i++) { + //std::cout << "roundtrip[" << i << "]=" << roundtrip[i] << ", way[" << i << "]=" << way[i] << std::endl; + mu_check(roundtrip[i] == way[i]); + } + compress = !compress; + } +} + +MU_TEST(test_encode_way) { + roundtripWay({ 1 }); + roundtripWay({ 1, 2 }); + roundtripWay({ 1, 2, 1 }); + roundtripWay({ 1, 2, 3, 4 }); + roundtripWay({ 4294967295, 4294967297, 8589934592, 4, 5 }); + // 11386679771 uses the full lower 32-bits, so is a good test case that + // zigzag encoding hasn't broken anything. + roundtripWay({ 5056880431, 538663248, 538663257, 538663260, 538663263, 11386679771, 538663266 }); + + // When the high bytes are all the same, it should take + // less space to encode. + { + std::vector output; + SortedWayStore::encodeWay({ 1, 2, 3, 4 }, output, false); + const uint16_t l1 = output.size(); + + SortedWayStore::encodeWay({ 1, 8589934592, 3, 4 }, output, false); + const uint16_t l2 = output.size(); + + mu_check(l1 < l2); + } +} + +MU_TEST(test_way_store) { + TestNodeStore ns; + SortedWayStore sws(true, ns); + sws.batchStart(); + + std::vector>> ways; + std::vector shortWay; + shortWay.push_back(123); + ways.push_back(std::make_pair(1, shortWay)); + ways.push_back(std::make_pair(2, shortWay)); + ways.push_back(std::make_pair(513, shortWay)); + + std::vector longWay; + for(int i = 200; i < 300; i++) + longWay.push_back(i); + ways.push_back(std::make_pair(65536, longWay)); + ways.push_back(std::make_pair(131072, longWay)); + + sws.insertNodes(ways); + sws.finalize(1); + + mu_check(sws.size() == 5); + + { + const auto& rv = sws.at(1); + mu_check(rv.size() == 1); + mu_check(rv[0].latp == 123); + } + + { + const auto& rv = sws.at(2); + mu_check(rv.size() == 1); + mu_check(rv[0].latp == 123); + } + + { + const auto& rv = sws.at(513); + mu_check(rv.size() == 1); + mu_check(rv[0].latp == 123); + } + + { + const auto& rv = sws.at(65536); + mu_check(rv.size() == 100); + mu_check(rv[0].latp == 200); + mu_check(rv[99].latp == 299); + } + + { + const auto& rv = sws.at(131072); + mu_check(rv.size() == 100); + mu_check(rv[0].latp == 200); + mu_check(rv[99].latp == 299); + } + + // missing things should throw std::out_of_range + + bool threw = false; + try { + sws.at(123123123); + } catch (std::out_of_range &e) { + threw = true; + } catch (...) {} + mu_check(threw == true); + + threw = false; + try { + sws.at(3); + } catch (std::out_of_range &e) { + threw = true; + } catch (...) {} + mu_check(threw == true); + +} + +MU_TEST(test_populate_mask) { + uint8_t mask[32]; + std::vector ids; + + { + // No ids: all 0s + populateMask(mask, ids); + for(int i = 0; i < 32; i++) + mu_check(mask[i] == 0); + } + + { + // Every id: all 1s + for(int i = 0; i < 256; i++) + ids.push_back(i); + populateMask(mask, ids); + for(int i = 0; i < 32; i++) + mu_check(mask[i] == 255); + } + + { + // Every other ID + ids.clear(); + for (int i = 0; i < 256; i += 2) + ids.push_back(i); + populateMask(mask, ids); + for(int i = 0; i < 32; i++) + mu_check(mask[i] == 0b01010101); + } +} + +MU_TEST_SUITE(test_suite_sorted_way_store) { + MU_RUN_TEST(test_encode_way); + MU_RUN_TEST(test_way_store); +} + +MU_TEST_SUITE(test_suite_bitmask) { + MU_RUN_TEST(test_populate_mask); +} + +int main() { + MU_RUN_SUITE(test_suite_sorted_way_store); + MU_RUN_SUITE(test_suite_bitmask); + MU_REPORT(); + return MU_EXIT_CODE; +} diff --git a/src/tile_data.cpp b/src/tile_data.cpp index 4438955f..1061b4c0 100644 --- a/src/tile_data.cpp +++ b/src/tile_data.cpp @@ -1,6 +1,7 @@ #include #include #include "tile_data.h" +#include "coordinates_geom.h" #include diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp index 9643f226..8d7a4242 100644 --- a/src/tilemaker.cpp +++ b/src/tilemaker.cpp @@ -35,16 +35,18 @@ #endif #include "geom.h" +#include "node_stores.h" +#include "way_stores.h" // Tilemaker code #include "helpers.h" #include "coordinates.h" +#include "coordinates_geom.h" #include "attribute_store.h" #include "output_object.h" #include "osm_lua_processing.h" #include "mbtiles.h" -#include "write_geometry.h" #include "shared_data.h" #include "read_pbf.h" @@ -170,7 +172,7 @@ int main(int argc, char* argv[]) { uint threadNum; string outputFile; string bbox; - bool _verbose = false, sqlite= false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false; + bool _verbose = false, sqlite= false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false; po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options"); desc.add_options() @@ -183,6 +185,8 @@ int main(int argc, char* argv[]) { ("process",po::value< string >(&luaFile)->default_value("process.lua"), "tag-processing Lua file") ("store", po::value< string >(&osmStoreFile), "temporary storage for node/ways/relations data") ("compact",po::bool_switch(&osmStoreCompact), "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)") + ("no-compress-nodes", po::bool_switch(&osmStoreUncompressedNodes), "Store nodes uncompressed") + ("no-compress-ways", po::bool_switch(&osmStoreUncompressedWays), "Store ways uncompressed") ("verbose",po::bool_switch(&_verbose), "verbose error output") ("skip-integrity",po::bool_switch(&skipIntegrity), "don't enforce way/node integrity") ("threads",po::value< uint >(&threadNum)->default_value(0), "number of threads (automatically detected if 0)"); @@ -279,7 +283,35 @@ int main(int argc, char* argv[]) { } // For each tile, objects to be used in processing - OSMStore osmStore; + shared_ptr nodeStore; + + bool allPbfsHaveSortTypeThenID = true; + bool anyPbfHasLocationsOnWays = false; + + for (const std::string& file: inputFiles) { + if (ends_with(file, ".pbf")) { + allPbfsHaveSortTypeThenID = allPbfsHaveSortTypeThenID && PbfHasOptionalFeature(file, OptionSortTypeThenID); + anyPbfHasLocationsOnWays = anyPbfHasLocationsOnWays || PbfHasOptionalFeature(file, OptionLocationsOnWays); + } + } + + if (osmStoreCompact) + nodeStore = make_shared(); + else { + if (allPbfsHaveSortTypeThenID) + nodeStore = make_shared(!osmStoreUncompressedNodes); + else + nodeStore = make_shared(); + } + + shared_ptr wayStore; + if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) { + wayStore = make_shared(!osmStoreUncompressedNodes, *nodeStore.get()); + } else { + wayStore = make_shared(); + } + + OSMStore osmStore(*nodeStore.get(), *wayStore.get()); osmStore.use_compact_store(osmStoreCompact); osmStore.enforce_integrity(!skipIntegrity); if(!osmStoreFile.empty()) { @@ -335,15 +367,20 @@ int main(int argc, char* argv[]) { ifstream infile(inputFile, ios::in | ios::binary); if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; } - int ret = pbfReader.ReadPbfFile(nodeKeys, threadNum, - [&]() { + const bool hasSortTypeThenID = PbfHasOptionalFeature(inputFile, OptionSortTypeThenID); + int ret = pbfReader.ReadPbfFile( + hasSortTypeThenID, + nodeKeys, + threadNum, + [&]() { thread_local std::shared_ptr pbfStream(new ifstream(inputFile, ios::in | ios::binary)); return pbfStream; }, [&]() { thread_local std::shared_ptr osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore)); return osmLuaProcessing; - }); + } + ); if (ret != 0) return ret; } attributeStore.finalize(); @@ -420,13 +457,17 @@ int main(int argc, char* argv[]) { cout << "Reading tile " << srcZ << ": " << srcX << "," << srcY << " (" << (run+1) << "/" << runs << ")" << endl; vector pbf = mapsplitFile.readTile(srcZ,srcX,tmsY); - int ret = pbfReader.ReadPbfFile(nodeKeys, 1, - [&]() { + int ret = pbfReader.ReadPbfFile( + false, + nodeKeys, + 1, + [&]() { return make_unique(pbf.data(), pbf.size(), ios::in | ios::binary); }, [&]() { return std::make_unique(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore); - }); + } + ); if (ret != 0) return ret; tileList.pop_back(); diff --git a/src/way_stores.cpp b/src/way_stores.cpp new file mode 100644 index 00000000..05d884d0 --- /dev/null +++ b/src/way_stores.cpp @@ -0,0 +1,54 @@ +#include + +#include "way_stores.h" + +void BinarySearchWayStore::finalize(unsigned int threadNum) { + std::lock_guard lock(mutex); + boost::sort::block_indirect_sort( + mLatpLonLists->begin(), mLatpLonLists->end(), + [](auto const &a, auto const &b) { return a.first < b.first; }, + threadNum); +} + +void BinarySearchWayStore::reopen() { + mLatpLonLists = std::make_unique(); +} + +std::vector BinarySearchWayStore::at(WayID wayid) const { + std::lock_guard lock(mutex); + + auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), wayid, [](auto const &e, auto wayid) { + return e.first < wayid; + }); + + if(iter == mLatpLonLists->end() || iter->first != wayid) + throw std::out_of_range("Could not find way with id " + std::to_string(wayid)); + + std::vector rv; + rv.reserve(iter->second.size()); + // TODO: copy iter->second to rv more efficiently + for (const LatpLon& el : iter->second) + rv.push_back(el); + return rv; +} + +void BinarySearchWayStore::insertLatpLons(std::vector &newWays) { + std::lock_guard lock(mutex); + auto i = mLatpLonLists->size(); + mLatpLonLists->resize(i + newWays.size()); + std::copy(std::make_move_iterator(newWays.begin()), std::make_move_iterator(newWays.end()), mLatpLonLists->begin() + i); +} + +const void BinarySearchWayStore::insertNodes(const std::vector>>& newWays) { + throw std::runtime_error("BinarySearchWayStore does not support insertNodes"); +} + +void BinarySearchWayStore::clear() { + std::lock_guard lock(mutex); + mLatpLonLists->clear(); +} + +std::size_t BinarySearchWayStore::size() const { + std::lock_guard lock(mutex); + return mLatpLonLists->size(); +}