From fee6184c8246b29bd034f90d0bf9cd4c0bca587b Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Thu, 1 Sep 2022 01:44:22 -0700
Subject: [PATCH] 1.007-1.012x AVX2 sort speedup: FirstN from compress table

PiperOrigin-RevId: 471468074
---
 hwy/ops/x86_256-inl.h      | 247 ++++++++++++++++++++-----------------
 hwy/ops/x86_512-inl.h      |  10 +-
 hwy/tests/compress_test.cc |  30 ++++-
 3 files changed, 165 insertions(+), 122 deletions(-)
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index b4c29a7dd1..3d5abb0049 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -4915,8 +4915,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
 namespace detail {
 
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
   const RebindToUnsigned<decltype(d)> d32;
   // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
   // of SetTableIndices would require 8 KiB, a large part of L1D. The other
@@ -4925,49 +4924,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
   // bits, for a total of 1 KiB.
   alignas(16) constexpr uint32_t packed_array[256] = {
       // PrintCompress32x8Tables
-      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
-      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
-      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
-      0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
-      0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
-      0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
-      0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
-      0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
-      0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
-      0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
-      0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
-      0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
-      0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
-      0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
-      0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
-      0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
-      0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
-      0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
-      0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
-      0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
-      0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
-      0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
-      0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
-      0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
-      0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
-      0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
-      0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
-      0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
-      0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
-      0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
-      0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
-      0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
-      0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
-      0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
-      0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
-      0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
-      0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
-      0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
-      0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
-      0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
-      0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
-      0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
-      0x10765432, 0x17654320, 0x07654321, 0x76543210};
+      0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
+      0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
+      0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
+      0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
+      0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
+      0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
+      0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
+      0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
+      0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
+      0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
+      0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
+      0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
+      0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
+      0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
+      0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
+      0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
+      0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
+      0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
+      0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
+      0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
+      0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
+      0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
+      0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
+      0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
+      0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
+      0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
+      0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
+      0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
+      0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
+      0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
+      0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
+      0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
+      0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
+      0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
+      0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
+      0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
+      0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
+      0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
+      0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
+      0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
+      0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
+      0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
+      0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
 
   // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
   // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@@ -4975,12 +4974,11 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
   // latency, it may be faster to use LoadDup128 and PSHUFB.
   const auto packed = Set(d32, packed_array[mask_bits]);
   alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
+  return packed >> Load(d32, shifts);
 }
 
 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
   const Repartition<uint32_t, decltype(d)> d32;
 
   // For 64-bit, we still need 32-bit indices because there is no 64-bit
@@ -4988,18 +4986,20 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
   // unpacking and load the entire index vector directly.
   alignas(32) constexpr uint32_t u32_indices[128] = {
       // PrintCompress64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
-      2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
-      0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
-      0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
-      2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 2,  3,  4,  5,  6,  7,
+      10, 11, 0,  1,  4,  5,  6, 7, 8, 9, 10, 11, 4,  5,  6,  7,
+      12, 13, 0,  1,  2,  3,  6, 7, 8, 9, 12, 13, 2,  3,  6,  7,
+      10, 11, 12, 13, 0,  1,  6, 7, 8, 9, 10, 11, 12, 13, 6,  7,
+      14, 15, 0,  1,  2,  3,  4, 5, 8, 9, 14, 15, 2,  3,  4,  5,
+      10, 11, 14, 15, 0,  1,  4, 5, 8, 9, 10, 11, 14, 15, 4,  5,
+      12, 13, 14, 15, 0,  1,  2, 3, 8, 9, 12, 13, 14, 15, 2,  3,
+      10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
 }
 
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
   const RebindToUnsigned<decltype(d)> d32;
   // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
   // of SetTableIndices would require 8 KiB, a large part of L1D. The other
@@ -5008,49 +5008,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
   // bits, for a total of 1 KiB.
   alignas(16) constexpr uint32_t packed_array[256] = {
       // PrintCompressNot32x8Tables
-      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
-      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
-      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
-      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
-      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
-      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
-      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
-      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
-      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
-      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
-      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
-      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
-      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
-      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
-      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
-      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
-      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
-      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
-      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
-      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
-      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
-      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
-      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
-      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
-      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
-      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
-      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
-      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
-      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
-      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
-      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
-      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
-      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
-      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
-      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
-      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
-      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
-      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
-      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
-      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
-      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
-      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
-      0x76543210, 0x76543201, 0x76543210, 0x76543210};
+      0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
+      0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
+      0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
+      0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
+      0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
+      0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
+      0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
+      0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
+      0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
+      0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
+      0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
+      0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
+      0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
+      0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
+      0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
+      0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
+      0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
+      0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
+      0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
+      0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
+      0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
+      0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
+      0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
+      0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
+      0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
+      0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
+      0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
+      0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
+      0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
+      0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
+      0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
+      0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
+      0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
+      0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
+      0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
+      0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
+      0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
+      0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
+      0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
+      0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
+      0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
+      0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
+      0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
 
   // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
   // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@@ -5058,12 +5058,12 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
   // latency, it may be faster to use LoadDup128 and PSHUFB.
   const auto packed = Set(d32, packed_array[mask_bits]);
   alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
+  return packed >> Load(d32, shifts);
 }
 
 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
   const Repartition<uint32_t, decltype(d)> d32;
 
   // For 64-bit, we still need 32-bit indices because there is no 64-bit
@@ -5071,13 +5071,15 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
   // unpacking and load the entire index vector directly.
   alignas(32) constexpr uint32_t u32_indices[128] = {
       // PrintCompressNot64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
-      2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
-      0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
-      2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
-      4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,
+      8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11,
+      8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8,  9,  12, 13,
+      8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8,  9,  14, 15,
+      8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8,  9,  10, 11, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8,  9,  12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
 }
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
@@ -5085,7 +5087,9 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   const Repartition<uint32_t, decltype(d)> du32;
 
   HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromBits(d, mask_bits);
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
   return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
 }
 
@@ -5135,7 +5139,9 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
   const Repartition<uint32_t, decltype(d)> du32;
 
   HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromNotBits(d, mask_bits);
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
   return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
 }
 
@@ -5199,7 +5205,22 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                     T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(m);
   const size_t count = PopCount(mask_bits);
-  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
+
+  const Repartition<uint32_t, decltype(d)> du32;
+  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64). Nibble MSB encodes FirstN.
+  const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
+  // Shift nibble MSB into MSB
+  const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
+  // First cast to unsigned (RebindMask cannot change lane size)
+  const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
+  const Mask256<T> mask = RebindMask(d, mask_u);
+  const Vec256<T> compressed =
+      BitCast(d, TableLookupLanes(BitCast(du32, v),
+                                  Indices256<uint32_t>{idx_and_mask.raw}));
+
+  BlendedStore(compressed, mask, d, unaligned);
   // Workaround for MSAN not marking output as initialized (b/233326619)
 #if HWY_IS_MSAN
   __msan_unpoison(unaligned, count * sizeof(T));
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index f2500e3246..62a5405d84 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -3672,7 +3672,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
   // See CompressIsPartition. u64 is faster than u32.
   alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompress32x8Tables
+      // From PrintCompress32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
       0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
       0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
       0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@@ -3781,7 +3783,7 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-  const auto idx = LoadU(du, iota + 32 - num0);
+  const Vec512<uint16_t> idx = LoadU(du, iota + 32 - num0);
   const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
       demoted0.raw, m_upper, idx.raw, demoted1.raw)};
 #endif  // HWY_TARGET == HWY_AVX3_DL
@@ -3800,7 +3802,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
   // See CompressIsPartition. u64 is faster than u32.
   alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompressNot32x8Tables
+      // From PrintCompressNot32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
       0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
       0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
       0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
diff --git a/hwy/tests/compress_test.cc b/hwy/tests/compress_test.cc
index bcb370ab60..e2d0ef0ba9 100644
--- a/hwy/tests/compress_test.cc
+++ b/hwy/tests/compress_test.cc
@@ -305,11 +305,13 @@ void PrintCompressNot16x8Tables() {
   printf("\n");
 }
 
-// Compressed to nibbles, unpacked via variable right shift
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
 void PrintCompress32x8Tables() {
   printf("======================================= 32/64x8\n");
   constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
   for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -330,6 +332,10 @@ void PrintCompress32x8Tables() {
     uint64_t packed = 0;
     for (size_t i = 0; i < N; ++i) {
       HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
       packed += indices[i] << (i * 4);
     }
 
@@ -344,6 +350,7 @@ void PrintCompressNot32x8Tables() {
   constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
   for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
     const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -364,6 +371,10 @@ void PrintCompressNot32x8Tables() {
     uint64_t packed = 0;
     for (size_t i = 0; i < N; ++i) {
       HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
       packed += indices[i] << (i * 4);
     }
 
@@ -504,11 +515,13 @@ void PrintCompressNot64x4Tables() {
   printf("\n");
 }
 
-// Same as above, but prints pairs of u32 indices (for AVX2)
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
 void PrintCompress64x4PairTables() {
   printf("======================================= 64x4 u32 index\n");
   constexpr size_t N = 4;  // AVX2
   for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
     std::array<size_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -530,8 +543,10 @@ void PrintCompress64x4PairTables() {
     // interpreted modulo N. Compression is not worth the extra shift+AND
     // because the table is anyway only 512 bytes.
     for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
     }
   }
   printf("\n");
@@ -542,6 +557,7 @@ void PrintCompressNot64x4PairTables() {
   constexpr size_t N = 4;  // AVX2
   for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
     const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
     std::array<size_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -563,8 +579,10 @@ void PrintCompressNot64x4PairTables() {
     // interpreted modulo N. Compression is not worth the extra shift+AND
     // because the table is anyway only 512 bytes.
     for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
     }
   }
   printf("\n");