ziglang · ominitay · Apr 9, 2023 · Apr 11, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/doc/langref.html.in b/doc/langref.html.in
@@ -8558,6 +8558,33 @@ test "main" {
       {#see_also|@cVaArg|@cVaCopy|@cVaEnd#}
       {#header_close#}
 
+      {#header_open|@depositBits#}
+      <pre>{#syntax#}@depositBits(source: T, mask: T) T{#endsyntax#}</pre>
+      <p>
+      {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution.
+      </p>
+      <p>
+      Uses a mask to transfer contiguous lower bits in the {#syntax#}source{#endsyntax#} operand to the destination, transferring them to the corresponding bits in the destination that are set in the mask. All other bits in the destination are zeroed.
+      </p>
+      <p>
+      Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PDEP) in microcode. It may be faster to use an alternative method in both of these cases.
+      </p>
+      <p>
+      Example:
+      </p>
+
+      {#code_begin|test|test_depositbits_builtin#}
+const std = @import("std");
+
+test "deposit bits" {
+    comptime {
+        try std.testing.expectEqual(@depositBits(0x00001234, 0xf0f0f0f0), 0x10203040);
+    }
+}
+      {#code_end#}
+      {#see_also|@extractBits#}
+      {#header_close#}
+
       {#header_open|@divExact#}
       <pre>{#syntax#}@divExact(numerator: T, denominator: T) T{#endsyntax#}</pre>
       <p>
@@ -8726,6 +8753,33 @@ export fn @"A function name that is a complete sentence."() void {}
       {#see_also|@export#}
       {#header_close#}
 
+      {#header_open|@extractBits#}
+      <pre>{#syntax#}@extractBits(source: T, mask: T) T{#endsyntax#}</pre>
+      <p>
+      {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution.
+      </p>
+      <p>
+      Uses a mask to transfer bits in the {#syntax#}source{#endsyntax#} operand to the destination, writing them as contiguous lower bits in the destination. The upper bits of the destination are zeroed.
+      </p>
+      <p>
+      Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PEXT) in microcode. It may be faster to use an alternative method in both of these cases.
+      </p>
+      <p>
+      Example:
+      </p>
+
+      {#code_begin|test|test_depositbits_builtin#}
+const std = @import("std");
+
+test "extract bits" {
+    comptime {
+        try std.testing.expectEqual(@extractBits(0x12345678, 0xf0f0f0f0), 0x00001357);
+    }
+}
+      {#code_end#}
+      {#see_also|@depositBits#}
+      {#header_close#}
+
       {#header_open|@fence#}
       <pre>{#syntax#}@fence(order: AtomicOrder) void{#endsyntax#}</pre>
       <p>

diff --git a/lib/compiler_rt.zig b/lib/compiler_rt.zig
@@ -9,6 +9,7 @@ comptime {
     _ = @import("compiler_rt/popcount.zig");
     _ = @import("compiler_rt/bswap.zig");
     _ = @import("compiler_rt/cmp.zig");
+    _ = @import("compiler_rt/pdeppext.zig");
 
     _ = @import("compiler_rt/shift.zig");
     _ = @import("compiler_rt/negXi2.zig");

diff --git a/lib/compiler_rt/pdeppext.zig b/lib/compiler_rt/pdeppext.zig
@@ -0,0 +1,177 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const common = @import("common.zig");
+
+const Limb = u32;
+const Log2Limb = u5;
+
+comptime {
+    @export(__pdep_bigint, .{ .name = "__pdep_bigint", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u32, .{ .name = "__pdep_u32", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u64, .{ .name = "__pdep_u64", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u128, .{ .name = "__pdep_u128", .linkage = common.linkage, .visibility = common.visibility });
+
+    @export(__pext_bigint, .{ .name = "__pext_bigint", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u32, .{ .name = "__pext_u32", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u64, .{ .name = "__pext_u64", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u128, .{ .name = "__pext_u128", .linkage = common.linkage, .visibility = common.visibility });
+}
+
+const endian = builtin.cpu.arch.endian();
+
+inline fn limb(x: []const Limb, i: usize) Limb {
+    return if (endian == .little) x[i] else x[x.len - 1 - i];
+}
+
+inline fn limb_ptr(x: []Limb, i: usize) *Limb {
+    return if (endian == .little) &x[i] else &x[x.len - 1 - i];
+}
+
+inline fn limb_set(x: []Limb, i: usize, v: Limb) void {
+    if (endian == .little) {
+        x[i] = v;
+    } else {
+        x[x.len - 1 - i] = v;
+    }
+}
+
+// Code for bigint pdep and pext largely taken from std.math.big.int.depositBits and extractBits
+
+inline fn pdep_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void {
+    @memset(result, 0);
+
+    var mask_limb: Limb = limb(mask, 0);
+    var mask_limb_index: usize = 0;
+    var i: usize = 0;
+
+    outer: while (true) : (i += 1) {
+        // Find the lowest set bit in mask
+        const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+            const mask_limb_tz = @ctz(mask_limb);
+            if (mask_limb_tz != @bitSizeOf(Limb)) {
+                const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                break :limb_bit cast_limb_bit;
+            }
+
+            mask_limb_index += 1;
+            if (mask_limb_index >= mask.len) break :outer;
+
+            mask_limb = limb(mask, mask_limb_index);
+        };
+
+        const i_limb_index = i / 32;
+        const i_limb_bit: Log2Limb = @truncate(i);
+
+        if (i_limb_index >= source.len) break;
+
+        const source_bit_set = limb(source, i_limb_index) & (@as(Limb, 1) << i_limb_bit) != 0;
+
+        limb_ptr(result, mask_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit;
+    }
+}
+
+pub fn __pdep_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void {
+    const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+
+    pdep_bigint(result, source, mask);
+}
+
+inline fn pext_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void {
+    @memset(result, 0);
+
+    var mask_limb: Limb = limb(mask, 0);
+    var mask_limb_index: usize = 0;
+    var i: usize = 0;
+
+    outer: while (true) : (i += 1) {
+        const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+            const mask_limb_tz = @ctz(mask_limb);
+            if (mask_limb_tz != @bitSizeOf(Limb)) {
+                const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                break :limb_bit cast_limb_bit;
+            }
+
+            mask_limb_index += 1;
+            if (mask_limb_index >= mask.len) break :outer;
+
+            mask_limb = limb(mask, mask_limb_index);
+        };
+
+        const i_limb_index = i / 32;
+        const i_limb_bit: Log2Limb = @truncate(i);
+
+        if (i_limb_index >= source.len) break;
+
+        const source_bit_set = limb(source, mask_limb_index) & (@as(Limb, 1) << mask_limb_bit) != 0;
+
+        limb_ptr(result, i_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit;
+    }
+}
+
+pub fn __pext_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void {
+    const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+
+    pext_bigint(result, source, mask);
+}
+
+inline fn pdep_uX(comptime T: type, source: T, mask_: T) T {
+    var bb: T = 1;
+    var result: T = 0;
+    var mask = mask_;
+
+    while (mask != 0) {
+        const bit = mask & ~(mask - 1);
+        mask &= ~bit;
+        const source_bit = source & bb;
+        if (source_bit != 0) result |= bit;
+        bb += bb;
+    }
+
+    return result;
+}
+
+pub fn __pdep_u32(source: u32, mask: u32) callconv(.C) u32 {
+    return pdep_uX(u32, source, mask);
+}
+
+pub fn __pdep_u64(source: u64, mask: u64) callconv(.C) u64 {
+    return pdep_uX(u64, source, mask);
+}
+
+pub fn __pdep_u128(source: u128, mask: u128) callconv(.C) u128 {
+    return pdep_uX(u128, source, mask);
+}
+
+inline fn pext_uX(comptime T: type, source: T, mask_: T) T {
+    var bb: T = 1;
+    var result: T = 0;
+    var mask = mask_;
+
+    while (mask != 0) {
+        const bit = mask & ~(mask - 1);
+        mask &= ~bit;
+        const source_bit = source & bit;
+        if (source_bit != 0) result |= bb;
+        bb += bb;
+    }
+
+    return result;
+}
+
+pub fn __pext_u32(source: u32, mask: u32) callconv(.C) u32 {
+    return pext_uX(u32, source, mask);
+}
+
+pub fn __pext_u64(source: u64, mask: u64) callconv(.C) u64 {
+    return pext_uX(u64, source, mask);
+}
+
+pub fn __pext_u128(source: u128, mask: u128) callconv(.C) u128 {
+    return pext_uX(u128, source, mask);
+}
diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
@@ -1735,6 +1735,98 @@ pub const Mutable = struct {
         y.shiftRight(y.toConst(), norm_shift);
     }
 
+    // TODO this function is quite inefficient and could be optimised
+    /// r = @depositBits(source, mask)
+    ///
+    /// Asserts that `source` and `mask` are positive
+    pub fn depositBits(r: *Mutable, source: Const, mask: Const) void {
+        assert(source.positive);
+        assert(mask.positive);
+
+        r.positive = true;
+        @memset(r.limbs, 0);
+
+        var mask_limb: Limb = mask.limbs[0];
+        var mask_limb_index: Limb = 0;
+        var i: usize = 0;
+        outer: while (true) : (i += 1) {
+            // Find next bit in mask
+            const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+                const mask_limb_tz = @ctz(mask_limb);
+                if (mask_limb_tz != @sizeOf(Limb) * 8) {
+                    const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                    mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                    break :limb_bit cast_limb_bit;
+                }
+
+                mask_limb_index += 1;
+                // No more limbs, we've finished iterating the mask
+                if (mask_limb_index >= mask.limbs.len) {
+                    break :outer;
+                }
+
+                mask_limb = mask.limbs[mask_limb_index];
+            };
+
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit: Log2Limb = @truncate(i);
+
+            if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
+
+            const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0;
+
+            r.limbs[mask_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit;
+        }
+
+        r.normalize(r.limbs.len);
+    }
+
+    // TODO this function is quite inefficient and could be optimised
+    /// r = @extractBits(source, mask)
+    ///
+    /// Asserts that `source` and `mask` are positive
+    pub fn extractBits(r: *Mutable, source: Const, mask: Const) void {
+        assert(source.positive);
+        assert(mask.positive);
+
+        r.positive = true;
+        @memset(r.limbs, 0);
+
+        var mask_limb: Limb = mask.limbs[0];
+        var mask_limb_index: Limb = 0;
+        var i: usize = 0;
+        outer: while (true) : (i += 1) {
+            // Find next bit in mask
+            const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+                const mask_limb_tz = @ctz(mask_limb);
+                if (mask_limb_tz != @sizeOf(Limb) * 8) {
+                    const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                    mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                    break :limb_bit cast_limb_bit;
+                }
+
+                mask_limb_index += 1;
+                // No more limbs, we've finished iterating the mask
+                if (mask_limb_index >= mask.limbs.len) {
+                    break :outer;
+                }
+
+                mask_limb = mask.limbs[mask_limb_index];
+            };
+
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit: Log2Limb = @truncate(i);
+
+            if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
+
+            const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0;
+
+            r.limbs[i_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit;
+        }
+
+        r.normalize(r.limbs.len);
+    }
+
     /// If a is positive, this passes through to truncate.
     /// If a is negative, then r is set to positive with the bit pattern ~(a - 1).
     /// r may alias a.