From 8678ca57ac772e79f7fda26aeb2404a92bff3be9 Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Tue, 14 Jan 2025 18:18:23 +0100
Subject: [PATCH 1/6] feat: improve Falcon signature verification

---
 assembly/src/ast/instruction/advice.rs        |   3 +
 assembly/src/parser/grammar.lalrpop           |   2 +
 assembly/src/parser/token.rs                  |   4 +
 core/src/sys_events.rs                        |  20 ++
 .../operations/sys_ops/sys_event_handlers.rs  |  43 ++++
 stdlib/asm/crypto/dsa/rpo_falcon512.masm      | 218 ++++++++++++------
 stdlib/tests/crypto/falcon.rs                 |  50 +++-
 7 files changed, 254 insertions(+), 86 deletions(-)

diff --git a/assembly/src/ast/instruction/advice.rs b/assembly/src/ast/instruction/advice.rs
index f6bfe807d..8ddcef758 100644
--- a/assembly/src/ast/instruction/advice.rs
+++ b/assembly/src/ast/instruction/advice.rs
@@ -13,6 +13,7 @@ use vm_core::sys_events::SystemEvent;
 #[derive(Clone, PartialEq, Eq, Debug)]
 pub enum SystemEventNode {
     PushU64Div,
+    PushFalconDiv,
     PushExt2intt,
     PushSmtPeek,
     PushMapVal,
@@ -30,6 +31,7 @@ impl From<&SystemEventNode> for SystemEvent {
         use SystemEventNode::*;
         match value {
             PushU64Div => Self::U64Div,
+            PushFalconDiv => Self::FalconDiv,
             PushExt2intt => Self::Ext2Intt,
             PushSmtPeek => Self::SmtPeek,
             PushMapVal => Self::MapValueToStack,
@@ -56,6 +58,7 @@ impl fmt::Display for SystemEventNode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::PushU64Div => write!(f, "push_u64div"),
+            Self::PushFalconDiv => write!(f, "push_falcon_div"),
             Self::PushExt2intt => write!(f, "push_ext2intt"),
             Self::PushSmtPeek => write!(f, "push_smtpeek"),
             Self::PushMapVal => write!(f, "push_mapval"),
diff --git a/assembly/src/parser/grammar.lalrpop b/assembly/src/parser/grammar.lalrpop
index 378ae1ec5..06af15b57 100644
--- a/assembly/src/parser/grammar.lalrpop
+++ b/assembly/src/parser/grammar.lalrpop
@@ -56,6 +56,7 @@ extern {
         "push_sig" => Token::PushSig,
         "push_smtpeek" => Token::PushSmtpeek,
         "push_u64div" => Token::PushU64Div,
+        "push_falcon_div" => Token::PushFalconDiv,
         "and" => Token::And,
         "assert" => Token::Assert,
         "assertz" => Token::Assertz,
@@ -677,6 +678,7 @@ SystemEvent: Instruction = {
     "adv" "." "push_sig" "." <kind:SignatureKind> => Instruction::SysEvent(SystemEventNode::PushSignature { kind }),
     "adv" "." "push_smtpeek" => Instruction::SysEvent(SystemEventNode::PushSmtPeek),
     "adv" "." "push_u64div" => Instruction::SysEvent(SystemEventNode::PushU64Div),
+    "adv" "." "push_falcon_div" => Instruction::SysEvent(SystemEventNode::PushFalconDiv),
 }
 
 #[inline]
diff --git a/assembly/src/parser/token.rs b/assembly/src/parser/token.rs
index 94c4f73fd..688a19f78 100644
--- a/assembly/src/parser/token.rs
+++ b/assembly/src/parser/token.rs
@@ -153,6 +153,7 @@ pub enum Token<'input> {
     PushSmtset,
     PushSmtget,
     PushU64Div,
+    PushFalconDiv,
     And,
     Assert,
     Assertz,
@@ -338,6 +339,7 @@ impl fmt::Display for Token<'_> {
             Token::PushSmtset => write!(f, "push_smtset"),
             Token::PushSmtget => write!(f, "push_smtget"),
             Token::PushU64Div => write!(f, "push_u64div"),
+            Token::PushFalconDiv => write!(f, "push_falcon_div"),
             Token::And => write!(f, "and"),
             Token::Assert => write!(f, "assert"),
             Token::Assertz => write!(f, "assertz"),
@@ -531,6 +533,7 @@ impl<'input> Token<'input> {
                 | Token::PushSmtset
                 | Token::PushSmtget
                 | Token::PushU64Div
+                | Token::PushFalconDiv
                 | Token::And
                 | Token::Assert
                 | Token::Assertz
@@ -676,6 +679,7 @@ impl<'input> Token<'input> {
         ("push_smtset", Token::PushSmtset),
         ("push_smtget", Token::PushSmtget),
         ("push_u64div", Token::PushU64Div),
+        ("push_falcon_div", Token::PushFalconDiv),
         ("and", Token::And),
         ("assert", Token::Assert),
         ("assertz", Token::Assertz),
diff --git a/core/src/sys_events.rs b/core/src/sys_events.rs
index 0a4c8df2a..d55416358 100644
--- a/core/src/sys_events.rs
+++ b/core/src/sys_events.rs
@@ -28,6 +28,7 @@ mod constants {
     pub const EVENT_HDWORD_TO_MAP_WITH_DOMAIN: u32    = 2822590340;
     pub const EVENT_HPERM_TO_MAP: u32                 = 3297060969;
     pub const EVENT_FALCON_SIG_TO_STACK: u32          = 3419226139;
+    pub const EVENT_FALCON_DIV: u32                   = 3419226155;
 }
 
 /// Defines a set of actions which can be initiated from the VM to inject new data into the advice
@@ -119,6 +120,22 @@ pub enum SystemEvent {
     /// the remainder respectively.
     U64Div,
 
+    /// Pushes the result of divison (both the quotient and the remainder) of a [u64] by the Falcon
+    /// prime (M = 12289) onto the advice stack.
+    ///
+    /// Inputs:
+    ///   Operand stack: [a1, a0, ...]
+    ///   Advice stack: [...]
+    ///
+    /// Outputs:
+    ///   Operand stack: [a1, a0, ...]
+    ///   Advice stack: [q0, q1, r, ...]
+    ///
+    /// Where (a0, a1) are the 32-bit limbs of the dividend (with a0 representing the 32 least
+    /// significant bits and a1 representing the 32 most significant bits).
+    /// Similarly, (q0, q1) represent the quotient and r the remainder.
+    FalconDiv,
+
     /// Given an element in a quadratic extension field on the top of the stack (i.e., a0, b1),
     /// computes its multiplicative inverse and push the result onto the advice stack.
     ///
@@ -310,6 +327,7 @@ impl SystemEvent {
             SystemEvent::MapValueToStack => EVENT_MAP_VALUE_TO_STACK,
             SystemEvent::MapValueToStackN => EVENT_MAP_VALUE_TO_STACK_N,
             SystemEvent::U64Div => EVENT_U64_DIV,
+            SystemEvent::FalconDiv => EVENT_FALCON_DIV,
             SystemEvent::Ext2Inv => EVENT_EXT2_INV,
             SystemEvent::Ext2Intt => EVENT_EXT2_INTT,
             SystemEvent::SmtPeek => EVENT_SMT_PEEK,
@@ -335,6 +353,7 @@ impl SystemEvent {
             EVENT_MAP_VALUE_TO_STACK => Some(SystemEvent::MapValueToStack),
             EVENT_MAP_VALUE_TO_STACK_N => Some(SystemEvent::MapValueToStackN),
             EVENT_U64_DIV => Some(SystemEvent::U64Div),
+            EVENT_FALCON_DIV => Some(SystemEvent::FalconDiv),
             EVENT_EXT2_INV => Some(SystemEvent::Ext2Inv),
             EVENT_EXT2_INTT => Some(SystemEvent::Ext2Intt),
             EVENT_SMT_PEEK => Some(SystemEvent::SmtPeek),
@@ -367,6 +386,7 @@ impl fmt::Display for SystemEvent {
             Self::MapValueToStack => write!(f, "map_value_to_stack"),
             Self::MapValueToStackN => write!(f, "map_value_to_stack_with_len"),
             Self::U64Div => write!(f, "div_u64"),
+            Self::FalconDiv => write!(f, "falcon_div"),
             Self::Ext2Inv => write!(f, "ext2_inv"),
             Self::Ext2Intt => write!(f, "ext2_intt"),
             Self::SmtPeek => write!(f, "smt_peek"),
diff --git a/processor/src/operations/sys_ops/sys_event_handlers.rs b/processor/src/operations/sys_ops/sys_event_handlers.rs
index f37612971..beaf30176 100644
--- a/processor/src/operations/sys_ops/sys_event_handlers.rs
+++ b/processor/src/operations/sys_ops/sys_event_handlers.rs
@@ -18,6 +18,9 @@ use crate::{
 /// The offset of the domain value on the stack in the `hdword_to_map_with_domain` system event.
 const HDWORD_TO_MAP_WITH_DOMAIN_DOMAIN_OFFSET: usize = 8;
 
+/// Falcon signature prime.
+const M: u64 = 12289;
+
 impl Process {
     pub(super) fn handle_system_event(
         &self,
@@ -38,6 +41,7 @@ impl Process {
                 copy_map_value_to_adv_stack(advice_provider, process_state, true)
             },
             SystemEvent::U64Div => push_u64_div_result(advice_provider, process_state),
+            SystemEvent::FalconDiv => push_falcon_mod_result(advice_provider, process_state),
             SystemEvent::Ext2Inv => push_ext2_inv_result(advice_provider, process_state),
             SystemEvent::Ext2Intt => push_ext2_intt_result(advice_provider, process_state),
             SystemEvent::SmtPeek => push_smtpeek_result(advice_provider, process_state),
@@ -339,6 +343,45 @@ pub fn push_u64_div_result(
     Ok(())
 }
 
+/// Pushes the result of divison (both the quotient and the remainder) of a [u64] by the Falcon
+/// prime (M = 12289) onto the advice stack.
+///
+/// Inputs:
+///   Operand stack: [a1, a0, ...]
+///   Advice stack: [...]
+///
+/// Outputs:
+///   Operand stack: [a1, a0, ...]
+///   Advice stack: [q0, q1, r, ...]
+///
+/// Where (a0, a1) are the 32-bit limbs of the dividend (with a0 representing the 32 least
+/// significant bits and a1 representing the 32 most significant bits).
+/// Similarly, (q0, q1) represent the quotient and r the remainder.
+///
+/// # Errors
+/// Returns an error if the divisor is ZERO.
+pub fn push_falcon_mod_result(
+    advice_provider: &mut impl AdviceProvider,
+    process: ProcessState,
+) -> Result<(), ExecutionError> {
+    let dividend_hi = process.get_stack_item(0).as_int();
+    let dividend_lo = process.get_stack_item(1).as_int();
+    let dividend = (dividend_hi << 32) + dividend_lo;
+
+    let quotient = dividend / M;
+    let remainder = dividend - quotient * M;
+
+    let (q_hi, q_lo) = u64_to_u32_elements(quotient);
+    let (r_hi, r_lo) = u64_to_u32_elements(remainder);
+    assert_eq!(r_hi, ZERO);
+
+    advice_provider.push_stack(AdviceSource::Value(r_lo))?;
+    advice_provider.push_stack(AdviceSource::Value(q_lo))?;
+    advice_provider.push_stack(AdviceSource::Value(q_hi))?;
+
+    Ok(())
+}
+
 /// Given an element in a quadratic extension field on the top of the stack (i.e., a0, b1),
 /// computes its multiplicative inverse and push the result onto the advice stack.
 ///
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index e142df9ac..2493f6a54 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -1,64 +1,72 @@
 use.std::crypto::hashes::rpo
+use.std::math::u64
 
 # CONSTANTS
 # =================================================================================================
 
 const.J=77321994752
 const.M=12289
+const.M_HALF=6144 # (M-1) / 2
 const.SQUARE_NORM_BOUND=34034726
 
 # MODULAR REDUCTION FALCON PRIME
 # =============================================================================================
 
-#! Given dividend ( i.e. field element a ) on stack top, this routine computes c = a % 12289
+#! Given dividend ( i.e. a u64 given by its lower and higher u32 decomposition ) on the stack,
+#! this routine computes c = a % 12289
 #!
 #! Expected stack state
 #!
-#! [a, ...]
+#! [a_hi, a_lo, ...]
 #!
 #! Output stack state looks like
 #!
 #! [c, ...] | c = a % 12289
+#!
+#! Cycles: 31
 export.mod_12289
-    u32split
-    push.M.0
-
-    adv.push_u64div
+    adv.push_falcon_div
+    # the advice stack contains now [qhi, qlo, r, ...] where q = qhi * 2^32 + qlo is quotient
+    # and r is remainder
 
     adv_push.2
     u32assert2
+    # => [qlo, qhi, a_hi, a_lo, ...]
 
-    swap
     push.M
     u32overflowing_mul
+    # => [overflow, M * qlo % 2^32, qhi, a_hi, a_lo, ...]
 
     movup.2
     push.M
+    # => [M, qhi, overflow, M * qlo % 2^32, a_hi, a_lo, ...]
     u32overflowing_madd
+    # => [t1, t0, M * qlo % 32, a_hi, a_lo, ...] where t = t1 * 2^32 + t0 and t = M * qhi + overflow
+    # Note by the bound on x - r = q * M, we are guaranteed that t1 = 0
     drop
+    # => [M * q / 2^32, (M * q) % 2^32, a_hi, a_lo, ...]
+    # => [res_hi, res_lo, a_hi, a_lo, ...]
 
-    adv_push.2
-    drop
-    u32assert
+    adv_push.1
+    dup u32lt.M assert 
+    # => [r, res_hi, res_lo, a_hi, a_lo, ...]
 
     dup
-
     movup.3
     u32overflowing_add
+    # => [flag, (res_lo + r) % 2^32, r, res_hi, a_hi, a_lo, ...] where u = uhi * 2^32 + ulo and u = (res_lo + r) / 2^32
 
     movup.3
     u32overflowing_add
+    # => [flag, final_res_hi, final_res_lo, r, a_hi, a_lo, ...] flag should be 0 by the bound on inputs
     drop
-
-    movup.5
+    # => [final_res_hi, final_res_lo, r, a_hi, a_lo, ...]
+    
+    movup.3
     assert_eq
-    movup.4
+    movup.2
     assert_eq
-
-    swap
-    drop
-    swap
-    drop
+    # => [r, ...]
 end
 
 # HASH-TO-POINT
@@ -379,7 +387,7 @@ export.probabilistic_product.16
 end
 
 
-# SQUARE NORM OF Z_q[x]/(phi) POLYNOMIALS
+# SQUARE NORM OF Z_M[x]/(phi) POLYNOMIALS
 # =============================================================================================
 
 #! Normalizes an `e` in [0, q) to be in [-(q-1) << 1, (q-1) << 1) and returns its square norm.
@@ -399,20 +407,19 @@ end
 #! Input: [e, ...]
 #! Output [norm(e)^2, ...]
 #!
-#! Cycles: 21
+#! Cycles: 20
 export.norm_sq
 
     dup
     dup mul
-    swap
-    #=> [e, e^2, ...]
+    #=> [e^2, e, ...]
 
-    dup
-    push.6144
-    u32gt
-    #=> [phi, e, e^2, ...]
+    push.M_HALF
+    dup.2
+    u32lt
+    #=> [phi, e^2, e, ...]
 
-    swap
+    movup.2
     mul.24578       # 2*q
     push.151019521  # q^2
     sub
@@ -422,33 +429,61 @@ export.norm_sq
     #=> [norm(e)^2, ...]
 end
 
-#! On input a tuple (u, w, v), the following computes (v - (u + (- w % q) % q) % q).
+#! Given a tuple (u, w, v), we want to compute (v - (u + (- w % q) % q) % q), where:
+#!
+#! 1. v is a field element given by its u32 decomposition i.e., (c_lo, c_hi) such that 
+#!    v = c_hi * 2**32 + c_lo
+#! 2. w is a field element representing the (i+512)-th coefficient of the product polynomial
+#!    pi (i.e., h * s2). We are guaranteed that w is at most 512 * (q-1)^2.
+#! 3. u is a field element representing the i-th coefficient of the product polynomial
+#!    pi (i.e., h * s2). We are guaranteed that u is at most 512 * (q-1)^2.
+#!
 #! We can avoid doing three modular reductions by using the following facts:
 #!
-#! 1. q is much smaller than the Miden prime. Precisely, q * 2^50 < Q
-#! 2. The coefficients of the product polynomial, u and w, are less than J := 512 * q^2
-#! 3. The coefficients of c are less than q.
+#! 1. q is much smaller than the Miden prime Q. Precisely, q * 2^50 < Q
+#! 2. The coefficients of the product polynomial, u and w, are strictly less than J := 512 * q^2.
+#! 3. The coefficients of c are at most q - 1.
 #!
-#! This means that we can substitute (v - (u + (- w % q) % q) % q) with  v + w + J - u without
-#! risking Q-overflow since |v + w + J - u| < 1025 * q^2
+#! This means that we can substitute (v - (u + (- w % q) % q) % q) with  v + w + J - u 
+#! (note J % q= 0) without risking Q-underflow but we can still overflow.
+#! For this reason, we use the u32 decomposition of v and perform the addition of
+#! v and w + J - u as u64. Note that |w + J - u| <= 1024 * (q - 1)^2
+#! and hence there is the possibility of an overflow when we add v and w + J - u as u64.
+#! When there is an overflow, we add 10952, which is equal to 2^32 % q, to the upper u32 limb of 
+#! the result of (v + (w + J - u)). Note that since |w + J - u| <= 1024 * (q-1)^2 < 2^38, and
+#! 10952 < q, we are guaranteed that this final u32 addition to the upper limb will not overflow.
 #!
 #! To get the final result we reduce (v + w + J - u) modulo q.
 #!
-#! Input: [v, w, u, ...]
+#! Input: [pi0, pi512 + J, c_hi, c_lo, ...]
 #! Output: [e, ...]
 #!
-#! Cycles: 44
+#! Cycles: 45
 export.diff_mod_q
 
-    # 1) v + w + J
-    add push.J add
-    #=> [v + w + J, u]
+    # 1) Subtract
+    sub
+    #=> [pi512 + J - pi, c_hi, c_lo, ...]
 
-    # 2) v + w + J - u
-    swap sub
-    #=> [v + w + J - u]
+    # 2) u32split first u64
+    u32split
+    #=> [tmp_hi, tmp_lo, c_hi, c_lo, ...]
+  
+    # 3) Add the two u64-s
+    exec.u64::overflowing_add
+    #=> [flag, res_hi, res_lo, ..]
+    
+    # 4) Handle potential overflow in the u64 addition
+    push.10952 # 2^32 mod q
+    push.0
+    #=> [0, 10952, flag, res_hi, res_lo, ..]
+    swap.2
+    #=> [flag, 10952, 0, res_hi, res_lo, ..]
+    cdrop
+    add
+    #=> [res_hi, res_lo, ..]
 
-    # 3) Reduce modulo q
+    # 5) Reduce modulo q
     exec.mod_12289
     #=> [e, ...]
 end
@@ -484,65 +519,89 @@ end
 #! Input: [pi_ptr, ...]
 #! Output: [norm_sq(s1), ...]
 #!
-#! Cycles: 59000
+#! Cycles: 40454
 export.compute_s1_norm_sq
     repeat.128
         # 1) Load the next 4 * 3 coefficients
-        # load c_i
+        # load the next four pi_i
         padw
-        dup.4 add.5124
+        dup.4
         mem_loadw
 
-        # load pi_{i+512}
+        # load the next four pi_{i+512}
         padw
         dup.8 add.512
         mem_loadw
 
-        # load pi_4
+        # load the next four c_i
         padw
-        dup.12
+        dup.12 add.5124
         mem_loadw
-        #=> [PI, PI_{i+512}, C, pi_ptr, ...]
+        #=> [C, PI_{i+512}, PI, pi_ptr, ...]
 
         # 2) Compute the squared norm of (i + 0)-th coefficient of s1
-        movup.8
-        exec.mod_12289
-        movup.5
-        #=> [v, w, u, ...] where u is the i-th coefficient of `pi`, v is the i-th
-        # coefficient of `c` and w is the (512 + i)-th coefficient of `pi` polynomial.
 
+        u32split
+        #=> [c0_hi, c0_lo, c1, c2, c3, PI_{i+512}, PI, pi_ptr, ...]
+        movup.5
+        push.J add
+        #=> [pi512_0, c0_hi, c0_lo, c1, c2, c3, pi512_1, pi512_2, pi512_3, PI, pi_ptr, ...]
+        movup.9
+        #=> [pi0, pi512_0, c_hi, c_lo, c1, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, ...]
         exec.diff_mod_q
-        #=> [e, ...]
-
+        #=> [e, c1, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, ...]
         exec.norm_sq
         #=> [norm(e)^2, ...]
 
         # Move the result out of the way so that we can process the remaining coefficients
         movdn.10
+        #=> [c1, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, e0, ...]
 
         # 3) Compute the squared norm of (i + 1)-th coefficient of s1
-        movup.6
-        exec.mod_12289
+
+        u32split
+        #=> [c1_hi, c1_lo, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, e0, ...]
         movup.4
+        #=> [pi512_1, c1_hi, c1_lo, c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, ...]
+        push.J add
+        movup.7
+        #=> [pi1, pi512_1, c1_hi, c1_lo, c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, ...]
         exec.diff_mod_q
         exec.norm_sq
+        #=> [e, c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, ...]
+
         movdn.7
+        #=> [c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, e1, ...]
 
         # 4) Compute the squared norm of (i + 2)-th coefficient of s1
-        movup.4
-        exec.mod_12289
+
+        u32split
+        #=> [c2_hi, c2_lo, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, e1, ...]
         movup.3
+        push.J add
+        #=> [pi512_2, c2_hi, c2_lo, c3, pi512_3, pi2, pi3, pi_ptr, e0, e1, ...]
+        movup.5
+        #=> [pi2, pi512_2, c2_hi, c2_lo, c3, pi512_3, pi3, pi_ptr, e0, e1, ...]
         exec.diff_mod_q
         exec.norm_sq
+
         movdn.4
+        #=> [c3, pi512_3, pi3, pi_ptr, e, e, e, ...]
 
         # 5) Compute the squared norm of (i + 3)-th coefficient of s1
-        movup.2
-        exec.mod_12289
-        movup.2
+
+        u32split
+        #=> [c3_hi, c3_lo, pi512_3, pi3, pi_ptr, e0, e1, e2, ...]
+        movup.2 push.J add
+        movup.3
+        #=> [pi3, pi512_3, c3_hi, c3_lo, pi_ptr, e0, e1, e2, ...]
         exec.diff_mod_q
+        #=> [e3, pi_ptr, e0, e1, e2, ...]
         exec.norm_sq
+        #=> [e3, pi_ptr, e0, e1, e2, ...]
+
         swap
+        #=> [pi_ptr, e3, e0, e1, e2, ...]
 
         # 6) Increment the pointer
         add.4
@@ -561,21 +620,28 @@ end
 #! Input: [s2_ptr, ...]
 #! Output: [norm_sq(s2), ...]
 #!
-#! Cycles: ~13500
+#! Cycles: 11137
 export.compute_s2_norm_sq
     repeat.128
         padw
-        dup.4 add.4 swap.5
+        dup.4
         mem_loadw
 
-        repeat.4
-            exec.norm_sq
-            movdn.4
-        end
-
+        exec.norm_sq
+        swap
+        exec.norm_sq
+        add
+        swap
+        exec.norm_sq
+        add
+        swap
+        exec.norm_sq
+        add
+        swap
+        add.4
     end
     drop
-    repeat.511
+    repeat.127
         add
     end
 end
@@ -592,7 +658,7 @@ end
 #! Input: [PK, MSG, ...]
 #! Output: [...]
 #!
-#! Cycles: ~ 90400
+#! Cycles: ~ 70465
 export.verify.6660
 
     # 1) Generate a Falcon signature using the secret key associated to PK on message MSG.
@@ -639,13 +705,13 @@ export.verify.6660
     exec.probabilistic_product
     #=> [...]                                       (Cycles: 2504)
 
-    # 6) Compute the squared norm of s1 := c - h * s2 (in Z_q[x]/(phi))
+    # 6) Compute the squared norm of s1 := c - h * s2 (in Z_M[x]/(phi))
 
     locaddr.1024
     #=> [pi_ptr, ...]
 
     exec.compute_s1_norm_sq
-    #=> [norm_sq(s1), ...]                          (Cycles: 58888)
+    #=> [norm_sq(s1), ...]                          (Cycles: 40454)
 
     # 7) Compute the squared norm of s2
 
@@ -653,7 +719,7 @@ export.verify.6660
     #=> [s2_ptr, norm_sq(s1), ...]
 
     exec.compute_s2_norm_sq
-    #=> [norm_sq(s2), norm_sq(s1), ...]             (Cycles: 13322)
+    #=> [norm_sq(s2), norm_sq(s1), ...]             (Cycles: 11137)
 
     # 8) Check that ||(s1, s2)||^2 < K
 
diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 233ad9a7e..88bdf6f0b 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -14,9 +14,11 @@ use test_utils::{
         MerkleStore, Rpo256,
     },
     expect_exec_error_matches,
+    proptest::proptest,
     rand::{rand_value, rand_vector},
     FieldElement, QuadFelt, Word, WORD_SIZE,
 };
+use vm_core::StarkField;
 
 /// Modulus used for rpo falcon 512.
 const M: u64 = 12289;
@@ -86,21 +88,49 @@ fn test_falcon512_diff_mod_q() {
         exec.rpo_falcon512::diff_mod_q
     end
     ";
-
-    let u = rand::thread_rng().gen_range(0..J);
-    let v = rand::thread_rng().gen_range(Q..M);
+    let v = rand::thread_rng().gen_range(0..Felt::MODULUS);
+    let (v_lo, v_hi) = (v as u32, v >> 32);
     let w = rand::thread_rng().gen_range(0..J);
+    let u = rand::thread_rng().gen_range(0..J);
 
-    let test1 = build_test!(source, &[u, v, w]);
+    let test1 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
 
     // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
-    let expanded_answer = (v as i64
-        - (u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64))
-    .rem_euclid(M as i64);
-    let simplified_answer = (v + w + J - u).rem_euclid(M);
-    assert_eq!(expanded_answer, i64::try_from(simplified_answer).unwrap());
+    let expanded_answer = (v as i128
+        - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
+        .rem_euclid(M as i128);
+    let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
+    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+
+    test1.expect_stack(&[simplified_answer as u64]);
+}
+
+proptest! {
+    #[test]
+    fn diff_mod_q_proptest(v in 0..Felt::MODULUS, w in 0..J, u in 0..J) {
+
+          let source = "
+    use.std::crypto::dsa::rpo_falcon512
+
+    begin
+        exec.rpo_falcon512::diff_mod_q
+    end
+    ";
+
+    let (v_lo, v_hi) = (v as u32, v >> 32);
+
+    let test1 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
+
+    // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
+    let expanded_answer = (v as i128
+        - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
+    .rem_euclid(M as i128);
+    let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
+    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+
+    test1.prop_expect_stack(&[simplified_answer as u64])?;
+    }
 
-    test1.expect_stack(&[simplified_answer]);
 }
 
 #[test]

From 49a2ae47eb762333094dfbdfd9b7360f4efd6875 Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Sun, 19 Jan 2025 21:58:39 +0100
Subject: [PATCH 2/6] chore: address feedback

---
 CHANGELOG.md                             |  1 +
 stdlib/asm/crypto/dsa/rpo_falcon512.masm | 10 +++++++++-
 stdlib/tests/crypto/falcon.rs            | 23 ++++++++++++++++++++---
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 03a183f36..3c469988a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@
 - Optimized the computation of the DEEP queries in the recursive verifier (#1594).
 - Added validity checks for the inputs to the recursive verifier (#1596).
 - Allow multiple memory reads in the same clock cycle (#1626)
+- Improved Falcon signiture verification (#1623).
 
 ## 0.11.0 (2024-11-04)
 
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index 2493f6a54..8bdfb3165 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -8,6 +8,7 @@ const.J=77321994752
 const.M=12289
 const.M_HALF=6144 # (M-1) / 2
 const.SQUARE_NORM_BOUND=34034726
+const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=1281
 
 # MODULAR REDUCTION FALCON PRIME
 # =============================================================================================
@@ -535,7 +536,7 @@ export.compute_s1_norm_sq
 
         # load the next four c_i
         padw
-        dup.12 add.5124
+        dup.12 add.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY
         mem_loadw
         #=> [C, PI_{i+512}, PI, pi_ptr, ...]
 
@@ -626,21 +627,28 @@ export.compute_s2_norm_sq
         padw
         dup.4
         mem_loadw
+        # => [c3, c2, c1, c0, s2_ptr, ...] where ci are coefficients of s2
 
         exec.norm_sq
         swap
+        # => [c2, norm_sq(c3), c1, c0, s2_ptr, ...]
         exec.norm_sq
         add
         swap
+        # => [c1, norm_sq(c2) + norm_sq(c3), c0, s2_ptr, ...]
         exec.norm_sq
         add
         swap
+        # => [c0, norm_sq(c1) + norm_sq(c2) + norm_sq(c3), s2_ptr, ...]
         exec.norm_sq
         add
+        # => [norm_sq(c0) + norm_sq(c1) + norm_sq(c2) + norm_sq(c3), s2_ptr, ...]
         swap
         add.4
+        # => [s2_ptr + 1, norm_sq(c0) + norm_sq(c1) + norm_sq(c2) + norm_sq(c3), ...]
     end
     drop
+    # (512 / 4) - 1 = 127 additions are needed
     repeat.127
         add
     end
diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 88bdf6f0b..0fbc46273 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -88,10 +88,12 @@ fn test_falcon512_diff_mod_q() {
         exec.rpo_falcon512::diff_mod_q
     end
     ";
-    let v = rand::thread_rng().gen_range(0..Felt::MODULUS);
+    let v = Felt::MODULUS - 1;
     let (v_lo, v_hi) = (v as u32, v >> 32);
-    let w = rand::thread_rng().gen_range(0..J);
-    let u = rand::thread_rng().gen_range(0..J);
+
+    // test largest possible value given v
+    let w = J - 1;
+    let u = 0;
 
     let test1 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
 
@@ -103,6 +105,21 @@ fn test_falcon512_diff_mod_q() {
     assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
 
     test1.expect_stack(&[simplified_answer as u64]);
+
+    // test smallest possible value given v
+    let w = 0;
+    let u = J - 1;
+
+    let test2 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
+
+    // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
+    let expanded_answer = (v as i128
+        - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
+        .rem_euclid(M as i128);
+    let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
+    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+
+    test2.expect_stack(&[simplified_answer as u64]);
 }
 
 proptest! {

From 3e31b3548f1c8c245cb0793b674cddacc2eadd37 Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Sun, 19 Jan 2025 22:20:17 +0100
Subject: [PATCH 3/6] fix: clippy

---
 stdlib/tests/crypto/falcon.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 0fbc46273..1996b1f8c 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -95,14 +95,14 @@ fn test_falcon512_diff_mod_q() {
     let w = J - 1;
     let u = 0;
 
-    let test1 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
+    let test1 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
     // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
         .rem_euclid(M as i128);
     let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
-    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+    assert_eq!(expanded_answer, simplified_answer);
 
     test1.expect_stack(&[simplified_answer as u64]);
 
@@ -110,14 +110,14 @@ fn test_falcon512_diff_mod_q() {
     let w = 0;
     let u = J - 1;
 
-    let test2 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
+    let test2 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
     // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
         .rem_euclid(M as i128);
     let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
-    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+    assert_eq!(expanded_answer, simplified_answer);
 
     test2.expect_stack(&[simplified_answer as u64]);
 }
@@ -136,14 +136,14 @@ proptest! {
 
     let (v_lo, v_hi) = (v as u32, v >> 32);
 
-    let test1 = build_test!(source, &[v_lo as u64, v_hi as u64, w + J, u]);
+    let test1 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
     // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
     .rem_euclid(M as i128);
     let simplified_answer = (v as i128 + w as i128 + J as i128 - u as i128).rem_euclid(M as i128);
-    assert_eq!(expanded_answer, i128::try_from(simplified_answer).unwrap());
+    assert_eq!(expanded_answer, simplified_answer);
 
     test1.prop_expect_stack(&[simplified_answer as u64])?;
     }

From 1d8ff5e17f7feb1e88aafac79e04c1358440f52e Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Sun, 26 Jan 2025 13:32:54 +0100
Subject: [PATCH 4/6] chore: address feedback

---
 .../operations/sys_ops/sys_event_handlers.rs  |  5 +-
 stdlib/asm/crypto/dsa/rpo_falcon512.masm      | 80 ++++++++++---------
 stdlib/tests/crypto/falcon.rs                 | 14 ++--
 3 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/processor/src/operations/sys_ops/sys_event_handlers.rs b/processor/src/operations/sys_ops/sys_event_handlers.rs
index beaf30176..7c78a15da 100644
--- a/processor/src/operations/sys_ops/sys_event_handlers.rs
+++ b/processor/src/operations/sys_ops/sys_event_handlers.rs
@@ -352,7 +352,7 @@ pub fn push_u64_div_result(
 ///
 /// Outputs:
 ///   Operand stack: [a1, a0, ...]
-///   Advice stack: [q0, q1, r, ...]
+///   Advice stack: [q1, q0, r, ...]
 ///
 /// Where (a0, a1) are the 32-bit limbs of the dividend (with a0 representing the 32 least
 /// significant bits and a1 representing the 32 most significant bits).
@@ -368,8 +368,7 @@ pub fn push_falcon_mod_result(
     let dividend_lo = process.get_stack_item(1).as_int();
     let dividend = (dividend_hi << 32) + dividend_lo;
 
-    let quotient = dividend / M;
-    let remainder = dividend - quotient * M;
+    let (quotient, remainder) = (dividend / M, dividend % M);
 
     let (q_hi, q_lo) = u64_to_u32_elements(quotient);
     let (r_hi, r_lo) = u64_to_u32_elements(remainder);
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index 8bdfb3165..fd2222430 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -7,6 +7,8 @@ use.std::math::u64
 const.J=77321994752
 const.M=12289
 const.M_HALF=6144 # (M-1) / 2
+const.M_DOUBLE=24578
+const.M_SQUARE=151019521
 const.SQUARE_NORM_BOUND=34034726
 const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=1281
 
@@ -14,7 +16,7 @@ const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=1281
 # =============================================================================================
 
 #! Given dividend ( i.e. a u64 given by its lower and higher u32 decomposition ) on the stack,
-#! this routine computes c = a % 12289
+#! this routine computes c = a % M where M = 12289
 #!
 #! Expected stack state
 #!
@@ -22,9 +24,9 @@ const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=1281
 #!
 #! Output stack state looks like
 #!
-#! [c, ...] | c = a % 12289
+#! [c, ...] | c = a % M
 #!
-#! Cycles: 31
+#! Cycles: 27
 export.mod_12289
     adv.push_falcon_div
     # the advice stack contains now [qhi, qlo, r, ...] where q = qhi * 2^32 + qlo is quotient
@@ -49,7 +51,10 @@ export.mod_12289
     # => [res_hi, res_lo, a_hi, a_lo, ...]
 
     adv_push.1
-    dup u32lt.M assert 
+    dup
+    u32overflowing_sub.M
+    # => [underflow_bit, r - M, r, res_hi, res_lo, a_hi, a_lo, ...]
+    assert drop
     # => [r, res_hi, res_lo, a_hi, a_lo, ...]
 
     dup
@@ -391,15 +396,15 @@ end
 # SQUARE NORM OF Z_M[x]/(phi) POLYNOMIALS
 # =============================================================================================
 
-#! Normalizes an `e` in [0, q) to be in [-(q-1) << 1, (q-1) << 1) and returns its square norm.
+#! Normalizes an `e` in [0, M) to be in [-(M-1) << 1, (M-1) << 1) and returns its square norm.
 #!
 #! We use the following formula to do so:
-#! normalize(e) = e^2 - phi * (2*q*e - q^2) where phi := (e > (q - 1)/2)
+#! normalize(e) = e^2 - phi * (2*M*e - M^2) where phi := (e > (M - 1)/2)
 #!
 #! The formula implements:
 #!
-#! if e > (q-1)/2:
-#!   return (q - e)^2
+#! if e > (M-1)/2:
+#!   return (M - e)^2
 #! else:
 #!   return e^2
 #!
@@ -421,16 +426,17 @@ export.norm_sq
     #=> [phi, e^2, e, ...]
 
     movup.2
-    mul.24578       # 2*q
-    push.151019521  # q^2
+    push.M_DOUBLE       # 2*M
+    mul
+    push.M_SQUARE       # M^2
     sub
-    #=> [2*q*e - q^2, phi, e^2, ...]
+    #=> [2*M*e - M^2, phi, e^2, ...]
 
     mul sub
     #=> [norm(e)^2, ...]
 end
 
-#! Given a tuple (u, w, v), we want to compute (v - (u + (- w % q) % q) % q), where:
+#! Given a tuple (u, w, v), we want to compute (v - (u + (- w % M) % M) % M), where:
 #!
 #! 1. v is a field element given by its u32 decomposition i.e., (c_lo, c_hi) such that 
 #!    v = c_hi * 2**32 + c_lo
@@ -441,26 +447,26 @@ end
 #!
 #! We can avoid doing three modular reductions by using the following facts:
 #!
-#! 1. q is much smaller than the Miden prime Q. Precisely, q * 2^50 < Q
-#! 2. The coefficients of the product polynomial, u and w, are strictly less than J := 512 * q^2.
-#! 3. The coefficients of c are at most q - 1.
+#! 1. M is much smaller than the Miden prime Q. Precisely, M * 2^50 < Q
+#! 2. The coefficients of the product polynomial, u and w, are strictly less than J := 512 * M^2.
+#! 3. The coefficients of c are at most M - 1.
 #!
-#! This means that we can substitute (v - (u + (- w % q) % q) % q) with  v + w + J - u 
-#! (note J % q= 0) without risking Q-underflow but we can still overflow.
+#! This means that we can substitute (v - (u + (- w % M) % M) % M) with  v + w + J - u 
+#! (note J % M = 0) without risking Q-underflow but we can still overflow.
 #! For this reason, we use the u32 decomposition of v and perform the addition of
-#! v and w + J - u as u64. Note that |w + J - u| <= 1024 * (q - 1)^2
+#! v and w + J - u as u64. Note that |w + J - u| <= 1024 * (M - 1)^2
 #! and hence there is the possibility of an overflow when we add v and w + J - u as u64.
-#! When there is an overflow, we add 10952, which is equal to 2^32 % q, to the upper u32 limb of 
-#! the result of (v + (w + J - u)). Note that since |w + J - u| <= 1024 * (q-1)^2 < 2^38, and
-#! 10952 < q, we are guaranteed that this final u32 addition to the upper limb will not overflow.
+#! When there is an overflow, we add 10952, which is equal to 2^32 % M, to the upper u32 limb of 
+#! the result of (v + (w + J - u)). Note that since |w + J - u| <= 1024 * (M-1)^2 < 2^38, and
+#! 10952 < M, we are guaranteed that this final u32 addition to the upper limb will not overflow.
 #!
-#! To get the final result we reduce (v + w + J - u) modulo q.
+#! To get the final result we reduce (v + w + J - u) modulo M.
 #!
 #! Input: [pi0, pi512 + J, c_hi, c_lo, ...]
 #! Output: [e, ...]
 #!
 #! Cycles: 45
-export.diff_mod_q
+export.diff_mod_M
 
     # 1) Subtract
     sub
@@ -475,7 +481,7 @@ export.diff_mod_q
     #=> [flag, res_hi, res_lo, ..]
     
     # 4) Handle potential overflow in the u64 addition
-    push.10952 # 2^32 mod q
+    push.10952 # 2^32 mod M
     push.0
     #=> [0, 10952, flag, res_hi, res_lo, ..]
     swap.2
@@ -484,30 +490,30 @@ export.diff_mod_q
     add
     #=> [res_hi, res_lo, ..]
 
-    # 5) Reduce modulo q
+    # 5) Reduce modulo M
     exec.mod_12289
     #=> [e, ...]
 end
 
 #! Takes a pointer to a polynomial pi of degree less than 1024 with coefficients in Z_Q and
 #! a polynomial c of degree 512 with coefficients also in Z_Q, where Q is the Miden prime.
-#! The goal is to compute s1 = c - pi  = c - h * s2 in Z_q[x]/(phi) where q is the Falcon prime.
+#! The goal is to compute s1 = c - pi  = c - h * s2 in Z_M[x]/(phi) where M is the Falcon prime.
 #! The pointer pi_ptr points both to pi and c through the relation c_ptr = pi_ptr + offset
 #! where offset := 1281.
 #! The naive way to compute s1 would be to first reduce the polynomial pi modulo the Falcon
-#! prime q and then modulo the irreducible polynomial phi = x^512 + 1. Then we would need to negate
-#! the coefficients of pi modulo q and only then can we add these coefficients to the coefficients
-#! of c and then reduce the result modulo q one more time.
+#! prime M and then modulo the irreducible polynomial phi = x^512 + 1. Then we would need to negate
+#! the coefficients of pi modulo M and only then can we add these coefficients to the coefficients
+#! of c and then reduce the result modulo M one more time.
 #! Knowing that the end goal of computing c is to compute its norm squared, we can do better.
 #!
-#! We can compute s1 in a single pass by delaying the q-modular reduction til the end. This can
+#! We can compute s1 in a single pass by delaying the M-modular reduction til the end. This can
 #! be achieved through a careful analysis of the computation of the difference between pi and c.
 #!
 #! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{512 + i}) which is equal to
 #! c_i  + pi_{512 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
-#! J := 512 * q^2 and this means that J + pi_{512 + i} - pi_i does not Q-underflow and since
-#! J = 0 modulo q, the addition of J does not affect the final result. It is also important to
-#! note that adding J does not Q-overflow by virtue of q * 2^50 < Q.
+#! J := 512 * M^2 and this means that J + pi_{512 + i} - pi_i does not Q-underflow and since
+#! J = 0 mod M, the addition of J does not affect the final result. It is also important to
+#! note that adding J does not Q-overflow by virtue of M * 2^50 < Q.
 #! All of the above implies that we can compute s1_i with only one modular reduction at the end,
 #! in addition to one modular reduction applied to c_i.
 #! Moreover, since we are only interested in the square norm of s1_i, we do not have to store
@@ -549,7 +555,7 @@ export.compute_s1_norm_sq
         #=> [pi512_0, c0_hi, c0_lo, c1, c2, c3, pi512_1, pi512_2, pi512_3, PI, pi_ptr, ...]
         movup.9
         #=> [pi0, pi512_0, c_hi, c_lo, c1, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, ...]
-        exec.diff_mod_q
+        exec.diff_mod_M
         #=> [e, c1, c2, c3, pi512_1, pi512_2, pi512_3, pi1, pi2, pi3, pi_ptr, ...]
         exec.norm_sq
         #=> [norm(e)^2, ...]
@@ -567,7 +573,7 @@ export.compute_s1_norm_sq
         push.J add
         movup.7
         #=> [pi1, pi512_1, c1_hi, c1_lo, c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, ...]
-        exec.diff_mod_q
+        exec.diff_mod_M
         exec.norm_sq
         #=> [e, c2, c3, pi512_2, pi512_3, pi2, pi3, pi_ptr, e0, ...]
 
@@ -583,7 +589,7 @@ export.compute_s1_norm_sq
         #=> [pi512_2, c2_hi, c2_lo, c3, pi512_3, pi2, pi3, pi_ptr, e0, e1, ...]
         movup.5
         #=> [pi2, pi512_2, c2_hi, c2_lo, c3, pi512_3, pi3, pi_ptr, e0, e1, ...]
-        exec.diff_mod_q
+        exec.diff_mod_M
         exec.norm_sq
 
         movdn.4
@@ -596,7 +602,7 @@ export.compute_s1_norm_sq
         movup.2 push.J add
         movup.3
         #=> [pi3, pi512_3, c3_hi, c3_lo, pi_ptr, e0, e1, e2, ...]
-        exec.diff_mod_q
+        exec.diff_mod_M
         #=> [e3, pi_ptr, e0, e1, e2, ...]
         exec.norm_sq
         #=> [e3, pi_ptr, e0, e1, e2, ...]
diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 1996b1f8c..56b79ceef 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -69,7 +69,7 @@ fn test_falcon512_norm_sq() {
     end
     ";
 
-    // normalize(e) = e^2 - phi * (2*q*e - q^2) where phi := (e > (q - 1)/2)
+    // normalize(e) = e^2 - phi * (2*M*e - M^2) where phi := (e > (M - 1)/2)
     let upper = rand::thread_rng().gen_range(Q + 1..M);
     let test_upper = build_test!(source, &[upper]);
     test_upper.expect_stack(&[(M - upper) * (M - upper)]);
@@ -85,7 +85,7 @@ fn test_falcon512_diff_mod_q() {
     use.std::crypto::dsa::rpo_falcon512
 
     begin
-        exec.rpo_falcon512::diff_mod_q
+        exec.rpo_falcon512::diff_mod_M
     end
     ";
     let v = Felt::MODULUS - 1;
@@ -97,7 +97,7 @@ fn test_falcon512_diff_mod_q() {
 
     let test1 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
-    // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
+    // Calculating (v - (u + (- w % M) % M) % M) should be the same as (v + w + J - u) % M.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
         .rem_euclid(M as i128);
@@ -112,7 +112,7 @@ fn test_falcon512_diff_mod_q() {
 
     let test2 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
-    // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
+    // Calculating (v - (u + (- w % M) % M) % M) should be the same as (v + w + J - u) % M.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
         .rem_euclid(M as i128);
@@ -124,13 +124,13 @@ fn test_falcon512_diff_mod_q() {
 
 proptest! {
     #[test]
-    fn diff_mod_q_proptest(v in 0..Felt::MODULUS, w in 0..J, u in 0..J) {
+    fn diff_mod_m_proptest(v in 0..Felt::MODULUS, w in 0..J, u in 0..J) {
 
           let source = "
     use.std::crypto::dsa::rpo_falcon512
 
     begin
-        exec.rpo_falcon512::diff_mod_q
+        exec.rpo_falcon512::diff_mod_M
     end
     ";
 
@@ -138,7 +138,7 @@ proptest! {
 
     let test1 = build_test!(source, &[v_lo as u64, v_hi, w + J, u]);
 
-    // Calculating (v - (u + (- w % q) % q) % q) should be the same as (v + w + J - u) % q.
+    // Calculating (v - (u + (- w % M) % M) % M) should be the same as (v + w + J - u) % M.
     let expanded_answer = (v as i128
         - ((u as i64 + -(w as i64).rem_euclid(M as i64)).rem_euclid(M as i64) as i128))
     .rem_euclid(M as i128);

From 278e285a191f7784cc219132a80600243bd7c33d Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Sun, 26 Jan 2025 18:26:30 +0100
Subject: [PATCH 5/6] chore: fix after rebase

---
 stdlib/asm/crypto/dsa/rpo_falcon512.masm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index fd2222430..1141281ab 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -10,7 +10,7 @@ const.M_HALF=6144 # (M-1) / 2
 const.M_DOUBLE=24578
 const.M_SQUARE=151019521
 const.SQUARE_NORM_BOUND=34034726
-const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=1281
+const.MEMORY_POINTER_OFFSET_OF_HASH_TO_POINT_POLY_FROM_PRODUCT_POLY=5124
 
 # MODULAR REDUCTION FALCON PRIME
 # =============================================================================================

From 80cd4ceda589517034e4bb66de3a7027f9d939cb Mon Sep 17 00:00:00 2001
From: Bobbin Threadbare <bobbinth@protonmail.com>
Date: Mon, 3 Feb 2025 11:51:34 -0800
Subject: [PATCH 6/6] chore: fix typos

---
 assembly/README.md                               | 2 +-
 docs/src/user_docs/assembly/code_organization.md | 2 +-
 processor/README.md                              | 2 +-
 stdlib/asm/crypto/hashes/rpo.masm                | 4 ++--
 stdlib/docs/crypto/hashes/rpo.md                 | 4 ++--
 stdlib/docs/crypto/stark/public_inputs.md        | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/assembly/README.md b/assembly/README.md
index 2814bb79b..06420b6fb 100644
--- a/assembly/README.md
+++ b/assembly/README.md
@@ -44,7 +44,7 @@ let program = assembler.assemble_program(&Path::new("./example.masm")).unwrap();
 As noted above, the default assembler is instantiated with nothing in it but
 the source code you provide. If you want to support more complex programs, you
 will want to factor code into libraries and modules, and then link all of them
-together at once. This can be acheived using a set of builder methods of the
+together at once. This can be achieved using a set of builder methods of the
 `Assembler` struct, e.g. `with_kernel_from_module`, `with_library`, etc.
 
 We'll look at a few of these in more detail below. See the module documentation
diff --git a/docs/src/user_docs/assembly/code_organization.md b/docs/src/user_docs/assembly/code_organization.md
index 626bb58b2..5dd961926 100644
--- a/docs/src/user_docs/assembly/code_organization.md
+++ b/docs/src/user_docs/assembly/code_organization.md
@@ -181,7 +181,7 @@ Miden assembly supports constant declarations. These constants are scoped to the
 
 Constants must be declared right after module imports and before any procedures or program bodies. A constant's name must start with an upper-case letter and can contain any combination of numbers, upper-case ASCII letters, and underscores (`_`). The number of characters in a constant name cannot exceed 100.
 
-A constant's value must be in a decimal or hexidecimal form and be in the range between $0$ and $2^{64} - 2^{32}$ (both inclusive). Value can be defined by an arithmetic expression using `+`, `-`, `*`, `/`, `//`, `(`, `)` operators and references to the previously defined constants if it uses only decimal numbers. Here `/` is a field division and `//` is an integer division. Note that the arithmetic expression cannot contain spaces.
+A constant's value must be in a decimal or hexadecimal form and be in the range between $0$ and $2^{64} - 2^{32}$ (both inclusive). Value can be defined by an arithmetic expression using `+`, `-`, `*`, `/`, `//`, `(`, `)` operators and references to the previously defined constants if it uses only decimal numbers. Here `/` is a field division and `//` is an integer division. Note that the arithmetic expression cannot contain spaces.
 
 ```
 use.std::math::u64
diff --git a/processor/README.md b/processor/README.md
index 92cebdc72..e59178bac 100644
--- a/processor/README.md
+++ b/processor/README.md
@@ -52,7 +52,7 @@ The processor is organized into several components:
 * The range-checker, which is responsible for checking whether values can fit into 16 bits.
 * The chiplets module, which contains specialized chiplets responsible for handling complex computations (e.g., hashing) as well as random access memory.
 
-These components are connected via two busses:
+These components are connected via two buses:
 * The range-checker bus, which links stack and chiplets modules with the range-checker.
 * The chiplet bus, which links stack and the decoder with the chiplets module.
 
diff --git a/stdlib/asm/crypto/hashes/rpo.masm b/stdlib/asm/crypto/hashes/rpo.masm
index f6a132e27..4494948f1 100644
--- a/stdlib/asm/crypto/hashes/rpo.masm
+++ b/stdlib/asm/crypto/hashes/rpo.masm
@@ -4,7 +4,7 @@
 #! consume an amount of data which is a multiple of the rate (2 words).
 #!
 #! Input: []
-#! Ouptut: [PERM, PERM, PERM, ...]
+#! Output: [PERM, PERM, PERM, ...]
 #!
 #! Cycles: 12
 export.init_no_padding
@@ -14,7 +14,7 @@ end
 #! Given the hasher state, returns the hash output.
 #!
 #! Input: [C, B, A, ...]
-#! Ouptut: [HASH, ...]
+#! Output: [HASH, ...]
 #!
 #! Where :
 #! - `A` is the capacity word that will be used by the hashing function.
diff --git a/stdlib/docs/crypto/hashes/rpo.md b/stdlib/docs/crypto/hashes/rpo.md
index da22de54c..85b998935 100644
--- a/stdlib/docs/crypto/hashes/rpo.md
+++ b/stdlib/docs/crypto/hashes/rpo.md
@@ -1,8 +1,8 @@
-Prepares the top of the stack with the hasher initial state.<br /><br />This procedures does not handle padding, therefore, the user is expected to<br />consume an amount of data which is a multiple of the rate (2 words).<br /><br />Input: []<br />Ouptut: [PERM, PERM, PERM, ...]<br />Cycles: 12<br />
+Prepares the top of the stack with the hasher initial state.<br /><br />This procedures does not handle padding, therefore, the user is expected to<br />consume an amount of data which is a multiple of the rate (2 words).<br /><br />Input: []<br />Output: [PERM, PERM, PERM, ...]<br />Cycles: 12<br />
 ## std::crypto::hashes::rpo
 | Procedure | Description |
 | ----------- | ------------- |
-| squeeze_digest | Given the hasher state, returns the hash output.<br /><br />Input: [C, B, A, ...]<br />Ouptut: [HASH, ...]<br />where: For the native RPO hasher HASH is B.<br />Cycles: 9<br /> |
+| squeeze_digest | Given the hasher state, returns the hash output.<br /><br />Input: [C, B, A, ...]<br />Output: [HASH, ...]<br />where: For the native RPO hasher HASH is B.<br />Cycles: 9<br /> |
 | absorb_double_words_from_memory | Hashes the memory `start_addr` to `end_addr` given an RPO state specified by 3 words.<br /><br />This requires that `end_addr=start_addr + 2n + 1`, otherwise the procedure will enter an infinite<br />loop. `end_addr` is not inclusive.<br /><br />Stack transition:<br />Input: [C, B, A, start_addr, end_addr, ...]<br />Output: [C', B', A', end_addr, end_addr ...]<br />Cycles: 4 + 3 * words, where `words` is the `start_addr - end_addr - 1`<br /><br />Where `A` is the capacity word that will be used by the hashing function, and `B'` the hash output.<br /> |
 | hash_memory_words | Hashes the memory `start_addr` to `end_addr`, handles odd number of elements.<br /><br />Requires `start_addr < end_addr`, `end_addr` is not inclusive.<br /><br />Stack transition:<br />Input: [start_addr, end_addr, ...]<br />Output: [H, ...]<br />Cycles:<br />even words: 49 cycles + 3 * words<br />odd words: 61 cycles + 3 * words<br /> |
 | hash_memory | Computes hash of Felt values starting at the specified memory address.<br /><br />This procedure divides the hashing process into two parts: hashing pairs of words using<br />`absorb_double_words_from_memory` procedure and hashing the remaining values using the `hperm`<br />instruction.<br /><br />Inputs:  [ptr, num_elements]<br />Outputs: [HASH]<br />Cycles:<br />- If number of elements divides by 8: 47 cycles + 3 * words<br />- Else: 180 cycles + 3 * words<br /><br />Panics if number of inputs equals 0.<br /> |
diff --git a/stdlib/docs/crypto/stark/public_inputs.md b/stdlib/docs/crypto/stark/public_inputs.md
index 96b67699f..cb6cfb298 100644
--- a/stdlib/docs/crypto/stark/public_inputs.md
+++ b/stdlib/docs/crypto/stark/public_inputs.md
@@ -2,4 +2,4 @@
 ## std::crypto::stark::public_inputs
 | Procedure | Description |
 | ----------- | ------------- |
-| load | Load the public inputs in memory starting from the address referenced by `public_inputs_ptr`.<br />In parallel, compute the hash of the public inputs being loaded. The hashing starts with<br />capacity registers of the hash function set to `C` resulting from hashing the proof context.<br />The ouptut D is the digest of the hashing.<br /><br />Input: [public_inputs_ptr, C]<br />Output: [D]<br />Cycles: 38<br /> |
+| load | Load the public inputs in memory starting from the address referenced by `public_inputs_ptr`.<br />In parallel, compute the hash of the public inputs being loaded. The hashing starts with<br />capacity registers of the hash function set to `C` resulting from hashing the proof context.<br />The output D is the digest of the hashing.<br /><br />Input: [public_inputs_ptr, C]<br />Output: [D]<br />Cycles: 38<br /> |