From 59d0360f3d68433c31d95c0b82082eaac275cd5c Mon Sep 17 00:00:00 2001
From: Yang Liu <liuyang22@iscas.ac.cn>
Date: Thu, 27 Jun 2024 13:56:07 +0800
Subject: [PATCH] i#3544 RV64 vector part2: Add basic vector support to the
 core (#6848)

This is a follow-up patch of adding RISC-V vector (RVV) extension
support to the core, part1 in PR #6810 (f1ce1bca).

This patch:
1. fixes several issues in the codec introduced in part1, codec unit
tests will be submitted separately in follow-up PRs;
2. rename and reuse SVE vector length getter/setter functions to be more
concise on APIs for vector extensions;
3. adds RISC-V vector support to drdisas;
4. support code cache and clean call context switch;

For now, we support RISC-V vector lengths up to 256 bits, longer vector
lengths will exceed the limit of DynamoRIO stack size and 12-bit signed
immediate range.

Issue: #3544
---
 .github/workflows/ci-docs.yml                 |   2 +-
 .github/workflows/ci-package.yml              |  12 +-
 CMakeLists.txt                                |   2 +-
 api/docs/release.dox                          |   4 +
 clients/drcachesim/common/trace_entry.h       |   2 +-
 .../drcachesim/tools/invariant_checker.cpp    |   4 +-
 clients/drcachesim/tools/opcode_mix.cpp       |   4 +-
 clients/drcachesim/tracer/raw2trace.cpp       |   4 +-
 clients/drdisas/drdisas.cpp                   |  19 ++-
 core/CMakeLists.txt                           |   2 +-
 core/arch/aarch64/proc.c                      |   8 +-
 core/arch/arch.h                              |   5 +
 core/arch/proc.h                              |   4 +
 core/arch/proc_api.h                          |  12 +-
 core/arch/proc_shared.c                       |  12 +-
 core/arch/riscv64/emit_utils.c                | 109 +++++++++++-
 core/arch/riscv64/mangle.c                    | 161 ++++++++++++++++--
 core/arch/riscv64/proc.c                      |  22 +++
 core/arch/riscv64/riscv64.asm                 |   9 +-
 core/globals.h                                |  12 +-
 core/ir/aarch64/codec.c                       |  12 +-
 core/ir/decode_shared.c                       |  32 ++--
 core/ir/encode_api.h                          |  14 +-
 core/ir/opnd.h                                |  16 +-
 core/ir/opnd_api.h                            |  79 ++++++++-
 core/ir/opnd_shared.c                         |  78 +++++++--
 core/ir/riscv64/codec.c                       | 147 ++++++++++++++--
 core/ir/riscv64/codec.py                      |   5 +-
 core/ir/riscv64/encode.c                      |  44 ++++-
 core/lib/globals_api.h                        |  19 ++-
 core/lib/mcxtx_api.h                          |   4 +-
 core/unit_tests.c                             |   3 +
 core/unix/include/sigcontext.h                |  31 +++-
 suite/tests/api/ir_aarch64_legacy.c           |  10 +-
 suite/tests/api/opnd-a64.c                    |   8 +-
 35 files changed, 782 insertions(+), 129 deletions(-)

diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml
index 40e89b324b0..0d1d7f10678 100644
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@@ -90,7 +90,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
index 64ba2b321db..6f18d34d589 100644
--- a/.github/workflows/ci-package.yml
+++ b/.github/workflows/ci-package.yml
@@ -103,7 +103,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -195,7 +195,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -283,7 +283,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -371,7 +371,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -451,7 +451,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -536,7 +536,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER="10.92.$((`git log -n 1 --format=%ct` / (60*60*24)))"
+          export VERSION_NUMBER="10.93.$((`git log -n 1 --format=%ct` / (60*60*24)))"
           export PREFIX="cronbuild-"
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50be3c81ed7..8ece8b790cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -575,7 +575,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn")
 
 # N.B.: When updating this, update all the default versions in ci-package.yml
 # and ci-docs.yml.  We should find a way to share (xref i#1565).
-set(VERSION_NUMBER_DEFAULT "10.92.${VERSION_NUMBER_PATCHLEVEL}")
+set(VERSION_NUMBER_DEFAULT "10.93.${VERSION_NUMBER_PATCHLEVEL}")
 # do not store the default VERSION_NUMBER in the cache to prevent a stale one
 # from preventing future version updates in a pre-existing build dir
 set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default")
diff --git a/api/docs/release.dox b/api/docs/release.dox
index 0eba0910da3..b5a8d48961b 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -157,6 +157,10 @@ changes:
  - Changed the way we cache the feature register values on AArch64. We now use an array of
    uint64 values rather than individual variables for each feature register. This
    allows the code to be more readable and easier to maintain.
+ - Renamed dr_set_sve_vector_length() to dr_set_vector_length() to share function
+   signature between AArch64 and RISC-V.
+ - Renamed dr_get_sve_vector_length() to dr_get_vector_length() to share function
+   signature between AArch64 and RISC-V.
 
 Further non-compatibility-affecting changes include:
  - Added DWARF-5 support to the drsyms library by linking in 4 static libraries
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 6b1a66e2b13..5928a6d840a 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -634,7 +634,7 @@ typedef enum {
      * length value is specific to the current thread.
      * The vector length affects how some SVE instructions are decoded so any tools which
      * decode instructions should clear any cached data and set the vector length used by
-     * the decoder using dr_set_sve_vector_length().
+     * the decoder using dr_set_vector_length().
      */
     TRACE_MARKER_TYPE_VECTOR_LENGTH,
 
diff --git a/clients/drcachesim/tools/invariant_checker.cpp b/clients/drcachesim/tools/invariant_checker.cpp
index df0214b37e3..2ba5227b0cb 100644
--- a/clients/drcachesim/tools/invariant_checker.cpp
+++ b/clients/drcachesim/tools/invariant_checker.cpp
@@ -438,8 +438,8 @@ invariant_checker_t::parallel_shard_memref(void *shard_data, const memref_t &mem
                         "Vector length marker has invalid size");
 
         const int new_vl_bits = memref.marker.marker_value * 8;
-        if (dr_get_sve_vector_length() != new_vl_bits) {
-            dr_set_sve_vector_length(new_vl_bits);
+        if (dr_get_vector_length() != new_vl_bits) {
+            dr_set_vector_length(new_vl_bits);
             // Changing the vector length can change the IR representation of some SVE
             // instructions but it doesn't effect any of the metadata that is stored
             // in decode_cache_ so we don't need to flush the cache.
diff --git a/clients/drcachesim/tools/opcode_mix.cpp b/clients/drcachesim/tools/opcode_mix.cpp
index 8b748ff0e90..07c712e09d0 100644
--- a/clients/drcachesim/tools/opcode_mix.cpp
+++ b/clients/drcachesim/tools/opcode_mix.cpp
@@ -186,8 +186,8 @@ opcode_mix_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
                memref.marker.marker_type == TRACE_MARKER_TYPE_VECTOR_LENGTH) {
 #ifdef AARCH64
         const int new_vl_bits = memref.marker.marker_value * 8;
-        if (dr_get_sve_vector_length() != new_vl_bits) {
-            dr_set_sve_vector_length(new_vl_bits);
+        if (dr_get_vector_length() != new_vl_bits) {
+            dr_set_vector_length(new_vl_bits);
             // Changing the vector length can change the IR representation of some SVE
             // instructions but it will never change the opcode so we don't need to
             // flush the opcode cache.
diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
index 6eceb3194d2..4ddd8695206 100644
--- a/clients/drcachesim/tracer/raw2trace.cpp
+++ b/clients/drcachesim/tracer/raw2trace.cpp
@@ -910,8 +910,8 @@ raw2trace_t::process_marker(raw2trace_thread_data_t *tdata,
             tdata->tid, marker_val);
 
         const int new_vl_bits = marker_val * 8;
-        if (dr_get_sve_vector_length() != new_vl_bits) {
-            dr_set_sve_vector_length(new_vl_bits);
+        if (dr_get_vector_length() != new_vl_bits) {
+            dr_set_vector_length(new_vl_bits);
             // Some SVE load/store instructions have an offset which is scaled by a value
             // that depends on the vector length. These instructions will need to be
             // re-decoded after the vector length changes.
diff --git a/clients/drdisas/drdisas.cpp b/clients/drdisas/drdisas.cpp
index 77c96e7fcfb..86cc56a1762 100644
--- a/clients/drdisas/drdisas.cpp
+++ b/clients/drdisas/drdisas.cpp
@@ -61,11 +61,16 @@ droption_t<std::string> op_mode(DROPTION_SCOPE_FRONTEND, "mode", "arm",
                                 "Decodes using the specified mode: 'arm' or 'thumb'.");
 #elif defined(AARCH64)
 droption_t<unsigned int>
-    op_sve_vl(DROPTION_SCOPE_FRONTEND, "vl", 128,
-              "Sets the SVE vector length to one of: 128 256 384 512 640 768 896 1024 "
-              "1152 1280 1408 1536 1664 1792 1920 2048.",
-              "Sets the SVE vector length to one of: 128 256 384 512 640 768 896 1024 "
-              "1152 1280 1408 1536 1664 1792 1920 2048.");
+    op_vl(DROPTION_SCOPE_FRONTEND, "vl", 128,
+          "Sets the SVE vector length to one of: 128 256 384 512 640 768 896 1024 "
+          "1152 1280 1408 1536 1664 1792 1920 2048.",
+          "Sets the SVE vector length to one of: 128 256 384 512 640 768 896 1024 "
+          "1152 1280 1408 1536 1664 1792 1920 2048.");
+#elif defined(RISCV64)
+droption_t<unsigned int>
+    op_vl(DROPTION_SCOPE_FRONTEND, "vl", 128,
+          "Sets the RVV vector length from 64 to 65536 in the power of 2.",
+          "Sets the RVV vector length from 64 to 65536 in the power of 2.");
 #endif
 
 droption_t<bool> op_show_bytes(DROPTION_SCOPE_FRONTEND, "show_bytes", true,
@@ -147,8 +152,8 @@ main(int argc, const char *argv[])
     }
 #endif
 
-#ifdef AARCH64
-    dr_set_sve_vector_length(op_sve_vl.get_value());
+#if defined(AARCH64) || defined(RISCV64)
+    dr_set_vector_length(op_vl.get_value());
 #endif
 
     // XXX i#4021: arm not yet supported.
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index c0230239e72..55a12a82c84 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -1167,7 +1167,7 @@ if (BUILD_TESTS AND
   if (APPLE)
     set_tests_properties(unit_tests PROPERTIES LABELS OSX)
   endif ()
-  if (AARCHXX)
+  if (AARCHXX OR RISCV64)
     set_tests_properties(unit_tests PROPERTIES LABELS RUNS_ON_QEMU)
   endif ()
   copy_target_to_device(unit_tests "${location_suffix}")
diff --git a/core/arch/aarch64/proc.c b/core/arch/aarch64/proc.c
index 109ce3dfe5b..dea88fefcf0 100644
--- a/core/arch/aarch64/proc.c
+++ b/core/arch/aarch64/proc.c
@@ -131,14 +131,14 @@ get_processor_specific_info(void)
             :
             : "x0");
         cpu_info.sve_vector_length_bytes = vl;
-        dr_set_sve_vector_length(vl * 8);
+        dr_set_vector_length(vl * 8);
     } else {
         cpu_info.sve_vector_length_bytes = 32;
-        dr_set_sve_vector_length(256);
+        dr_set_vector_length(256);
     }
 #        else
     /* Set SVE vector length for unit testing the off-line decoder. */
-    dr_set_sve_vector_length(256);
+    dr_set_vector_length(256);
 #        endif
 }
 #    endif
@@ -290,7 +290,7 @@ enable_all_test_cpu_features()
     for (int i = 0; i < BUFFER_SIZE_ELEMENTS(features); ++i) {
         proc_set_feature(features[i], true);
     }
-    dr_set_sve_vector_length(256);
+    dr_set_vector_length(256);
 }
 
 #ifndef DR_HOST_NOT_TARGET
diff --git a/core/arch/arch.h b/core/arch/arch.h
index 5adcab0f0c4..d2c2eead38d 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -174,6 +174,8 @@ mixed_mode_enabled(void)
 #    define REG4_OFFSET ((MC_OFFS) + (offsetof(priv_mcontext_t, a4)))
 #    define REG5_OFFSET ((MC_OFFS) + (offsetof(priv_mcontext_t, a5)))
 #    define XFLAGS_OFFSET ((MC_OFFS) + (offsetof(priv_mcontext_t, fcsr)))
+#    define VSTART_OFFSET ((MC_OFFS) + (offsetof(priv_mcontext_t, vstart)))
+#    define VCSR_OFFSET ((MC_OFFS) + (offsetof(priv_mcontext_t, vcsr)))
 #    define SCRATCH_REG0 DR_REG_A0
 #    define SCRATCH_REG1 DR_REG_A1
 #    define SCRATCH_REG2 DR_REG_A2
@@ -188,6 +190,9 @@ mixed_mode_enabled(void)
 #    define SCRATCH_REG5_OFFS REG5_OFFSET
 #    define REG_OFFSET(reg) (X0_OFFSET + ((reg)-DR_REG_X0) * sizeof(reg_t))
 #    define FREG_OFFSET(reg) (F0_OFFSET + ((reg)-DR_REG_F0) * sizeof(reg_t))
+#    define VREG_OFFSET(reg) \
+        ((MC_OFFS) +         \
+         (offsetof(priv_mcontext_t, simd) + ((reg)-DR_REG_VR0) * sizeof(dr_simd_t)))
 #    define CALL_SCRATCH_REG DR_REG_T6
 #    define MC_IBL_REG a2
 #    define MC_RETVAL_REG a0
diff --git a/core/arch/proc.h b/core/arch/proc.h
index e4b204edd05..d537854dc1a 100644
--- a/core/arch/proc.h
+++ b/core/arch/proc.h
@@ -72,6 +72,10 @@ typedef struct _cpu_info_t {
 #ifdef AARCHXX
     uint architecture;
     uint sve_vector_length_bytes;
+#endif
+#ifdef RISCV64
+    /* Vector length in bytes. */
+    uint vlenb;
 #endif
     uint family;
     uint type;
diff --git a/core/arch/proc_api.h b/core/arch/proc_api.h
index f02da530a58..0b7373efb57 100644
--- a/core/arch/proc_api.h
+++ b/core/arch/proc_api.h
@@ -565,7 +565,7 @@ DR_API
 const char *
 proc_get_cache_size_str(cache_size_t size);
 
-#ifdef AARCHXX
+#if defined(AARCHXX)
 DR_API
 /**
  * Returns the size in bytes of the SVE registers' vector length set by the
@@ -576,6 +576,16 @@ DR_API
  */
 uint
 proc_get_vector_length_bytes(void);
+#elif defined(RISCV64)
+DR_API
+/**
+ * Returns the size in bytes of the RVV registers' vector length which is a design-time
+ * constant set by the hardware implementor. Length can be from 64 to 65536 bits
+ * in the power of 2.
+ * Currently DynamoRIO supports implementations of up to 256 bits.
+ */
+uint
+proc_get_vector_length_bytes(void);
 #endif
 
 DR_API
diff --git a/core/arch/proc_shared.c b/core/arch/proc_shared.c
index a9e8599ee45..6590ccfe02c 100644
--- a/core/arch/proc_shared.c
+++ b/core/arch/proc_shared.c
@@ -74,8 +74,10 @@ cpu_info_t cpu_info = {
 #else
     VENDOR_UNKNOWN,
 #endif
-#ifdef AARCHXX
+#if defined(AARCHXX)
+    0,
     0,
+#elif defined(RISCV64)
     0,
 #endif
     0,
@@ -200,7 +202,7 @@ proc_get_stepping(void)
     return cpu_info.stepping;
 }
 
-#ifdef AARCHXX
+#if defined(AARCHXX)
 uint
 proc_get_architecture(void)
 {
@@ -212,6 +214,12 @@ proc_get_vector_length_bytes(void)
 {
     return cpu_info.sve_vector_length_bytes;
 }
+#elif defined(RISCV64)
+uint
+proc_get_vector_length_bytes(void)
+{
+    return cpu_info.vlenb;
+}
 #endif
 
 features_t *
diff --git a/core/arch/riscv64/emit_utils.c b/core/arch/riscv64/emit_utils.c
index 0b423bdfedc..cf9ee479a4c 100644
--- a/core/arch/riscv64/emit_utils.c
+++ b/core/arch/riscv64/emit_utils.c
@@ -52,6 +52,8 @@
 /* TODO i#3544: Think of a better way to represent CSR in the IR, maybe as registers? */
 /* Number of the fcsr register. */
 #define FCSR 0x003
+#define VSTART 0x008
+#define VCSR 0x00F
 
 /* Instruction fixed bits constants. */
 
@@ -648,6 +650,18 @@ append_restore_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
         INSTR_CREATE_csrrw(dcontext, opnd_create_reg(DR_REG_X0),
                            opnd_create_reg(DR_REG_A0),
                            opnd_create_immed_int(FCSR, OPSZ_12b)));
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_A0, VSTART_OFFSET));
+        APP(ilist,
+            INSTR_CREATE_csrrw(dcontext, opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_reg(DR_REG_A0),
+                               opnd_create_immed_int(VSTART, OPSZ_12b)));
+        APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_A0, VCSR_OFFSET));
+        APP(ilist,
+            INSTR_CREATE_csrrw(dcontext, opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_reg(DR_REG_A0),
+                               opnd_create_immed_int(VCSR, OPSZ_12b)));
+    }
 }
 
 /* dcontext is in REG_DCXT; other registers can be used as scratch.
@@ -656,15 +670,53 @@ void
 append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
 {
     opnd_t memopnd;
+    uint vtypei;
 
     /* Floating-point register is not SIMD registers in RISC-V, but to be consistent with
      * other architectures, we handle them here.
      */
     for (int reg = DR_REG_F0; reg <= DR_REG_F31; reg++) {
         memopnd = opnd_create_dcontext_field_via_reg_sz(
-            dcontext, REG_NULL, REG_OFFSET(reg), reg_get_size(reg));
+            dcontext, REG_NULL, FREG_OFFSET(reg), reg_get_size(reg));
         APP(ilist, INSTR_CREATE_fld(dcontext, opnd_create_reg(reg), memopnd));
     }
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* ma:   mask agnostic
+         * ta:   tail agnostic
+         * sew:  selected element width
+         * lmul: vector register group multiplier
+         *
+         *           ma            ta         sew=8       lmul=8 */
+        vtypei = (0b1 << 7) | (0b1 << 6) | (0b000 << 3) | 0b011;
+        memopnd = opnd_create_dcontext_field_via_reg_sz(
+            dcontext, DR_REG_A1, 0, reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_8));
+        APP(ilist,
+            INSTR_CREATE_addi(dcontext, opnd_create_reg(DR_REG_A1),
+                              opnd_create_reg(REG_DCXT),
+                              opnd_create_immed_int(VREG_OFFSET(DR_REG_VR0), OPSZ_12b)));
+        /* For the following vector instructions, set the element width to 8b, and use 8
+         * registers as a group (lmul=8).
+         */
+        APP(ilist,
+            INSTR_CREATE_vsetvli(dcontext, opnd_create_reg(DR_REG_A0),
+                                 opnd_create_reg(DR_REG_ZERO),
+                                 opnd_create_immed_uint(vtypei, OPSZ_11b)));
+        /* Uses lmul=8 to copy 8 registers at a time. */
+        for (int reg = DR_REG_VR0; reg <= DR_REG_VR31; reg += 8) {
+            APP(ilist,
+                INSTR_CREATE_vle8_v(dcontext, opnd_create_reg(reg), memopnd,
+                                    opnd_create_immed_int(1, OPSZ_1b) /* mask disabled */,
+                                    opnd_create_immed_int(0, OPSZ_3b) /* nfields = 1 */));
+            /* If it's the last vector register group, no need to increase the offset. */
+            if (reg != DR_REG_VR24) {
+                APP(ilist,
+                    INSTR_CREATE_addi(
+                        dcontext, opnd_create_reg(DR_REG_A1), opnd_create_reg(DR_REG_A1),
+                        opnd_create_immed_int(8 * sizeof(dr_simd_t), OPSZ_12b)));
+            }
+        }
+    }
 }
 
 /* Append instructions to restore gpr on fcache enter, to be executed
@@ -748,22 +800,60 @@ append_save_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end, bool abs
     APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, REG_OFFSET(DR_REG_TP)));
 }
 
-/* dcontext base is held in REG_DCXT, and exit stub in X0.
+/* dcontext base is held in REG_DCXT, and exit stub in A0.
  * GPR's are already saved.
  */
 void
 append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
 {
     opnd_t memopnd;
+    uint vtypei;
 
     /* Floating-point register is not SIMD registers in RISC-V, but to be consistent with
      * other architectures, we handle them here.
      */
     for (int reg = DR_REG_F0; reg <= DR_REG_F31; reg++) {
         memopnd = opnd_create_dcontext_field_via_reg_sz(
-            dcontext, REG_NULL, REG_OFFSET(reg), reg_get_size(reg));
+            dcontext, REG_NULL, FREG_OFFSET(reg), reg_get_size(reg));
         APP(ilist, INSTR_CREATE_fsd(dcontext, memopnd, opnd_create_reg(reg)));
     }
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* ma:   mask agnostic
+         * ta:   tail agnostic
+         * sew:  selected element width
+         * lmul: vector register group multiplier
+         *
+         *           ma            ta         sew=8       lmul=8 */
+        vtypei = (0b1 << 7) | (0b1 << 6) | (0b000 << 3) | 0b011;
+        memopnd = opnd_create_dcontext_field_via_reg_sz(
+            dcontext, DR_REG_A1, 0, reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_8));
+        APP(ilist,
+            INSTR_CREATE_addi(dcontext, opnd_create_reg(DR_REG_A1),
+                              opnd_create_reg(REG_DCXT),
+                              opnd_create_immed_int(VREG_OFFSET(DR_REG_VR0), OPSZ_12b)));
+        /* For the following vector instructions, set the element width to 8b, and use 8
+         * registers as a group (lmul=8).
+         */
+        APP(ilist,
+            INSTR_CREATE_vsetvli(dcontext, opnd_create_reg(DR_REG_A2),
+                                 opnd_create_reg(DR_REG_ZERO),
+                                 opnd_create_immed_uint(vtypei, OPSZ_11b)));
+        /* Uses lmul=8 to copy 8 registers at a time. */
+        for (int reg = DR_REG_VR0; reg <= DR_REG_VR31; reg += 8) {
+            APP(ilist,
+                INSTR_CREATE_vse8_v(dcontext, memopnd, opnd_create_reg(reg),
+                                    opnd_create_immed_int(1, OPSZ_1b) /* mask disabled */,
+                                    opnd_create_immed_int(0, OPSZ_3b) /* nfields = 1 */));
+            /* If it's the last vector register group, no need to increase the offset. */
+            if (reg != DR_REG_VR24) {
+                APP(ilist,
+                    INSTR_CREATE_addi(
+                        dcontext, opnd_create_reg(DR_REG_A1), opnd_create_reg(DR_REG_A1),
+                        opnd_create_immed_int(8 * sizeof(dr_simd_t), OPSZ_12b)));
+            }
+        }
+    }
 }
 
 /* Scratch reg0 is holding exit stub. */
@@ -775,6 +865,19 @@ append_save_clear_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute
                            opnd_create_reg(DR_REG_X0),
                            opnd_create_immed_int(FCSR, OPSZ_12b)));
     APP(ilist, SAVE_TO_DC(dcontext, DR_REG_A1, XFLAGS_OFFSET));
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        APP(ilist,
+            INSTR_CREATE_csrrs(dcontext, opnd_create_reg(DR_REG_A1),
+                               opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_immed_int(VSTART, OPSZ_12b)));
+        APP(ilist, SAVE_TO_DC(dcontext, DR_REG_A1, VSTART_OFFSET));
+        APP(ilist,
+            INSTR_CREATE_csrrs(dcontext, opnd_create_reg(DR_REG_A1),
+                               opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_immed_int(VCSR, OPSZ_12b)));
+        APP(ilist, SAVE_TO_DC(dcontext, DR_REG_A1, VCSR_OFFSET));
+    }
 }
 
 bool
diff --git a/core/arch/riscv64/mangle.c b/core/arch/riscv64/mangle.c
index b67bf55c75d..9b103937f79 100644
--- a/core/arch/riscv64/mangle.c
+++ b/core/arch/riscv64/mangle.c
@@ -43,6 +43,8 @@
 /* TODO i#3544: Think of a better way to represent CSR in the IR, maybe as registers? */
 /* Number of the fcsr register. */
 #define FCSR 0x003
+#define VSTART 0x008
+#define VCSR 0x00F
 
 /* TODO i#3544: Think of a better way to represent these fields in the IR. */
 /* Volume I: RISC-V Unprivileged ISA V20191213.
@@ -73,13 +75,16 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
 {
     uint dstack_offs = 0;
     int dstack_middle_offs;
+    uint vtypei;
+    opnd_t memopnd;
 
     if (cci == NULL)
         cci = &default_clean_call_info;
     ASSERT(proc_num_simd_registers() == MCXT_NUM_SIMD_SLOTS);
 
-    /* a0 is used to save and restore the pc and csr registers. */
+    /* A0 and A1 are used as scratch registers. */
     cci->reg_skip[DR_REG_A0 - DR_REG_START_GPR] = false;
+    cci->reg_skip[DR_REG_A1 - DR_REG_START_GPR] = false;
 
     /* For out-of-line clean calls, the stack pointer is adjusted before jumping to this
      * code.
@@ -101,7 +106,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
         if (cci->reg_skip[i] || (out_of_line && DR_REG_START_GPR + i == DR_REG_RA))
             continue;
 
-        /* Uses c.sdsp to save space, see -max_bb_instrs option, same below. */
+        /* Uses c.[f]sdsp to save space, same below. */
         PRE(ilist, instr,
             INSTR_CREATE_c_sdsp(dcontext,
                                 opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0,
@@ -151,7 +156,6 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     PRE(ilist, instr,
         INSTR_CREATE_csrrs(dcontext, opnd_create_reg(DR_REG_A0),
                            opnd_create_reg(DR_REG_X0),
-                           /* FIXME i#3544: Use register. */
                            opnd_create_immed_int(FCSR, OPSZ_12b)));
 
     PRE(ilist, instr,
@@ -160,8 +164,72 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
 
     dstack_offs += XSP_SZ;
 
-    /* TODO i#3544: No support for SIMD on RISC-V so far, this is to keep the mcontext
-     * shape. */
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* csrr a0, vstart */
+        PRE(ilist, instr,
+            INSTR_CREATE_csrrs(dcontext, opnd_create_reg(DR_REG_A0),
+                               opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_immed_int(VSTART, OPSZ_12b)));
+
+        PRE(ilist, instr,
+            INSTR_CREATE_c_sdsp(dcontext, OPND_CREATE_MEM64(DR_REG_SP, dstack_offs),
+                                opnd_create_reg(DR_REG_A0)));
+    }
+
+    dstack_offs += XSP_SZ;
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* csrr a0, vcsr */
+        PRE(ilist, instr,
+            INSTR_CREATE_csrrs(dcontext, opnd_create_reg(DR_REG_A0),
+                               opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_immed_int(VCSR, OPSZ_12b)));
+
+        PRE(ilist, instr,
+            INSTR_CREATE_c_sdsp(dcontext, OPND_CREATE_MEM64(DR_REG_SP, dstack_offs),
+                                opnd_create_reg(DR_REG_A0)));
+    }
+
+    dstack_offs += XSP_SZ;
+
+    /* Push vector registers. */
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* ma:   mask agnostic
+         * ta:   tail agnostic
+         * sew:  selected element width
+         * lmul: vector register group multiplier
+         *
+         *           ma            ta         sew=8       lmul=8 */
+        vtypei = (0b1 << 7) | (0b1 << 6) | (0b000 << 3) | 0b011;
+        memopnd = opnd_create_dcontext_field_via_reg_sz(
+            dcontext, DR_REG_A0, 0, reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_8));
+        PRE(ilist, instr,
+            INSTR_CREATE_addi(dcontext, opnd_create_reg(DR_REG_A0),
+                              opnd_create_reg(DR_REG_SP),
+                              opnd_create_immed_int(dstack_offs, OPSZ_12b)));
+        /* For the following vector instructions, set the element width to 8b, and use 8
+         * registers as a group (lmul=8).
+         */
+        PRE(ilist, instr,
+            INSTR_CREATE_vsetvli(dcontext, opnd_create_reg(DR_REG_A1),
+                                 opnd_create_reg(DR_REG_ZERO),
+                                 opnd_create_immed_uint(vtypei, OPSZ_11b)));
+        /* Uses lmul=8 to copy 8 registers at a time. */
+        for (int reg = DR_REG_VR0; reg <= DR_REG_VR31; reg += 8) {
+            PRE(ilist, instr,
+                INSTR_CREATE_vse8_v(dcontext, memopnd, opnd_create_reg(reg),
+                                    opnd_create_immed_int(1, OPSZ_1b) /* mask disabled */,
+                                    opnd_create_immed_int(0, OPSZ_3b) /* nfields = 1 */));
+            /* If it's the last vector register group, no need to increase the offset. */
+            if (reg != DR_REG_VR24) {
+                PRE(ilist, instr,
+                    INSTR_CREATE_addi(
+                        dcontext, opnd_create_reg(DR_REG_A0), opnd_create_reg(DR_REG_A0),
+                        opnd_create_immed_int(8 * sizeof(dr_simd_t), OPSZ_12b)));
+            }
+        }
+    }
+
     dstack_offs += (proc_num_simd_registers() * sizeof(dr_simd_t));
 
     /* Restore sp. */
@@ -174,6 +242,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     PRE(ilist, instr,
         INSTR_CREATE_c_ldsp(dcontext, opnd_create_reg(DR_REG_A0),
                             OPND_CREATE_MEM64(DR_REG_SP, REG_OFFSET(DR_REG_A0))));
+    PRE(ilist, instr,
+        INSTR_CREATE_c_ldsp(dcontext, opnd_create_reg(DR_REG_A1),
+                            OPND_CREATE_MEM64(DR_REG_SP, REG_OFFSET(DR_REG_A1))));
 
     return dstack_offs + dstack_middle_offs;
 }
@@ -184,24 +255,90 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
 {
     if (cci == NULL)
         cci = &default_clean_call_info;
-    uint current_offs;
-    current_offs = get_clean_call_switch_stack_size() -
-        proc_num_simd_registers() * sizeof(dr_simd_t);
+    uint current_offs, vtypei;
+    opnd_t memopnd;
+    current_offs = get_clean_call_switch_stack_size();
 
     /* sp is the stack pointer, which should not be poped. */
     cci->reg_skip[DR_REG_SP - DR_REG_START_GPR] = true;
 
-    /* XXX: c.sdsp/c.fsdsp has a zero-extended 9-bit offset, which is not enough for our
-     * usage.
+    /* Pop vector registers. */
+    current_offs -= proc_num_simd_registers() * sizeof(dr_simd_t);
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        /* ma:   mask agnostic
+         * ta:   tail agnostic
+         * sew:  selected element width
+         * lmul: vector register group multiplier
+         *
+         *           ma            ta         sew=8       lmul=8 */
+        vtypei = (0b1 << 7) | (0b1 << 6) | (0b000 << 3) | 0b011;
+        memopnd = opnd_create_dcontext_field_via_reg_sz(
+            dcontext, DR_REG_A0, 0, reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_8));
+        PRE(ilist, instr,
+            INSTR_CREATE_addi(dcontext, opnd_create_reg(DR_REG_A0),
+                              opnd_create_reg(DR_REG_SP),
+                              opnd_create_immed_int(current_offs, OPSZ_12b)));
+        /* For the following vector instructions, set the element width to 8b, and use 8
+         * registers as a group (lmul=8).
+         */
+        PRE(ilist, instr,
+            INSTR_CREATE_vsetvli(dcontext, opnd_create_reg(DR_REG_A1),
+                                 opnd_create_reg(DR_REG_ZERO),
+                                 opnd_create_immed_uint(vtypei, OPSZ_11b)));
+        /* Uses lmul=8 to copy 8 registers at a time. */
+        for (int reg = DR_REG_VR0; reg <= DR_REG_VR31; reg += 8) {
+            PRE(ilist, instr,
+                INSTR_CREATE_vle8_v(dcontext, opnd_create_reg(reg), memopnd,
+                                    opnd_create_immed_int(1, OPSZ_1b) /* mask disabled */,
+                                    opnd_create_immed_int(0, OPSZ_3b) /* nfields = 1 */));
+            /* If it's the last vector register group, no need to increase the offset. */
+            if (reg != DR_REG_VR24) {
+                PRE(ilist, instr,
+                    INSTR_CREATE_addi(
+                        dcontext, opnd_create_reg(DR_REG_A0), opnd_create_reg(DR_REG_A0),
+                        opnd_create_immed_int(8 * sizeof(dr_simd_t), OPSZ_12b)));
+            }
+        }
+    }
+
+    /* XXX: c.sdsp/c.fsdsp has a zero-extended 9-bit offset, which is not enough for
+     * our usage.
      */
-    ASSERT(current_offs >= DR_NUM_FPR_REGS * XSP_SZ);
     PRE(ilist, instr,
         INSTR_CREATE_addi(dcontext, opnd_create_reg(DR_REG_SP),
                           opnd_create_reg(DR_REG_SP),
                           opnd_create_immed_int(DR_NUM_FPR_REGS * XSP_SZ, OPSZ_12b)));
 
+    /* Uses c.[f]ldsp to save space, same below. */
+    current_offs -= XSP_SZ;
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        PRE(ilist, instr,
+            INSTR_CREATE_c_ldsp(
+                dcontext, opnd_create_reg(DR_REG_A0),
+                OPND_CREATE_MEM64(DR_REG_SP, current_offs - DR_NUM_FPR_REGS * XSP_SZ)));
+        /* csrw a0, vcsr */
+        PRE(ilist, instr,
+            INSTR_CREATE_csrrw(dcontext, opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_reg(DR_REG_A0),
+                               opnd_create_immed_int(VCSR, OPSZ_12b)));
+    }
+
+    current_offs -= XSP_SZ;
+
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        PRE(ilist, instr,
+            INSTR_CREATE_c_ldsp(
+                dcontext, opnd_create_reg(DR_REG_A0),
+                OPND_CREATE_MEM64(DR_REG_SP, current_offs - DR_NUM_FPR_REGS * XSP_SZ)));
+        /* csrw a0, vstart */
+        PRE(ilist, instr,
+            INSTR_CREATE_csrrw(dcontext, opnd_create_reg(DR_REG_ZERO),
+                               opnd_create_reg(DR_REG_A0),
+                               opnd_create_immed_int(VSTART, OPSZ_12b)));
+    }
+
     current_offs -= XSP_SZ;
-    /* Uses c.ldsp to save space, see -max_bb_instrs option, same below. */
     PRE(ilist, instr,
         INSTR_CREATE_c_ldsp(
             dcontext, opnd_create_reg(DR_REG_A0),
diff --git a/core/arch/riscv64/proc.c b/core/arch/riscv64/proc.c
index af31965f8e6..c7e214308ad 100644
--- a/core/arch/riscv64/proc.c
+++ b/core/arch/riscv64/proc.c
@@ -122,6 +122,24 @@ get_cache_line_size(DR_PARAM_OUT size_t *dcache_line_size,
     return false;
 }
 
+static void
+get_processor_specific_info(void)
+{
+#ifndef DR_HOST_NOT_TARGET
+    if (proc_has_feature(FEATURE_VECTOR)) {
+        uint64 vlenb = 0;
+        __asm__ __volatile__("csrr %0, 0xc22\n" : "=r"(vlenb));
+        cpu_info.vlenb = vlenb;
+        dr_set_vector_length(vlenb * 8);
+    } else {
+        cpu_info.vlenb = 32;
+        dr_set_vector_length(256);
+    }
+#else
+    dr_set_vector_length(256);
+#endif
+}
+
 void
 proc_init_arch(void)
 {
@@ -136,6 +154,10 @@ proc_init_arch(void)
                              /* icache_line_size= */ NULL)) {
         LOG(GLOBAL, LOG_TOP, 1, "Unable to obtain cache line size");
     }
+
+#ifndef DR_HOST_NOT_TARGET
+    get_processor_specific_info();
+#endif
 }
 
 bool
diff --git a/core/arch/riscv64/riscv64.asm b/core/arch/riscv64/riscv64.asm
index d64059bd1f8..19636fd8156 100644
--- a/core/arch/riscv64/riscv64.asm
+++ b/core/arch/riscv64/riscv64.asm
@@ -42,7 +42,7 @@ START_FILE
 
 /* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */
 /* The reserved space for SIMD is also included. */
-#define PRIV_MCONTEXT_SIZE 0x290
+#define PRIV_MCONTEXT_SIZE 0x620
 
 /* offset of priv_mcontext_t in dr_mcontext_t */
 #define PRIV_MCONTEXT_OFFSET 16
@@ -52,9 +52,9 @@ START_FILE
 #endif
 
 /* offsetof(dcontext_t, dstack) */
-#define dstack_OFFSET     0x2d8
+#define dstack_OFFSET 0x668
 /* offsetof(dcontext_t, is_exiting) */
-#define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
+#define is_exiting_OFFSET (dstack_OFFSET + 1 * ARG_SZ)
 
 #ifndef RISCV64
 # error RISCV64 must be defined
@@ -223,7 +223,8 @@ save_priv_mcontext_helper:
         fsd      f31, 64*ARG_SZ(ARG1)
         frcsr    x3
         sd       x3,  65*ARG_SZ(ARG1)
-        /* No need to save simd registers, at least for now. */
+        /* TODO i#3544: Save vector registers too? That would require runtime detection
+         * for vector support and vlenb. */
         ret
 
         DECLARE_EXPORTED_FUNC(dr_app_start)
diff --git a/core/globals.h b/core/globals.h
index e136313ae22..3d621898854 100644
--- a/core/globals.h
+++ b/core/globals.h
@@ -297,13 +297,15 @@ typedef struct _thread_record_t {
 #    define DYNAMORIO_EXPORT DR_APP_API
 #endif
 
-/* AArch64 Scalable Vector Extension's vector length in bits. This depends on
- * the hardware implementation and can be one of:
- * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
- * See https://developer.arm.com/documentation/102476/0100/Introducing-SVE
+/* - AArch64 Scalable Vector Extension's vector length in bits. This depends on
+ *   the hardware implementation and can be one of:
+ *   128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
+ *   See https://developer.arm.com/documentation/102476/0100/Introducing-SVE
+ * - RISC-V Vector's vector length in bits which is from 64 to 65536 in the
+ *   power of 2.
  * This variable stores the length for off-line decoding.
  */
-extern int sve_veclen;
+extern int vector_length;
 
 #include "heap.h"
 #include "options_struct.h"
diff --git a/core/ir/aarch64/codec.c b/core/ir/aarch64/codec.c
index 5460b31e9b7..1f5a5f7795c 100644
--- a/core/ir/aarch64/codec.c
+++ b/core/ir/aarch64/codec.c
@@ -5229,7 +5229,7 @@ decode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
      * memory displacement. So when creating the address operand here, it should be
      * multiplied by the current vector register length in bytes.
      */
-    int vl_bytes = dr_get_sve_vector_length() / 8;
+    int vl_bytes = dr_get_vector_length() / 8;
     *opnd = opnd_create_base_disp(rn, DR_REG_NULL, 0, offset * vl_bytes, mem_transfer);
 
     return true;
@@ -5251,7 +5251,7 @@ encode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
      * vector length at the IR level, transformed to a vector index in the
      * encoding.
      */
-    int vl_bytes = dr_get_sve_vector_length() / 8;
+    int vl_bytes = dr_get_vector_length() / 8;
     if ((opnd_get_disp(opnd) % vl_bytes) != 0)
         return false;
     int disp = opnd_get_disp(opnd) / vl_bytes;
@@ -5381,7 +5381,7 @@ decode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
      * address operand here, it should be multiplied by the current vector or
      * predicate register length in bytes.
      */
-    int vl_bytes = dr_get_sve_vector_length() / 8;
+    int vl_bytes = dr_get_vector_length() / 8;
     int pl_bytes = vl_bytes / 8;
     int mul_len = is_vector ? vl_bytes : pl_bytes;
     *opnd =
@@ -5410,7 +5410,7 @@ encode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
      * vector or predicate length at the IR level, transformed to a vector or
      * predicate index in the encoding.
      */
-    int vl_bytes = dr_get_sve_vector_length() / 8;
+    int vl_bytes = dr_get_vector_length() / 8;
     int pl_bytes = vl_bytes / 8;
     if (is_vector) {
         if ((opnd_get_disp(opnd) % vl_bytes) != 0)
@@ -8091,7 +8091,7 @@ decode_opnd_svemem_gpr_simm4_vl_xreg(uint enc, int opcode, byte *pc, OUT opnd_t
 
     /* The offset is scaled by the size of the vector in memory.*/
     const uint register_count = BITS(enc, 22, 21) + 1;
-    const uint scale = (register_count * dr_get_sve_vector_length()) / 8;
+    const uint scale = (register_count * dr_get_vector_length()) / 8;
 
     return decode_svemem_gpr_simm4(enc, element_size, scale, opnd);
 }
@@ -8104,7 +8104,7 @@ encode_opnd_svemem_gpr_simm4_vl_xreg(uint enc, int opcode, byte *pc, opnd_t opnd
 
     /* The offset is scaled by the size of the vector in memory.*/
     const uint register_count = BITS(enc, 22, 21) + 1;
-    const uint scale = (register_count * dr_get_sve_vector_length()) / 8;
+    const uint scale = (register_count * dr_get_vector_length()) / 8;
 
     return encode_svemem_gpr_simm4(enc, element_size, scale, opnd, enc_out);
 }
diff --git a/core/ir/decode_shared.c b/core/ir/decode_shared.c
index 8a190790e69..ac4892e4b57 100644
--- a/core/ir/decode_shared.c
+++ b/core/ir/decode_shared.c
@@ -151,6 +151,7 @@ const char *const size_names[] = {
     "OPSZ_4_of_32_evex64",
     "OPSZ_8_of_32_evex64",
     "OPSZ_8x16",
+    "OPSZ_256",
     "OPSZ_1_of_4",
     "OPSZ_2_of_4",
     "OPSZ_1_of_8",
@@ -174,29 +175,40 @@ const char *const size_names[] = {
     "OPSZ_eighth_16_vex32_evex64",
 };
 
-/* AArch64 Scalable Vector Extension's vector length in bits. */
-int sve_veclen;
-int sve_veclens[] = { 128,  256,  384,  512,  640,  768,  896,  1024,
-                      1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048 };
+/* AArch64 SVE or RISC-V Vector's vector length in bits. */
+int vector_length;
+
+/* AArch64 SVE valid vector lengths. */
+int sve_vector_lengths[] = { 128,  256,  384,  512,  640,  768,  896,  1024,
+                             1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048 };
 
 bool
-dr_set_sve_vector_length(int vl)
+dr_set_vector_length(int vl)
 {
-    for (int i = 0; i < sizeof(sve_veclens) / sizeof(sve_veclens[0]); i++) {
-        if (vl == sve_veclens[i]) {
-            sve_veclen = vl;
+#if defined(AARCH64)
+    for (int i = 0; i < sizeof(sve_vector_lengths) / sizeof(sve_vector_lengths[0]); i++) {
+        if (vl == sve_vector_lengths[i]) {
+            vector_length = vl;
             return true;
         }
     }
+#elif defined(RISCV64)
+    const int riscv_vlen_min = 64;
+    const int riscv_vlen_max = 65536;
+    if (vl >= riscv_vlen_min && vl <= riscv_vlen_max && IS_POWER_OF_2(vl)) {
+        vector_length = vl;
+        return true;
+    }
+#endif
     /* Make unusual values visible in case our internal uses mess up. */
     ASSERT_CURIOSITY(false);
     return false;
 }
 
 int
-dr_get_sve_vector_length(void)
+dr_get_vector_length(void)
 {
-    return sve_veclen;
+    return vector_length;
 }
 
 /* point at this when you need a canonical invalid instr
diff --git a/core/ir/encode_api.h b/core/ir/encode_api.h
index 179e9471da8..e7af6aef8bf 100644
--- a/core/ir/encode_api.h
+++ b/core/ir/encode_api.h
@@ -150,21 +150,23 @@ dr_get_isa_mode(void *drcontext);
 
 DR_API
 /**
- * AArch64 Scalable Vector Extension's vector length in bits is one of:
- * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
+ * - AArch64 Scalable Vector Extension's vector length in bits is one of:
+ *   128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
+ * - RISC-V Vector Extension's vector length in bit is from 64 to 65536 in the
+ *   power of 2.
  * Returns whether successful.
  * TODO i#3044: This function will only allow setting vector length if not
- * running on SVE.
+ * running on SVE or RVV.
  */
 bool
-dr_set_sve_vector_length(int vl);
+dr_set_vector_length(int vl);
 
 DR_API
 /**
- * Read AArch64 Scalable Vector Extension's vector length, in bits.
+ * Read AArch64 SVE or RISC-V Vector's vector length, in bits.
  */
 int
-dr_get_sve_vector_length(void);
+dr_get_vector_length(void);
 
 enum {
 #ifdef X86
diff --git a/core/ir/opnd.h b/core/ir/opnd.h
index 9c83d551550..09b4e0b4dc1 100644
--- a/core/ir/opnd.h
+++ b/core/ir/opnd.h
@@ -363,7 +363,7 @@ extern reg_id_t dr_reg_stolen;
 #        define OPSZ_SVE_PREDLEN_BYTES \
             opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
 #    else
-/* SVE vector length for off-line decoder set using dr_set_sve_vector_length() or -vl
+/* SVE vector length for off-line decoder set using dr_set_vector_length() or -vl
  * option with drdisas,
  * e.g.
  * $ drdisas -vl 256 e58057a1 85865e6b
@@ -372,11 +372,21 @@ extern reg_id_t dr_reg_stolen;
  * $
  */
 /* Size of the SVE Z vector registers in bytes. */
-#        define OPSZ_SVE_VECLEN_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
+#        define OPSZ_SVE_VECLEN_BYTES opnd_size_from_bytes(dr_get_vector_length() / 8)
 /* Size of the SVE P predicate registers in bytes. */
 #        define OPSZ_SVE_PREDLEN_BYTES \
-            opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
+            opnd_size_from_bytes((dr_get_vector_length() / 8) / 8)
 #    endif
 #endif /*AARCH64*/
 
+#ifdef RISCV64
+#    if !defined(DR_HOST_NOT_TARGE) && !defined(STANDALONE_DECODER) && \
+        !defined(BUILD_TESTS)
+/* Size of the RVV registers in bytes. */
+#        define OPSZ_RVV_VECLEN_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
+#    else
+#        define OPSZ_RVV_VECLEN_BYTES opnd_size_from_bytes(dr_get_vector_length() / 8)
+#    endif
+#endif
+
 #endif /* _OPND_H_ */
diff --git a/core/ir/opnd_api.h b/core/ir/opnd_api.h
index d02fa6ee290..0bf49f0f391 100644
--- a/core/ir/opnd_api.h
+++ b/core/ir/opnd_api.h
@@ -222,6 +222,8 @@ enum {
     OPSZ_8x16, /**< 8 or 16 bytes, but not based on rex prefix, instead dependent
                 * on 32-bit/64-bit mode.
                 */
+
+    OPSZ_256, /**< 256 bytes. Needed for RISC-V vector extension with LMUL. */
     /* Add new size here.  Also update size_names[] in decode_shared.c along with
      * the size routines in opnd_shared.c.
      */
@@ -1260,6 +1262,43 @@ enum {
     DR_REG_F30,  /**< The f30(ft10) floating-point register. */
     DR_REG_F31,  /**< The f31(ft11) floating-point register. */
     DR_REG_FCSR, /**< The floating-point control and status register. */
+
+    /* Vector registers, we name the macros DR_REG_VR* to avoid conflict with virtual
+     * registers.
+     */
+    DR_REG_VR0,  /**< The v0 vector register. */
+    DR_REG_VR1,  /**< The v1 vector register. */
+    DR_REG_VR2,  /**< The v2 vector register. */
+    DR_REG_VR3,  /**< The v3 vector register. */
+    DR_REG_VR4,  /**< The v4 vector register. */
+    DR_REG_VR5,  /**< The v5 vector register. */
+    DR_REG_VR6,  /**< The v6 vector register. */
+    DR_REG_VR7,  /**< The v7 vector register. */
+    DR_REG_VR8,  /**< The v8 vector register. */
+    DR_REG_VR9,  /**< The v9 vector register. */
+    DR_REG_VR10, /**< The v10 vector register. */
+    DR_REG_VR11, /**< The v11 vector register. */
+    DR_REG_VR12, /**< The v12 vector register. */
+    DR_REG_VR13, /**< The v13 vector register. */
+    DR_REG_VR14, /**< The v14 vector register. */
+    DR_REG_VR15, /**< The v15 vector register. */
+    DR_REG_VR16, /**< The v16 vector register. */
+    DR_REG_VR17, /**< The v17 vector register. */
+    DR_REG_VR18, /**< The v18 vector register. */
+    DR_REG_VR19, /**< The v19 vector register. */
+    DR_REG_VR20, /**< The v20 vector register. */
+    DR_REG_VR21, /**< The v21 vector register. */
+    DR_REG_VR22, /**< The v22 vector register. */
+    DR_REG_VR23, /**< The v23 vector register. */
+    DR_REG_VR24, /**< The v24 vector register. */
+    DR_REG_VR25, /**< The v25 vector register. */
+    DR_REG_VR26, /**< The v26 vector register. */
+    DR_REG_VR27, /**< The v27 vector register. */
+    DR_REG_VR28, /**< The v28 vector register. */
+    DR_REG_VR29, /**< The v29 vector register. */
+    DR_REG_VR30, /**< The v30 vector register. */
+    DR_REG_VR31, /**< The v31 vector register. */
+
     /* FPR aliases */
     DR_REG_FT0 = DR_REG_F0, /**< The 1st temporary floating-point (f0) register. */
     DR_REG_FT1 = DR_REG_F1, /**< The 2nd temporary floating-point (f1) register. */
@@ -1298,8 +1337,8 @@ enum {
 
     /* FIXME i#3544: CCSRs */
 
-    DR_REG_LAST_VALID_ENUM = DR_REG_FCSR, /**< Last valid register enum. */
-    DR_REG_LAST_ENUM = DR_REG_FCSR,       /**< Last value of register enums. */
+    DR_REG_LAST_VALID_ENUM = DR_REG_VR31, /**< Last valid register enum. */
+    DR_REG_LAST_ENUM = DR_REG_VR31,       /**< Last value of register enums. */
 
     DR_REG_START_64 = DR_REG_X1,  /**< Start of 64-bit register enum values. */
     DR_REG_STOP_64 = DR_REG_F31,  /**< End of 64-bit register enum values. */
@@ -1309,11 +1348,14 @@ enum {
     DR_REG_STOP_GPR = DR_REG_X31, /**< End of general registers. */
     DR_REG_START_FPR = DR_REG_F0, /**< Start of floating-point registers. */
     DR_REG_STOP_FPR = DR_REG_F31, /**< End of floating-point registers. */
+    DR_REG_START_VR = DR_REG_VR0, /**< Start of vector registers. */
+    DR_REG_STOP_VR = DR_REG_VR31, /**< End of vector registers. */
     DR_REG_XSP = DR_REG_SP, /**< Platform-independent way to refer to stack pointer. */
 
     DR_NUM_GPR_REGS = DR_REG_STOP_GPR - DR_REG_START_GPR + 1, /**< Count of GPR regs. */
     DR_NUM_FPR_REGS = DR_REG_STOP_FPR - DR_REG_START_FPR + 1, /**< Count of FPR regs. */
-    DR_NUM_SIMD_VECTOR_REGS = 0,                              /**< Count of SIMD regs. */
+    DR_NUM_VR_REGS = DR_REG_STOP_VR - DR_REG_START_VR + 1, /**< Count of vector regs. */
+    DR_NUM_SIMD_VECTOR_REGS = 0,                           /**< Count of SIMD regs. */
 #else /* RISCV64 */
 #    error Register definitions missing for this platform.
 #endif
@@ -1591,6 +1633,25 @@ typedef ushort reg_id_t; /**< The type of a DR_REG_ enum value. */
  */
 typedef byte opnd_size_t; /**< The type of an OPSZ_ enum value. */
 
+#ifdef RISCV64
+/**
+ * The LMUL type for RISCV64 vector extension.
+ * We keep the encoding in sync with the specification, see page 12 of RISC-V "V" Vector
+ * Extension Version 1.0.
+ * We encode the lmul as signed number that fits into 3-bits, see reg_get_size_lmul() for
+ * the usage.
+ */
+typedef enum {
+    RV64_LMUL_1_8 = -3, /**< RISC-V vector extension LMUL 1/8. */
+    RV64_LMUL_1_4 = -2, /**< RISC-V vector extension LMUL 1/4. */
+    RV64_LMUL_1_2 = -1, /**< RISC-V vector extension LMUL 1/2. */
+    RV64_LMUL_1 = 0,    /**< RISC-V vector extension LMUL 1. */
+    RV64_LMUL_2 = 1,    /**< RISC-V vector extension LMUL 2. */
+    RV64_LMUL_4 = 2,    /**< RISC-V vector extension LMUL 4. */
+    RV64_LMUL_8 = 3,    /**< RISC-V vector extension LMUL 8. */
+} lmul_t;
+#endif
+
 #ifdef X86
 /* Platform-independent full-register specifiers */
 #    ifdef X64
@@ -3609,6 +3670,18 @@ DR_API
 opnd_size_t
 reg_get_size(reg_id_t reg);
 
+#ifdef RISCV64
+DR_API
+/**
+ * Assumes that \p reg is a DR_REG_VR constant.
+ * Returns the OPSZ_ constant corresponding to the vector register size and lmul.
+ * Returns OPSZ_NA if reg is not a DR_REG_VR constant.
+ * \note RISCV64-only.
+ */
+opnd_size_t
+reg_get_size_lmul(reg_id_t reg, lmul_t lmul);
+#endif
+
 DR_API
 /**
  * Assumes that \p reg is a DR_REG_ constant.
diff --git a/core/ir/opnd_shared.c b/core/ir/opnd_shared.c
index 815c5098f8d..a37511979f4 100644
--- a/core/ir/opnd_shared.c
+++ b/core/ir/opnd_shared.c
@@ -2000,6 +2000,7 @@ opnd_size_in_bytes(opnd_size_t size)
     case OPSZ_120: return 120;
     case OPSZ_124: return 124;
     case OPSZ_128: return 128;
+    case OPSZ_256: return 256;
     case OPSZ_512: return 512;
     case OPSZ_VAR_REGLIST: return 0; /* varies to match reglist operand */
     case OPSZ_xsave:
@@ -2076,6 +2077,7 @@ opnd_size_from_bytes(uint bytes)
     case 120: return OPSZ_120;
     case 124: return OPSZ_124;
     case 128: return OPSZ_128;
+    case 256: return OPSZ_256;
     case 512: return OPSZ_512;
     default: return OPSZ_NA;
     }
@@ -2437,10 +2439,8 @@ reg_32_to_16(reg_id_t reg)
     CLIENT_ASSERT(false, "reg_32_to_16 not supported on ARM");
     return REG_NULL;
 #elif defined(RISCV64)
-    /* FIXME i#3544: There is no separate addressing for half registers.
-     * Semantics are part of the opcode.
-     */
-    return reg;
+    CLIENT_ASSERT(false, "reg_32_to_16 not supported on RISCV64");
+    return REG_NULL;
 #endif
 }
 
@@ -2464,10 +2464,8 @@ reg_32_to_8(reg_id_t reg)
     CLIENT_ASSERT(false, "reg_32_to_8 not supported on ARM");
     return REG_NULL;
 #elif defined(RISCV64)
-    /* FIXME i#3544: There is no separate addressing for half registers.
-     * Semantics are part of the opcode.
-     */
-    return reg;
+    CLIENT_ASSERT(false, "reg_32_to_8 not supported on RISCV64");
+    return REG_NULL;
 #endif
 }
 
@@ -2537,12 +2535,12 @@ reg_32_to_opsz(reg_id_t reg, opnd_size_t sz)
     if (sz == OPSZ_4)
         return reg;
     else if (sz == OPSZ_2)
-        return IF_AARCHXX_ELSE(reg, reg_32_to_16(reg));
+        return IF_AARCHXX_OR_RISCV64_ELSE(reg, reg_32_to_16(reg));
     else if (sz == OPSZ_1)
-        return IF_AARCHXX_ELSE(reg, reg_32_to_8(reg));
+        return IF_AARCHXX_OR_RISCV64_ELSE(reg, reg_32_to_8(reg));
 #ifdef X64
     else if (sz == OPSZ_8)
-        return reg_32_to_64(reg);
+        return IF_RISCV64_ELSE(reg, reg_32_to_64(reg));
 #endif
     else
         CLIENT_ASSERT(false, "reg_32_to_opsz: invalid size parameter");
@@ -2790,6 +2788,8 @@ reg_get_size(reg_id_t reg)
 #elif defined(RISCV64)
     if (reg == DR_REG_X0)
         return OPSZ_8;
+    else if (reg >= DR_REG_VR0 && reg <= DR_REG_VR31)
+        return OPSZ_RVV_VECLEN_BYTES;
 #endif
     LOG(GLOBAL, LOG_ANNOTATIONS, 2, "reg=%d, %s, last reg=%d\n", reg,
         get_register_name(reg), DR_REG_LAST_ENUM);
@@ -2797,6 +2797,34 @@ reg_get_size(reg_id_t reg)
     return OPSZ_NA;
 }
 
+#ifdef RISCV64
+/* Returns the OPSZ_ constant corresponding to the vector register size and lmul.
+ * Page 12 of RISC-V "V" Vector Extension Version 1.0.
+ */
+opnd_size_t
+reg_get_size_lmul(reg_id_t reg, lmul_t lmul)
+{
+    if (reg >= DR_REG_VR0 && reg <= DR_REG_VR31) {
+        /* lmul is a 3-bit signed number encoded as the shift amount,
+         * so (vlen >> (3 - lmul)) converts the hardware vector length in bits to the
+         * effective vector length in bytes.
+         */
+        ASSERT(lmul <= 3 && lmul >= -3);
+        opnd_size_t opsz = opnd_size_from_bytes(dr_get_vector_length() >> (3 - lmul));
+
+        LOG(GLOBAL, LOG_ANNOTATIONS, 2, "reg=%d, %s, last reg=%d\n", reg,
+            get_register_name(reg), DR_REG_LAST_ENUM);
+        CLIENT_ASSERT(opsz != OPSZ_NA, "reg_get_size_lmul: invalid register");
+        return opsz;
+    }
+
+    LOG(GLOBAL, LOG_ANNOTATIONS, 2, "reg=%d, %s, last reg=%d\n", reg,
+        get_register_name(reg), DR_REG_LAST_ENUM);
+    CLIENT_ASSERT(false, "reg_get_size_lmul: invalid register");
+    return OPSZ_NA;
+}
+#endif
+
 #ifndef STANDALONE_DECODER
 /****************************************************************************/
 /* dcontext convenience routines */
@@ -2896,3 +2924,31 @@ opnd_create_tls_slot(int offs)
 
 #endif /* !STANDALONE_DECODER */
 /****************************************************************************/
+
+#ifdef STANDALONE_UNIT_TEST
+
+#    ifdef RISCV64
+void
+test_reg_get_size_lmul(void)
+{
+    dr_set_vector_length(256);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_1_8), OPSZ_4);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_1_4), OPSZ_8);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_1_2), OPSZ_16);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_1), OPSZ_32);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_2), OPSZ_64);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_4), OPSZ_128);
+    EXPECT(reg_get_size_lmul(DR_REG_VR0, RV64_LMUL_8), OPSZ_256);
+}
+#    endif /* RISCV64 */
+
+void
+unit_test_opnd_shared(void)
+{
+#    ifdef RISCV64
+    test_reg_get_size_lmul();
+#    endif
+    print_file(STDERR, "done testing opnd_shared\n");
+}
+
+#endif /* STANDALONE_UNIT_TEST */
diff --git a/core/ir/riscv64/codec.c b/core/ir/riscv64/codec.c
index 65ff0da12a7..6e542638f49 100644
--- a/core/ir/riscv64/codec.c
+++ b/core/ir/riscv64/codec.c
@@ -152,6 +152,21 @@ decode_rdfp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig_
     return true;
 }
 
+/* Decode the destination vector register field:
+ * |31 12|11   7|6      0|
+ * | ... |  vd  | opcode |
+ *        ^----^
+ */
+static bool
+decode_vd_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig_pc, int idx,
+               instr_t *out)
+{
+    reg_t reg = DR_REG_VR0 + GET_FIELD(inst, 11, 7);
+    opnd_t opnd = opnd_create_reg(reg);
+    instr_set_dst(out, idx, opnd);
+    return true;
+}
+
 /* Decode the 1st source fixed-point register field:
  * |31 20|19   15|14  7|6      0|
  * | ... |  rs1  | ... | opcode |
@@ -184,6 +199,21 @@ decode_rs1fp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig
     return true;
 }
 
+/* Decode the 1st source vector register field:
+ * |31 20|19   15|14  7|6      0|
+ * | ... |  vs1  | ... | opcode |
+ *        ^-----^
+ */
+static bool
+decode_vs1_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig_pc,
+                int idx, instr_t *out)
+{
+    reg_t reg = DR_REG_VR0 + GET_FIELD(inst, 19, 15);
+    opnd_t opnd = opnd_create_reg(reg);
+    instr_set_src(out, idx, opnd);
+    return true;
+}
+
 /* Decode the rs1 field as a base register:
  * |31 20|19    15|14  7|6      0|
  * | ... |  base  | ... | opcode |
@@ -232,6 +262,21 @@ decode_rs2fp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig
     return true;
 }
 
+/* Decode the 2nd source vector register field:
+ * |31 25|24   20|19  7|6      0|
+ * | ... |  vs2  | ... | opcode |
+ *        ^-----^
+ */
+static bool
+decode_vs2_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig_pc,
+                int idx, instr_t *out)
+{
+    reg_t reg = DR_REG_VR0 + GET_FIELD(inst, 24, 20);
+    opnd_t opnd = opnd_create_reg(reg);
+    instr_set_src(out, idx, opnd);
+    return true;
+}
+
 /* Decode the 3rd source fixed-point register field:
  * |31 27|26  7|6      0|
  * | rs3 | ... | opcode |
@@ -248,6 +293,21 @@ decode_rs3fp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig
     return true;
 }
 
+/* Decode the 3rd source vector register field:
+ * |31 12|11  7|6      0|
+ * | ... | vs3 | opcode |
+ *        ^----^
+ */
+static bool
+decode_vs3_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc, byte *orig_pc,
+                int idx, instr_t *out)
+{
+    reg_t reg = DR_REG_VR0 + GET_FIELD(inst, 11, 7);
+    opnd_t opnd = opnd_create_reg(reg);
+    instr_set_src(out, idx, opnd);
+    return true;
+}
+
 /* Decode the fence mode field of the "fence" instruction:
  * |31  28| 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 |19 15|14    12|11 7|6   0|
  * |  fm  | PI | PO | PR | PW | SI | SO | SR | SW | rs1 | funct3 | rd | 0xF |
@@ -1051,9 +1111,11 @@ decode_v_l_rs1_disp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc,
                          byte *orig_pc, int idx, instr_t *out)
 {
     reg_t reg = DR_REG_X0 + GET_FIELD(inst, 19, 15);
-    /* Immediate part of LR.W/D is always 0. */
-    int32_t imm =
-        GET_FIELD(inst, 6, 0) == 0b0101111 ? 0 : SIGN_EXTEND(GET_FIELD(inst, 31, 20), 12);
+    /* Immediate part of LR.W/D or vector load is always 0. */
+    bool is_vector_load = GET_FIELD(inst, 6, 0) == 0b0000111 &&
+        (GET_FIELD(inst, 14, 12) == 0 || GET_FIELD(inst, 14, 12) > 0b100);
+    bool is_lr = GET_FIELD(inst, 6, 0) == 0b0101111;
+    int32_t imm = is_vector_load || is_lr ? 0 : SIGN_EXTEND(GET_FIELD(inst, 31, 20), 12);
     opnd_t opnd = opnd_add_flags(opnd_create_base_disp(reg, DR_REG_NULL, 0, imm, op_sz),
                                  DR_OPND_IMM_PRINT_DECIMAL);
     instr_set_src(out, idx, opnd);
@@ -1076,8 +1138,11 @@ decode_v_s_rs1_disp_opnd(dcontext_t *dc, uint32_t inst, int op_sz, byte *pc,
                          byte *orig_pc, int idx, instr_t *out)
 {
     reg_t reg = DR_REG_X0 + GET_FIELD(inst, 19, 15);
-    /* Immediate part of SC.W/D is always 0. */
-    int32_t imm = GET_FIELD(inst, 6, 0) == 0b0101111
+    /* Immediate part of SC.W/D or vector store is always 0. */
+    bool is_vector_store = GET_FIELD(inst, 6, 0) == 0b0100111 &&
+        (GET_FIELD(inst, 14, 12) == 0 || GET_FIELD(inst, 14, 12) > 0b100);
+    bool is_sc = GET_FIELD(inst, 6, 0) == 0b0101111;
+    int32_t imm = is_vector_store || is_sc
         ? 0
         : (GET_FIELD(inst, 31, 25) << 5) | GET_FIELD(inst, 11, 7);
     imm = SIGN_EXTEND(imm, 12);
@@ -1362,10 +1427,10 @@ opnd_dec_func_t opnd_decoders[] = {
     [RISCV64_FLD_VM] = decode_vm_opnd,
     [RISCV64_FLD_NF] = decode_nf_opnd,
     [RISCV64_FLD_SIMM5] = decode_simm5_opnd,
-    [RISCV64_FLD_VD] = decode_rd_opnd,
-    [RISCV64_FLD_VS1] = decode_rs1_opnd,
-    [RISCV64_FLD_VS2] = decode_rs2_opnd,
-    [RISCV64_FLD_VS3] = decode_rd_opnd,
+    [RISCV64_FLD_VD] = decode_vd_opnd,
+    [RISCV64_FLD_VS1] = decode_vs1_opnd,
+    [RISCV64_FLD_VS2] = decode_vs2_opnd,
+    [RISCV64_FLD_VS3] = decode_vs3_opnd,
     [RISCV64_FLD_I_S_RS1_DISP] = decode_v_s_rs1_disp_opnd,
 };
 
@@ -1752,6 +1817,20 @@ encode_rdfp_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_t
     return true;
 }
 
+/* Encode the destination vector register field:
+ * |31 12|11   7|6      0|
+ * | ... |  vd  | opcode |
+ *        ^----^
+ */
+static bool
+encode_vd_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_t *di)
+{
+    opnd_t opnd = instr_get_dst(instr, idx);
+    uint32_t reg = opnd_get_reg(opnd) - DR_REG_VR0;
+    *out |= SET_FIELD(reg, 11, 7);
+    return true;
+}
+
 /* Encode the 1st source fixed-point register field:
  * |31 20|19   15|14  7|6      0|
  * | ... |  rs1  | ... | opcode |
@@ -1783,6 +1862,20 @@ encode_rs1fp_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_
     return true;
 }
 
+/* Encode the 1st source vector register field:
+ * |31 20|19   15|14  7|6      0|
+ * | ... |  vs1  | ... | opcode |
+ *        ^-----^
+ */
+static bool
+encode_vs1_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_t *di)
+{
+    opnd_t opnd = instr_get_src(instr, idx);
+    uint32_t reg = opnd_get_reg(opnd) - DR_REG_VR0;
+    *out |= SET_FIELD(reg, 19, 15);
+    return true;
+}
+
 /* Encode the rs1 field as a base register:
  * |31 20|19    15|14  7|6      0|
  * | ... |  base  | ... | opcode |
@@ -1829,6 +1922,20 @@ encode_rs2fp_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_
     return true;
 }
 
+/* Encode the 2nd source vector register field:
+ * |31 25|24   20|19  7|6      0|
+ * | ... |  vs2  | ... | opcode |
+ *        ^-----^
+ */
+static bool
+encode_vs2_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_t *di)
+{
+    opnd_t opnd = instr_get_src(instr, idx);
+    uint32_t reg = opnd_get_reg(opnd) - DR_REG_VR0;
+    *out |= SET_FIELD(reg, 24, 20);
+    return true;
+}
+
 /* Encode the 3rd source fixed-point register field:
  * |31 27|26  7|6      0|
  * | rs3 | ... | opcode |
@@ -1844,6 +1951,20 @@ encode_rs3fp_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_
     return true;
 }
 
+/* Encode the 3rd source vector register field:
+ * |31 12|11  7|6      0|
+ * | ... | vs3 | opcode |
+ *        ^----^
+ */
+static bool
+encode_vs3_opnd(instr_t *instr, byte *pc, int idx, uint32_t *out, decode_info_t *di)
+{
+    opnd_t opnd = instr_get_src(instr, idx);
+    uint32_t reg = opnd_get_reg(opnd) - DR_REG_VR0;
+    *out |= SET_FIELD(reg, 11, 7);
+    return true;
+}
+
 /* Encode the fence mode field of the "fence" instruction:
  * |31  28| 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 |19 15|14    12|11 7|6   0|
  * |  fm  | PI | PO | PR | PW | SI | SO | SR | SW | rs1 | funct3 | rd | 0xF |
@@ -2785,10 +2906,10 @@ opnd_enc_func_t opnd_encoders[] = {
     [RISCV64_FLD_VM] = encode_vm_opnd,
     [RISCV64_FLD_NF] = encode_nf_opnd,
     [RISCV64_FLD_SIMM5] = encode_simm5_opnd,
-    [RISCV64_FLD_VD] = encode_rd_opnd,
-    [RISCV64_FLD_VS1] = encode_rs1_opnd,
-    [RISCV64_FLD_VS2] = encode_rs2_opnd,
-    [RISCV64_FLD_VS3] = encode_rd_opnd,
+    [RISCV64_FLD_VD] = encode_vd_opnd,
+    [RISCV64_FLD_VS1] = encode_vs1_opnd,
+    [RISCV64_FLD_VS2] = encode_vs2_opnd,
+    [RISCV64_FLD_VS3] = encode_vs3_opnd,
     [RISCV64_FLD_I_S_RS1_DISP] = encode_implicit_opnd,
 };
 
diff --git a/core/ir/riscv64/codec.py b/core/ir/riscv64/codec.py
index d3520b2c255..6f8bc7b605d 100755
--- a/core/ir/riscv64/codec.py
+++ b/core/ir/riscv64/codec.py
@@ -957,8 +957,9 @@ def __fixup_uncompressed_inst(self, inst: Instruction):
         elif opc in [0b0100011, 0b0100111]:  # STORE instructions
             dbg(f'fixup: {inst.name} {[f.name for f in inst.flds]}')
             if opc == 0b0100111 and funct3 in [0b000, 0b101, 0b110, 0b111]:
-                # Vector store instructions have no imm part
-                inst.flds[-2] = Field.V_S_RS1_DISP
+                # Vector store instructions have no imm part. Also swap operands
+                # to be consistent with the scalar instruction encoding.
+                inst.flds[-1], inst.flds[-2] = Field.V_S_RS1_DISP, inst.flds[-1]
             else:
                 inst.flds[2] = Field.V_S_RS1_DISP
                 inst.flds.pop(0)
diff --git a/core/ir/riscv64/encode.c b/core/ir/riscv64/encode.c
index 5c3a99af264..dde1ba3aa52 100644
--- a/core/ir/riscv64/encode.c
+++ b/core/ir/riscv64/encode.c
@@ -47,7 +47,10 @@ const char *const reg_names[] = {
     "ft0",  "ft1",  "ft2", "ft3", "ft4", "ft5", "ft6",  "ft7",  "fs0", "fs1",
     "fa0",  "fa1",  "fa2", "fa3", "fa4", "fa5", "fa6",  "fa7",  "fs2", "fs3",
     "fs4",  "fs5",  "fs6", "fs7", "fs8", "fs9", "fs10", "fs11", "ft8", "ft9",
-    "ft10", "ft11", "fcsr",
+    "ft10", "ft11", "fcsr", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+    "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31"
 };
 
 
@@ -67,6 +70,12 @@ const reg_id_t dr_reg_fixer[] = { REG_NULL,
     DR_REG_F18, DR_REG_F19, DR_REG_F20, DR_REG_F21, DR_REG_F22, DR_REG_F23,
     DR_REG_F24, DR_REG_F25, DR_REG_F26, DR_REG_F27, DR_REG_F28, DR_REG_F29,
     DR_REG_F30, DR_REG_F31, DR_REG_FCSR,
+    DR_REG_VR0,  DR_REG_VR1,  DR_REG_VR2,  DR_REG_VR3,  DR_REG_VR4,  DR_REG_VR5,
+    DR_REG_VR6,  DR_REG_VR7,  DR_REG_VR8,  DR_REG_VR9,  DR_REG_VR10, DR_REG_VR11,
+    DR_REG_VR12, DR_REG_VR13, DR_REG_VR14, DR_REG_VR15, DR_REG_VR16, DR_REG_VR17,
+    DR_REG_VR18, DR_REG_VR19, DR_REG_VR20, DR_REG_VR21, DR_REG_VR22, DR_REG_VR23,
+    DR_REG_VR24, DR_REG_VR25, DR_REG_VR26, DR_REG_VR27, DR_REG_VR28, DR_REG_VR29,
+    DR_REG_VR30, DR_REG_VR31,
 };
 /* clang-format on */
 
@@ -144,6 +153,39 @@ const reg_id_t d_r_reg_id_to_virtual[] = {
     DR_REG_V63, /* DR_REG_F30 */
     DR_REG_V64, /* DR_REG_F31 */
     DR_REG_V65, /* DR_REG_FCSR */
+
+    DR_REG_V66, /* DR_REG_VR0 */
+    DR_REG_V67, /* DR_REG_VR1 */
+    DR_REG_V68, /* DR_REG_VR2 */
+    DR_REG_V69, /* DR_REG_VR3 */
+    DR_REG_V70, /* DR_REG_VR4 */
+    DR_REG_V71, /* DR_REG_VR5 */
+    DR_REG_V72, /* DR_REG_VR6 */
+    DR_REG_V73, /* DR_REG_VR7 */
+    DR_REG_V74, /* DR_REG_VR8 */
+    DR_REG_V75, /* DR_REG_VR9 */
+    DR_REG_V76, /* DR_REG_VR10 */
+    DR_REG_V77, /* DR_REG_VR11 */
+    DR_REG_V78, /* DR_REG_VR12 */
+    DR_REG_V79, /* DR_REG_VR13 */
+    DR_REG_V80, /* DR_REG_VR14 */
+    DR_REG_V81, /* DR_REG_VR15 */
+    DR_REG_V82, /* DR_REG_VR16 */
+    DR_REG_V83, /* DR_REG_VR17 */
+    DR_REG_V84, /* DR_REG_VR18 */
+    DR_REG_V85, /* DR_REG_VR19 */
+    DR_REG_V86, /* DR_REG_VR20 */
+    DR_REG_V87, /* DR_REG_VR21 */
+    DR_REG_V88, /* DR_REG_VR22 */
+    DR_REG_V89, /* DR_REG_VR23 */
+    DR_REG_V90, /* DR_REG_VR24 */
+    DR_REG_V91, /* DR_REG_VR25 */
+    DR_REG_V92, /* DR_REG_VR26 */
+    DR_REG_V93, /* DR_REG_VR27 */
+    DR_REG_V94, /* DR_REG_VR28 */
+    DR_REG_V95, /* DR_REG_VR29 */
+    DR_REG_V96, /* DR_REG_VR30 */
+    DR_REG_V97, /* DR_REG_VR31 */
 };
 
 #ifdef DEBUG
diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h
index 998d1575596..9d7664483c5 100644
--- a/core/lib/globals_api.h
+++ b/core/lib/globals_api.h
@@ -813,17 +813,18 @@ typedef union _dr_simd_t {
 #    define MCXT_NUM_OPMASK_SLOTS 8
 
 #elif defined(RISCV64)
-
-/* FIXME i#3544: Not implemented. Definitions just for compiling. */
+/**
+ * 256-bit RISC-V Vector extension registers.
+ * Vector register length can be from 64 to 65536 bits in the power of 2.
+ * Currently we support implementations of up to 256 bits due to limit of DR's
+ * stack size and 12-bit signed immediate range. Also, align to 16 bytes for
+ * better performance.
+ */
 typedef union ALIGN_VAR(16) _dr_simd_t {
-    byte b;      /**< Bottom  8 bits of Vn == Bn. */
-    ushort h;    /**< Bottom 16 bits of Vn == Hn. */
-    uint s;      /**< Bottom 32 bits of Vn == Sn. */
-    uint d[2];   /**< Bottom 64 bits of Vn == Dn as d[1]:d[0]. */
-    uint q[4];   /**< 128-bit Qn as q[3]:q[2]:q[1]:q[0]. */
-    uint u32[4]; /**< The full 128-bit register. */
+    uint u32[8];   /**< Representation as 8 32-bit elements. */
+    uint64 u64[4]; /**< The full 256-bit register. */
 } dr_simd_t;
-#    define MCXT_NUM_SIMD_SLOTS 8
+#    define MCXT_NUM_SIMD_SLOTS 32
 #    define MCXT_NUM_OPMASK_SLOTS 0
 #else
 #    error NYI
diff --git a/core/lib/mcxtx_api.h b/core/lib/mcxtx_api.h
index c74f5135aab..e112ca6cf1f 100644
--- a/core/lib/mcxtx_api.h
+++ b/core/lib/mcxtx_api.h
@@ -564,7 +564,9 @@
         reg_t ft11; /**< The 12th temporary floating-point register. */
     };  /**< The anonymous union of alternative names for the f31/ft11 register. */
     reg_t fcsr; /**< Floating-Point Control Register. */
-    /** The SIMD registers. No support for SIMD on RISC-V so far. */
+    reg_t vstart; /**< Vector Start Index CSR. */
+    reg_t vcsr; /**< Vector Control and Status Register. */
+    /** The Vector registers. */
     dr_simd_t simd[MCXT_NUM_SIMD_SLOTS];
 #else /* RISCV64 */
 #error Unsupported architecture
diff --git a/core/unit_tests.c b/core/unit_tests.c
index 292dba0d833..5e621030e91 100644
--- a/core/unit_tests.c
+++ b/core/unit_tests.c
@@ -52,6 +52,8 @@ void
 unit_test_vmareas(void);
 void
 unit_test_utils(void);
+void
+unit_test_opnd_shared(void);
 #ifdef WINDOWS
 void
 unit_test_drwinapi(void);
@@ -79,6 +81,7 @@ main(int argc, char **argv, char **envp)
     unit_test_memquery();
 #endif
     unit_test_utils();
+    unit_test_opnd_shared();
     unit_test_options();
     unit_test_vmareas();
 #ifdef WINDOWS
diff --git a/core/unix/include/sigcontext.h b/core/unix/include/sigcontext.h
index 82ad8f1c5cc..0ba7377721c 100644
--- a/core/unix/include/sigcontext.h
+++ b/core/unix/include/sigcontext.h
@@ -499,9 +499,38 @@ union __riscv_fp_state {
     struct __riscv_q_ext_state q;
 };
 
+#    define RISCV_V_MAGIC 0x53465457
+
+struct __riscv_ctx_hdr {
+    __u32 magic;
+    __u32 size;
+};
+
+struct __riscv_extra_ext_header {
+    __u32 __padding[129] __attribute__((aligned(16)));
+    __u32 reserved;
+    struct __riscv_ctx_hdr hdr;
+};
+
+struct __riscv_v_ext_state {
+    unsigned long vstart;
+    unsigned long vl;
+    unsigned long vtype;
+    unsigned long vcsr;
+    unsigned long vlenb;
+    unsigned char vdata[1024];
+};
+
+struct __sc_riscv_v_state {
+    struct __riscv_v_ext_state v_state;
+} __attribute__((aligned(16)));
+
 typedef struct _kernel_sigcontext_t {
     struct user_regs_struct sc_regs;
-    union __riscv_fp_state sc_fpregs;
+    union {
+        union __riscv_fp_state sc_fpregs;
+        struct __riscv_extra_ext_header sc_extdesc;
+    };
 } kernel_sigcontext_t;
 
 #endif /* RISCV64 */
diff --git a/suite/tests/api/ir_aarch64_legacy.c b/suite/tests/api/ir_aarch64_legacy.c
index e0982c311df..5bdb588caad 100644
--- a/suite/tests/api/ir_aarch64_legacy.c
+++ b/suite/tests/api/ir_aarch64_legacy.c
@@ -6932,13 +6932,13 @@ test_vector_length(void *dcontext)
     /* XXX: Make this test work when on actual SVE hardware where this API routine
      * is documented as failing.
      */
-    bool res = dr_set_sve_vector_length(new_len);
+    bool res = dr_set_vector_length(new_len);
     ASSERT(res);
-    ASSERT(dr_get_sve_vector_length() == new_len);
+    ASSERT(dr_get_vector_length() == new_len);
     /* Ensure invalid lengths return failure. */
-    ASSERT(!dr_set_sve_vector_length(0));
-    ASSERT(!dr_set_sve_vector_length(1));
-    ASSERT(!dr_set_sve_vector_length(4096));
+    ASSERT(!dr_set_vector_length(0));
+    ASSERT(!dr_set_vector_length(1));
+    ASSERT(!dr_set_vector_length(4096));
 }
 
 int
diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c
index aed49964759..5c966e8d9d1 100644
--- a/suite/tests/api/opnd-a64.c
+++ b/suite/tests/api/opnd-a64.c
@@ -86,7 +86,7 @@ test_get_size()
         opsz_predlen = opnd_size_from_bytes(vl / 8);
     } else {
         /* Set vector length to 256 bits for unit tests on non-SVE hardware. */
-        ASSERT(dr_get_sve_vector_length() == 256);
+        ASSERT(dr_get_vector_length() == 256);
         opsz_veclen = OPSZ_32;
         opsz_predlen = OPSZ_4;
     }
@@ -420,8 +420,8 @@ op_mem_size(int op)
 void
 test_compute_vector_address(void *drcontext)
 {
-    const int original_vector_length = dr_get_sve_vector_length();
-    ASSERT(dr_set_sve_vector_length(256));
+    const int original_vector_length = dr_get_vector_length();
+    ASSERT(dr_set_vector_length(256));
 
 #define SCALAR_BASE_REG 0
 
@@ -747,7 +747,7 @@ test_compute_vector_address(void *drcontext)
 #undef EXPECT
 #undef VEC_ADDR_TEST
 
-    ASSERT(dr_set_sve_vector_length(original_vector_length));
+    ASSERT(dr_set_vector_length(original_vector_length));
 }
 
 void