diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index b70c4b94c97a1..1388efce88956 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -156,7 +156,8 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
                                      size_m=a.shape[0],
                                      size_n=w_ref.shape[1],
                                      size_k=w_ref.shape[0],
-                                     is_k_full=True))))
+                                     is_k_full=True,
+                                     is_zp_float=False))))
 
     # machete
     timers.append(
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 536c133bb3341..8fb44e3a3dbd8 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
     results.append(
         benchmark.Timer(
             stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 6dbf9594e8492..7bd246d73b1e4 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -54,9 +54,10 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
+          const bool has_act_order,     // whether act_order is enabled
+          const int group_blocks = -1,  // number of consecutive 16x16 blocks
+                                        // with a separate quantization scale
+          const bool is_zp_float        // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -82,7 +83,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& workspace,
                                vllm::ScalarTypeId const b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp) {
+                               bool is_k_full, bool has_zp, bool is_zp_float) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
   return torch::empty({1, 1});
@@ -390,6 +391,17 @@ __device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
+template <typename scalar_t>
+__device__ inline void sub_zpf(typename ScalarType<scalar_t>::FragB& frag_b,
+                               typename ScalarType<scalar_t>::FragZPF& frag_zpf,
+                               int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zpf)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
 // Same as above, but for act_order (each K is multiplied individually)
 template <typename scalar_t>
 __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
@@ -420,6 +432,15 @@ __device__ inline void scale_float(float* c,
   c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
 }
 
+// Given 2 floats subtract by 2 zero points (halves)
+template <typename scalar_t>
+__device__ inline void sub_zpf_float(
+    float* c, typename ScalarType<scalar_t>::FragZPF& zp) {
+  scalar_t* zp_ptr = reinterpret_cast<scalar_t*>(&zp);
+  c[0] = __fsub_rn(c[0], ScalarType<scalar_t>::num2float(zp_ptr[0]));
+  c[1] = __fsub_rn(c[1], ScalarType<scalar_t>::num2float(zp_ptr[1]));
+}
+
 // Wait until barrier reaches `count`, then lock for current threadblock.
 __device__ inline void barrier_acquire(int* lock, int count) {
   if (threadIdx.x == 0) {
@@ -516,10 +537,11 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const bool has_act_order,    // whether act_order is enabled
-          const bool has_zp,           // whether zero-points are enabled
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
+          const bool has_act_order,     // whether act_order is enabled
+          const bool has_zp,            // whether zero-points are enabled
+          const int group_blocks = -1,  // number of consecutive 16x16 blocks
+                                        // with a separate quantization scale
+          const bool is_zp_float        // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -556,6 +578,7 @@ __global__ void Marlin(
   using FragC = typename ScalarType<scalar_t>::FragC;
   using FragS = typename ScalarType<scalar_t>::FragS;
   using FragZP = typename ScalarType<scalar_t>::FragZP;
+  using FragZPF = typename ScalarType<scalar_t>::FragZPF;
 
   static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
 
@@ -692,8 +715,10 @@ __global__ void Marlin(
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
-  int zp_gl_stride = (prob_n / pack_factor) / 4;
-  constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
   constexpr int zp_tb_groups = s_tb_groups;
   constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
   int zp_gl_rd_delta = zp_gl_stride;
@@ -768,9 +793,15 @@ __global__ void Marlin(
   constexpr int num_ints_per_thread = 8 / pack_factor;
   int zp_sh_rd;
   if constexpr (has_zp) {
-    zp_sh_rd = num_ints_per_thread * num_col_threads *
-                   ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-               num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1)
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
   }
 
   // Precompute which thread should not read memory in which iterations; this is
@@ -832,6 +863,7 @@ __global__ void Marlin(
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
+  FragZPF frag_zpf[2][4];                // float16 zero-points
 
   // Zero accumulators.
   auto zero_accums = [&]() {
@@ -1126,7 +1158,7 @@ __global__ void Marlin(
     // has_zp implies AWQ, which doesn't have act_order,
     static_assert(!has_zp || group_blocks != 0);
 
-    if constexpr (has_zp) {
+    if constexpr (has_zp && !is_zp_float) {
       int pipe = full_pipe % stages;
 
       if constexpr (group_blocks == -1) {
@@ -1170,11 +1202,40 @@ __global__ void Marlin(
         }
       }
     }
+
+    if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   auto matmul = [&](int k) {
-    if constexpr (has_zp) {
+    if constexpr (has_zp && !is_zp_float) {
       FragB frag_zp_0;
       FragB frag_zp_1;
       int zp_quant_0, zp_quant_1;
@@ -1219,10 +1280,14 @@ __global__ void Marlin(
       frag_b1 = dequant<scalar_t, w_type_id>(b_quant_1);
 
       // Apply zero-point to frag_b0
-      if constexpr (has_zp) {
+      if constexpr (has_zp && !is_zp_float) {
         sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
       }
 
+      if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        sub_zpf<scalar_t>(frag_b0, frag_zpf[k % 2][j], 0);
+      }
+
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
         scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
@@ -1235,10 +1300,14 @@ __global__ void Marlin(
       }
 
       // Apply zero-point to frag_b1
-      if constexpr (has_zp) {
+      if constexpr (has_zp && !is_zp_float) {
         sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
       }
 
+      if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        sub_zpf<scalar_t>(frag_b1, frag_zpf[k % 2][j], 1);
+      }
+
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
@@ -1510,7 +1579,7 @@ __global__ void Marlin(
         fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
       }
 
-      if constexpr (has_zp && group_blocks == -1) {
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
         if (i == 0) {
           fetch_zp_to_shared();
         }
@@ -1697,23 +1766,27 @@ __global__ void Marlin(
 }
 
   #define __CALL_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                    HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS)          \
+                    HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS,          \
+                    IS_ZP_FLOAT)                                               \
     else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
              thread_n_blocks == THREAD_N_BLOCKS &&                             \
              thread_k_blocks == THREAD_K_BLOCKS &&                             \
              has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
-      cudaFuncSetAttribute(                                                    \
-          Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,          \
-                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
-                 HAS_ZP, GROUP_BLOCKS>,                                        \
-          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
-      Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,              \
-             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
-             HAS_ZP, GROUP_BLOCKS>                                             \
-          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
-              A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,        \
-              num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);     \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      if constexpr (!IS_ZP_FLOAT || std::is_same<scalar_t, half>::value) {     \
+        cudaFuncSetAttribute(                                                  \
+            Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,        \
+                   THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages,              \
+                   HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>,          \
+            cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+        Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,            \
+               THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,   \
+               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
+            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
+                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
+                num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);   \
+      }                                                                        \
     }
 
 typedef struct {
@@ -1905,51 +1978,96 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 }
 
   #define GPTQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)                                                        \
                                                                             \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
+              false)                                                        \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
+              false)
 
   #define AWQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)  \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
+              false)                                                       \
                                                                            \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
-    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
+              false)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, false)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define HQQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)            \
+    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
+              true)                                                       \
+    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, true)
 
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
@@ -1958,7 +2076,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                vllm::ScalarType const& q_type, bool has_act_order,
                bool is_k_full, bool has_zp, int num_groups, int group_size,
                int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, int max_par, bool use_fp32_reduce) {
+               int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) {
   if (has_zp) {
     TORCH_CHECK(
         q_type == vllm::kU4 || q_type == vllm::kU8,
@@ -2111,6 +2229,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
     AWQ_CALL_IF(vllm::kU8, 8, 8, 256)
     AWQ_CALL_IF(vllm::kU8, 8, 4, 128)
     AWQ_CALL_IF(vllm::kU8, 4, 8, 128)
+
+    HQQ_CALL_IF(vllm::kU4, 16, 4, 256)
+    HQQ_CALL_IF(vllm::kU4, 8, 8, 256)
+    HQQ_CALL_IF(vllm::kU4, 8, 4, 128)
+    HQQ_CALL_IF(vllm::kU4, 4, 8, 128)
+    HQQ_CALL_IF(vllm::kU8, 16, 4, 256)
+    HQQ_CALL_IF(vllm::kU8, 8, 8, 256)
+    HQQ_CALL_IF(vllm::kU8, 8, 4, 128)
+    HQQ_CALL_IF(vllm::kU8, 4, 8, 128)
     else {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
                   ", ", prob_k, "]", ", has_act_order = ", has_act_order,
@@ -2135,7 +2262,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                vllm::ScalarTypeId const& b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
                                bool is_k_full, bool has_zp,
-                               bool use_fp32_reduce) {
+                               bool use_fp32_reduce, bool is_zp_float) {
   vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
   if (has_zp) {
     TORCH_CHECK(
@@ -2148,6 +2275,12 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         b_q_type.str());
   }
 
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
   int pack_factor = 32 / b_q_type.size_bits();
 
   // Verify A
@@ -2257,12 +2390,22 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   if (has_zp) {
     int rank = b_zeros.sizes().size();
     TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
-    TORCH_CHECK(b_zeros.size(0) == num_groups,
-                "b_zeros dim 0 = ", b_zeros.size(0),
-                " is not num_groups = ", num_groups);
-    TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
-                "b_zeros dim 1 = ", b_zeros.size(1),
-                " is not size_n / pack_factor = ", size_n / pack_factor);
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(1) == size_n,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(0),
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(0) == num_groups,
+                  "b_zeros dim 0 = ", b_zeros.size(0),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
   }
 
   // Verify workspace size
@@ -2282,7 +2425,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
@@ -2291,7 +2434,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce);
+        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index be06c09bee331..5b0fe94e7971c 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -24,6 +24,7 @@ class ScalarType<half> {
   using FragC = Vec<float, 4>;
   using FragS = Vec<half2, 1>;
   using FragZP = Vec<half2, 4>;
+  using FragZPF = Vec<half2, 1>;
 
   static __device__ float inline num2float(const half x) {
     return __half2float(x);
@@ -53,6 +54,7 @@ class ScalarType<nv_bfloat16> {
   using FragC = Vec<float, 4>;
   using FragS = Vec<nv_bfloat162, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
+  using FragZPF = Vec<nv_bfloat162, 1>;
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   static __device__ float inline num2float(const nv_bfloat16 x) {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b999028fe06a9..79e14355e6f61 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -197,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
       "int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce) -> Tensor");
+      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 5cfd4d6da7a86..7286438dfd76f 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -29,6 +29,7 @@
     marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
+from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -226,7 +227,7 @@ def test_gptq_marlin_gemm(
         torch.ops._C.gptq_marlin_gemm,
         (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
          workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
-         a_input.shape[1], is_k_full, False, use_fp32_reduce),
+         a_input.shape[1], is_k_full, False, use_fp32_reduce, False),
         test_utils=DEFAULT_OPCHECK_TEST_UTILS)
 
     output = ops.gptq_marlin_gemm(
@@ -244,6 +245,7 @@ def test_gptq_marlin_gemm(
         is_k_full=is_k_full,
         has_zp=False,
         use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -441,6 +443,7 @@ def test_awq_marlin_gemm(
         is_k_full=is_k_full,
         has_zp=has_zp,
         use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
     )
     output_ref = torch.matmul(a_input, w_ref)
 
@@ -451,6 +454,87 @@ def test_awq_marlin_gemm(
     assert max_diff < 0.04
 
 
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("group_size", [64])
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
+def test_hqq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    group_size,
+    mnk_factors,
+    use_fp32_reduce,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    quant_type = scalar_types.uint4
+
+    a_input = rand_data((size_m, size_k))
+    dev = a_input.device
+
+    b_weight = torch.randint(0,
+                             10, (size_n, size_k),
+                             dtype=torch.uint8,
+                             device=dev)
+    scale = rand_data((size_n, size_k // group_size))
+    zero = rand_data((size_n, size_k // group_size))
+
+    gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
+                                        4).to(dev)
+    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
+                                     group_size).to(dev)
+    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
+                                      group_size).to(dev)
+
+    g_idx = marlin_make_empty_g_idx(dev)
+    g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_w_q,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace.scratch,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[0],
+        a_input.shape[1],
+        is_k_full=True,
+        has_zp=True,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=True,
+    )
+
+    b_flat = b_weight.reshape(-1, group_size)
+    zp_flat = zero.reshape(-1, 1)
+    s_flat = scale.reshape(-1, 1)
+    dequant = (b_flat - zp_flat) * s_flat
+
+    output_ref = torch.matmul(a_input,
+                              dequant.reshape(b_weight.shape).transpose(1, 0))
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 @pytest.mark.skipif(not is_quant_method_supported("qqq"),
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b2952bbfa917c..a2501f00e66a7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -335,7 +335,8 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                size_k: torch.SymInt,
                                is_k_full: bool,
                                has_zp: bool = False,
-                               use_fp32_reduce: bool = False) -> torch.Tensor:
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
 
     @register_fake("_C::ggml_dequantize")
@@ -578,11 +579,12 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      size_k: int,
                      is_k_full: bool,
                      has_zp: bool = False,
-                     use_fp32_reduce: bool = False) -> torch.Tensor:
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, b_q_type.id,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp, use_fp32_reduce)
+                                         has_zp, use_fp32_reduce, is_zp_float)
 
 
 # fp8 marlin
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index da841d052d728..ff342c4f9479e 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -21,6 +21,7 @@
     GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
+from vllm.model_executor.layers.quantization.hqq_marlin import HQQMarlinConfig
 from vllm.model_executor.layers.quantization.ipex_quant import IPEXConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
@@ -48,6 +49,7 @@
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
     "qqq": QQQConfig,
+    "hqq": HQQMarlinConfig,
     "experts_int8": ExpertsInt8Config,
     "neuron_quant": NeuronQuantConfig,
     "ipex": IPEXConfig,
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
new file mode 100644
index 0000000000000..35c4cb00fb298
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -0,0 +1,218 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_make_empty_g_idx, marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace)
+from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           ModelWeightParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class HQQMarlinConfig(QuantizationConfig):
+    """Config class for HQQ Marlin"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+    ) -> None:
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.quant_type = self.TYPE_MAP[(weight_bits)]
+
+    def __repr__(self) -> str:
+        return (f"HQQMarlinConfig(quant_type={self.quant_type}, "
+                f"group_size={self.group_size})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "hqq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig":
+        weight_bits = cls.get_from_keys(config, ["nbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        #TODO
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["HQQMarlinMethod"]:
+        if isinstance(layer, LinearBase):
+            return HQQMarlinMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class HQQMarlinMethod(LinearMethodBase):
+    """Linear method for HQQ Marlin.
+    """
+
+    def __init__(
+        self,
+        quant_config: HQQMarlinConfig,
+    ):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        self.output_size_per_partition = sum(output_partition_sizes)
+
+        self.input_size_per_partition = input_size_per_partition
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        self.scales_and_zp_size = (input_size_per_partition //
+                                   self.quant_config.group_size)
+
+        # Quantized weights
+        qweight = ModelWeightParameter(data=torch.empty(
+            self.output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.uint8,
+        ),
+                                       input_dim=1,
+                                       output_dim=0,
+                                       weight_loader=weight_loader)
+
+        zeros = GroupQuantScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                         input_dim=1,
+                                         output_dim=0,
+                                         weight_loader=weight_loader)
+
+        scales = GroupQuantScaleParameter(data=torch.empty(
+            self.output_size_per_partition,
+            self.scales_and_zp_size,
+            dtype=params_dtype,
+        ),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
+
+        layer.register_parameter("W_q", qweight)
+        layer.register_parameter("zero", zeros)
+        layer.register_parameter("scale", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        dev = layer.W_q.device
+        qweight_t = layer.W_q.transpose(1, 0)
+
+        gptq_w_q = gptq_pack(qweight_t, 4, self.input_size_per_partition,
+                             self.output_size_per_partition)
+
+        sort_indices = torch.empty(0, dtype=torch.int, device=gptq_w_q.device)
+        marlin_w_q = ops.gptq_marlin_repack(
+            gptq_w_q,
+            sort_indices,
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            4,
+        ).to(dev)
+        marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0),
+                                         self.input_size_per_partition,
+                                         self.output_size_per_partition,
+                                         self.quant_config.group_size).to(dev)
+        marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0),
+                                          self.input_size_per_partition,
+                                          self.output_size_per_partition,
+                                          self.quant_config.group_size).to(dev)
+
+        layer.g_idx = marlin_make_empty_g_idx(dev)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+
+        layer.marlin_qweight = marlin_w_q
+        layer.marlin_zeros = marlin_zp
+        layer.marlin_scales = marlin_s
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        workspace = MarlinWorkspace(self.output_size_per_partition,
+                                    GPTQ_MARLIN_MIN_THREAD_N,
+                                    GPTQ_MARLIN_MAX_PARALLEL)
+
+        scales = layer.marlin_scales
+        zeros = layer.marlin_zeros
+        orig_type = x.dtype
+
+        if orig_type != torch.float16:
+            x = x.to(torch.float16)
+            scales = scales.to(torch.float16)
+            zeros = zeros.to(torch.float16)
+
+        marlin_out = ops.gptq_marlin_gemm(
+            x,
+            layer.marlin_qweight,
+            scales,
+            zeros,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            workspace.scratch,
+            scalar_types.uint4,
+            x.shape[0],
+            self.output_size_per_partition,
+            self.input_size_per_partition,
+            True,  # is_k_full
+            True,  # has_zp
+            False,  # use 32-bit reduce
+            True,  # use float zp
+        )
+
+        if bias is not None:
+            marlin_out.add_(bias)
+
+        if orig_type != torch.float16:
+            return marlin_out.to(orig_type)
+        else:
+            return marlin_out
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 9a1defa409714..c9366ca97d149 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -303,7 +303,8 @@ def apply_gptq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
                                   has_zp=False,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
 
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -340,7 +341,8 @@ def apply_awq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
                                   has_zp=True,
-                                  use_fp32_reduce=use_fp32_reduce)
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
 
     if bias is not None:
         output.add_(bias)  # In-place add
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0c51314bc90df..7bc530a37df09 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -128,6 +128,14 @@ def get_quant_config(model_config: ModelConfig,
     if model_config.quantization == "gguf":
         return quant_cls.from_config({})
 
+    if model_config.quantization == "hqq":
+        wq_params = (model_config.hf_config.quantization_config["quant_config"]
+                     ["weight_quant_params"])
+        return quant_cls.from_config({
+            "nbits": wq_params["nbits"],
+            "group_size": wq_params["group_size"]
+        })
+
     # Read the quantization config from the HF model config, if available.
     hf_quant_config = getattr(model_config.hf_config, "quantization_config",
                               None)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index fd88ae8b50402..3b2fb6d8ad131 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -42,6 +42,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
+from vllm.model_executor.layers.quantization.hqq_marlin import HQQMarlinConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -315,6 +316,9 @@ def __init__(
         else:
             self.norm = PPMissingLayer()
 
+        self.is_hqq = (quant_config is not None
+                       and isinstance(quant_config, HQQMarlinConfig))
+
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
@@ -366,6 +370,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+
+        # unpack function from https://github.com/mobiusml/hqq
+        def unpack_4bit_u8(
+                W_q: torch.Tensor,
+                dtype=torch.uint8) -> torch.Tensor:  # uint8/2 > uint8
+            step = W_q.shape[0]
+            tmp = torch.empty([2 * step, W_q.shape[1]],
+                              dtype=dtype,
+                              device=W_q.device)
+            tmp[:step] = (W_q & 0b11110000) >> 4
+            tmp[step:] = W_q & 0b00001111
+            return tmp
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -394,9 +411,29 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if is_pp_missing_parameter(name, self):
                     continue
 
+                # TODO should input/output dim in hqq_marlin.py depend on this?
+                ignore_hqq = (".axis", ".channel_wise", ".compute_dtype",
+                              ".encoded_state_dict", ".group_size", ".nbits",
+                              ".offload_meta", ".optimize", ".packing",
+                              ".quant_scale", ".quant_zero", ".round_zero",
+                              ".shape", ".stores_quant_config",
+                              ".unpack_view_dtype", ".view_as_float")
+                if name.endswith(ignore_hqq) and name not in params_dict:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                if self.is_hqq and name.endswith(".W_q"):
+                    weight_loader(
+                        param,
+                        unpack_4bit_u8(loaded_weight).reshape(
+                            -1, param.shape[1]), shard_id)
+                elif self.is_hqq and name.endswith((".scale", ".zero")):
+                    weight_loader(param,
+                                  loaded_weight.reshape(-1, param.shape[1]),
+                                  shard_id)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
 
                 break
             else:
@@ -411,10 +448,29 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if is_pp_missing_parameter(name, self):
                     continue
 
+                # TODO should input/output dim in hqq_marlin.py depend on this?
+                ignore_hqq = (".axis", ".channel_wise", ".compute_dtype",
+                              ".encoded_state_dict", ".group_size", ".nbits",
+                              ".offload_meta", ".optimize", ".packing",
+                              ".quant_scale", ".quant_zero", ".round_zero",
+                              ".shape", ".stores_quant_config",
+                              ".unpack_view_dtype", ".view_as_float")
+                if name.endswith(ignore_hqq) and name not in params_dict:
+                    continue
+
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                weight_loader(param, loaded_weight)
+                if self.is_hqq and name.endswith(".W_q"):
+                    weight_loader(
+                        param,
+                        unpack_4bit_u8(loaded_weight).reshape(
+                            -1, param.shape[1]))
+                elif self.is_hqq and name.endswith((".scale", ".zero")):
+                    weight_loader(param,
+                                  loaded_weight.reshape(-1, param.shape[1]))
+                else:
+                    weight_loader(param, loaded_weight)
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should