diff --git a/onnxruntime/contrib_ops/cpu/quantize_linear.cc b/onnxruntime/contrib_ops/cpu/quantize_linear.cc
index 48b90d847b02b..6f504c37aca78 100644
--- a/onnxruntime/contrib_ops/cpu/quantize_linear.cc
+++ b/onnxruntime/contrib_ops/cpu/quantize_linear.cc
@@ -4,6 +4,7 @@
 #include "contrib_ops/cpu/quantize_linear.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
 #include "core/providers/cpu/tensor/cast_op.h"
+#include "core/providers/common.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -40,51 +41,48 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   auto& x_zero_point = *ctx->Input<Tensor>(2);
   auto& y = *ctx->Output(0, x.Shape());
 
-  TensorShape shape(0, 0);
-  std::unique_ptr<Tensor> reshaped_zero_point;
-  std::unique_ptr<Tensor> reshaped_scale;
+  const auto& x_shape = x.Shape();
+  const auto& scale_shape = x_scale.Shape();
+  const auto& zero_point_shape = x_zero_point.Shape();
+  const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
+
+  size_t stride = 0;
+  const auto& broadcastDim = x_shape[axis];
 
-  // if an axis was provided, build the shape necessary for broadcasting across that axis
   if (has_axis_) {
-    ONNXRUNTIME_ENFORCE(axis_ < static_cast<int64_t>(x.Shape().NumDimensions()), "axis greater than input data dimension!");
-    std::vector<int64_t> shape_;
-    shape_.push_back(x_zero_point.Size());
-    if (axis_ > 0) {
-      for (int64_t i = axis_ - 1; i >= 0; i--) {
-        shape_.push_back(1);
-      }
-    }
-    shape = TensorShape(shape_);
+    // if an axis was specified, ensure the scale and zero point are compatible
+    ONNXRUNTIME_ENFORCE(scale_shape.NumDimensions() == 1 && scale_shape.Size() == broadcastDim, "x_scale must be 1D tensor with size ", broadcastDim);
+    ONNXRUNTIME_ENFORCE(zero_point_shape.NumDimensions() == 1 && zero_point_shape.Size() == broadcastDim, "x_zero_point must be 1D tensor with size ", broadcastDim);
+    stride = 1;
+  } else {
+    // if no axis, enforce that scale and zero point are scalars
+    ONNXRUNTIME_ENFORCE(scale_shape.NumDimensions() == 0, "x_scale must be a scalar if no axis is provided");
+    ONNXRUNTIME_ENFORCE(zero_point_shape.NumDimensions() == 0, "x_zero_point must be a scalar if no axis is provided");
+  }
 
-    // reshape copies of the inputs for broadcasting.
-    TensorAllocator<T> tensorAllocatorUint8(*ctx);
-    reshaped_zero_point = tensorAllocatorUint8.Allocate(shape);
-    memcpy(reshaped_zero_point->MutableDataRaw(), x_zero_point.DataRaw(), sizeof(T) * x_zero_point.Size());
+  size_t N = x_shape.SizeToDimension(axis);
+  const T* zero_point = x_zero_point.template Data<T>();
+  const float* scale = x_scale.template Data<float>();
+  size_t block_size = x_shape.SizeFromDimension(axis + 1);
+  const T* input = x.template Data<T>();
+  float* output = y.template MutableData<float>();
 
-    TensorAllocator<float> tensorAllocatorFloat(*ctx);
-    reshaped_scale = tensorAllocatorFloat.Allocate(shape);
-    memcpy(reshaped_scale->MutableDataRaw(), x_scale.DataRaw(), sizeof(float) * x_scale.Size());
-  }
+  for (size_t n = 0; n < N; n++) {
+    const float* current_scale = scale;
+    const T* current_zero_point = zero_point;
 
-  TBroadcaster<T> bc(x, has_axis_ ? *reshaped_zero_point : x_zero_point);
-  TBroadcastOutput<float> output(bc.GetSpanSize(), y);
-  BroadcastLoop(bc, output,
-                [](EigenVectorMap<float> output, T input0, ConstEigenVectorMap<T> input1) {
-                  output = (int32_t(input0) - input1.template cast<int32_t>().array()).template cast<float>();
-                },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<T> input0, T input1) {
-                  output = (input0.template cast<int32_t>().array() - int32_t(input1)).template cast<float>();
-                },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<T> input0, ConstEigenVectorMap<T> input1) {
-                  output = (input0.template cast<int32_t>() - input1.template cast<int32_t>()).template cast<float>();
-                });
-
-  TBroadcaster<float> bc2(y, has_axis_ ? *reshaped_scale : x_scale);
-  TBroadcastOutput<float> output2(bc2.GetSpanSize(), y);
-  BroadcastLoop(bc2, output2,
-                [](EigenVectorMap<float> output, float input0, ConstEigenVectorMap<float> input1) { output = input0 * input1.array(); },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<float> input0, float input1) { output = input0.array() * input1; },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<float> input0, ConstEigenVectorMap<float> input1) { output = input0.array() * input1.array(); });
+    for (size_t bd = 0; bd < static_cast<size_t>(broadcastDim); bd++) {
+      auto zp = static_cast<const int>(*current_zero_point);
+      auto sc = *current_scale;
+
+      for (size_t bs = 0; bs < block_size; bs++) {
+        *output++ = static_cast<float>(static_cast<const int>(*input++) - zp) * sc;
+      }
+
+      current_scale += stride;
+      current_zero_point += stride;
+    }
+  }
 
   return Status::OK();
 }
@@ -117,60 +115,49 @@ Status QuantizeLinear<float>::Compute(OpKernelContext* ctx) const {
   auto& y_zero_point = *ctx->Input<Tensor>(2);
   auto& y = *ctx->Output(0, x.Shape());
 
-  TensorShape shape(0, 0);
-  std::unique_ptr<Tensor> reshaped_scale;
+  const auto& x_shape = x.Shape();
+  const auto& scale_shape = y_scale.Shape();
+  const auto& zero_point_shape = y_zero_point.Shape();
+  const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
 
-  TensorAllocator<float> tensorAllocator(*ctx);
+  size_t stride = 0;
+  const auto& broadcastDim = x_shape[axis];
 
-  // if an axis was provided, build the shape necessary for broadcasting across that axis
   if (has_axis_) {
-    ONNXRUNTIME_ENFORCE(axis_ < static_cast<int64_t>(x.Shape().NumDimensions()), "axis greater than input data dimension!");
-    std::vector<int64_t> shape_;
-    shape_.push_back(y_zero_point.Size());
-    if (axis_ > 0) {
-      for (int64_t i = axis_ - 1; i >= 0; i--) {
-        shape_.push_back(1);
+    // if an axis was specified, ensure the scale and zero point are compatible
+    ONNXRUNTIME_ENFORCE(scale_shape.NumDimensions() == 1 && scale_shape.Size() == broadcastDim, "x_scale must be 1D tensor with size ", broadcastDim);
+    ONNXRUNTIME_ENFORCE(zero_point_shape.NumDimensions() == 1 && zero_point_shape.Size() == broadcastDim, "x_zero_point must be 1D tensor with size ", broadcastDim);
+    stride = 1;
+  } else {
+    // if no axis, enforce that scale and zero point are scalars
+    ONNXRUNTIME_ENFORCE(scale_shape.NumDimensions() == 0, "x_scale must be a scalar if no axis is provided");
+    ONNXRUNTIME_ENFORCE(zero_point_shape.NumDimensions() == 0, "x_zero_point must be a scalar if no axis is provided");
+  }
+
+  size_t N = x_shape.SizeToDimension(axis);
+  const uint8_t* zero_point = y_zero_point.template Data<uint8_t>();
+  const float* scale = y_scale.template Data<float>();
+  size_t block_size = x_shape.SizeFromDimension(axis + 1);
+  const float* input = x.template Data<float>();
+  uint8_t* output = y.template MutableData<uint8_t>();
+
+  for (size_t n = 0; n < N; n++) {
+    const float* current_scale = scale;
+    const uint8_t* current_zero_point = zero_point;
+
+    for (size_t bd = 0; bd < static_cast<size_t>(broadcastDim); bd++) {
+      auto zp = static_cast<const float>(*current_zero_point);
+      auto sc = *current_scale;
+
+      for (size_t bs = 0; bs < block_size; bs++) {
+        *output++ = static_cast<uint8_t>(clamp(std::round(static_cast<float>(*input++) / sc) + zp, 0.0f, float(UINT8_MAX)));
       }
-    }
-    shape = TensorShape(shape_);
 
-    reshaped_scale = tensorAllocator.Allocate(shape);
-    memcpy(reshaped_scale->MutableDataRaw(), y_scale.DataRaw(), sizeof(float) * y_scale.Size());
+      current_scale += stride;
+      current_zero_point += stride;
+    }
   }
 
-  std::unique_ptr<Tensor> W = tensorAllocator.Allocate(x.Shape());
-  Tensor* pW = W.get();
-
-  TBroadcaster<float> bc(x, has_axis_ ? *reshaped_scale : y_scale);
-  TBroadcastOutput<float> output2(bc.GetSpanSize(), *pW);
-  BroadcastLoop(bc, output2,
-                [](EigenVectorMap<float> output, float input0, ConstEigenVectorMap<float> input1) { output = (input0 / input1.array()).round(); },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<float> input0, float input1) { output = (input0.array() / input1).round(); },
-                [](EigenVectorMap<float> output, ConstEigenVectorMap<float> input0, ConstEigenVectorMap<float> input1) { output = (input0.array() / input1.array()).round(); });
-
-  std::unique_ptr<Tensor> Zf = tensorAllocator.Allocate(has_axis_ ? shape : y_zero_point.Shape());
-  Tensor* pZf = Zf.get();
-  CastData<uint8_t, float>(&y_zero_point, pZf, has_axis_ ? shape : y_zero_point.Shape());
-
-  TBroadcaster<float> bc2(*pW, *pZf);
-  TBroadcastOutput<uint8_t> output(bc2.GetSpanSize(), y);
-  BroadcastLoop(bc2, output,
-                [](EigenVectorMap<uint8_t> output, float input0, ConstEigenVectorMap<float> input1) {
-                  for (std::ptrdiff_t i = 0; i < output.size(); i++) {
-                    output[i] = uint8_t(clamp(input0 + float(input1[i]), 0.0f, float(UINT8_MAX)));
-                  }
-                },
-                [](EigenVectorMap<uint8_t> output, ConstEigenVectorMap<float> input0, float input1) {
-                  for (std::ptrdiff_t i = 0; i < output.size(); i++) {
-                    output[i] = uint8_t(clamp(input0[i] + float(input1), 0.0f, float(UINT8_MAX)));
-                  }
-                },
-                [](EigenVectorMap<uint8_t> output, ConstEigenVectorMap<float> input0, ConstEigenVectorMap<float> input1) {
-                  for (std::ptrdiff_t i = 0; i < output.size(); i++) {
-                    output[i] = uint8_t(clamp(input0[i] + float(input1[i]), 0.0f, float(UINT8_MAX)));
-                  }
-                });
-
   return Status::OK();
 }
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/quantize_linear.h b/onnxruntime/contrib_ops/cpu/quantize_linear.h
index 1175e88fcf045..cc113af44551a 100644
--- a/onnxruntime/contrib_ops/cpu/quantize_linear.h
+++ b/onnxruntime/contrib_ops/cpu/quantize_linear.h
@@ -20,7 +20,7 @@ class DequantizeLinear final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
  private:
-  int64_t axis_;
+  int64_t axis_ = 0;
   bool has_axis_;
 };
 
@@ -34,7 +34,7 @@ class QuantizeLinear final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
  private:
-  int64_t axis_;
+  int64_t axis_ = 0;
   bool has_axis_;
 };
 }  // namespace contrib
diff --git a/onnxruntime/test/contrib_ops/quantize_linear_test.cc b/onnxruntime/test/contrib_ops/quantize_linear_test.cc
index b81536aee80cd..952f3f08f4aab 100644
--- a/onnxruntime/test/contrib_ops/quantize_linear_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_linear_test.cc
@@ -7,6 +7,7 @@
 namespace onnxruntime {
 namespace test {
 
+// scalar zero & scale with uint8
 TEST(DequantizeLinearOpTest, DequantizeLinear_0) {
   OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
   std::vector<int64_t> dims{4};
@@ -17,6 +18,7 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_0) {
   test.Run();
 }
 
+// scalar zero & scale with int8
 TEST(DequantizeLinearOpTest, DequantizeLinear_1) {
   OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
   std::vector<int64_t> dims{4};
@@ -27,6 +29,7 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_1) {
   test.Run();
 }
 
+// 1d zero & scale with uint8 broadcast axis 0
 TEST(DequantizeLinearOpTest, DequantizeLinear_2) {
   OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
   std::vector<int64_t> dims{3, 4};
@@ -34,9 +37,103 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_2) {
                          {0, 1, 2, 3,
                           0, 1, 2, 3,
                           0, 10, 20, 30});
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("scale", {3},
+                       {1.0f,
+                        2.0f,
+                        4.0f});
+  test.AddInput<uint8_t>("zero_point", {3},
+                         {0,
+                          0,
+                          0});
+  test.AddOutput<float>("Y", dims,
+                        {0, 1, 2, 3,
+                         0, 2, 4, 6,
+                         0, 40, 80, 120});
+  test.Run();
+}
+
+// 1d zero & scale with int8 broadcast axis 1
+TEST(DequantizeLinearOpTest, DequantizeLinear_3) {
+  OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
+  std::vector<int64_t> dims{3, 4};
+  test.AddInput<int8_t>("X", dims,
+                        {0, 1, 2, 3,
+                         0, 2, 4, 6,
+                         0, 10, 20, 30});
   test.AddAttribute<int64_t>("axis", 1);
-  test.AddInput<float>("scale", {3}, {1.0f, 2.0f, 4.0f});
-  test.AddInput<uint8_t>("zero_point", {3}, {0, 0, 0});
+  test.AddInput<float>("scale", {4}, {1, 2, 4, 8});
+  test.AddInput<int8_t>("zero_point", {4}, {0, -10, -20, -30});
+  test.AddOutput<float>("Y", dims,
+                        {0, 22, 88, 264,
+                         0, 24, 96, 288,
+                         0, 40, 160, 480});
+  test.Run();
+}
+
+// 1d zero & scale with int8 broadcast axis 0 with 4d tensor input/output
+TEST(DequantizeLinearOpTest, DequantizeLinear_4) {
+  OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
+  std::vector<int64_t> dims{2, 3, 2, 4};
+  test.AddInput<int8_t>("X", dims,
+                        {7, 9, 10, 10,
+                         5, 8, 9, 1,
+
+                         8, 6, 7, 9,
+                         10, 0, 7, 10,
+
+                         8, 2, 6, 0,
+                         5, 9, 8, 1,
+
+                         2, 7, 5, 3,
+                         2, 4, 1, 3,
+
+                         8, 7, 4, 8,
+                         10, 1, 5, 5,
+
+                         7, 7, 0, 2,
+                         4, 4, 0, 5});
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddInput<float>("scale", {3}, {1, 10, 7});
+  test.AddInput<int8_t>("zero_point", {3}, {10, 2, 1});
+  test.AddOutput<float>("Y", dims,
+                        {-3, -1, 0, 0,
+                         -5, -2, -1, -9,
+
+                         60, 40, 50, 70,
+                         80, -20, 50, 80,
+
+                         49, 7, 35, -7,
+                         28, 56, 49, 0,
+
+                         -8, -3, -5, -7,
+                         -8, -6, -9, -7,
+
+                         60, 50, 20, 60,
+                         80, -10, 30, 30,
+
+                         42, 42, -7, 7,
+                         21, 21, -7, 28});
+  test.Run();
+}
+
+// 1d zero & scale with uint8 broadcast axis -2 (-2 resolves to axis 0)
+TEST(DequantizeLinearOpTest, DequantizeLinear_5) {
+  OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
+  std::vector<int64_t> dims{3, 4};
+  test.AddInput<uint8_t>("X", dims,
+                         {0, 1, 2, 3,
+                          0, 1, 2, 3,
+                          0, 10, 20, 30});
+  test.AddAttribute<int64_t>("axis", -2);
+  test.AddInput<float>("scale", {3},
+                       {1.0f,
+                        2.0f,
+                        4.0f});
+  test.AddInput<uint8_t>("zero_point", {3},
+                         {0,
+                          0,
+                          0});
   test.AddOutput<float>("Y", dims,
                         {0, 1, 2, 3,
                          0, 2, 4, 6,
@@ -44,6 +141,7 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_2) {
   test.Run();
 }
 
+// quantize with scalar zero point and scale
 TEST(QuantizeLinearOpTest, QuantizeLinear_0) {
   OpTester test("QuantizeLinear", 1, onnxruntime::kMSDomain);
   std::vector<int64_t> dims{6};
@@ -54,19 +152,15 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_0) {
   test.Run();
 }
 
-// TODO this test is failing for Mac. needs to be debugged.
-#if defined(__MACH__)
-TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_1) {
-#else
+// quantize with broadcasting
 TEST(QuantizeLinearOpTest, QuantizeLinear_1) {
-#endif
   OpTester test("QuantizeLinear", 1, onnxruntime::kMSDomain);
   std::vector<int64_t> dims{3, 4};
   test.AddInput<float>("X", dims,
                        {0, 2, 3, 1000,
                         0, 2, 3, 1000,
                         0, 2, 3, 1000});
-  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<int64_t>("axis", 0);
   test.AddInput<float>("scale", {3}, {1, 2, 4});
   test.AddInput<uint8_t>("zero_point", {3}, {0, 0, 0});
   test.AddOutput<uint8_t>("Y", dims,
@@ -76,5 +170,22 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_1) {
   test.Run();
 }
 
+// quantize with broadcasting and negative axis (-2 resolves to axis 0)
+TEST(QuantizeLinearOpTest, QuantizeLinear_2) {
+  OpTester test("QuantizeLinear", 1, onnxruntime::kMSDomain);
+  std::vector<int64_t> dims{3, 4};
+  test.AddInput<float>("X", dims,
+                       {0, 2, 3, 1000,
+                        0, 2, 3, 1000,
+                        0, 2, 3, 1000});
+  test.AddAttribute<int64_t>("axis", -2);
+  test.AddInput<float>("scale", {3}, {1, 2, 4});
+  test.AddInput<uint8_t>("zero_point", {3}, {0, 0, 0});
+  test.AddOutput<uint8_t>("Y", dims,
+                          {0, 2, 3, 255,
+                           0, 1, 2, 255,
+                           0, 1, 1, 250});
+  test.Run();
+}
 }  // namespace test
 }  // namespace onnxruntime