diff --git a/tensorflow/lite/micro/kernels/xtensa/decompress.cc b/tensorflow/lite/micro/kernels/xtensa/decompress.cc
index 0fa917415c4..13d2ce2dec7 100644
--- a/tensorflow/lite/micro/kernels/xtensa/decompress.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/decompress.cc
@@ -385,14 +385,33 @@ void DecompressionStateXtensa::DecompressToBufferWidthAnyInt8_Xtensa(
     }
   } else {
     int elements_per_channel_t = elements_per_channel_;
+    uint32_t index_1, index_2;
+    uint32_t mask_bits = (1 << compressed_bit_width_) - 1;
 
     for (int i = 0; i < num_channels_t; i++) {
-      for (int j = 0; j < elements_per_channel_t; j++) {
+      elements_per_channel_t = elements_per_channel_;
+      /* if output pointer is not 2 byte aligned */
+      if ((unsigned int)p_out_tmp & 0x1) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
+        AE_S8_0_IP(d_tmp, p_out_tmp, 1);
+        elements_per_channel_t = elements_per_channel_t - 1;
+      }
+      for (int j = 0; j < (elements_per_channel_t >> 1); j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, 2 * bw);
+        index_1 = (index >> compressed_bit_width_) & mask_bits;
+        index_2 = (index)&mask_bits;
+        ae_int8x8 d_tmp1 = AE_L8_X((const ae_int8*)value_table, index_1);
+        ae_int8x8 d_tmp2 = AE_L8_X((const ae_int8*)value_table, index_2);
+        ae_int16x4 d_tmp =
+            AE_MOVINT16X4_FROMINT8X8(AE_SEL8X8I(d_tmp2, d_tmp1, 21));
+        AE_S16_0_IP(d_tmp, (ae_int16*)p_out_tmp, 2);
+      }
+      if (elements_per_channel_t & 0x1) {
         AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
         ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
         AE_S8_0_IP(d_tmp, p_out_tmp, 1);
       }
-
       value_table += stride;
     }
   }