diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 99157a23fcb..5695e882a95 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -17,6 +17,7 @@
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <cudf/table/table_device_view.cuh>
 #include <io/utilities/block_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,14 +47,16 @@ struct dictinit_state_s {
 };
 
 /**
- * @brief Return a 12-bit hash from a byte sequence
+ * @brief Return a 12-bit hash from a string
  */
-static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
+static inline __device__ uint32_t hash_string(const string_view val)
 {
-  if (len != 0) {
-    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
-  } else {
+  if (val.empty()) {
     return 0;
+  } else {
+    char const *ptr = val.data();
+    uint32_t len    = val.size_bytes();
+    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   }
 }
 
@@ -71,7 +74,8 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
 {
   if (t == 0) { s->nnz = 0; }
   for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
-    const uint32_t *valid_map = s->chunk.valid_map_base;
+    const uint32_t *valid_map = s->chunk.leaf_column->null_mask();
+    auto column_offset        = s->chunk.leaf_column->offset();
     uint32_t is_valid, nz_pos;
     if (t < block_size / 32) {
       if (!valid_map) {
@@ -80,10 +84,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
         uint32_t const row   = s->chunk.start_row + i + t * 32;
         auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
 
-        auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
+        auto const valid_map_idx = (row + column_offset) / 32;
         uint32_t valid           = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
 
-        auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
+        auto const rows_in_next_word = (row + column_offset) & 0x1f;
         if (rows_in_next_word != 0) {
           auto const rows_in_current_word = 32 - rows_in_next_word;
           // Read next word if any rows are within the chunk
@@ -111,12 +115,18 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
  * @brief Gather all non-NULL string rows and compute total character data size
  *
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
+ * @param[in] num_columns Number of string columns
  */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
-  gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
+  gpuInitDictionaryIndices(DictionaryChunk *chunks,
+                           const table_device_view view,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
+                           uint32_t num_columns)
 {
   __shared__ __align__(16) dictinit_state_s state_g;
 
@@ -131,12 +141,21 @@ __global__ void __launch_bounds__(block_size, 2)
   dictinit_state_s *const s = &state_g;
   uint32_t col_id           = blockIdx.x;
   uint32_t group_id         = blockIdx.y;
-  const nvstrdesc_s *ck_data;
-  uint32_t *dict_data;
   uint32_t nnz, start_row, dict_char_count;
   int t = threadIdx.x;
 
-  if (t == 0) s->chunk = chunks[group_id * num_columns + col_id];
+  if (t == 0) {
+    column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id];
+    s->chunk                             = chunks[group_id * num_columns + col_id];
+    s->chunk.leaf_column                 = leaf_column_view;
+    s->chunk.dict_data =
+      dict_data + col_id * leaf_column_view->size() + group_id * row_index_stride;
+    s->chunk.dict_index = dict_index + col_id * leaf_column_view->size();
+    s->chunk.start_row  = group_id * row_index_stride;
+    s->chunk.num_rows =
+      min(row_index_stride,
+          max(static_cast<size_t>(leaf_column_view->size() - s->chunk.start_row), size_t{0}));
+  }
   for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
     if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
   }
@@ -152,15 +171,15 @@ __global__ void __launch_bounds__(block_size, 2)
   nnz       = s->nnz;
   dict_data = s->chunk.dict_data;
   start_row = s->chunk.start_row;
-  ck_data   = static_cast<const nvstrdesc_s *>(s->chunk.column_data_base) + start_row;
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0;
     uint32_t hash   = 0;
     uint32_t len    = 0;
     if (i + t < nnz) {
-      ck_row = s->dict[i + t];
-      len    = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash   = nvstr_init_hash(ck_data[ck_row].ptr, len);
+      ck_row                 = s->dict[i + t];
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      len                    = static_cast<uint32_t>(string_val.size_bytes());
+      hash                   = hash_string(string_val);
     }
     len = block_reduce(temp_storage.reduce_storage).Sum(len);
     if (t == 0) s->chunk.string_char_count += len;
@@ -200,10 +219,11 @@ __global__ void __launch_bounds__(block_size, 2)
     uint32_t ck_row = 0, pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row;
     bool collision;
     if (i + t < nnz) {
-      ck_row  = dict_data[i + t] - start_row;
-      hash    = nvstr_init_hash(ck_data[ck_row].ptr, static_cast<uint32_t>(ck_data[ck_row].count));
-      sh      = (hash & 1) ? 16 : 0;
-      pos_old = s->map.u16[hash];
+      ck_row                 = dict_data[i + t] - start_row;
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      hash                   = hash_string(string_val);
+      sh                     = (hash & 1) ? 16 : 0;
+      pos_old                = s->map.u16[hash];
     }
     // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
     // behavior for the first row in the hash map that will be used for early duplicate detection
@@ -233,18 +253,16 @@ __global__ void __launch_bounds__(block_size, 2)
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
     if (i + t < nnz) {
-      const char *str1, *str2;
-      uint32_t len1, len2, hash;
-      ck_row     = s->dict[i + t];
-      str1       = ck_data[ck_row].ptr;
-      len1       = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash       = nvstr_init_hash(str1, len1);
-      ck_row_ref = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
+      ck_row                   = s->dict[i + t];
+      string_view string_value = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      auto const string_length = static_cast<uint32_t>(string_value.size_bytes());
+      auto const hash          = hash_string(string_value);
+      ck_row_ref               = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
       if (ck_row_ref != ck_row) {
-        str2    = ck_data[ck_row_ref].ptr;
-        len2    = static_cast<uint32_t>(ck_data[ck_row_ref].count);
-        is_dupe = nvstr_is_equal(str1, len1, str2, len2);
-        dict_char_count += (is_dupe) ? 0 : len1;
+        string_view reference_string =
+          s->chunk.leaf_column->element<string_view>(ck_row_ref + start_row);
+        is_dupe = (string_value == reference_string);
+        dict_char_count += (is_dupe) ? 0 : string_length;
       }
     }
     uint32_t dupes_in_block;
@@ -269,6 +287,12 @@ __global__ void __launch_bounds__(block_size, 2)
     chunks[group_id * num_columns + col_id].string_char_count = s->chunk.string_char_count;
     chunks[group_id * num_columns + col_id].num_dict_strings  = nnz - s->total_dupes;
     chunks[group_id * num_columns + col_id].dict_char_count   = dict_char_count;
+    chunks[group_id * num_columns + col_id].leaf_column       = s->chunk.leaf_column;
+
+    chunks[group_id * num_columns + col_id].dict_data  = s->chunk.dict_data;
+    chunks[group_id * num_columns + col_id].dict_index = s->chunk.dict_index;
+    chunks[group_id * num_columns + col_id].start_row  = s->chunk.start_row;
+    chunks[group_id * num_columns + col_id].num_rows   = s->chunk.num_rows;
   }
 }
 
@@ -357,7 +381,6 @@ __global__ void __launch_bounds__(block_size)
   uint32_t num_strings;
   uint32_t *dict_data, *dict_index;
   uint32_t dict_char_count;
-  const nvstrdesc_s *str_data;
   int t = threadIdx.x;
 
   if (t == 0) s->stripe = stripes[stripe_id * num_columns + col_id];
@@ -366,21 +389,17 @@ __global__ void __launch_bounds__(block_size)
   num_strings = s->stripe.num_strings;
   dict_data   = s->stripe.dict_data;
   if (!dict_data) return;
-  dict_index      = s->stripe.dict_index;
-  str_data        = static_cast<const nvstrdesc_s *>(s->stripe.column_data_base);
-  dict_char_count = 0;
+  dict_index                 = s->stripe.dict_index;
+  string_view current_string = string_view::min();
+  dict_char_count            = 0;
   for (uint32_t i = 0; i < num_strings; i += block_size) {
     uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
     uint32_t cur_len = 0;
-    const char *cur_ptr;
-    bool is_dupe = false;
-    if (i + t < num_strings) {
-      cur_ptr = str_data[cur].ptr;
-      cur_len = str_data[cur].count;
-    }
+    bool is_dupe     = false;
+    if (i + t < num_strings) { current_string = s->stripe.leaf_column->element<string_view>(cur); }
     if (i + t != 0 && i + t < num_strings) {
       uint32_t prev = dict_data[i + t - 1];
-      is_dupe       = nvstr_is_equal(cur_ptr, cur_len, str_data[prev].ptr, str_data[prev].count);
+      is_dupe       = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));
     }
     dict_char_count += (is_dupe) ? 0 : cur_len;
     uint32_t dupes_in_block;
@@ -403,14 +422,14 @@ __global__ void __launch_bounds__(block_size)
 }
 
 /**
- * @brief Launches kernel for initializing dictionary chunks
- *
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
- * @param[in] num_rowgroups Number of row groups
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::InitDictionaryIndices
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream)
@@ -418,20 +437,12 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
   static constexpr int block_size = 512;
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(num_columns, num_rowgroups);
-  gpuInitDictionaryIndices<block_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
+  gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, view, dict_data, dict_index, row_index_stride, str_col_ids, num_columns);
 }
 
 /**
- * @brief Launches kernel for building stripe dictionaries
- *
- * @param[in] stripes StripeDictionary device array [stripe][column]
- * @param[in] stripes_host StripeDictionary host array [stripe][column]
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_stripes Number of stripes
- * @param[in] num_rowgroups Number of row groups
- * @param[in] num_columns Number of columns
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
  */
 void BuildStripeDictionaries(StripeDictionary *stripes,
                              StripeDictionary *stripes_host,
@@ -447,18 +458,16 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
     stripes, chunks, num_columns);
   for (uint32_t i = 0; i < num_stripes * num_columns; i++) {
     if (stripes_host[i].dict_data != nullptr) {
-      thrust::device_ptr<uint32_t> p = thrust::device_pointer_cast(stripes_host[i].dict_data);
-      const nvstrdesc_s *str_data =
-        static_cast<const nvstrdesc_s *>(stripes_host[i].column_data_base);
+      thrust::device_ptr<uint32_t> dict_data_ptr =
+        thrust::device_pointer_cast(stripes_host[i].dict_data);
+      column_device_view *string_column = stripes_host[i].leaf_column;
       // NOTE: Requires the --expt-extended-lambda nvcc flag
       thrust::sort(rmm::exec_policy(stream),
-                   p,
-                   p + stripes_host[i].num_strings,
-                   [str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) {
-                     return nvstr_is_lesser(str_data[lhs].ptr,
-                                            (uint32_t)str_data[lhs].count,
-                                            str_data[rhs].ptr,
-                                            (uint32_t)str_data[rhs].count);
+                   dict_data_ptr,
+                   dict_data_ptr + stripes_host[i].num_strings,
+                   [string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) {
+                     return string_column->element<string_view>(lhs) <
+                            string_column->element<string_view>(rhs);
                    });
     }
   }
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 7ad92e40cb4..55df0adf95b 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -21,6 +21,7 @@
 #include <io/comp/gpuinflate.h>
 #include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -124,16 +125,15 @@ struct RowGroup {
  * @brief Struct to describe an encoder data chunk
  */
 struct EncChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of input column data
-  uint32_t start_row;              // start row of this chunk
-  uint32_t num_rows;               // number of rows in this chunk
-  uint32_t valid_rows;             // max number of valid rows
-  uint8_t encoding_kind;           // column encoding kind (orc::ColumnEncodingKind)
-  uint8_t type_kind;               // column data type (orc::TypeKind)
-  uint8_t dtype_len;               // data type length
-  uint8_t scale;                   // scale for decimals or timestamps
+  uint32_t start_row;     // start row of this chunk
+  uint32_t num_rows;      // number of rows in this chunk
+  uint8_t encoding_kind;  // column encoding kind (orc::ColumnEncodingKind)
+  uint8_t type_kind;      // column data type (orc::TypeKind)
+  uint8_t dtype_len;      // data type length
+  uint8_t scale;          // scale for decimals or timestamps
+
+  uint32_t *dict_index;  // dictionary index from row index
+  column_device_view *leaf_column;
 };
 
 /**
@@ -163,10 +163,7 @@ struct StripeStream {
  * @brief Struct to describe a dictionary chunk
  */
 struct DictionaryChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;             // dictionary data (index of non-null rows)
+  uint32_t *dict_data;   // dictionary data (index of non-null rows)
   uint32_t *dict_index;  // row indices of corresponding string (row from dictionary index)
   uint32_t start_row;    // start row of this chunk
   uint32_t num_rows;     // num rows in this chunk
@@ -175,20 +172,23 @@ struct DictionaryChunk {
     string_char_count;  // total size of string data (NOTE: assumes less than 4G bytes per chunk)
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
  * @brief Struct to describe a dictionary
  */
 struct StripeDictionary {
-  const void *column_data_base;  // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;           // row indices of corresponding string (row from dictionary index)
-  uint32_t *dict_index;          // dictionary index from row index
-  uint32_t column_id;            // real column id
-  uint32_t start_chunk;          // first chunk in stripe
-  uint32_t num_chunks;           // number of chunks in the stripe
-  uint32_t num_strings;          // number of unique strings in the dictionary
-  uint32_t dict_char_count;      // total size of dictionary string data
+  uint32_t *dict_data;       // row indices of corresponding string (row from dictionary index)
+  uint32_t *dict_index;      // dictionary index from row index
+  uint32_t column_id;        // real column id
+  uint32_t start_chunk;      // first chunk in stripe
+  uint32_t num_chunks;       // number of chunks in the stripe
+  uint32_t num_strings;      // number of unique strings in the dictionary
+  uint32_t dict_char_count;  // total size of dictionary string data
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -313,6 +313,17 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
                               detail::device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @brief Set leaf column element of EncChunk
+ *
+ * @param[in] view table device view representing input table
+ * @param[in,out] chunks encoder chunk device array [column][rowgroup]
+ * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ */
+void set_chunk_columns(const table_device_view &view,
+                       detail::device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
  *
@@ -350,15 +361,25 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
 /**
  * @brief Launches kernel for initializing dictionary chunks
  *
+ * @param[in] view table device view representing input table
  * @param[in,out] chunks DictionaryChunk device array [rowgroup][column]
+ * @param[in] dict_data dictionary data (index of non-null rows)
+ * @param[in] dict_index row indices of corresponding string (row from dictionary index)
+ * @param[in] row_index_stride Rowgroup size in rows
+ * @param[in] str_col_ids List of columns that are strings type
  * @param[in] num_columns Number of columns
  * @param[in] num_rowgroups Number of row groups
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building stripe dictionaries
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index aef32efaf6e..10932d36309 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -669,19 +669,20 @@ __global__ void __launch_bounds__(block_size)
       if (t * 8 < nrows) {
         uint32_t row  = s->chunk.start_row + present_rows + t * 8;
         uint8_t valid = 0;
-        if (row < s->chunk.valid_rows) {
-          if (s->chunk.valid_map_base) {
-            size_type current_valid_offset = row + s->chunk.column_offset;
-            size_type next_valid_offset    = current_valid_offset + min(32, s->chunk.valid_rows);
+        if (row < s->chunk.leaf_column->size()) {
+          if (s->chunk.leaf_column->nullable()) {
+            size_type current_valid_offset = row + s->chunk.leaf_column->offset();
+            size_type next_valid_offset =
+              current_valid_offset + min(32, s->chunk.leaf_column->size());
 
             bitmask_type mask = cudf::detail::get_mask_offset_word(
-              s->chunk.valid_map_base, 0, current_valid_offset, next_valid_offset);
+              s->chunk.leaf_column->null_mask(), 0, current_valid_offset, next_valid_offset);
             valid = 0xff & mask;
           } else {
             valid = 0xff;
           }
-          if (row + 7 > s->chunk.valid_rows) {
-            valid = valid & ((1 << (s->chunk.valid_rows & 7)) - 1);
+          if (row + 7 > s->chunk.leaf_column->size()) {
+            valid = valid & ((1 << (s->chunk.leaf_column->size() & 7)) - 1);
           }
         }
         s->valid_buf[(row >> 3) & 0x1ff] = valid;
@@ -729,19 +730,18 @@ __global__ void __launch_bounds__(block_size)
       lengths_to_positions(s->buf.u32, 512, t);
       __syncthreads();
       if (valid) {
-        int nz_idx       = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
-        void const *base = s->chunk.column_data_base;
+        int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
         switch (s->chunk.type_kind) {
           case INT:
           case DATE:
-          case FLOAT: s->vals.u32[nz_idx] = static_cast<const uint32_t *>(base)[row]; break;
+          case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint32_t>(row); break;
           case DOUBLE:
-          case LONG: s->vals.u64[nz_idx] = static_cast<const uint64_t *>(base)[row]; break;
-          case SHORT: s->vals.u32[nz_idx] = static_cast<const uint16_t *>(base)[row]; break;
+          case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element<uint64_t>(row); break;
+          case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint16_t>(row); break;
           case BOOLEAN:
-          case BYTE: s->vals.u8[nz_idx] = static_cast<const uint8_t *>(base)[row]; break;
+          case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = static_cast<const int64_t *>(base)[row];
+            int64_t ts       = s->chunk.leaf_column->element<int64_t>(row);
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
@@ -772,16 +772,13 @@ __global__ void __launch_bounds__(block_size)
           }
           case STRING:
             if (s->chunk.encoding_kind == DICTIONARY_V2) {
-              uint32_t dict_idx = static_cast<const uint32_t *>(base)[row];
-              if (dict_idx > 0x7fffffffu)
-                dict_idx = static_cast<const uint32_t *>(base)[dict_idx & 0x7fffffffu];
+              uint32_t dict_idx = s->chunk.dict_index[row];
+              if (dict_idx > 0x7fffffffu) dict_idx = s->chunk.dict_index[dict_idx & 0x7fffffffu];
               s->vals.u32[nz_idx] = dict_idx;
             } else {
-              const nvstrdesc_s *str_desc = static_cast<const nvstrdesc_s *>(base) + row;
-              const char *ptr             = str_desc->ptr;
-              uint32_t count              = static_cast<uint32_t>(str_desc->count);
-              s->u.strenc.str_data[s->buf.u32[t] - 1] = ptr;
-              s->lengths.u32[nz_idx]                  = count;
+              string_view value = s->chunk.leaf_column->element<string_view>(row);
+              s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data();
+              s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
             break;
           default: break;
@@ -899,8 +896,8 @@ __global__ void __launch_bounds__(block_size)
     streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t *>(const_cast<void *>(s->chunk.column_data_base)) +
-        s->chunk.start_row * s->chunk.dtype_len;
+        static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
+        (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
 }
@@ -939,8 +936,8 @@ __global__ void __launch_bounds__(block_size)
     s->nrows         = s->u.dict_stripe.num_strings;
     s->cur_row       = 0;
   }
-  auto const str_desc  = static_cast<const nvstrdesc_s *>(s->u.dict_stripe.column_data_base);
-  auto const dict_data = s->u.dict_stripe.dict_data;
+  column_device_view *string_column = s->u.dict_stripe.leaf_column;
+  auto const dict_data              = s->u.dict_stripe.dict_data;
   __syncthreads();
   if (s->chunk.encoding_kind != DICTIONARY_V2) {
     return;  // This column isn't using dictionary encoding -> bail out
@@ -951,8 +948,13 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char *ptr = (t < numvals) ? str_desc[string_idx].ptr : 0;
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      const char *ptr = 0;
+      uint32_t count  = 0;
+      if (t < numvals) {
+        auto string_val = string_column->element<string_view>(string_idx);
+        ptr             = string_val.data();
+        count           = string_val.size_bytes();
+      }
       s->u.strenc.str_data[t] = ptr;
       StoreStringData(s->stream.data_ptrs[CI_DICTIONARY] + s->strm_pos[CI_DICTIONARY],
                       &s->u.strenc,
@@ -961,7 +963,10 @@ __global__ void __launch_bounds__(block_size)
       if (!t) { s->strm_pos[CI_DICTIONARY] += s->u.strenc.char_count; }
     } else {
       // Encoding string lengths
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      uint32_t count =
+        (t < numvals)
+          ? static_cast<uint32_t>(string_column->element<string_view>(string_idx).size_bytes())
+          : 0;
       uint32_t nz_idx = (s->cur_row + t) & 0x3ff;
       if (t < numvals) s->lengths.u32[nz_idx] = count;
       __syncthreads();
@@ -982,6 +987,15 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; }
 }
 
+__global__ void __launch_bounds__(512)
+  gpu_set_chunk_columns(const table_device_view view, device_2dspan<EncChunk> chunks)
+{
+  // Set leaf_column member of EncChunk
+  for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) {
+    chunks[blockIdx.x][i].leaf_column = view.begin() + blockIdx.x;
+  }
+}
+
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
@@ -1189,6 +1203,16 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
+void set_chunk_columns(const table_device_view &view,
+                       device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(512, 1);
+  dim3 dim_grid(chunks.size().first, 1);
+
+  gpu_set_chunk_columns<<<dim_grid, dim_block, 0, stream.value()>>>(view, chunks);
+}
+
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index eb5e90bbeec..cb75698fd8d 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -42,7 +42,6 @@ namespace detail {
 namespace orc {
 using namespace cudf::io::orc;
 using namespace cudf::io;
-using cudf::io::orc::gpu::nvstrdesc_s;
 
 struct row_group_index_info {
   int32_t pos       = -1;  // Position
@@ -111,39 +110,6 @@ constexpr T to_clockscale(cudf::type_id timestamp_id)
 
 }  // namespace
 
-/**
- * @brief Helper kernel for converting string data/offsets into nvstrdesc
- * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
- * made to use the native offset+data layout.
- */
-__global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
-                                        const size_type *offsets,
-                                        const char *strdata,
-                                        const uint32_t *nulls,
-                                        const size_type column_offset,
-                                        size_type column_size)
-{
-  size_type row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < column_size) {
-    uint32_t is_valid = (nulls != nullptr)
-                          ? (nulls[(row + column_offset) / 32] >> ((row + column_offset) % 32)) & 1
-                          : 1;
-    size_t count;
-    const char *ptr;
-    if (is_valid) {
-      size_type cur  = offsets[row];
-      size_type next = offsets[row + 1];
-      ptr            = strdata + cur;
-      count          = (next > cur) ? next - cur : 0;
-    } else {
-      ptr   = nullptr;
-      count = 0;
-    }
-    dst[row].ptr   = ptr;
-    dst[row].count = count;
-  }
-}
-
 /**
  * @brief Helper class that adds ORC-specific column info
  */
@@ -160,31 +126,14 @@ class orc_column_view {
                            rmm::cuda_stream_view stream)
     : _id(id),
       _str_id(str_id),
-      _string_type(col.type().id() == type_id::STRING),
-      _type_width(_string_type ? 0 : cudf::size_of(col.type())),
+      _is_string_type(col.type().id() == type_id::STRING),
+      _type_width(_is_string_type ? 0 : cudf::size_of(col.type())),
       _data_count(col.size()),
       _null_count(col.null_count()),
-      _data(col.head<uint8_t>() + col.offset() * _type_width),
       _nulls(col.null_mask()),
-      _column_offset(col.offset()),
       _clockscale(to_clockscale<uint8_t>(col.type().id())),
       _type_kind(to_orc_type(col.type().id()))
   {
-    if (_string_type && _data_count > 0) {
-      strings_column_view view{col};
-      _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
-        static_cast<gpu::nvstrdesc_s *>(_indexes.data()),
-        view.offsets().data<size_type>() + view.offset(),
-        view.chars().data<char>(),
-        _nulls,
-        _column_offset,
-        _data_count);
-      _data = _indexes.data();
-
-      stream.synchronize();
-    }
     // Generating default name if name isn't present in metadata
     if (metadata && _id < metadata->column_names.size()) {
       _name = metadata->column_names[_id];
@@ -193,7 +142,7 @@ class orc_column_view {
     }
   }
 
-  auto is_string() const noexcept { return _string_type; }
+  auto is_string() const noexcept { return _is_string_type; }
   void set_dict_stride(size_t stride) noexcept { dict_stride = stride; }
   auto get_dict_stride() const noexcept { return dict_stride; }
 
@@ -207,7 +156,7 @@ class orc_column_view {
   }
   auto host_dict_chunk(size_t rowgroup) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &dict[rowgroup * dict_stride + _str_id];
   }
   auto device_dict_chunk() const { return d_dict; }
@@ -223,7 +172,7 @@ class orc_column_view {
   }
   auto host_stripe_dict(size_t stripe) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &stripe_dict[stripe * dict_stride + _str_id];
   }
   auto device_stripe_dict() const { return d_stripe_dict; }
@@ -233,9 +182,7 @@ class orc_column_view {
   size_t data_count() const noexcept { return _data_count; }
   size_t null_count() const noexcept { return _null_count; }
   bool nullable() const noexcept { return (_nulls != nullptr); }
-  void const *data() const noexcept { return _data; }
   uint32_t const *nulls() const noexcept { return _nulls; }
-  size_type column_offset() const noexcept { return _column_offset; }
   uint8_t clockscale() const noexcept { return _clockscale; }
 
   void set_orc_encoding(ColumnEncodingKind e) { _encoding_kind = e; }
@@ -245,17 +192,15 @@ class orc_column_view {
 
  private:
   // Identifier within set of columns and string columns, respectively
-  size_t _id        = 0;
-  size_t _str_id    = 0;
-  bool _string_type = false;
-
-  size_t _type_width       = 0;
-  size_t _data_count       = 0;
-  size_t _null_count       = 0;
-  void const *_data        = nullptr;
-  uint32_t const *_nulls   = nullptr;
-  size_type _column_offset = 0;
-  uint8_t _clockscale      = 0;
+  size_t _id           = 0;
+  size_t _str_id       = 0;
+  bool _is_string_type = false;
+
+  size_t _type_width     = 0;
+  size_t _data_count     = 0;
+  size_t _null_count     = 0;
+  uint32_t const *_nulls = nullptr;
+  uint8_t _clockscale    = 0;
 
   // ORC-related members
   std::string _name{};
@@ -263,7 +208,6 @@ class orc_column_view {
   ColumnEncodingKind _encoding_kind;
 
   // String dictionary-related members
-  rmm::device_buffer _indexes;
   size_t dict_stride                       = 0;
   gpu::DictionaryChunk const *dict         = nullptr;
   gpu::StripeDictionary const *stripe_dict = nullptr;
@@ -308,8 +252,10 @@ std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   return infos;
 }
 
-void writer::impl::init_dictionaries(orc_column_view *columns,
+void writer::impl::init_dictionaries(const table_device_view &view,
+                                     orc_column_view *columns,
                                      std::vector<int> const &str_col_ids,
+                                     device_span<size_type> d_str_col_ids,
                                      uint32_t *dict_data,
                                      uint32_t *dict_index,
                                      hostdevice_vector<gpu::DictionaryChunk> *dict)
@@ -321,26 +267,17 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
     auto &str_column = columns[str_col_ids[i]];
     str_column.set_dict_stride(str_col_ids.size());
     str_column.attach_dict_chunk(dict->host_ptr(), dict->device_ptr());
-
-    for (size_t g = 0; g < num_rowgroups; g++) {
-      auto *ck              = &(*dict)[g * str_col_ids.size() + i];
-      ck->valid_map_base    = str_column.nulls();
-      ck->column_offset     = str_column.column_offset();
-      ck->column_data_base  = str_column.data();
-      ck->dict_data         = dict_data + i * str_column.data_count() + g * row_index_stride_;
-      ck->dict_index        = dict_index + i * str_column.data_count();  // Indexed by abs row
-      ck->start_row         = g * row_index_stride_;
-      ck->num_rows          = std::min<uint32_t>(row_index_stride_,
-                                        std::max<int>(str_column.data_count() - ck->start_row, 0));
-      ck->num_strings       = 0;
-      ck->string_char_count = 0;
-      ck->num_dict_strings  = 0;
-      ck->dict_char_count   = 0;
-    }
   }
 
-  dict->host_to_device(stream);
-  gpu::InitDictionaryIndices(dict->device_ptr(), str_col_ids.size(), num_rowgroups, stream);
+  gpu::InitDictionaryIndices(view,
+                             dict->device_ptr(),
+                             dict_data,
+                             dict_index,
+                             row_index_stride_,
+                             d_str_col_ids.data(),
+                             d_str_col_ids.size(),
+                             num_rowgroups,
+                             stream);
   dict->device_to_host(stream, true);
 }
 
@@ -358,19 +295,19 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
     str_column.attach_stripe_dict(stripe_dict.host_ptr(), stripe_dict.device_ptr());
 
     for (auto const &stripe : stripe_bounds) {
-      auto &sd            = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
-      sd.column_data_base = str_column.host_dict_chunk(0)->column_data_base;
-      sd.dict_data        = str_column.host_dict_chunk(stripe.first)->dict_data;
-      sd.dict_index       = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
-      sd.column_id        = str_col_ids[col_idx];
-      sd.start_chunk      = stripe.first;
-      sd.num_chunks       = stripe.size;
-      sd.dict_char_count  = 0;
+      auto &sd           = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
+      sd.dict_data       = str_column.host_dict_chunk(stripe.first)->dict_data;
+      sd.dict_index      = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
+      sd.column_id       = str_col_ids[col_idx];
+      sd.start_chunk     = stripe.first;
+      sd.num_chunks      = stripe.size;
+      sd.dict_char_count = 0;
       sd.num_strings =
         std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) {
           const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
           return dt_str_cnt + dt.num_dict_strings;
         });
+      sd.leaf_column = dict[col_idx].leaf_column;
     }
 
     if (enable_dictionary_) {
@@ -593,15 +530,16 @@ struct segmented_valid_cnt_input {
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(host_span<orc_column_view const> columns,
+encoded_data writer::impl::encode_columns(const table_device_view &view,
+                                          host_span<orc_column_view const> columns,
                                           std::vector<int> const &str_col_ids,
                                           host_span<stripe_rowgroups const> stripe_bounds,
                                           orc_streams const &streams)
 {
   auto const num_columns   = columns.size();
   auto const num_rowgroups = stripes_size(stripe_bounds);
-  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups);
-  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups);
+  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups, stream);
+  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups, stream);
   auto const stream_offsets = streams.compute_offsets(columns, num_rowgroups);
   rmm::device_uvector<uint8_t> encoded_data(stream_offsets.data_size(), stream);
 
@@ -614,23 +552,17 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
         auto const rg_idx = *rg_idx_it;
         auto &ck          = chunks[column.id()][rg_idx];
 
-        ck.start_row  = (rg_idx * row_index_stride_);
-        ck.num_rows   = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
-        ck.valid_rows = column.data_count();
+        ck.start_row = (rg_idx * row_index_stride_);
+        ck.num_rows  = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
         ck.encoding_kind = column.orc_encoding();
         ck.type_kind     = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = (ck.encoding_kind == DICTIONARY_V2)
-                                  ? column.host_stripe_dict(stripe.id)->dict_index
-                                  : column.data();
+          ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
+                            ? column.host_stripe_dict(stripe.id)->dict_index
+                            : nullptr;
           ck.dtype_len = 1;
         } else {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = column.data();
-          ck.dtype_len        = column.type_width();
+          ck.dtype_len = column.type_width();
         }
         ck.scale = column.clockscale();
         // Only need to check row groups that end within the stripe
@@ -730,6 +662,8 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
   chunks.host_to_device(stream);
   chunk_streams.host_to_device(stream);
 
+  gpu::set_chunk_columns(view, chunks, stream);
+
   if (!str_col_ids.empty()) {
     auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
     gpu::EncodeStripeDictionaries(
@@ -791,8 +725,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   size_t num_chunks        = num_rowgroups * columns.size();
 
   std::vector<std::vector<uint8_t>> stat_blobs(num_stat_blobs);
-  hostdevice_vector<stats_column_desc> stat_desc(columns.size());
-  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs);
+  hostdevice_vector<stats_column_desc> stat_desc(columns.size(), stream);
+  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs, stream);
   rmm::device_uvector<statistics_chunk> stat_chunks(num_chunks + num_stat_blobs, stream);
   rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
 
@@ -811,11 +745,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       case TypeKind::STRING: desc->stats_dtype = dtype_string; break;
       default: desc->stats_dtype = dtype_none; break;
     }
-    desc->num_rows         = column.data_count();
-    desc->num_values       = column.data_count();
-    desc->valid_map_base   = column.nulls();
-    desc->column_offset    = column.column_offset();
-    desc->column_data_base = column.data();
+    desc->num_rows   = column.data_count();
+    desc->num_values = column.data_count();
     if (desc->stats_dtype == dtype_timestamp64) {
       // Timestamp statistics are in milliseconds
       switch (column.clockscale()) {
@@ -869,8 +800,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     stat_merge.device_ptr(), stat_chunks.data() + num_chunks, num_stat_blobs, stream);
   stat_merge.device_to_host(stream, true);
 
-  hostdevice_vector<uint8_t> blobs(stat_merge[num_stat_blobs - 1].start_chunk +
-                                   stat_merge[num_stat_blobs - 1].num_chunks);
+  hostdevice_vector<uint8_t> blobs(
+    stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
   gpu::orc_encode_statistics(blobs.device_ptr(),
                              stat_merge.device_ptr(),
                              stat_chunks.data() + num_chunks,
@@ -1061,6 +992,22 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
+rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &view,
+                                                     rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> string_column_ids(view.num_columns(), stream);
+  auto iter     = thrust::make_counting_iterator<size_type>(0);
+  auto end_iter = thrust::copy_if(rmm::exec_policy(stream),
+                                  iter,
+                                  iter + view.num_columns(),
+                                  string_column_ids.begin(),
+                                  [view] __device__(size_type index) {
+                                    return (view.column(index).type().id() == type_id::STRING);
+                                  });
+  string_column_ids.resize(end_iter - string_column_ids.begin(), stream);
+  return string_column_ids;
+}
+
 void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
@@ -1074,6 +1021,9 @@ void writer::impl::write(table_view const &table)
       "be specified");
   }
 
+  auto device_columns    = table_device_view::create(table, stream);
+  auto string_column_ids = get_string_column_ids(*device_columns, stream);
+
   // Wrapper around cudf columns to attach ORC-specific type info
   std::vector<orc_column_view> orc_columns;
   orc_columns.reserve(num_columns);
@@ -1093,9 +1043,15 @@ void writer::impl::write(table_view const &table)
   // Build per-column dictionary indices
   const auto num_rowgroups   = div_by_rowgroups<size_t>(num_rows);
   const auto num_dict_chunks = num_rowgroups * str_col_ids.size();
-  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks);
+  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks, stream);
   if (!str_col_ids.empty()) {
-    init_dictionaries(orc_columns.data(), str_col_ids, dict_data.data(), dict_index.data(), &dict);
+    init_dictionaries(*device_columns,
+                      orc_columns.data(),
+                      str_col_ids,
+                      string_column_ids,
+                      dict_data.data(),
+                      dict_index.data(),
+                      &dict);
   }
 
   // Decide stripe boundaries early on, based on uncompressed size
@@ -1103,23 +1059,22 @@ void writer::impl::write(table_view const &table)
 
   // Build stripe-level dictionaries
   const auto num_stripe_dict = stripe_bounds.size() * str_col_ids.size();
-  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict);
+  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict, stream);
   if (!str_col_ids.empty()) {
     build_dictionaries(
       orc_columns.data(), str_col_ids, stripe_bounds, dict, dict_index.data(), stripe_dict);
   }
 
   auto streams  = create_streams(orc_columns, stripe_bounds);
-  auto enc_data = encode_columns(orc_columns, str_col_ids, stripe_bounds, streams);
+  auto enc_data = encode_columns(*device_columns, orc_columns, str_col_ids, stripe_bounds, streams);
 
   // Assemble individual disparate column chunks into contiguous data streams
   const auto num_index_streams = (num_columns + 1);
   const auto num_data_streams  = streams.size() - num_index_streams;
-  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams);
+  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams, stream);
   auto stripes =
     gather_stripes(num_rows, num_index_streams, stripe_bounds, &enc_data.streams, &strm_descs);
 
-  auto device_columns = table_device_view::create(table);
   // Gather column statistics
   std::vector<std::vector<uint8_t>> column_stats;
   if (enable_statistics_ && num_columns > 0 && num_rows > 0) {
@@ -1160,8 +1115,8 @@ void writer::impl::write(table_view const &table)
 
   // Compress the data streams
   rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks);
+  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
   if (compression_kind_ != NONE) {
     strm_descs.host_to_device(stream);
     gpu::CompressOrcDataStreams(static_cast<uint8_t *>(compressed_data.data()),
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f0ec3a70cec..352cb11440f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -186,14 +186,18 @@ class writer::impl {
   /**
    * @brief Builds up column dictionaries indices
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
+   * @param d_str_col_ids List of columns that are strings type in device memory
    * @param dict_data Dictionary data memory
    * @param dict_index Dictionary index memory
    * @param dict List of dictionary chunks
    */
-  void init_dictionaries(orc_column_view* columns,
+  void init_dictionaries(const table_device_view& view,
+                         orc_column_view* columns,
                          std::vector<int> const& str_col_ids,
+                         device_span<size_type> d_str_col_ids,
                          uint32_t* dict_data,
                          uint32_t* dict_index,
                          hostdevice_vector<gpu::DictionaryChunk>* dict);
@@ -238,13 +242,15 @@ class writer::impl {
   /**
    * @brief Encodes the input columns into streams.
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
    * @param stripe_bounds List of stripe boundaries
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Encoded data and per-chunk stream descriptors
    */
-  encoded_data encode_columns(host_span<orc_column_view const> columns,
+  encoded_data encode_columns(const table_device_view& view,
+                              host_span<orc_column_view const> columns,
                               std::vector<int> const& str_col_ids,
                               host_span<stripe_rowgroups const> stripe_bounds,
                               orc_streams const& streams);
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 46d471d5cf7..2676f30474d 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -52,8 +52,10 @@ inline __device__ uint32_t uint64_hash16(uint64_t v)
   return uint32_hash16((uint32_t)(v + (v >> 32)));
 }
 
-inline __device__ uint32_t nvstr_hash16(const uint8_t *p, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  const char *p = val.data();
+  uint32_t len  = val.size_bytes();
   uint32_t hash = len;
   if (len > 0) {
     uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
@@ -181,7 +183,7 @@ __global__ void __launch_bounds__(block_size, 1)
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
   while (s->row_cnt < s->ck.num_rows) {
@@ -206,7 +208,7 @@ __global__ void __launch_bounds__(block_size, 1)
         if (dtype == BYTE_ARRAY) {
           auto str1 = s->col.leaf_column->element<string_view>(row);
           len += str1.size_bytes();
-          hash = nvstr_hash16(reinterpret_cast<const uint8_t *>(str1.data()), str1.size_bytes());
+          hash = hash_string(str1);
           // Walk the list of rows with the same hash
           next_addr = &s->hashmap[hash];
           while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3b29394686f..51ec0013f1a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -79,8 +79,10 @@ struct page_enc_state_s {
 /**
  * @brief Return a 12-bit hash from a byte sequence
  */
-inline __device__ uint32_t nvstr_init_hash(const uint8_t *ptr, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  char const *ptr = val.data();
+  uint32_t len    = val.size_bytes();
   if (len != 0) {
     return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   } else {
@@ -199,7 +201,7 @@ __global__ void __launch_bounds__(block_size)
     // dtype_len, which determines how much memory we need to allocate for the fragment.
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
 
@@ -218,7 +220,7 @@ __global__ void __launch_bounds__(block_size)
         if (dtype == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
-          hash = nvstr_init_hash(reinterpret_cast<const uint8_t *>(str.data()), str.size_bytes());
+          hash = hash_string(str);
         } else if (dtype_len_in == 8) {
           hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
         } else {
@@ -1059,7 +1061,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len_out;
+    dtype_len_in = dtype_len_out;
   }
   dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
   if (t == 0) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 31baf419f45..1e8a6920ea4 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -969,7 +969,7 @@ void writer::impl::write(table_view const &table)
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
-  auto parent_column_table_device_view = table_device_view::create(single_streams_table);
+  auto parent_column_table_device_view = table_device_view::create(single_streams_table, stream);
   rmm::device_uvector<column_device_view> leaf_column_views(0, stream);
 
   // Initialize column description
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
index 128bd905259..52f21f0a9ad 100644
--- a/cpp/src/io/statistics/column_stats.cu
+++ b/cpp/src/io/statistics/column_stats.cu
@@ -187,12 +187,6 @@ gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Sto
   }
 }
 
-// FIXME: Use native libcudf string type
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Gather statistics for string columns
  *
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h
index d1d414aa7b4..d7895de50ce 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/column_stats.h
@@ -45,10 +45,7 @@ struct stats_column_desc {
   uint32_t num_rows;             //!< number of rows in column
   uint32_t num_values;  //!< Number of data values in column. Different from num_rows in case of
                         //!< nested columns
-  const uint32_t *valid_map_base;  //!< base of valid bit map for this column (null if not present)
-  size_type column_offset;         //! < index of the first element relative to the base memory
-  const void *column_data_base;    //!< base ptr to column data
-  int32_t ts_scale;  //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
+  int32_t ts_scale;     //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
 
   column_device_view *leaf_column;    //!< Pointer to leaf column
   column_device_view *parent_column;  //!< Pointer to parent column. Is nullptr if not list type.