From 824fca3069e1d1d40e190c4b72f887e1d009df9c Mon Sep 17 00:00:00 2001
From: Janmey Shukla <68988130+Vortexx2@users.noreply.github.com>
Date: Fri, 29 Dec 2023 16:00:22 +0530
Subject: [PATCH] Fix issue with below limit strings in ngram calculation

When strings provided were below `n` characters, the ngram function returns an empty list. This was previously exploded, without filtering out the empty lists, causing the <NA> token to occur erroneously. Now, the empty lists should be filtered out.
---
 python/cudf/cudf/core/column/string.py | 1 +
 1 file changed, 1 insertion(+)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 7bf81f3e2d3..781d63a0d48 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4842,6 +4842,7 @@ def character_ngrams(
         result = self._return_or_inplace(lc, retain_index=True)
 
         if isinstance(result, cudf.Series) and not as_list:
+            result = result[result.list.len() > 0] # before exploding, removes those lists which have 0 length
             return result.explode()
         return result