From 824fca3069e1d1d40e190c4b72f887e1d009df9c Mon Sep 17 00:00:00 2001 From: Janmey Shukla <68988130+Vortexx2@users.noreply.github.com> Date: Fri, 29 Dec 2023 16:00:22 +0530 Subject: [PATCH] Fix issue with below limit strings in ngram calculation When strings provided were below `n` characters, the ngram function returns an empty list. This was previously exploded, without filtering out the empty lists, causing the token to occur erroneously. Now, the empty lists should be filtered out. --- python/cudf/cudf/core/column/string.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7bf81f3e2d3..781d63a0d48 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4842,6 +4842,7 @@ def character_ngrams( result = self._return_or_inplace(lc, retain_index=True) if isinstance(result, cudf.Series) and not as_list: + result = result[result.list.len() > 0] # before exploding, removes those lists which have 0 length return result.explode() return result