From c30df48a3a379f6500c46012c25a233531d52b31 Mon Sep 17 00:00:00 2001 From: Benjamin Piwowarski Date: Sun, 10 Mar 2024 08:23:20 +0100 Subject: [PATCH] fix: padding with new numpy versions --- docs/source/text/index.rst | 1 + src/xpmir/neural/interaction/drmm.py | 8 +++++++- src/xpmir/utils/iter.py | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/text/index.rst b/docs/source/text/index.rst index 7c86aff..4b0ba02 100644 --- a/docs/source/text/index.rst +++ b/docs/source/text/index.rst @@ -56,3 +56,4 @@ Adapters ******** .. autoxpmconfig:: xpmir.text.adapters.MeanTextEncoder +.. autoxpmconfig:: xpmir.text.adapters.TopicTextConverter diff --git a/src/xpmir/neural/interaction/drmm.py b/src/xpmir/neural/interaction/drmm.py index 31592fd..7704cf0 100644 --- a/src/xpmir/neural/interaction/drmm.py +++ b/src/xpmir/neural/interaction/drmm.py @@ -122,11 +122,17 @@ def _encode( options: TokenizerOptions, ) -> SimilarityInputWithTokens: encoded = encoder(texts, options=options) + + max_len = max(encoded.tokenized.lens) + padded_tokens = [ + (t + [""] * (max_len - len(t))) for t in encoded.tokenized.tokens + ] + return self.similarity.preprocess( SimilarityInputWithTokens( encoded.value, encoded.tokenized.mask, - np.array(encoded.tokenized.tokens), + np.array(padded_tokens, dtype=str), ) ) diff --git a/src/xpmir/utils/iter.py b/src/xpmir/utils/iter.py index 2734f1e..62a29b9 100644 --- a/src/xpmir/utils/iter.py +++ b/src/xpmir/utils/iter.py @@ -406,13 +406,14 @@ def start(self): def close(self): if self.mp_iterator: + atexit.unregister(self.close) self.stop_process.set() try: # Try to remove an item from the queue just in case next(self.mp_iterator) finally: + self.mp_iterator = None logging.info("Signaled the mp_iterator to quit") - atexit.unregister(self.close) def detach(self): """Produces an iterator only based on the multiprocess queue (useful