Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove __getitem__ code duplication in gensim.models.phrases #2206

Merged
merged 5 commits into from
Oct 4, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 48 additions & 45 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,52 @@ def load(cls, *args, **kwargs):
return model


def _sentence2token(phrase_class, sentence):
""" Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter.

This function is used by: meth:`~gensim.models.phrases.Phrases.__getitem__` and
meth:`~gensim.models.phrases.Phraser.__getitem__`

Parameters
----------
phrase_class :
class:`~gensim.models.phrases.Phrases` or :class:`~gensim.models.phrases.Phraser`
sentence : {list of str, iterable of list of str}
Sentence or text corpus.

Returns
-------
{list of str, :class:`~gensim.interfaces.TransformedCorpus`}
`sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences
if the input was a corpus.

"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty line at the end of docstring

is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
return phrase_class._apply(sentence)

delimiter = phrase_class.delimiter
if hasattr(phrase_class, 'vocab'):
scorer = ft.partial(
phrase_class.scoring,
len_vocab=float(len(phrase_class.vocab)),
min_count=float(phrase_class.min_count),
corpus_word_count=float(phrase_class.corpus_word_count))
else:
scorer = None
bigrams = phrase_class.analyze_sentence(sentence, threshold=phrase_class.threshold,
common_terms=phrase_class.common_terms, scorer=scorer)

new_s = []
for words, score in bigrams:
if score is not None:
words = delimiter.join(words)
new_s.append(words)
return [utils.to_unicode(w) for w in new_s]


class Phrases(SentenceAnalyzer, PhrasesTransformation):
"""Detect phrases based on collocation counts."""

Expand Down Expand Up @@ -597,33 +643,7 @@ def __getitem__(self, sentence):
"""
warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")

delimiter = self.delimiter # delimiter used for lookup

is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
return self._apply(sentence)

delimiter = self.delimiter
bigrams = self.analyze_sentence(
sentence,
threshold=self.threshold,
common_terms=self.common_terms,
scorer=ft.partial(
self.scoring,
len_vocab=float(len(self.vocab)),
min_count=float(self.min_count),
corpus_word_count=float(self.corpus_word_count),
),
)
new_s = []
for words, score in bigrams:
if score is not None:
words = delimiter.join(words)
new_s.append(words)

return [utils.to_unicode(w) for w in new_s]
return _sentence2token(self, sentence)


def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
Expand Down Expand Up @@ -855,24 +875,7 @@ def __getitem__(self, sentence):
[u'graph_minors']

"""
is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
return self._apply(sentence)

delimiter = self.delimiter
bigrams = self.analyze_sentence(
sentence,
threshold=self.threshold,
common_terms=self.common_terms,
scorer=None) # we will use our score_item function redefinition
new_s = []
for words, score in bigrams:
if score is not None:
words = delimiter.join(words)
new_s.append(words)
return [utils.to_unicode(w) for w in new_s]
return _sentence2token(self, sentence)


if __name__ == '__main__':
Expand Down