Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove pattern dependency #3012

Merged
merged 18 commits into from
Jan 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,18 @@ data
*.inv
*.js
docs/_images/

#
# Generated by Cython
#
gensim/_matutils.c
gensim/corpora/_mmreader.c
gensim/models/doc2vec_corpusfile.cpp
gensim/models/doc2vec_inner.cpp
gensim/models/fasttext_corpusfile.cpp
gensim/models/fasttext_inner.c
gensim/models/nmf_pgd.c
gensim/models/word2vec_corpusfile.cpp
gensim/models/word2vec_inner.c

.ipynb_checkpoints
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ Production stability is important to Gensim, so we're improving the process of *
* [#2926](https://github.com/RaRe-Technologies/gensim/pull/2926): Rename `num_words` to `topn` in dtm_coherence, by [@MeganStodel](https://github.com/MeganStodel)
* [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937): Remove Keras dependency, by [@piskvorky](https://github.com/piskvorky)
* Removed all code, methods, attributes and functions marked as deprecated in [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3).
* Removed pattern dependency (PR [#3012](https://github.com/RaRe-Technologies/gensim/pull/3012), [@mpenkov](https://github.com/mpenkov)). If you need to lemmatize, do it prior to passing the corpus to gensim.

---

Expand Down
1 change: 0 additions & 1 deletion docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ Modules:
scripts/make_wikicorpus
scripts/word2vec_standalone
scripts/make_wiki_online
scripts/make_wiki_online_lemma
scripts/make_wiki_online_nodebug
scripts/word2vec2tensor
scripts/segment_wiki
Expand Down
9 changes: 0 additions & 9 deletions docs/src/scripts/make_wiki_online_lemma.rst

This file was deleted.

4 changes: 1 addition & 3 deletions gensim/corpora/_mmreader.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ from __future__ import with_statement

from gensim import utils

from six import string_types
from six.moves import range
import logging

cimport cython
Expand Down Expand Up @@ -187,7 +185,7 @@ cdef class MmReader():

if offset == -1:
return []
if isinstance(self.input, string_types):
if isinstance(self.input, str):
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False
Expand Down
30 changes: 13 additions & 17 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,8 @@

Notes
-----
If you have the `pattern <https://github.com/clips/pattern>`_ package installed,
this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer).

See :mod:`gensim.scripts.make_wiki` for a canned (example) command-line script based on this module.

"""

import bz2
Expand Down Expand Up @@ -467,9 +464,8 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,

Parameters
----------
args : (str, bool, str, int)
Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
page identificator.
args : (str, str, int)
Article text, article title, page identificator.
tokenizer_func : function
Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
Needs to have interface:
Expand All @@ -487,12 +483,9 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
List of tokens from article, title and page id.

"""
text, lemmatize, title, pageid = args
text, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenizer_func(text, token_min_len, token_max_len, lower)
result = tokenizer_func(text, token_min_len, token_max_len, lower)
return result, title, pageid


Expand Down Expand Up @@ -574,7 +567,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping

"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
def __init__(self, fname, processes=None, lemmatize=None, dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
"""Initialize the corpus.
Expand All @@ -588,9 +581,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
Path to the Wikipedia dump file.
processes : int, optional
Number of processes to run, defaults to `max(1, number of cpu - 1)`.
lemmatize : bool
Use lemmatization instead of simple regexp tokenization.
Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Dictionary, if not provided, this scans the corpus once, to determine its vocabulary
**IMPORTANT: this needs a really long time**.
Expand Down Expand Up @@ -618,14 +608,20 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.

"""
if lemmatize is not None:
raise NotImplementedError(
'The lemmatize parameter is no longer supported. '
'If you need to lemmatize, use e.g. <https://github.com/clips/pattern>. '
'Perform lemmatization as part of your tokenization function and '
'pass it as the tokenizer_func parameter to this initializer.'
)
self.fname = fname
self.filter_namespaces = filter_namespaces
self.filter_articles = filter_articles
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
self.tokenizer_func = tokenizer_func
self.article_min_tokens = article_min_tokens
self.token_min_len = token_min_len
Expand Down Expand Up @@ -677,7 +673,7 @@ def get_texts(self):

tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
texts = (
(text, self.lemmatize, title, pageid, tokenization_params)
(text, title, pageid, tokenization_params)
for title, text, pageid
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)
)
Expand Down
1 change: 0 additions & 1 deletion gensim/models/word2vec_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import cython
import numpy as np

from gensim.utils import any2utf8
from six import iteritems

cimport numpy as np

Expand Down
1 change: 0 additions & 1 deletion gensim/scripts/make_wiki_lemma.py

This file was deleted.

112 changes: 0 additions & 112 deletions gensim/scripts/make_wiki_online_lemma.py

This file was deleted.

9 changes: 2 additions & 7 deletions gensim/scripts/make_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.

If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .

Example:
python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
"""
Expand Down Expand Up @@ -74,13 +70,12 @@
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program

if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
wiki = WikiCorpus(inp, dictionary=dictionary)
# ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
Expand All @@ -89,7 +84,7 @@
wiki.save(outp + '_corpus.pkl.bz2')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
wiki = WikiCorpus(inp) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
Expand Down
13 changes: 8 additions & 5 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class _WikiSectionsCorpus(WikiCorpus):
"""

def __init__(self, fileobj, min_article_character=200, processes=None,
lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False):
lemmatize=None, filter_namespaces=('0',), include_interlinks=False):
"""
Parameters
----------
Expand All @@ -278,22 +278,25 @@ def __init__(self, fileobj, min_article_character=200, processes=None,
Minimal number of character for article (except titles and leading gaps).
processes : int, optional
Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
lemmatize : bool, optional
piskvorky marked this conversation as resolved.
Show resolved Hide resolved
If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
Otherwise, use simple regexp tokenization.
filter_namespaces : tuple of int, optional
Enumeration of namespaces that will be ignored.
include_interlinks: bool
Whether or not interlinks should be included in the output

"""
if lemmatize is not None:
raise NotImplementedError(
'The lemmatize parameter is no longer supported since Gensim 4.0.0. '
'If you need to lemmatize, use e.g. https://github.com/clips/pattern '
'to preprocess your corpus before submitting it to Gensim.'
)

self.fileobj = fileobj
self.filter_namespaces = filter_namespaces
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
self.min_article_character = min_article_character
self.include_interlinks = include_interlinks

Expand Down
Loading