From a72050522829c4643bb8ac4527e619825ccdaa6f Mon Sep 17 00:00:00 2001 From: Xinyi <14123142d@connect.polyu.hk> Date: Sat, 12 Jan 2019 01:57:59 +0800 Subject: [PATCH] Set `metadata=True` for `make_wikicorpus` script by default (#2245) * Add metadata for wiki examples (#1322) * Add metadata for wiki examples (#1322) * update output list of files * upd docstring * three -> several --- docs/src/wiki.rst | 14 ++++++++++++++ gensim/scripts/make_wikicorpus.py | 18 ++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst index bc148729d4..8bc21a897d 100644 --- a/docs/src/wiki.rst +++ b/docs/src/wiki.rst @@ -160,6 +160,20 @@ Unlike LSA, the topics coming from LDA are easier to interpret topic #17: 0.049*indonesia + 0.042*indonesian + 0.031*malaysia + 0.024*singapore + 0.022*greek + 0.021*jakarta + 0.016*greece + 0.015*dord + 0.014*athens + 0.011*malaysian topic #18: 0.031*stakes + 0.029*webs + 0.018*futsal + 0.014*whitish + 0.013*hyun + 0.012*thoroughbred + 0.012*dnf + 0.012*jockey + 0.011*medalists + 0.011*racehorse topic #19: 0.119*oblast + 0.034*uploaded + 0.034*uploads + 0.033*nordland + 0.025*selsoviet + 0.023*raion + 0.022*krai + 0.018*okrug + 0.015*hålogaland + 0.015*russiae + 0.020*manga + 0.017*dragon + 0.012*theme + 0.011*dvd + 0.011*super + 0.011*hunter + 0.009*ash + 0.009*dream + 0.009*angel + >>> + >>> import pickle # noqa: E402 + >>> + >>> # Get an article and its topic distribution + >>> with open("wiki_en_bow.mm.metadata.cpickle", 'rb') as meta_file: + ... docno2metadata = pickle.load(meta_file) + >>> + >>> doc_num = 0 + >>> print("Title: {}".format(docno2metadata[doc_num][1])) # take the first article as an example + Title: Anarchism + >>> + >>> vec = mm[doc_num] # get tf-idf vector + >>> lda.get_document_topics(vec) + [(1, 0.028828567), (10, 0.32766217), (36, 0.021675354), (55, 0.2521854), (57, 0.27154338)] Creating this LDA model of Wikipedia takes about 6 hours and 20 minutes on my laptop [1]_. If you need your results faster, consider running :doc:`dist_lda` on a cluster of diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py index 0ec9704724..49c249a707 100755 --- a/gensim/scripts/make_wikicorpus.py +++ b/gensim/scripts/make_wikicorpus.py @@ -12,13 +12,15 @@ Convert articles from a Wikipedia dump to (sparse) vectors. The input is a bz2-compressed dump of Wikipedia articles, in XML format. -This actually creates three files: +This actually creates several files: -* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids -* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in - Matrix Matrix format -* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation -* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump +* `OUTPUT_PREFIX_wordids.txt.bz2`: mapping between words and their integer ids +* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation in Matrix Market format +* `OUTPUT_PREFIX_bow.mm.index`: index for `OUTPUT_PREFIX_bow.mm` +* `OUTPUT_PREFIX_bow.mm.metadata.cpickle`: titles of documents +* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation in Matix Market format +* `OUTPUT_PREFIX_tfidf.mm.index`: index for `OUTPUT_PREFIX_tfidf.mm` +* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model The output Matrix Market files can then be compressed (e.g., by bzip2) to save disk space; gensim's corpus iterators can work with compressed input, too. @@ -80,7 +82,7 @@ dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') @@ -91,7 +93,7 @@ # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above