Set metadata=True for make_wikicorpus script by default (#2245)

* Add metadata for wiki examples (#1322) * Add metadata for wiki examples (#1322) * update output list of files * upd docstring * three -> several
piskvorky · Jan 11, 2019 · a720505 · a720505
1 parent 13b52a2
commit a720505
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 8 deletions.
diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst
@@ -160,6 +160,20 @@ Unlike LSA, the topics coming from LDA are easier to interpret
     topic #17: 0.049*indonesia + 0.042*indonesian + 0.031*malaysia + 0.024*singapore + 0.022*greek + 0.021*jakarta + 0.016*greece + 0.015*dord + 0.014*athens + 0.011*malaysian
     topic #18: 0.031*stakes + 0.029*webs + 0.018*futsal + 0.014*whitish + 0.013*hyun + 0.012*thoroughbred + 0.012*dnf + 0.012*jockey + 0.011*medalists + 0.011*racehorse
     topic #19: 0.119*oblast + 0.034*uploaded + 0.034*uploads + 0.033*nordland + 0.025*selsoviet + 0.023*raion + 0.022*krai + 0.018*okrug + 0.015*hålogaland + 0.015*russiae + 0.020*manga + 0.017*dragon + 0.012*theme + 0.011*dvd + 0.011*super + 0.011*hunter + 0.009*ash + 0.009*dream + 0.009*angel
+    >>>
+    >>> import pickle  # noqa: E402
+    >>>
+    >>> # Get an article and its topic distribution
+    >>> with open("wiki_en_bow.mm.metadata.cpickle", 'rb') as meta_file:
+    ...     docno2metadata = pickle.load(meta_file)
+    >>>
+    >>> doc_num = 0
+    >>> print("Title: {}".format(docno2metadata[doc_num][1]))  # take the first article as an example
+    Title: Anarchism
+    >>>
+    >>> vec = mm[doc_num]  # get tf-idf vector
+    >>> lda.get_document_topics(vec)
+    [(1, 0.028828567), (10, 0.32766217), (36, 0.021675354), (55, 0.2521854), (57, 0.27154338)]
 
 Creating this LDA model of Wikipedia takes about 6 hours and 20 minutes on my laptop [1]_.
 If you need your results faster, consider running :doc:`dist_lda` on a cluster of

diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py
@@ -12,13 +12,15 @@
 Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
 bz2-compressed dump of Wikipedia articles, in XML format.
 
-This actually creates three files:
+This actually creates several files:
 
-* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
-* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
-  Matrix Matrix format
-* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
-* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
+* `OUTPUT_PREFIX_wordids.txt.bz2`: mapping between words and their integer ids
+* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation in Matrix Market format
+* `OUTPUT_PREFIX_bow.mm.index`: index for `OUTPUT_PREFIX_bow.mm`
+* `OUTPUT_PREFIX_bow.mm.metadata.cpickle`: titles of documents
+* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation in Matix Market format
+* `OUTPUT_PREFIX_tfidf.mm.index`: index for `OUTPUT_PREFIX_tfidf.mm`
+* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model
 
 The output Matrix Market files can then be compressed (e.g., by bzip2) to save
 disk space; gensim's corpus iterators can work with compressed input, too.
@@ -80,7 +82,7 @@
         dictionary.allow_update = True  # start collecting document frequencies
         wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
         # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
-        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
+        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
         # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
         dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
         dictionary.save_as_text(outp + '_wordids.txt.bz2')
@@ -91,7 +93,7 @@
         # only keep the most frequent words (out of total ~8.2m unique tokens)
         wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
         # save dictionary and bag-of-words (term-document frequency matrix)
-        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)  # another ~9h
+        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True)  # another ~9h
         wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
         # load back the id->word mapping directly from file
         # this seems to save more memory, compared to keeping the wiki.dictionary object from above