piskvorky · tmylk · Jan 29, 2017 · Oct 10, 2016 · Oct 11, 2016 · Oct 16, 2016
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,10 @@ Changes
 =======
 
 0.13.3, 2016-09-26
-
+* Added sklearn wrapper for LdaModel (Basic LDA Model) along with relevant test cases and ipynb draft. (@AadityaJ,
+[#932](https://github.com/RaRe-Technologies/gensim/pull/932))
+* Add online learning feature to word2vec. (@isohyt [#900](https://github.com/RaRe-Technologies/gensim/pull/900))
+* Tutorial: Reproducing Doc2vec paper result on wikipedia. (@isohyt, [#654](https://github.com/RaRe-Technologies/gensim/pull/654))
 * Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised, added test cases in test/test_word2vec.py(@pranay360, #883)
 * Added Save/Load interface to AnnoyIndexer for ondex persistence (@fortiema, [#845](https://github.com/RaRe-Technologies/gensim/pull/845))
 * Change export_phrases in Phrases model. Fix issue #794 (@AadityaJ,

diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using wrappers for Scikit learn API"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial is about using gensim models as a part of your scikit learn workflow with the help of wrappers found at ```gensim.sklearn_integration.base```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The wrapper available (as of now) are :\n",
+    "* LdaModel (```gensim.sklearn_integration.base.LdaModel```),which implements gensim's ```LdaModel``` in a scikit-learn interface"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### LdaModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use LdaModel begin with importing LdaModel wrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from gensim.sklearn_integration.base import LdaModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next we will create a dummy set of texts and convert it into a corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from gensim.corpora import mmcorpus, Dictionary\n",
+    "texts = [['human', 'interface', 'computer'],\n",
+    " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
+    " ['eps', 'user', 'interface', 'system'],\n",
+    " ['system', 'human', 'system', 'eps'],\n",
+    " ['user', 'response', 'time'],\n",
+    " ['trees'],\n",
+    " ['graph', 'trees'],\n",
+    " ['graph', 'minors', 'trees'],\n",
+    " ['graph', 'minors', 'survey']]\n",
+    "dictionary = Dictionary(texts)\n",
+    "corpus = [dictionary.doc2bow(text) for text in texts]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then to run the LdaModel on it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, u'0.271*system + 0.181*eps + 0.181*interface + 0.181*human + 0.091*computer + 0.091*user + 0.001*trees + 0.001*graph + 0.001*time + 0.001*minors'), (1, u'0.166*graph + 0.166*trees + 0.111*user + 0.111*survey + 0.111*response + 0.111*minors + 0.111*time + 0.056*computer + 0.056*system + 0.001*human')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model=LdaModel(n_topics=2,id2word=dictionary,n_iter=20, random_state=1)\n",
+    "model.fit(corpus)\n",
+    "print model.print_topics(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py
@@ -0,0 +1,6 @@
+"""scikit learn wrapper for gensim
+Contains various gensim based implementations
+which match with scikit-learn standards .
+See [1] for complete set of conventions.
+[1] http://scikit-learn.org/stable/developers/
+"""
diff --git a/gensim/sklearn_integration/base.py b/gensim/sklearn_integration/base.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2011 Radim Rehurek <[email protected]>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+#
+"""
+scikit learn interface for gensim for easy use of gensim with scikit-learn
+"""
+import numpy as np
+import gensim.models.ldamodel
+
+
+class BaseClass(object):
+    def __init__(self):
+        """init
+        base class to be always inherited
+        to be used in the future
+        """
+    def run(self):   # to test
+        return np.array([0, 0, 0])
+
+
+class LdaModel(object):
+    """
+    Base LDA module
+    """
+    def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=None,
+                 refresh=10,lda_model=None,id2word=None,passes=20,ex=None):
+        """
+        base LDA code . Uses mapper function
+        n_topics : num_topics
+        .fit  : init call // corpus not used
+        //none : id2word
+        n_iter : passes // assumed
+        random_state : random_state
+        alpha : alpha
+        eta : eta
+        refresh : update_every
+        id2word: id2word
+        """
+        self.n_topics = n_topics
+        self.n_iter = n_iter
+        self.alpha = alpha
+        self.eta = eta
+        self.random_state = random_state
+        self.refresh = refresh
+        self.id2word=id2word
+        self.passes=passes
+        # use lda_model variable as object
+        self.lda_model = lda_model
+        # perform appropriate checks
+        if alpha <= 0:
+            raise ValueError("alpha value must be larger than zero")
+        if eta <= 0:
+            raise ValueError("eta value must be larger than zero")
+
+    def get_params(self, deep=True):
+        if deep:
+            return {"alpha": self.alpha, "n_iter": self.n_iter,"eta":self.eta,"random_state":self.random_state,"lda_model":self.lda_model,"id2word":self.id2word,"passes":self.passes}
+
+    def set_params(self, **parameters):
+        for parameter, value in parameters.items():
+            self.setattr(parameter, value)
+        return self
+
+    def fit(self,X,y=None):
+        """
+        call gensim.model.LdaModel from this
+        // todo: convert fit and relevant,corpus still requires gensim preprocessing
+        calling :
+        >>>gensim.models.LdaModel(corpus=corpus,num_topics=n_topics,id2word=None,passes=n_iter,update_every=refresh,alpha=alpha,iterations=n_iter,eta=eta,random_state=random_state)
+        """
+        if X is None:
+            raise AttributeError("Corpus defined as none")
+        self.lda_model = gensim.models.LdaModel(corpus=X,num_topics=self.n_topics, id2word=self.id2word, passes=self.passes,
+                                                update_every=self.refresh,alpha=self.alpha, iterations=self.n_iter,
+                                                eta=self.eta,random_state=self.random_state)
+        return  self.lda_model
+
+    def print_topics(self,n_topics=20,num_words=20,log=True):
+        """
+        print all the topics
+        using the object lda_model
+        """
+        return self.lda_model.show_topics(num_topics=n_topics,num_words=num_words,log=log)
+
+    def transform(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
+        """
+        takes as an input a new document (bow) and
+        Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
+        """
+        return self.lda_model.get_document_topics(bow,minimum_probability=minimum_probability,minimum_phi_value=minimum_phi_value,
+                                                  per_word_topics=per_word_topics)
+        # might need to do more
+    def get_term_topics(self,wordid,minimum_probability=None):
+        """
+        returns the most likely topic associated with a particular word
+        use wordid or simply pass the word itself
+        """
+        return self.lda_model.get_term_topics(wordid,minimum_probability=minimum_probability)
+
+    def get_topic_terms(self,topicid,topn=10):
+        """
+        return a tuple of (wordid,probability) for given topic
+        topn can be used to restrict
+        """
+        return self.lda_model.get_topic_terms(topicid=topicid,topn=topn)
diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py
@@ -0,0 +1,32 @@
+import six
+import unittest
+
+from gensim.sklearn_integration import base
+from gensim.corpora import Dictionary
+texts = [['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
+
+class TestLdaModel:
+    def __init__(self):
+        self.model=base.LdaModel(id2word=dictionary,n_topics=2,passes=100)
+        self.model.fit(corpus)
+
+    def testPrintTopic(self):
+        topic = self.model.print_topics(2)
+
+        for k, v in topic:
+            self.assertTrue(isinstance(k, six.string_types))
+            self.assertTrue(isinstance(v, float))
+
+if __name__ == '__main__':
+    unittest.main()