diff --git a/Text+Classification+using+python,+scikit+and+nltk.ipynb b/Text+Classification+using+python,+scikit+and+nltk.ipynb index 149df84..e8a4baf 100644 --- a/Text+Classification+using+python,+scikit+and+nltk.ipynb +++ b/Text+Classification+using+python,+scikit+and+nltk.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "#Loading the data set - training data.\n", @@ -15,10 +13,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, + "execution_count": 2, + "metadata": {}, "outputs": [ { "data": { @@ -45,7 +41,7 @@ " 'talk.religion.misc']" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -57,10 +53,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -78,10 +72,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [ { "data": { @@ -89,7 +81,7 @@ "(11314, 130107)" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -104,10 +96,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { @@ -115,7 +105,7 @@ "(11314, 130107)" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -130,10 +120,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "# Machine Learning\n", @@ -144,10 +132,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ "# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:\n", @@ -162,10 +148,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { @@ -173,7 +157,7 @@ "0.7738980350504514" ] }, - "execution_count": 15, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -188,26 +172,16 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, + "execution_count": 12, + "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\javedsha\\AppData\\Local\\Continuum\\Anaconda2\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n", - " DeprecationWarning)\n" - ] - }, { "data": { "text/plain": [ - "0.82381837493361654" + "0.8238183749336165" ] }, - "execution_count": 16, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -217,7 +191,7 @@ "\n", "from sklearn.linear_model import SGDClassifier\n", "text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),\n", - " ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])\n", + " ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, tol=0.21, random_state=42))])\n", "\n", "text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)\n", "predicted_svm = text_clf_svm.predict(twenty_test.data)\n", @@ -259,9 +233,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -287,9 +259,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -333,9 +303,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -407,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.13" + "version": "2.7.16" } }, "nbformat": 4, diff --git a/Text+Classification+using+python,+scikit+and+nltk.py b/Text+Classification+using+python,+scikit+and+nltk.py index 8c850bb..b2aef2b 100644 --- a/Text+Classification+using+python,+scikit+and+nltk.py +++ b/Text+Classification+using+python,+scikit+and+nltk.py @@ -72,7 +72,7 @@ from sklearn.linear_model import SGDClassifier text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), - ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))]) + ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, tol=0.21, random_state=42))]) text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target) predicted_svm = text_clf_svm.predict(twenty_test.data)