From 3bebf215239e2d2e4e4b0725079d1f611be9b315 Mon Sep 17 00:00:00 2001 From: Lucas Moura Date: Sat, 28 May 2016 20:29:58 -0300 Subject: [PATCH] Use sklearn stopwords instead of external file --- apprecommender/config.py | 1 - apprecommender/data.py | 19 --- apprecommender/decider.py | 4 +- apprecommender/initialize.py | 5 - .../tests/test_ml/test_pkg_classification.py | 18 ++- data/stopwords | 153 ------------------ 6 files changed, 12 insertions(+), 188 deletions(-) delete mode 100644 data/stopwords diff --git a/apprecommender/config.py b/apprecommender/config.py index 06a086f..73038e4 100644 --- a/apprecommender/config.py +++ b/apprecommender/config.py @@ -64,7 +64,6 @@ def __init__(self): self.axi_programs = os.path.join(self.base_dir, "axi_programs") self.axi_desktopapps = os.path.join(self.base_dir, "axi_desktopapps") - self.stopwords = os.path.join(self.filters_dir, 'stopwords') # popcon indexes self.index_mode = "old" # check if there are popcon indexes available diff --git a/apprecommender/data.py b/apprecommender/data.py index a70c7e5..8deeac7 100644 --- a/apprecommender/data.py +++ b/apprecommender/data.py @@ -41,7 +41,6 @@ from apprecommender.data_classification import time_weight from apprecommender.dissimilarity import JaccardDistance from apprecommender.error import Error -from apprecommender.singleton import Singleton def axi_get_pkgs(axi): @@ -214,24 +213,6 @@ def split_pkg_data(user_pkg, partition_size): return round_partition -class StopWords(Singleton): - - def __init__(self): - self._stopwords = set() - - @property - def stopwords(self): - if not self._stopwords: - stopwords_path = Config().stopwords - with open(stopwords_path, 'r') as stopwords: - for word in stopwords: - self._stopwords.add(word.strip()) - - return self._stopwords - else: - return self._stopwords - - class FilteredXapianIndex(xapian.WritableDatabase): """ diff --git a/apprecommender/decider.py b/apprecommender/decider.py index 9a1a7c2..3627e87 100644 --- a/apprecommender/decider.py +++ b/apprecommender/decider.py @@ -3,7 +3,7 @@ import re import xapian -from apprecommender.data import StopWords +from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS class PkgMatchDecider(xapian.MatchDecider): @@ -109,7 +109,7 @@ class FilterDescription(xapian.ExpandDecider): def __init__(self): xapian.ExpandDecider.__init__(self) - self.stop_words = StopWords().stopwords + self.stop_words = ENGLISH_STOP_WORDS def __call__(self, term): """ diff --git a/apprecommender/initialize.py b/apprecommender/initialize.py index eca39e9..f11a3dc 100644 --- a/apprecommender/initialize.py +++ b/apprecommender/initialize.py @@ -187,7 +187,6 @@ def prepare_data(self): self.save_list(pkgs, pkgs_path) self.indexer_axi('sample', pkgs_path) - self.move_stopwords() def get_role_program_pkgs(self): command = "cat /var/lib/debtags/package-tags | " \ @@ -200,7 +199,3 @@ def get_role_program_pkgs(self): def save_list(self, elements, path): with open(path, 'w') as text: text.write('\n'.join(elements) + '\n') - - def move_stopwords(self): - filters_dir = self.config.filters_dir - shutil.copyfile('../data/stopwords', filters_dir + '/stopwords') diff --git a/apprecommender/tests/test_ml/test_pkg_classification.py b/apprecommender/tests/test_ml/test_pkg_classification.py index e12c8a9..3ba0ba8 100644 --- a/apprecommender/tests/test_ml/test_pkg_classification.py +++ b/apprecommender/tests/test_ml/test_pkg_classification.py @@ -29,16 +29,18 @@ def test_get_pkg_debtags(self): self.assertTrue(debtag in vim_debtags_result) def test_get_pkg_terms(self): - vim_terms = [u'almost', u'compat', u'version', u'editor', u'new', - u'featur', u'ad', u'multi', u'level', u'undo', u'syntax', - u'highlight', u'command', u'line', u'histori', - u'help', u'filenam', u'complet', u'block', u'oper', - u'fold', u'support', u'etc', u'packag', u'contain', - u'version', u'vim', u'compil', u'rather', u'standard', - u'set', u'featur', u'packag', u'provid', u'version', - u'packag', u'need', u'less'] + vim_terms = [u'vim', u'compat', u'version', u'unix', u'editor', u'vi', + u'new', u'featur', u'ad', u'multi', u'level', u'undo', + u'syntax', u'highlight', u'command', u'line', u'histori', + u'line', u'help', u'filenam', u'complet', u'block', + u'oper', u'fold', u'unicod', u'support', u'packag', + u'contain', u'version', u'vim', u'compil', u'standard', + u'set', u'featur', u'packag', u'doe', u'provid', u'gui', + u'version', u'vim', u'vim', u'packag', u'need'] vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim') + print vim_terms_result + for term in vim_terms: self.assertTrue(term in vim_terms_result) diff --git a/data/stopwords b/data/stopwords deleted file mode 100644 index d075cc0..0000000 --- a/data/stopwords +++ /dev/null @@ -1,153 +0,0 @@ -i -me -my -myself -we -our -ours -ourselves -you -your -yours -yourself -yourselves -he -him -his -himself -she -her -hers -herself -it -its -itself -they -them -their -theirs -themselves -what -which -who -whom -this -that -these -those -am -is -are -was -were -be -been -being -have -has -had -having -do -does -did -doing -a -an -the -and -but -if -or -because -as -until -while -of -at -by -for -with -about -against -between -into -through -during -before -after -above -below -to -from -up -down -in -out -on -off -over -under -again -further -then -once -here -there -when -where -why -how -all -any -both -each -few -more -most -other -some -such -no -nor -not -only -own -same -so -than -too -very -s -t -can -will -just -don -should -now -d -ll -m -o -re -ve -y -ain -aren -couldn -didn -doesn -hadn -hasn -haven -isn -ma -mightn -mustn -needn -shan -shouldn -wasn -weren -won -wouldn