Merge pull request #4

version 0.0.1 release
Nandhini25S · Oct 16, 2022 · 8dcfe7f · 8dcfe7f
2 parents 842709f + 830beb7
commit 8dcfe7f
Show file tree

Hide file tree

Showing 16 changed files with 131 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,8 @@
 src/vocably/vocably.egg-info
 src/vocably/__pycache__
 build
+src/nandini_vocably.egg-info
 dist
+src/vocably.egg-info
+src/__init__.py
+.pytest_cache
diff --git a/Makefile b/Makefile
@@ -5,6 +5,7 @@ install:
 	python3 setup.py install
 
 config:
+	python3 -m pip install pylint==2.15.4 pytest==7.1.3
 	python3 -m spacy download en_core_web_sm
 	python3 -m "nltk.downloader" all
 
@@ -13,9 +14,6 @@ all:
 	make config
 	make list
 
-test:
-	python3 -m pytest -v
-
 delete:
 	python3 setup.py clean --all
 
@@ -26,8 +24,11 @@ uninstall:
 	pip3 uninstall vocably
 
 format:
-	flake8 --max-line-length=120 --ignore=E305,E402,W503,BLK100
+	flake8 --max-line-length=120 --ignore=BLK100 --exclude=build --format="%(path)s:%(row)d:%(col)d: %(code)s %(text)s" --show-source --statistics
 
+# Language: makefile
+lint:
+	pylint vocably --ignore=C0114,C0115,C0116,C0117,C0118
 
-
-
+test:
+	python3 pytest test/
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ $ pip install vocably
 $ git clone https://github.com/Nandhini25S/Vocably.git
 $ cd Vocably
 $ conda env create -f environment.yml
+$ export PYTHONPATH=./src
 $ python3 setup.py install
 ```
 
@@ -26,6 +27,7 @@ $ conda activate vocably
 to install and config
 
 ```bash
+$ export PYTHONPATH=./src
 $ make install
 $ make configure
 ```

diff --git a/Scripts/config.bat b/Scripts/config.bat
@@ -1,2 +1,3 @@
+pip3 install -U pytest==7.1.3
 python3 -m spacy download en_core_web_sm
 python3 -m "nltk.downloader" all
diff --git a/Scripts/install.bat b/Scripts/install.bat
@@ -1 +1,2 @@
+set PYTHONPATH=./src
 python3 setup.py install
diff --git a/Scripts/pytest.bat b/Scripts/pytest.bat
@@ -0,0 +1 @@
+pytest test/text_preprocessor.py
diff --git a/requirements.txt b/requirements.txt
@@ -10,5 +10,7 @@ scikit-learn==1.1.2
 transformers==4.22.1
 tokenizers==0.12.1
 spacy==3.4.1
-flake8==5.4.0
-click==7.1.2
+flake8==5.0.4
+click==7.1.2
+rich~=12.6.0
+setuptools~=65.5.0
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,23 @@
+[metadata]
+name = vocably
+long_description = file: README.md
+long_description_content_type = text/markdown
+author = Nandhini
+author_email = [email protected]
+
+[isort]
+include_trailing_comma = True
+use_parentheses = True
+line_length = 119
+description-file = README.md
+
+[flake8]
+max-line-length = 119
+exclude = ./docs,
+    __pycache__,
+    .github
+    *.venv/*,
+    .venv,
+    .eggs
+    ./src/vocably/__pycache__,
+    src/vocably/vocably.egg-info
diff --git a/setup.py b/setup.py
@@ -1,23 +1,24 @@
 import setuptools
 from setuptools import setup
 from pathlib import Path
-from src.vocably import VERSION
+from src.vocably import __version__
 
 core_requirements = [
     'numpy~=1.23.3',
     'pandas~=1.5.0',
-    'Click~=7.1.2',
+    'click~=7.1.2',
     'torch~=1.12.1',
     'gensim~=4.2.0',
     'nltk~=3.7',
     'scipy==1.9.1',
     'scikit-learn==1.1.2',
     'transformers==4.22.1',
+    'rich~=12.6.0',
 ]
 
 setup(
     name='vocably',
-    version=VERSION,
+    version=__version__,
     py_modules=['command', 'core'],
     install_requires=core_requirements,
     description='Vocaly is a Natural Language Framework written in Python for Language based Tasks.',
@@ -26,10 +27,15 @@
     python_requires='>=3.7,<4',
     author="Nandhini",
     author_email="[email protected]",
-    url="",
+    url="https://github.com/Nandhini25S/Vocably",
     include_package_data=True,
     os_type=["linux", "Windows", "MacOS", "Unix"],
     license='MIT',
-    package_dir={'': 'src/vocably'},
-    packages=setuptools.find_packages(where="src/vocably"),
+    package_dir={'': 'src'},
+    packages=setuptools.find_packages(where="src"),
+    entry_points={
+        'console_scripts': [
+            'vocably = vocably.cli.main:main',
+        ],
+    },
 )
diff --git a/src/vocably/Preprocessing/__init__.py b/src/vocably/Preprocessing/__init__.py
diff --git a/src/vocably/Preprocessing/text.py b/src/vocably/Preprocessing/text.py
@@ -0,0 +1,55 @@
+import re
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+import spacy
+from vocably.constants import WHITELIST
+
+
+class Preprocess:
+    def __init__(self, remove_stopwords: bool = False,
+                 lemmatize: bool = True,
+                 remove_links: bool = True,
+                 remove_punctuation: bool = True,
+                 remove_numbers: bool = True, nltk_tokenize=False):
+        self.remove_stopwords = remove_stopwords
+        self.lemmatize = lemmatize
+        self.remove_links = remove_links
+        self.remove_punctuation = remove_punctuation
+        self.remove_numbers = remove_numbers
+        self.nltk_tokenize = nltk_tokenize
+
+    def normalise(self, text):
+        text = text.lower()
+        text = text.replace('\n', ' ')
+        text = text.replace('\t', ' ')
+        if self.remove_links:
+            text = re.sub(r"http(s)?(:)?(\/\/)?|(\/\/)?(www\.)?(.com)?", '', text)
+            text = re.sub(r'\S*\s?(http|https)\S*', '', text)
+        if self.remove_punctuation:
+            text = re.sub(r'[^\w\s]', '', text)
+            text = re.sub(r'\s+', ' ', text)
+        if self.remove_numbers:
+            text = re.sub(r'[^a-zA-Z]', ' ', text)
+        return text
+
+    def tokenize(self, text):
+        if self.remove_stopwords:
+            text = self.stopwords_remove(text)
+        if self.nltk_tokenize:
+            if self.lemmatize:
+                lemmatizer = WordNetLemmatizer()
+                return [lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(text)]
+            stemmer = PorterStemmer()
+            return [stemmer.stem(word) for word in word_tokenize(text)]
+        if self.lemmatize:
+            lemmatizer = WordNetLemmatizer()
+            return [lemmatizer.lemmatize(word, pos='v') for word in text.split()]
+        stemming = PorterStemmer()
+        return [stemming.stem(word) for word in text.split()]
+
+    def stopwords_remove(self, text):
+        english = spacy.load('en_core_web_sm')
+        stop_words = [i for i in english.Defaults.stop_words]
+        white_list = WHITELIST
+        return ' '.join([word for word in text.split() if word not in stop_words or word in white_list])
diff --git a/src/vocably/__init__.py b/src/vocably/__init__.py
@@ -1 +1,2 @@
-VERSION="0.0.1"
+__version__ : str = """0.0.1"""
+__author__ : str = '''Nandhini'''
diff --git a/src/vocably/cli/__init__.py b/src/vocably/cli/__init__.py
diff --git a/src/vocably/cli/main.py b/src/vocably/cli/main.py
@@ -0,0 +1,10 @@
+import rich
+import sys
+
+
+def main():
+    rich.print("[bold purple]Welcome to vocably![/bold purple]")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/vocably/constants.py b/src/vocably/constants.py
@@ -0,0 +1,3 @@
+WHITELIST : list[str] = ['not', "n't", "isnt", "isn't", "only", "about", "wouldn't",
+             "shouldn't", "couldn't", "weren't", "wasn't",
+             "hasn't", "werent", "hasnt"]
diff --git a/test/text_preprocessor.py b/test/text_preprocessor.py
@@ -0,0 +1,6 @@
+from vocably.Preprocessing.text import Preprocess
+
+preprocess = Preprocess(remove_links=True, remove_punctuation=True, remove_stopwords=True,
+                        remove_numbers=False, nltk_tokenize=True)
+text = 'Friendship is not only about caring for each other. Being with them in hard times'
+print(f"{preprocess.tokenize(preprocess.normalise(text))}")