-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
version 0.0.1 release
- Loading branch information
Showing
16 changed files
with
131 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
pip3 install -U pytest==7.1.3 | ||
python3 -m spacy download en_core_web_sm | ||
python3 -m "nltk.downloader" all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
set PYTHONPATH=./src | ||
python3 setup.py install |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pytest test/text_preprocessor.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[metadata] | ||
name = vocably | ||
long_description = file: README.md | ||
long_description_content_type = text/markdown | ||
author = Nandhini | ||
author_email = [email protected] | ||
|
||
[isort] | ||
include_trailing_comma = True | ||
use_parentheses = True | ||
line_length = 119 | ||
description-file = README.md | ||
|
||
[flake8] | ||
max-line-length = 119 | ||
exclude = ./docs, | ||
__pycache__, | ||
.github | ||
*.venv/*, | ||
.venv, | ||
.eggs | ||
./src/vocably/__pycache__, | ||
src/vocably/vocably.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,24 @@ | ||
import setuptools | ||
from setuptools import setup | ||
from pathlib import Path | ||
from src.vocably import VERSION | ||
from src.vocably import __version__ | ||
|
||
core_requirements = [ | ||
'numpy~=1.23.3', | ||
'pandas~=1.5.0', | ||
'Click~=7.1.2', | ||
'click~=7.1.2', | ||
'torch~=1.12.1', | ||
'gensim~=4.2.0', | ||
'nltk~=3.7', | ||
'scipy==1.9.1', | ||
'scikit-learn==1.1.2', | ||
'transformers==4.22.1', | ||
'rich~=12.6.0', | ||
] | ||
|
||
setup( | ||
name='vocably', | ||
version=VERSION, | ||
version=__version__, | ||
py_modules=['command', 'core'], | ||
install_requires=core_requirements, | ||
description='Vocaly is a Natural Language Framework written in Python for Language based Tasks.', | ||
|
@@ -26,10 +27,15 @@ | |
python_requires='>=3.7,<4', | ||
author="Nandhini", | ||
author_email="[email protected]", | ||
url="", | ||
url="https://github.com/Nandhini25S/Vocably", | ||
include_package_data=True, | ||
os_type=["linux", "Windows", "MacOS", "Unix"], | ||
license='MIT', | ||
package_dir={'': 'src/vocably'}, | ||
packages=setuptools.find_packages(where="src/vocably"), | ||
package_dir={'': 'src'}, | ||
packages=setuptools.find_packages(where="src"), | ||
entry_points={ | ||
'console_scripts': [ | ||
'vocably = vocably.cli.main:main', | ||
], | ||
}, | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import re | ||
from nltk.stem import WordNetLemmatizer | ||
from nltk.tokenize import word_tokenize | ||
from nltk.stem import PorterStemmer | ||
import spacy | ||
from vocably.constants import WHITELIST | ||
|
||
|
||
class Preprocess: | ||
def __init__(self, remove_stopwords: bool = False, | ||
lemmatize: bool = True, | ||
remove_links: bool = True, | ||
remove_punctuation: bool = True, | ||
remove_numbers: bool = True, nltk_tokenize=False): | ||
self.remove_stopwords = remove_stopwords | ||
self.lemmatize = lemmatize | ||
self.remove_links = remove_links | ||
self.remove_punctuation = remove_punctuation | ||
self.remove_numbers = remove_numbers | ||
self.nltk_tokenize = nltk_tokenize | ||
|
||
def normalise(self, text): | ||
text = text.lower() | ||
text = text.replace('\n', ' ') | ||
text = text.replace('\t', ' ') | ||
if self.remove_links: | ||
text = re.sub(r"http(s)?(:)?(\/\/)?|(\/\/)?(www\.)?(.com)?", '', text) | ||
text = re.sub(r'\S*\s?(http|https)\S*', '', text) | ||
if self.remove_punctuation: | ||
text = re.sub(r'[^\w\s]', '', text) | ||
text = re.sub(r'\s+', ' ', text) | ||
if self.remove_numbers: | ||
text = re.sub(r'[^a-zA-Z]', ' ', text) | ||
return text | ||
|
||
def tokenize(self, text): | ||
if self.remove_stopwords: | ||
text = self.stopwords_remove(text) | ||
if self.nltk_tokenize: | ||
if self.lemmatize: | ||
lemmatizer = WordNetLemmatizer() | ||
return [lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(text)] | ||
stemmer = PorterStemmer() | ||
return [stemmer.stem(word) for word in word_tokenize(text)] | ||
if self.lemmatize: | ||
lemmatizer = WordNetLemmatizer() | ||
return [lemmatizer.lemmatize(word, pos='v') for word in text.split()] | ||
stemming = PorterStemmer() | ||
return [stemming.stem(word) for word in text.split()] | ||
|
||
def stopwords_remove(self, text): | ||
english = spacy.load('en_core_web_sm') | ||
stop_words = [i for i in english.Defaults.stop_words] | ||
white_list = WHITELIST | ||
return ' '.join([word for word in text.split() if word not in stop_words or word in white_list]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
VERSION="0.0.1" | ||
__version__ : str = """0.0.1""" | ||
__author__ : str = '''Nandhini''' |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import rich | ||
import sys | ||
|
||
|
||
def main(): | ||
rich.print("[bold purple]Welcome to vocably![/bold purple]") | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
WHITELIST : list[str] = ['not', "n't", "isnt", "isn't", "only", "about", "wouldn't", | ||
"shouldn't", "couldn't", "weren't", "wasn't", | ||
"hasn't", "werent", "hasnt"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from vocably.Preprocessing.text import Preprocess | ||
|
||
preprocess = Preprocess(remove_links=True, remove_punctuation=True, remove_stopwords=True, | ||
remove_numbers=False, nltk_tokenize=True) | ||
text = 'Friendship is not only about caring for each other. Being with them in hard times' | ||
print(f"{preprocess.tokenize(preprocess.normalise(text))}") |