diff --git a/src/ner/agri_ner_akai/local/Dockerfile b/src/ner/agri_ner_akai/local/Dockerfile index 97897b3..bb63430 100644 --- a/src/ner/agri_ner_akai/local/Dockerfile +++ b/src/ner/agri_ner_akai/local/Dockerfile @@ -3,11 +3,12 @@ FROM python:3.9-slim WORKDIR /app - #install requirements COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt +RUN python -m spacy download en_core_web_sm + # Copy the rest of the application code to the working directory COPY . /app/ EXPOSE 8000 diff --git a/src/ner/agri_ner_akai/local/README.md b/src/ner/agri_ner_akai/local/README.md index 5c5a066..da09159 100644 --- a/src/ner/agri_ner_akai/local/README.md +++ b/src/ner/agri_ner_akai/local/README.md @@ -1,21 +1,42 @@ ## NER: - ### Purpose : + Model to detect + - crops - pests -- seed type +- seed type +- email +- time +- phone numbers +- numbers with units +- dates +### Testing the model deployment : -### Testing the model deployment : -To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : +To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps : - Git clone the repo -- Go to current folder location i.e. ``` cd /src/ner/agri_ner_akai/local ``` -- Create docker image file and test the api: +- Go to current folder location i.e. ``cd /src/ner/agri_ner_akai/local`` +- Create docker image file and test the api: + ``` docker build -t testmodel . docker run -p 8000:8000 testmodel -curl -X POST -H "Content-Type: application/json" -d '{"text": "What are tomatoes and potaotes that are being attacked by aphids? "}' http://localhost:8000/ +``` + +### **Request** + +``` +curl -X POST -H "Content-Type: application/json" -d '{ +"text": "What are tomatoes and potaotes that are being attacked by aphids will be treated next monday?", +"type": ["email", "CROP"] +}' http://localhost:8000/ +``` + +``` +curl -X POST -H "Content-Type: application/json" -d '{ +"text": "What are tomatoes and potaotes that are being attacked by aphids? " +}' http://localhost:8000/ ``` diff --git a/src/ner/agri_ner_akai/local/bert_ner.py b/src/ner/agri_ner_akai/local/bert_ner.py new file mode 100644 index 0000000..75858fc --- /dev/null +++ b/src/ner/agri_ner_akai/local/bert_ner.py @@ -0,0 +1,68 @@ +from transformers import pipeline +from request import ModelRequest + +class BertNERModel(): + def __new__(cls): + if not hasattr(cls, 'instance'): + cls.instance = super(BertNERModel, cls).__new__(cls) + cls.nlp_ner = pipeline("ner", model="GautamR/akai_ner", tokenizer="GautamR/akai_ner") + return cls.instance + + def inference(self, sentence): + entities = self.nlp_ner(sentence) + return self.aggregate_entities(sentence, entities) + + @staticmethod + def aggregate_entities(sentence, entity_outputs): + aggregated_entities = [] + current_entity = None + + for entity in entity_outputs: + entity_type = entity["entity"].split("-")[-1] + + # Handle subwords + if entity["word"].startswith("##"): + # If we encounter an I-PEST or any other I- entity + if "I-" in entity["entity"]: + if current_entity: # Add previous entity + aggregated_entities.append(current_entity) + + word_start = sentence.rfind(" ", 0, entity["start"]) + 1 + word_end = sentence.find(" ", entity["end"]) + if word_end == -1: + word_end = len(sentence) + + current_entity = { + "entity_group": entity_type, + "score": float(entity["score"]), + "word": sentence[word_start:word_end].replace('.','').replace('?',''), + "start": float(word_start), + "end": float(word_end) + } + aggregated_entities.append(current_entity) + current_entity = None + + else: + if current_entity: + # If it's a subword but not an I- entity + current_entity["word"] += entity["word"][2:] + current_entity["end"] = entity["end"] + current_entity["score"] = float((current_entity["score"] + entity["score"]) / 2) # averaging scores + + # Handle full words + else: + if current_entity: + aggregated_entities.append(current_entity) + + current_entity = { + "entity_group": entity_type, + "score": float(entity["score"]), + "word": entity["word"], + "start": float(entity["start"]), + "end": float(entity["end"]) + } + + if current_entity: + aggregated_entities.append(current_entity) + + return aggregated_entities diff --git a/src/ner/agri_ner_akai/local/model.py b/src/ner/agri_ner_akai/local/model.py index f863e8c..c571545 100644 --- a/src/ner/agri_ner_akai/local/model.py +++ b/src/ner/agri_ner_akai/local/model.py @@ -1,69 +1,51 @@ from transformers import pipeline from request import ModelRequest +from regex_parse_ner import RegNERModel +from bert_ner import BertNERModel class Model(): - def __new__(cls, context): - cls.context = context - if not hasattr(cls, 'instance'): - cls.instance = super(Model, cls).__new__(cls) - cls.nlp_ner = pipeline("ner", model="GautamR/akai_ner", tokenizer="GautamR/akai_ner") - return cls.instance + def __init__(self, context): + self.context = context + print("Loading models...") + self.regex_model = RegNERModel() + print("Regex model loaded successfully") + self.bert_model = BertNERModel() + print("Bert model loaded successfully") - async def inference(self, request: ModelRequest): - entities = self.nlp_ner(request.text) - return self.aggregate_entities(request.text, entities) + def combine_entities(self, reg_entities, bert_entities): + combined_entities = reg_entities + + for entity in bert_entities: + if entity['entity_group'] not in combined_entities: + combined_entities[entity['entity_group']] = [] - @staticmethod - def aggregate_entities(sentence, entity_outputs): - aggregated_entities = [] - current_entity = None + entity_info = { + 'name': entity['word'], + 'start': entity['start'], + 'end': entity['end'], + 'score': entity['score'] + } - for entity in entity_outputs: - entity_type = entity["entity"].split("-")[-1] + combined_entities[entity['entity_group']].append(entity_info) - # Handle subwords - if entity["word"].startswith("##"): - # If we encounter an I-PEST or any other I- entity - if "I-" in entity["entity"]: - if current_entity: # Add previous entity - aggregated_entities.append(current_entity) - - word_start = sentence.rfind(" ", 0, entity["start"]) + 1 - word_end = sentence.find(" ", entity["end"]) - if word_end == -1: - word_end = len(sentence) + return combined_entities + + async def inference(self, request: ModelRequest): + sentence = request.text + types = request.type - current_entity = { - "entity_group": entity_type, - "score": float(entity["score"]), - "word": sentence[word_start:word_end].replace('.','').replace('?',''), - "start": float(word_start), - "end": float(word_end) - } - aggregated_entities.append(current_entity) - current_entity = None + reg_entities = self.regex_model.inference(sentence) + bert_entities = self.bert_model.inference(sentence) - else: - if current_entity: - # If it's a subword but not an I- entity - current_entity["word"] += entity["word"][2:] - current_entity["end"] = entity["end"] - current_entity["score"] = float((current_entity["score"] + entity["score"]) / 2) # averaging scores + combined_entities = self.combine_entities(reg_entities, bert_entities) - # Handle full words - else: - if current_entity: - aggregated_entities.append(current_entity) + final_entities = {} - current_entity = { - "entity_group": entity_type, - "score": float(entity["score"]), - "word": entity["word"], - "start": float(entity["start"]), - "end": float(entity["end"]) - } + if types is None: + return combined_entities - if current_entity: - aggregated_entities.append(current_entity) + for entity_group in combined_entities: + if entity_group in types: + final_entities[entity_group] = combined_entities[entity_group] - return aggregated_entities + return final_entities \ No newline at end of file diff --git a/src/ner/agri_ner_akai/local/regex_parse_ner.py b/src/ner/agri_ner_akai/local/regex_parse_ner.py new file mode 100644 index 0000000..2a698e4 --- /dev/null +++ b/src/ner/agri_ner_akai/local/regex_parse_ner.py @@ -0,0 +1,143 @@ +import re +import spacy +from datetime import datetime, timedelta + +class RegNERModel(): + def __init__(self): + self.nlp = spacy.load("en_core_web_sm") + + print("Model loaded successfully") + + def detect_email(self, sentence): + email_regex_pattern = '[A-Za-z0-9._%+-]*@[A-Za-z0-9.-]*\.[A-Z|a-z]*' + emails_matches = [] + + for match in re.finditer(email_regex_pattern, sentence): + emails_matches.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} ) + + return emails_matches + + def detect_time(self, sentence): + time_regex = r'\b(?:1[0-2]|0?[1-9])(?::[0-5][0-9])?(?:\s?[ap]m)?\b' + times = [] + + for match in re.finditer(time_regex, sentence, re.IGNORECASE): + times.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} ) + + return times + + def detect_phone_numbers(self, sentence): + phone_regex = r'(\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4})' + + phone_numbers = [] + for match in re.finditer(phone_regex, sentence): + phone_numbers.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} ) + + return phone_numbers + + def detect_numbers_with_units(self, sentence, phone_numbers): + number_unit_regex = r'(?