diff --git a/nextpy/ai/config.py b/nextpy/ai/config.py deleted file mode 100644 index 9704dd7a..00000000 --- a/nextpy/ai/config.py +++ /dev/null @@ -1,53 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Env values will be passed from from os.environ -Automatically loads environment variables from .env file. -""" - -import os -from pathlib import Path -from typing import Optional - -import yaml -from dotenv import load_dotenv -from pydantic import BaseSettings - -load_dotenv("./.env") - - -CONFIG_FILE = "config.yaml" -ROOT_DIR = os.path.dirname(Path(__file__).parent.parent) -config_path = ROOT_DIR + "/" + CONFIG_FILE - -if os.path.exists(config_path): - with open(config_path, "r") as file: - config_data = yaml.safe_load(file) - -else: - config_data = {"OPENAI_API_KEY": ""} - config_data["OPENAI_API_KEY"] = input("Enter OPENAI_API_KEY:") - config_data["OPENAI_ORG_ID"] = input("Enter OPENAI_ORG_ID:") - config_data["SERP_API_KEY"] = input("Enter SERP_API_KEY:") - config_data["GOOGLE_SEARCH_API_KEY"] = input("Enter GOOGLE_SEARCH_API_KEY:") - - -class Config: - OPENAI_API_KEY = config_data["OPENAI_API_KEY"] - OPENAI_ORG_ID = config_data["OPENAI_ORG_ID"] - SERP_API_KEY = config_data["SERP_API_KEY"] - GOOGLE_SEARCH_API_KEY = config_data["GOOGLE_SEARCH_API_KEY"] - - -class AgentBoxSettings(BaseSettings): - """AgentBox API Config.""" - - VERBOSE: bool = False - SHOW_INFO: bool = True - - AGENTBOX_API_KEY: Optional[str] = None - AGENTBOX_BASE_URL: str = "https://agentboxapi.com/api/v1" - AGENTBOX_TIMEOUT: int = 20 - - -settings = AgentBoxSettings() diff --git "a/nextpy/ai/config.\360\237\244\226" "b/nextpy/ai/config.\360\237\244\226" deleted file mode 100644 index 52d94419..00000000 --- "a/nextpy/ai/config.\360\237\244\226" +++ /dev/null @@ -1,41 +0,0 @@ -llm: - type: "OpenAI" - model: "gpt-3.5-turbo" -rag: - data_source: "./test_data/meteoric" - data_loader: "SimpleDirectoryReader" - data_transformer: - type: "CharacterTextSplitter" - chunk_overlap: 40 - chunk_size: 1024 - vector_store: - type: "Chroma" - embedding_function: "OpenAIEmbeddings" -agent: - type: "ChatAgent" - prompt_template: | - {{#user~}} - You will use this FORMAT only to answer user's QUERY - FORMAT: {{format}} - QUERY: {{input}} - - Use the below knowledge to answer QUERY in given FORMAT:- - {{RETRIEVED_KNOWLEDGE}} - {{~/user}} - - {{#assistant~}} - Yes, I will tell you about with that - {{~/assistant}} - - {{#user~}} - Yes, tell me - {{~/user}} - - {{#assistant~}} - {{gen 'response' temperature=0 max_tokens=300}} - {{~/assistant}} - input_variables: - knowledge_variable: "input" - extras: "format" - output_key: "response" - diff --git a/nextpy/ai/config/__init__.py b/nextpy/ai/config/__init__.py new file mode 100644 index 00000000..e6ee1211 --- /dev/null +++ b/nextpy/ai/config/__init__.py @@ -0,0 +1 @@ +# init file for config diff --git a/nextpy/ai/finetune/LLMFinetune.py b/nextpy/ai/finetune/LLMFinetune.py deleted file mode 100644 index 82839a08..00000000 --- a/nextpy/ai/finetune/LLMFinetune.py +++ /dev/null @@ -1,27 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from abc import ABC, abstractmethod -from logging import Logger - -import openai - - -class LLMFinetune(ABC): - def __init__(self, logger: Logger, openai_key: str): - self.logger = logger - openai.api_key = openai_key - - @abstractmethod - def transform_data( - self, - train_csv_file: str, - val_csv_file: str, - train_output_file: str, - val_output_file: str, - ) -> str: - pass - - @abstractmethod - def finetune(self, **kwargs): - pass diff --git a/nextpy/ai/finetune/openai_finetune.py b/nextpy/ai/finetune/openai_finetune.py deleted file mode 100644 index e0f5c966..00000000 --- a/nextpy/ai/finetune/openai_finetune.py +++ /dev/null @@ -1,207 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import csv -import json -import logging -import time -from logging import Logger -from typing import List, Optional - -import openai - -from . import LLMFinetune - -# openai.organization = "YOUR_ORG_ID" -# APIKEY -# openai.Model.list() - - -class OpenaiFinetune(LLMFinetune): - def __init__(self, logger: Logger, openai_key: str): - self.logger = logger - openai.api_key = openai_key - - def transform_data( - self, - train_csv_file: str, - val_csv_file: str, - train_output_file: str, - val_output_file: str, - llm_model: str = "openai", - ) -> str: - """Transforms CSV files into JSONL and creates files for fine-tuning.""" - # Verify llm_model - if llm_model != "openai": - raise ValueError("Unsupported model:", llm_model) - - # Paths and Output files - paths = [train_csv_file, val_csv_file] - output_files = [train_output_file, val_output_file] - - # Extracting prompt-completion pairs - prompt_completion_pairs = [] - for csv_file in paths: - with open(csv_file, "r") as f: - reader = csv.reader(f) - for row in reader: - if len(row) >= 2: - prompt = row[0] - completion = row[1] - prompt_completion_pairs.append((prompt, completion)) - - # Writing to JSONL - for output_file, pairs in zip(output_files, prompt_completion_pairs): - with open(output_file, "w") as f: - for pair in pairs: - json_obj = {"prompt": pair[0], "completion": pair[1]} - json_str = json.dumps(json_obj) - f.write(json_str + "\n") - - # Creating Files - ids = [] - for output_file in output_files: - if not output_file.endswith(".jsonl"): - raise Exception( - "args `output_file` must be the **file** path to the .jsonl file" - ) - try: - _ = openai.File.create( - file=open(output_file, "rb"), purpose="fine-tune" - ) - ids.append(_) - except Exception as e: - self.logger.error(f"Error creating file: {e}") - raise e - - return output_files, ids - - # TODO: Specify use of the method - # def model( - # self, - # model_name: str, - # input: str, - # instruction: str, - # n: int, - # temperature: float, - # top_p: float, - # ): - # try: - # model = openai.Edit.create( - # model=model_name, - # temperature=temperature, - # top_p=top_p, - # input=input, - # instruction=instruction, - # n=n, - # ) - # return model - # except Exception as e: - # self.logger.error(f"Error creating model: {e}") - # raise e - - def finetune( - self, - training_file: str, - model_name: Optional[str] = "curie", - n_epoch: Optional[int] = 4, - validation_file: Optional[str] = None, - batch_size: Optional[int] = None, - learning_rate_multiplier: Optional[int] = None, - prompt_loss_weight: Optional[int] = 0.01, - compute_classification_metrics: Optional[bool] = False, - classification_n_classes: Optional[int] = None, - classification_positive_class: Optional[str] = None, - classification_betas: Optional[List[float]] = None, - suffix: Optional[str] = None, - ): - """_summary_. - - Args: - training_file (str): The ID of an uploaded file that contains training data. - model_name (Optional[str], optional): The name of the base model to fine-tune. You can select one of "ada", "babbage", "curie", "davinci", or a fine-tuned model created after 2022-04-21. Defaults to "curie". - n_epoch (Optional[int], optional): Number of epochs to train the model for. Defaults to 4. - validation_file (Optional[str], optional): The ID of an uploaded file that contains validation data. Defaults to None. - batch_size (Optional[int], optional): Batch size to use for training. Defaults to None. - learning_rate_multiplier (Optional[int], optional): Learning rate multiplier to use for training. Defaults to None. - prompt_loss_weight (Optional[int], optional): Weight to use for loss on the prompt tokens. Defaults to 0.01. - compute_classification_metrics (Optional[bool], optional): If True, classification metrics such as accuracy and f1-score are computed for validation set. Defaults to False. - classification_n_classes (Optional[int], optional): Number of classes in a classification task. Defaults to None. - classification_positive_class (Optional[str], optional): This parameter is needed to generate precision, recall, and F1 metrics when doing binary classification. Defaults to None. - classification_betas (Optional[List[float]], optional): If this is provided, we calculate F-beta scores at the specified beta values. Defaults to None. - suffix (Optional[str], optional): A string of up to 40 characters that will be added to your fine-tuned model name. Defaults to None. - - Raises: - e: Errors generated while creating fine-tune job - Exception: If fine-tuning job fails - - Returns: - _type_: _description_ - """ - # openai.FineTune.create(training_file="file-XGinujblHPwGLSztz8cPS8XY") - - job_id = None - try: - job_id = openai.FineTune.create( - training_file=training_file, - model=model_name, - n_epochs=n_epoch, - validation_file=validation_file, - batch_size=batch_size, - learning_rate_multiplier=learning_rate_multiplier, - prompt_loss_weight=prompt_loss_weight, - compute_classification_metrics=compute_classification_metrics, - classification_n_classes=classification_n_classes, - classification_positive_class=classification_positive_class, - classification_betas=classification_betas, - suffix=suffix, - ) - while openai.FineTune.retrieve(job_id.get("id")).get("status") == "pending": - time.sleep(1) - self.logger.info( - "Fine-tuning job status: %s", - openai.FineTune.retrieve(job_id.get("id")).get("status"), - ) - - if openai.FineTune.retrieve(job_id.get("id")).get("status") == "failed": - self.logger.error("Fine-tuning job failed") - raise Exception("Fine-tuning job failed") - - self.logger.info("Fine-tuning job completed successfully") - return job_id - - except Exception as e: - self.logger.error(f"Error creating fine-tune job: {e}") - raise e - - -if __name__ == "__main__": - from creds import OPENAI_KEY - - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - logger.addHandler(logging.StreamHandler()) - finetune = Finetune(logger, openai_key=OPENAI_KEY) - train_path, val_path = finetune.generate_jsonl_from_csv( - "sports_train.csv", "sports_val.csv", "sports_train.jsonl", "sports_val.jsonl" - ) - output_paths, ids = finetune.create_file(output_files=[train_path, val_path]) - train_file, val_file = output_paths - train_id, val_id = ids - job_id = finetune.finetune( - training_file=train_id.get("id"), - n_epoch=1, - validation_file=val_id.get("id"), - suffix="sports", - batch_size=4, - compute_classification_metrics=True, - classification_n_classes=2, - classification_positive_class="hockey", - classification_betas=[0.5, 1, 2], - prompt_loss_weight=0.01, - model_name="curie", - learning_rate_multiplier=1.0, - ) - print("#" * 5, end="\n\n") - print(type(openai.FineTune.retrieve(job_id.get("id")))) - print(openai.FineTune.retrieve(job_id.get("id"))) diff --git a/nextpy/ai/finetune/transformer_finetune.py b/nextpy/ai/finetune/transformer_finetune.py deleted file mode 100644 index dc7afa27..00000000 --- a/nextpy/ai/finetune/transformer_finetune.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from logging import Logger - -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments - -from . import LLMFinetune - - -class TransformersFinetune(LLMFinetune): - def __init__(self, logger: Logger, base_model: str): - super().__init__(logger, openai_key=None) - self.model = AutoModelForCausalLM.from_pretrained(base_model) - self.tokenizer = AutoTokenizer.from_pretrained(base_model) - - def transform_data( - self, - train_csv_file: str, - val_csv_file: str, - train_output_file: str, - val_output_file: str, - ) -> str: - # Implement logic to transform CSV files to desired JSON or other formats - # You can load, process, and save the CSV data here - # Return the path or message confirming the transformation - pass - - def finetune( - self, - data_path, - output_dir, - num_epochs=1, - batch_size=32, - learning_rate=5e-5, - val_set_size=0.1, - max_length=512, - ): - # Load dataset - data = load_dataset("json", data_files={"train": data_path}) - - # Split data into training and validation sets - train_val = data["train"].train_test_split( - test_size=val_set_size, shuffle=True, seed=42 - ) - train_data = train_val["train"] - valid_data = train_val["test"] - - # Tokenization function - def tokenize_function(examples): - return self.tokenizer( - examples["text"], - truncation=True, - max_length=max_length, - padding="max_length", - ) - - # Tokenize dataset - train_data = train_data.map(tokenize_function, batched=True) - valid_data = valid_data.map(tokenize_function, batched=True) - - # Training arguments - training_args = TrainingArguments( - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - num_train_epochs=num_epochs, - learning_rate=learning_rate, - output_dir=output_dir, - evaluation_strategy="steps" if val_set_size > 0 else "no", - logging_dir="./logs", - ) - - # Trainer - trainer = Trainer( - model=self.model, - args=training_args, - train_dataset=train_data, - eval_dataset=valid_data, - ) - - # Training - trainer.train() - - # Save model - self.model.save_pretrained(output_dir) diff --git a/nextpy/ai/hooks/__init__.py b/nextpy/ai/hooks/__init__.py new file mode 100644 index 00000000..9cd84a05 --- /dev/null +++ b/nextpy/ai/hooks/__init__.py @@ -0,0 +1 @@ +# init file for hooks diff --git a/nextpy/ai/hooks/hook_base.py b/nextpy/ai/hooks/hook_base.py new file mode 100644 index 00000000..7ce95f18 --- /dev/null +++ b/nextpy/ai/hooks/hook_base.py @@ -0,0 +1 @@ +# base class for all hooks diff --git a/nextpy/ai/hooks/hook_manager.py b/nextpy/ai/hooks/hook_manager.py new file mode 100644 index 00000000..763640be --- /dev/null +++ b/nextpy/ai/hooks/hook_manager.py @@ -0,0 +1 @@ +# manager to retrieve and register hooks diff --git a/nextpy/ai/models/audio/README.md b/nextpy/ai/models/audio/README.md deleted file mode 100644 index 2709c840..00000000 --- a/nextpy/ai/models/audio/README.md +++ /dev/null @@ -1,61 +0,0 @@ -Source : https://github.com/Shaunwei/RealChar/tree/main ( RealChar. - Your Realtime AI Character) - -# ElevenLabs Voice Cloning Guide - - -This README serves as a guide on how to use ElevenLabs for voice cloning. Follow the steps below to clone a voice, test it, and fine-tune it for the best results. - -## Collecting Data - -Before you start, you'll need voice data. Download high quality vocal only audio clips. Check the [training_data](.ai-example/audio/training_data) folder for reference. - -If you're creating your own dataset, ensure the audio is high quality. It should have no background noise, clear pronunciation. - -The audio format must be mp3 and should be about 1 minute long in total. - -## Creating an ElevenLabs Account - -Visit [ElevenLabs](https://beta.elevenlabs.io/) to create an account. You'll need this to access the speech synthesis and voice cloning features. - -Get your `ELEVEN_LABS_API_KEY`: -1. Click profile icon and select 'profile'. -2. Copy API Key - -## Speech Synthesis/Voice Cloning - -Follow these steps to clone a voice: - -1. Go to the [speech synthesis page](https://beta.elevenlabs.io/speech-synthesis). -2. Click "Add Voice". -3. Click "Add Generative or Cloned Voice". -4. Click "Instant Voice Cloning". -5. Fill in all the required information and upload your audio samples. -6. Click "Add Voice". - -## Testing Your Voice - -To test the voice you've just created: - -1. Go back to the [speech synthesis page](https://beta.elevenlabs.io/speech-synthesis). -2. Choose the voice you just created in Settings. -4. Type some text and click "Generate". - -## Fine-tuning Your Voice - -You can make the voice read better by adjusting system and user prompts. -Here are some tips: - -- If the voice is too monotone, lower the Stability to make it more emotional. However, setting the Stability to zero can sometimes lead to a strange accent. -- Longer sentences tend to be spoken better because they provide more context for the AI speaker to understand. -- For shorter sentences that are spoken too quickly, replace "." with "...". Add "-" or a newline for a pause. -- Add emotion-related words or phrases, or use punctuation marks like “!”, “?” to add emotions to the voice. - -## Using Your Custom Voice in Our Project - -You need the voice id of cloned voice. Here's how: -1. go to https://api.elevenlabs.io/docs -2. choose Get Voices api -3. follow the instruction and find the specific voice_id in the Responses. -4. Do not forget to update your .env file with `ELEVEN_LABS_API_KEY` and voice ids. - - diff --git a/nextpy/ai/models/audio/__init__.py b/nextpy/ai/models/audio/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/models/audio/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/models/audio/speech_to_text/__init__.py b/nextpy/ai/models/audio/speech_to_text/__init__.py deleted file mode 100644 index 8922896f..00000000 --- a/nextpy/ai/models/audio/speech_to_text/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import os - -from nextpy.ai.models.audio.speech_to_text.base import SpeechToText - - -def get_speech_to_text() -> SpeechToText: - use = os.getenv("SPEECH_TO_TEXT_USE", "LOCAL_WHISPER") - if use == "GOOGLE": - from nextpy.ai.audio.speech_to_text.google import Google - - Google.initialize() - return Google.get_instance() - elif use == "LOCAL_WHISPER": - from nextpy.ai.audio.speech_to_text.whisper import Whisper - - Whisper.initialize(use="local") - return Whisper.get_instance() - elif use == "OPENAI_WHISPER": - from nextpy.ai.audio.speech_to_text.whisper import Whisper - - Whisper.initialize(use="api") - return Whisper.get_instance() - else: - raise NotImplementedError(f"Unknown speech to text engine: {use}") diff --git a/nextpy/ai/models/audio/speech_to_text/base.py b/nextpy/ai/models/audio/speech_to_text/base.py deleted file mode 100644 index d30cb846..00000000 --- a/nextpy/ai/models/audio/speech_to_text/base.py +++ /dev/null @@ -1,13 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from abc import ABC, abstractmethod - - -class SpeechToText(ABC): - @abstractmethod - def transcribe( - self, audio_bytes, platform="web", prompt="", language="en-US" - ) -> str: - # platform: 'web' | 'mobile' | 'terminal' - pass diff --git a/nextpy/ai/models/audio/speech_to_text/google.py b/nextpy/ai/models/audio/speech_to_text/google.py deleted file mode 100644 index 8f7d4df8..00000000 --- a/nextpy/ai/models/audio/speech_to_text/google.py +++ /dev/null @@ -1,55 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import types - -from google.cloud import speech - -from nextpy.ai.models.audio.speech_to_text.base import SpeechToText -from nextpy.utils.logger import get_logger -from nextpy.utils.singleton import Singleton - -logger = get_logger(__name__) -config = types.SimpleNamespace( - **{ - "web": { - "encoding": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, - "sample_rate_hertz": 48000, - "language_code": "en-US", - "max_alternatives": 1, - }, - "terminal": { - "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16, - "sample_rate_hertz": 44100, - "language_code": "en-US", - "max_alternatives": 1, - }, - } -) - - -class Google(Singleton, SpeechToText): - def __init__(self): - super().__init__() - logger.info("Setting up [Google Speech to Text]...") - self.client = speech.SpeechClient() - - def transcribe(self, audio_bytes, platform, prompt="", language="en-US") -> str: - batch_config = speech.RecognitionConfig( - { - "speech_contexts": [speech.SpeechContext(phrases=prompt.split(","))], - **config.__dict__[platform], - } - ) - batch_config.language_code = language - if language != "en-US": - batch_config.alternative_language_codes = ["en-US"] - response = self.client.recognize( - config=batch_config, audio=speech.RecognitionAudio(content=audio_bytes) - ) - if not response.results: - return "" - result = response.results[0] - if not result.alternatives: - return "" - return result.alternatives[0].transcript diff --git a/nextpy/ai/models/audio/speech_to_text/whisper.py b/nextpy/ai/models/audio/speech_to_text/whisper.py deleted file mode 100644 index 9252a45e..00000000 --- a/nextpy/ai/models/audio/speech_to_text/whisper.py +++ /dev/null @@ -1,103 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import io -import os -import types -import wave - -import speech_recognition as sr -from faster_whisper import WhisperModel -from pydub import AudioSegment -from torch.cuda import is_available as is_cuda_available - -from nextpy.ai.models.audio.speech_to_text.base import SpeechToText -from nextpy.utils.logger import get_logger -from nextpy.utils.singleton import Singleton - -DEBUG = False -logger = get_logger(__name__) -config = types.SimpleNamespace( - **{ - "model": os.getenv("LOCAL_WHISPER_MODEL", "base"), - "language": "en", - "api_key": os.getenv("OPENAI_API_KEY"), - } -) - -# Whisper use a shorter version for language code. Provide a mapping to convert -# from the standard language code to the whisper language code. -WHISPER_LANGUAGE_CODE_MAPPING = { - "en-US": "en", - "es-ES": "es", - "fr-FR": "fr", - "de-DE": "de", - "it-IT": "it", - "pt-PT": "pt", - "hi-IN": "hi", - "pl-PL": "pl", -} - - -class Whisper(Singleton, SpeechToText): - def __init__(self, use="local"): - super().__init__() - if use == "local": - device = "cuda" if is_cuda_available() else "cpu" - logger.info( - f"Loading [Local Whisper] model: [{config.model}]({device}) ..." - ) - self.model = WhisperModel( - model_size_or_path=config.model, - device="auto", - download_root=None, - ) - self.recognizer = sr.Recognizer() - self.use = use - if DEBUG: - self.wf = wave.open("output.wav", "wb") - self.wf.setnchannels(1) # Assuming mono audio - self.wf.setsampwidth(2) # Assuming 16-bit audio - self.wf.setframerate(44100) # Assuming 44100Hz sample rate - - def transcribe(self, audio_bytes, platform, prompt="", language="en-US"): - logger.info("Transcribing audio...") - if platform == "web": - audio = self._convert_webm_to_wav(audio_bytes, self.use == "local") - else: - audio = self._convert_bytes_to_wav(audio_bytes, self.use == "local") - if self.use == "local": - return self._transcribe(audio, prompt) - elif self.use == "api": - return self._transcribe_api(audio, prompt) - - def _transcribe(self, audio, prompt="", language="en-US"): - language = WHISPER_LANGUAGE_CODE_MAPPING.get(language, config.language) - segs, _ = self.model.transcribe( - audio, language=language, vad_filter=True, initial_prompt=prompt - ) - text = " ".join([seg.text for seg in segs]) - return text - - def _transcribe_api(self, audio, prompt=""): - text = self.recognizer.recognize_whisper_api( - audio, - api_key=config.api_key, - ) - return text - - def _convert_webm_to_wav(self, webm_data, local=True): - webm_audio = AudioSegment.from_file(io.BytesIO(webm_data), format="webm") - wav_data = io.BytesIO() - webm_audio.export(wav_data, format="wav") - if local: - return wav_data - with sr.AudioFile(wav_data) as source: - audio = self.recognizer.record(source) - return audio - - def _convert_bytes_to_wav(self, audio_bytes, local=True): - if local: - audio = io.BytesIO(sr.AudioData(audio_bytes, 44100, 2).get_wav_data()) - return audio - return sr.AudioData(audio_bytes, 44100, 2) diff --git a/nextpy/ai/models/audio/text_to_speech/__init__.py b/nextpy/ai/models/audio/text_to_speech/__init__.py deleted file mode 100644 index 7d19b4f5..00000000 --- a/nextpy/ai/models/audio/text_to_speech/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import os - -from nextpy.ai.models.audio.text_to_speech.base import TextToSpeech - - -def get_text_to_speech(tts: str = None) -> TextToSpeech: - if not tts: - tts = os.getenv("TEXT_TO_SPEECH_USE", "ELEVEN_LABS") - if tts == "ELEVEN_LABS": - from nextpy.ai.audio.text_to_speech.elevenlabs import ElevenLabs - - ElevenLabs.initialize() - return ElevenLabs.get_instance() - elif tts == "GOOGLE_TTS": - from nextpy.ai.audio.text_to_speech.google_cloud_tts import GoogleCloudTTS - - GoogleCloudTTS.initialize() - return GoogleCloudTTS.get_instance() - elif tts == "UNREAL_SPEECH": - from nextpy.ai.audio.text_to_speech.unreal_speech import UnrealSpeech - - UnrealSpeech.initialize() - return UnrealSpeech.get_instance() - else: - raise NotImplementedError(f"Unknown text to speech engine: {tts}") diff --git a/nextpy/ai/models/audio/text_to_speech/base.py b/nextpy/ai/models/audio/text_to_speech/base.py deleted file mode 100644 index c972e822..00000000 --- a/nextpy/ai/models/audio/text_to_speech/base.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from abc import ABC, abstractmethod - - -class TextToSpeech(ABC): - @abstractmethod - async def stream(self, *args, **kwargs): - pass diff --git a/nextpy/ai/models/audio/text_to_speech/elevenlabs.py b/nextpy/ai/models/audio/text_to_speech/elevenlabs.py deleted file mode 100644 index 434eebae..00000000 --- a/nextpy/ai/models/audio/text_to_speech/elevenlabs.py +++ /dev/null @@ -1,74 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import asyncio -import os -import types - -import httpx - -from nextpy.ai.models.audio.text_to_speech.base import TextToSpeech -from nextpy.utils.logger import get_logger -from nextpy.utils.singleton import Singleton - -logger = get_logger(__name__) -DEBUG = False - -config = types.SimpleNamespace( - **{ - "chunk_size": 1024, - "url": "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream", - "headers": { - "Accept": "audio/mpeg", - "Content-Type": "application/json", - "xi-api-key": os.environ["ELEVEN_LABS_API_KEY"], - }, - "data": { - "model_id": "eleven_monolingual_v1", - "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}, - }, - } -) - - -class ElevenLabs(Singleton, TextToSpeech): - def __init__(self): - super().__init__() - logger.info("Initializing [ElevenLabs Text To Speech] voices...") - - async def stream( - self, - text, - websocket, - tts_event: asyncio.Event, - voice_id="21m00Tcm4TlvDq8ikWAM", - first_sentence=False, - language="en-US", - ) -> None: - if DEBUG: - return - if voice_id == "": - logger.info( - f"voice_id is not found in .env file, using ElevenLabs default voice" - ) - voice_id = "21m00Tcm4TlvDq8ikWAM" - headers = config.headers - if language != "en-US": - config.data["model_id"] = "eleven_multilingual_v1" - data = { - "text": text, - **config.data, - } - url = config.url.format(voice_id=voice_id) - if first_sentence: - url = url + "?optimize_streaming_latency=4" - async with httpx.AsyncClient() as client: - response = await client.post(url, json=data, headers=headers) - if response.status_code != 200: - logger.error(f"ElevenLabs returns response {response.status_code}") - async for chunk in response.aiter_bytes(): - await asyncio.sleep(0.1) - if tts_event.is_set(): - # stop streaming audio - break - await websocket.send_bytes(chunk) diff --git a/nextpy/ai/models/audio/text_to_speech/google_cloud_tts.py b/nextpy/ai/models/audio/text_to_speech/google_cloud_tts.py deleted file mode 100644 index 0abc11a1..00000000 --- a/nextpy/ai/models/audio/text_to_speech/google_cloud_tts.py +++ /dev/null @@ -1,100 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import asyncio -import base64 -import os -import types - -import google.auth.transport.requests -import httpx -from google.oauth2 import service_account - -from nextpy.ai.models.audio.text_to_speech.base import TextToSpeech -from nextpy.utils.logger import get_logger -from nextpy.utils.singleton import Singleton - -logger = get_logger(__name__) -DEBUG = False - -config = types.SimpleNamespace( - **{ - "url": "https://texttospeech.googleapis.com/v1/text:synthesize", - "headers": { - "Content-Type": "application/json", - }, - "data": { - "voice": { - "languageCode": "en-US", - "name": "en-US-Studio-M", - "ssmlGender": "NEUTRAL", - }, - "audioConfig": {"audioEncoding": "MP3"}, - }, - "service_account_file": os.getenv( - "GOOGLE_APPLICATION_CREDENTIALS", "default/path.json" - ), - } -) - - -class GoogleCloudTTS(Singleton, TextToSpeech): - def __init__(self): - super().__init__() - logger.info("Initializing [Google Cloud Text To Speech] voices...") - - # Load the service account key - credentials = service_account.Credentials.from_service_account_file( - config.service_account_file, - scopes=["https://www.googleapis.com/auth/cloud-platform"], - ) - - # Request an access token - auth_req = google.auth.transport.requests.Request() - credentials.refresh(auth_req) - - # Now credentials.valid is True and credentials.token contains the access token - self.access_token = credentials.token - - # Set the Authorization header with the access token - config.headers["Authorization"] = f"Bearer {self.access_token}" - - async def stream( - self, - text, - websocket, - tts_event: asyncio.Event, - voice_id="en-US-Standard-C", - first_sentence=False, - language="en-US", - ) -> None: - if DEBUG: - return - if voice_id == "": - logger.info( - "voice_id is not found in .env file, using Google default voice" - ) - voice_id = "en-US-Standard-C" - headers = config.headers - # For customized voices - - # if language != 'en-US': - # config.data["voice"]["languageCode"] = language - # config.data["voice"]["name"] = voice_id - data = { - "input": {"text": text}, - **config.data, - } - url = config.url - async with httpx.AsyncClient() as client: - response = await client.post(url, json=data, headers=headers) - # Google Cloud TTS API does not support streaming, we send the whole content at once - if response.status_code != 200: - logger.error( - f"Google Cloud TTS returns response {response.status_code}" - ) - else: - audio_content = response.content - # Decode the base64-encoded audio content - audio_content = base64.b64decode(audio_content) - await websocket.send_bytes(audio_content) diff --git a/nextpy/ai/models/audio/text_to_speech/unreal_speech.py b/nextpy/ai/models/audio/text_to_speech/unreal_speech.py deleted file mode 100644 index 7f09a651..00000000 --- a/nextpy/ai/models/audio/text_to_speech/unreal_speech.py +++ /dev/null @@ -1,56 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import asyncio -import types - -import httpx - -from nextpy.ai.models.audio.text_to_speech.base import TextToSpeech -from nextpy.utils.logger import get_logger -from nextpy.utils.singleton import Singleton - -logger = get_logger(__name__) -DEBUG = False - -config = types.SimpleNamespace( - **{ - "chunk_size": 1024, - "url": "https://lab.api.unrealspeech.com/stream", - "headers": { - "Accept": "audio/mpeg", - "Content-Type": "application/json", - }, - "data": { - "speed": -0.2, - }, - } -) - - -class UnrealSpeech(Singleton, TextToSpeech): - def __init__(self): - super().__init__() - logger.info("Initializing [Unreal Speech] voices...") - - async def stream( - self, text, websocket, tts_event: asyncio.Event, voice_id=5, *args, **kwargs - ) -> None: - if DEBUG: - return - params = { - "text": text, - "speaker_index": voice_id, - **config.data, - } - - async with httpx.AsyncClient() as client: - response = await client.get(config.url, params=params) - if response.status_code != 200: - logger.error(f"Unreal Speech returns response {response.status_code}") - async for chunk in response.aiter_bytes(): - await asyncio.sleep(0.1) - if tts_event.is_set(): - # stop streaming audio - break - await websocket.send_bytes(chunk) diff --git a/nextpy/ai/models/embedding/__init__.py b/nextpy/ai/models/embedding/__init__.py deleted file mode 100644 index bf61f8a2..00000000 --- a/nextpy/ai/models/embedding/__init__.py +++ /dev/null @@ -1,56 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrappers around embedding modules.""" -import logging -from typing import Any - -from nextpy.ai.models.embedding.aleph_alpha import ( - AlephAlphaAsymmetricSemanticEmbedding, - AlephAlphaSymmetricSemanticEmbedding, -) -from nextpy.ai.models.embedding.bedrock import BedrockEmbeddings -from nextpy.ai.models.embedding.cohere import CohereEmbeddings -from nextpy.ai.models.embedding.dashscope import DashScopeEmbeddings -from nextpy.ai.models.embedding.deepinfra import DeepInfraEmbeddings -from nextpy.ai.models.embedding.elasticsearch import ElasticsearchEmbeddings -from nextpy.ai.models.embedding.embaas import EmbaasEmbeddings -from nextpy.ai.models.embedding.fake import FakeEmbeddings -from nextpy.ai.models.embedding.google_palm import GooglePalmEmbeddings -from nextpy.ai.models.embedding.huggingface import ( - HuggingFaceHubEmbeddings, - HuggingFaceInstructEmbeddings, - HuggingFaceSetenceTransformersEmbeddings, -) -from nextpy.ai.models.embedding.jina import JinaEmbeddings -from nextpy.ai.models.embedding.llamacpp import LlamaCppEmbeddings -from nextpy.ai.models.embedding.minimax import MiniMaxEmbeddings -from nextpy.ai.models.embedding.modelscopehub import ModelScopeEmbeddings -from nextpy.ai.models.embedding.mosaicml import MosaicMLInstructorEmbeddings -from nextpy.ai.models.embedding.openai import OpenAIEmbeddings -from nextpy.ai.models.embedding.tensorflowhub import TensorflowHubEmbeddings - -logger = logging.getLogger(__name__) - -__all__ = [ - "AlephAlphaAsymmetricSemanticEmbedding", - "AlephAlphaSymmetricSemanticEmbedding", - "BedrockEmbeddings", - "CohereEmbeddings", - "DashScopeEmbeddings", - "DeepInfraEmbeddings", - "ElasticsearchEmbeddings", - "EmbaasEmbeddings", - "FakeEmbeddings", - "GooglePalmEmbeddings", - "HuggingFaceSetenceTransformersEmbeddings", - "HuggingFaceInstructEmbeddings", - "HuggingFaceHubEmbeddings", - "JinaEmbeddings", - "LlamaCppEmbeddings", - "MiniMaxEmbeddings", - "ModelScopeEmbeddings", - "MosaicMLInstructorEmbeddings", - "OpenAIEmbeddings", - "TensorflowHubEmbeddings", -] diff --git a/nextpy/ai/models/embedding/aleph_alpha.py b/nextpy/ai/models/embedding/aleph_alpha.py deleted file mode 100644 index 4015f459..00000000 --- a/nextpy/ai/models/embedding/aleph_alpha.py +++ /dev/null @@ -1,183 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import Any, Dict, List, Optional -from pydantic import BaseModel, root_validator -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - - -class AlephAlphaAsymmetricSemanticEmbedding(BaseModel, Embeddings): - - client: Any #: :meta private: - - model: Optional[str] = "luminous-base" - hosting: Optional[str] = "https://api.aleph-alpha.com" - normalize: Optional[bool] = True - compress_to_size: Optional[int] = 128 - contextual_control_threshold: Optional[int] = None - control_log_additive: Optional[bool] = True - aleph_alpha_api_key: Optional[str] = None - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - aleph_alpha_api_key = get_from_dict_or_env( - values, "aleph_alpha_api_key", "ALEPH_ALPHA_API_KEY" - ) - try: - from aleph_alpha_client import Client - except ImportError: - raise ValueError( - "Could not import aleph_alpha_client python package. " - "Please install it with `pip install aleph_alpha_client`." - ) - values["client"] = Client(token=aleph_alpha_api_key) - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to Aleph Alpha's asymmetric Document endpoint. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - try: - from aleph_alpha_client import ( - Prompt, - SemanticEmbeddingRequest, - SemanticRepresentation, - ) - except ImportError: - raise ValueError( - "Could not import aleph_alpha_client python package. " - "Please install it with `pip install aleph_alpha_client`." - ) - document_embeddings = [] - - for text in texts: - document_params = { - "prompt": Prompt.from_text(text), - "representation": SemanticRepresentation.Document, - "compress_to_size": self.compress_to_size, - "normalize": self.normalize, - "contextual_control_threshold": self.contextual_control_threshold, - "control_log_additive": self.control_log_additive, - } - - document_request = SemanticEmbeddingRequest(**document_params) - document_response = self.client.semantic_embed( - request=document_request, model=self.model - ) - - document_embeddings.append(document_response.embedding) - - return document_embeddings - - def embed_query(self, text: str) -> List[float]: - """Call out to Aleph Alpha's asymmetric, query embedding endpoint - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - try: - from aleph_alpha_client import ( - Prompt, - SemanticEmbeddingRequest, - SemanticRepresentation, - ) - except ImportError: - raise ValueError( - "Could not import aleph_alpha_client python package. " - "Please install it with `pip install aleph_alpha_client`." - ) - symmetric_params = { - "prompt": Prompt.from_text(text), - "representation": SemanticRepresentation.Query, - "compress_to_size": self.compress_to_size, - "normalize": self.normalize, - "contextual_control_threshold": self.contextual_control_threshold, - "control_log_additive": self.control_log_additive, - } - - symmetric_request = SemanticEmbeddingRequest(**symmetric_params) - symmetric_response = self.client.semantic_embed( - request=symmetric_request, model=self.model - ) - - return symmetric_response.embedding - - -class AlephAlphaSymmetricSemanticEmbedding(AlephAlphaAsymmetricSemanticEmbedding): - """The symmetric version of the Aleph Alpha's semantic embeddings. - - The main difference is that here, both the documents and - queries are embedded with a SemanticRepresentation.Symmetric - Example: - .. code-block:: python - - from aleph_alpha import AlephAlphaSymmetricSemanticEmbedding - - embeddings = AlephAlphaAsymmetricSemanticEmbedding() - text = "This is a test text" - - doc_result = embeddings.embed_documents([text]) - query_result = embeddings.embed_query(text) - """ - - def _embed(self, text: str) -> List[float]: - try: - from aleph_alpha_client import ( - Prompt, - SemanticEmbeddingRequest, - SemanticRepresentation, - ) - except ImportError: - raise ValueError( - "Could not import aleph_alpha_client python package. " - "Please install it with `pip install aleph_alpha_client`." - ) - query_params = { - "prompt": Prompt.from_text(text), - "representation": SemanticRepresentation.Symmetric, - "compress_to_size": self.compress_to_size, - "normalize": self.normalize, - "contextual_control_threshold": self.contextual_control_threshold, - "control_log_additive": self.control_log_additive, - } - - query_request = SemanticEmbeddingRequest(**query_params) - query_response = self.client.semantic_embed( - request=query_request, model=self.model - ) - - return query_response.embedding - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to Aleph Alpha's Document endpoint. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - document_embeddings = [] - - for text in texts: - document_embeddings.append(self._embed(text)) - return document_embeddings - - def embed_query(self, text: str) -> List[float]: - """Call out to Aleph Alpha's asymmetric, query embedding endpoint - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - return self._embed(text) diff --git a/nextpy/ai/models/embedding/base.py b/nextpy/ai/models/embedding/base.py deleted file mode 100644 index 662b1a2a..00000000 --- a/nextpy/ai/models/embedding/base.py +++ /dev/null @@ -1,18 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Interface for embedding models.""" -from abc import ABC, abstractmethod -from typing import List - - -class Embeddings(ABC): - """Interface for embedding models.""" - - @abstractmethod - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed search docs.""" - - @abstractmethod - def embed_query(self, text: str) -> List[float]: - """Embed query text.""" diff --git a/nextpy/ai/models/embedding/bedrock.py b/nextpy/ai/models/embedding/bedrock.py deleted file mode 100644 index 01bce089..00000000 --- a/nextpy/ai/models/embedding/bedrock.py +++ /dev/null @@ -1,163 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import json -import os -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Extra, root_validator - -from nextpy.ai.models.embedding.base import Embeddings - - -class BedrockEmbeddings(BaseModel, Embeddings): - """Embeddings provider to invoke Bedrock embedding models. - - To authenticate, the AWS client uses the following methods to - automatically load credentials: - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html - - If a specific credential profile should be used, you must pass - the name of the profile from the ~/.aws/credentials file that is to be used. - - Make sure the credentials / roles used have the required policies to - access the Bedrock service. - """ - - """ - Example: - .. code-block:: python - - from nextpy.ai.bedrock_embedding import BedrockEmbeddings - - region_name ="us-east-1" - credentials_profile_name = "default" - model_id = "amazon.titan-e1t-medium" - - be = BedrockEmbeddings( - credentials_profile_name=credentials_profile_name, - region_name=region_name, - model_id=model_id - ) - """ - - client: Any #: :meta private: - - region_name: Optional[str] = None - """The aws region e.g., `us-west-2`. Fallsback to AWS_DEFAULT_REGION env variable - or region specified in ~/.aws/config in case it is not provided here. - """ - - credentials_profile_name: Optional[str] = None - """The name of the profile in the ~/.aws/credentials or ~/.aws/config files, which - has either access keys or role information specified. - If not specified, the default credential profile or, if on an EC2 instance, - credentials from IMDS will be used. - See: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html - """ - - model_id: str = "amazon.titan-e1t-medium" - """Id of the model to call, e.g., amazon.titan-e1t-medium, this is - equivalent to the modelId property in the list-foundation-models api""" - - model_kwargs: Optional[Dict] = None - """Key word arguments to pass to the model.""" - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that AWS credentials to and python package exists in environment.""" - if values["client"] is not None: - return values - - try: - import boto3 - - if values["credentials_profile_name"] is not None: - session = boto3.Session(profile_name=values["credentials_profile_name"]) - else: - # use default credentials - session = boto3.Session() - - client_params = {} - if values["region_name"]: - client_params["region_name"] = values["region_name"] - - values["client"] = session.client("bedrock", **client_params) - - except ImportError: - raise ModuleNotFoundError( - "Could not import boto3 python package. " - "Please install it with `pip install boto3`." - ) - except Exception as e: - raise ValueError( - "Could not load credentials to authenticate with AWS client. " - "Please check that credentials in the specified " - "profile name are valid." - ) from e - - return values - - def _embedding_func(self, text: str) -> List[float]: - """Call out to Bedrock embedding endpoint.""" - # replace newlines, which can negatively affect performance. - text = text.replace(os.linesep, " ") - _model_kwargs = self.model_kwargs or {} - - input_body = {**_model_kwargs} - input_body["inputText"] = text - body = json.dumps(input_body) - content_type = "application/json" - accepts = "application/json" - - embeddings = [] - try: - response = self.client.invoke_model( - body=body, - modelId=self.model_id, - accept=accepts, - contentType=content_type, - ) - response_body = json.loads(response.get("body").read()) - embeddings = response_body.get("embedding") - except Exception as e: - raise ValueError(f"Error raised by inference endpoint: {e}") - - return embeddings - - def embed_documents( - self, texts: List[str], chunk_size: int = 1 - ) -> List[List[float]]: - """Compute doc embeddings using a Bedrock model. - - Args: - texts: The list of texts to embed. - chunk_size: Bedrock currently only allows single string - inputs, so chunk size is always 1. This input is here - only for compatibility with the embeddings interface. - - - Returns: - List of embeddings, one for each text. - """ - results = [] - for text in texts: - response = self._embedding_func(text) - results.append(response) - return results - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a Bedrock model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - return self._embedding_func(text) diff --git a/nextpy/ai/models/embedding/cohere.py b/nextpy/ai/models/embedding/cohere.py deleted file mode 100644 index 573a75c4..00000000 --- a/nextpy/ai/models/embedding/cohere.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around Cohere embedding models.""" -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Extra, root_validator - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - - -class CohereEmbeddings(BaseModel, Embeddings): - """Wrapper around Cohere embedding models. - - To use, you should have the ``cohere`` python package installed, and the - environment variable ``COHERE_API_KEY`` set with your API key or pass it - as a named parameter to the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.models.embedding import CohereEmbeddings - cohere = CohereEmbeddings( - model="embed-english-light-v2.0", cohere_api_key="my-api-key" - ) - """ - - client: Any #: :meta private: - model: str = "embed-english-v2.0" - """Model name to use.""" - - truncate: Optional[str] = None - """Truncate embeddings that are too long from start or end ("NONE"|"START"|"END")""" - - cohere_api_key: Optional[str] = None - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - cohere_api_key = get_from_dict_or_env( - values, "cohere_api_key", "COHERE_API_KEY" - ) - try: - import cohere - - values["client"] = cohere.Client(cohere_api_key) - except ImportError: - raise ValueError( - "Could not import cohere python package. " - "Please install it with `pip install cohere`." - ) - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to Cohere's embedding endpoint. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - embeddings = self.client.embed( - model=self.model, texts=texts, truncate=self.truncate - ).embeddings - return [list(map(float, e)) for e in embeddings] - - def embed_query(self, text: str) -> List[float]: - """Call out to Cohere's embedding endpoint. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - embedding = self.client.embed( - model=self.model, texts=[text], truncate=self.truncate - ).embeddings[0] - return list(map(float, embedding)) diff --git a/nextpy/ai/models/embedding/dashscope.py b/nextpy/ai/models/embedding/dashscope.py deleted file mode 100644 index 655ef80e..00000000 --- a/nextpy/ai/models/embedding/dashscope.py +++ /dev/null @@ -1,156 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around DashScope embedding models.""" -import logging -from typing import ( - Any, - Callable, - Dict, - List, - Optional, -) - -from pydantic import BaseModel, Extra, root_validator -from requests.exceptions import HTTPError -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -logger = logging.getLogger(__name__) - - -class DashScopeEmbeddings(BaseModel, Embeddings): - """Wrapper around DashScope embedding models. - - To use, you should have the ``dashscope`` python package installed, and the - environment variable ``DASHSCOPE_API_KEY`` set with your API key or pass it - as a named parameter to the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.models.embedding import DashScopeEmbeddings - embeddings = DashScopeEmbeddings(dashscope_api_key="my-api-key") - - Example: - .. code-block:: python - - import os - os.environ["DASHSCOPE_API_KEY"] = "your DashScope API KEY" - - from nextpy.ai.models.embeddings.dashscope import DashScopeEmbeddings - embeddings = DashScopeEmbeddings( - model="text-embedding-v1", - ) - text = "This is a test query." - query_result = embeddings.embed_query(text) - - """ - - client: Any #: :meta private: - model: str = "text-embedding-v1" - dashscope_api_key: Optional[str] = None - """Maximum number of retries to make when generating.""" - max_retries: int = 5 - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - import dashscope - - """Validate that api key and python package exists in environment.""" - values["dashscope_api_key"] = get_from_dict_or_env( - values, "dashscope_api_key", "DASHSCOPE_API_KEY" - ) - dashscope.api_key = values["dashscope_api_key"] - try: - import dashscope - - values["client"] = dashscope.TextEmbedding - except ImportError: - raise ImportError( - "Could not import dashscope python package. " - "Please install it with `pip install dashscope`." - ) - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to DashScope's embedding endpoint for embedding search docs. - - Args: - texts: The list of texts to embed. - chunk_size: The chunk size of embeddings. If None, will use the chunk size - specified by the class. - - Returns: - List of embeddings, one for each text. - """ - embeddings = embed_with_retry( - self, input=texts, text_type="document", model=self.model - ) - embedding_list = [item["embedding"] for item in embeddings] - return embedding_list - - def embed_query(self, text: str) -> List[float]: - """Call out to DashScope's embedding endpoint for embedding query text. - - Args: - text: The text to embed. - - Returns: - Embedding for the text. - """ - embedding = embed_with_retry( - self, input=text, text_type="query", model=self.model - )[0]["embedding"] - return embedding - - -def _create_retry_decorator(embeddings: DashScopeEmbeddings) -> Callable[[Any], Any]: - multiplier = 1 - min_seconds = 1 - max_seconds = 4 - # Wait 2^x * 1 second between each retry starting with - # 1 seconds, then up to 4 seconds, then 4 seconds afterwards - return retry( - reraise=True, - stop=stop_after_attempt(embeddings.max_retries), - wait=wait_exponential(multiplier, min=min_seconds, max=max_seconds), - retry=(retry_if_exception_type(HTTPError)), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - -def embed_with_retry(embeddings: DashScopeEmbeddings, **kwargs: Any) -> Any: - """Use tenacity to retry the embedding call.""" - retry_decorator = _create_retry_decorator(embeddings) - - @retry_decorator - def _embed_with_retry(**kwargs: Any) -> Any: - resp = embeddings.client.call(**kwargs) - if resp.status_code == 200: - return resp.output["embeddings"] - elif resp.status_code in [400, 401]: - raise ValueError( - f"status_code: {resp.status_code} \n " - f"code: {resp.code} \n message: {resp.message}" - ) - else: - raise HTTPError( - f"HTTP error occurred: status_code: {resp.status_code} \n " - f"code: {resp.code} \n message: {resp.message}" - ) - - return _embed_with_retry(**kwargs) diff --git a/nextpy/ai/models/embedding/deepinfra.py b/nextpy/ai/models/embedding/deepinfra.py deleted file mode 100644 index a51c3066..00000000 --- a/nextpy/ai/models/embedding/deepinfra.py +++ /dev/null @@ -1,132 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import Any, Dict, List, Mapping, Optional - -import requests -from pydantic import BaseModel, Extra, root_validator - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -DEFAULT_MODEL_ID = "sentence-transformers/clip-ViT-B-32" - - -class DeepInfraEmbeddings(BaseModel, Embeddings): - """Wrapper around Deep Infra's embedding inference service. - - To use, you should have the - environment variable ``DEEPINFRA_API_TOKEN`` set with your API token, or pass - it as a named parameter to the constructor. - There are multiple embedding models available, - see https://deepinfra.com/models?type=embeddings. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import DeepInfraEmbeddings - deepinfra_emb = DeepInfraEmbeddings( - model_id="sentence-transformers/clip-ViT-B-32", - deepinfra_api_token="my-api-key" - ) - r1 = deepinfra_emb.embed_documents( - [ - "Alpha is the first letter of Greek alphabet", - "Beta is the second letter of Greek alphabet", - ] - ) - r2 = deepinfra_emb.embed_query( - "What is the second letter of Greek alphabet" - ) - - """ - - model_id: str = DEFAULT_MODEL_ID - """Embeddings model to use.""" - normalize: bool = False - """whether to normalize the computed embeddings""" - embed_instruction: str = "passage: " - """Instruction used to embed documents.""" - query_instruction: str = "query: " - """Instruction used to embed the query.""" - model_kwargs: Optional[dict] = None - """Other model keyword args""" - - deepinfra_api_token: Optional[str] = None - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - deepinfra_api_token = get_from_dict_or_env( - values, "deepinfra_api_token", "DEEPINFRA_API_TOKEN" - ) - values["deepinfra_api_token"] = deepinfra_api_token - return values - - @property - def _identifying_params(self) -> Mapping[str, Any]: - """Get the identifying parameters.""" - return {"model_id": self.model_id} - - def _embed(self, input: List[str]) -> List[List[float]]: - _model_kwargs = self.model_kwargs or {} - # HTTP headers for authorization - headers = { - "Authorization": f"bearer {self.deepinfra_api_token}", - "Content-Type": "application/json", - } - # send request - try: - res = requests.post( - f"https://api.deepinfra.com/v1/inference/{self.model_id}", - headers=headers, - json={"inputs": input, "normalize": self.normalize, **_model_kwargs}, - ) - except requests.exceptions.RequestException as e: - raise ValueError(f"Error raised by inference endpoint: {e}") - - if res.status_code != 200: - raise ValueError( - "Error raised by inference API HTTP code: %s, %s" - % (res.status_code, res.text) - ) - try: - t = res.json() - embeddings = t["embeddings"] - except requests.exceptions.JSONDecodeError as e: - raise ValueError( - f"Error raised by inference API: {e}.\nResponse: {res.text}" - ) - - return embeddings - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed documents using a Deep Infra deployed embedding model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - instruction_pairs = [f"{self.query_instruction}{text}" for text in texts] - embeddings = self._embed(instruction_pairs) - return embeddings - - def embed_query(self, text: str) -> List[float]: - """Embed a query using a Deep Infra deployed embedding model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - instruction_pair = f"{self.query_instruction}{text}" - embedding = self._embed([instruction_pair])[0] - return embedding diff --git a/nextpy/ai/models/embedding/elasticsearch.py b/nextpy/ai/models/embedding/elasticsearch.py deleted file mode 100644 index 9f95a973..00000000 --- a/nextpy/ai/models/embedding/elasticsearch.py +++ /dev/null @@ -1,219 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, List, Optional - -from nextpy.utils.data_ops import get_from_dict_or_env - -if TYPE_CHECKING: - from elasticsearch import Elasticsearch - -from nextpy.ai.models.embedding.base import Embeddings - - -class ElasticsearchEmbeddings(Embeddings): - """Wrapper around Elasticsearch embedding models. - - This class provides an interface to generate embedding using a model deployed - in an Elasticsearch cluster. It requires an Elasticsearch connection object - and the model_id of the model deployed in the cluster. - - In Elasticsearch you need to have an embedding model loaded and deployed. - - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-trained-model.html - - https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-deploy-models.html - """ # noqa: E501 - - def __init__( - self, - client: Any, - model_id: str, - *, - input_field: str = "text_field", - ): - """Initialize the ElasticsearchEmbeddings instance. - - Args: - client (MlClient): An Elasticsearch ML client object. - model_id (str): The model_id of the model deployed in the Elasticsearch - cluster. - input_field (str): The name of the key for the input text field in the - document. Defaults to 'text_field'. - """ - self.client = client - self.model_id = model_id - self.input_field = input_field - - @classmethod - def from_credentials( - cls, - model_id: str, - *, - es_cloud_id: Optional[str] = None, - es_user: Optional[str] = None, - es_password: Optional[str] = None, - input_field: str = "text_field", - ) -> ElasticsearchEmbeddings: - """Instantiate embeddings from Elasticsearch credentials. - - Args: - model_id (str): The model_id of the model deployed in the Elasticsearch - cluster. - input_field (str): The name of the key for the input text field in the - document. Defaults to 'text_field'. - es_cloud_id: (str, optional): The Elasticsearch cloud ID to connect to. - es_user: (str, optional): Elasticsearch username. - es_password: (str, optional): Elasticsearch password. - - Example: - .. code-block:: python - - from langchain.embeddings import ElasticsearchEmbeddings - - # Define the model ID and input field name (if different from default) - model_id = "your_model_id" - # Optional, only if different from 'text_field' - input_field = "your_input_field" - - # Credentials can be passed in two ways. Either set the env vars - # ES_CLOUD_ID, ES_USER, ES_PASSWORD and they will be automatically - # pulled in, or pass them in directly as kwargs. - embeddings = ElasticsearchEmbeddings.from_credentials( - model_id, - input_field=input_field, - # es_cloud_id="foo", - # es_user="bar", - # es_password="baz", - ) - - documents = [ - "This is an example document.", - "Another example document to generate embeddings for.", - ] - embeddings_generator.embed_documents(documents) - """ - try: - from elasticsearch import Elasticsearch - from elasticsearch.client import MlClient - except ImportError: - raise ImportError( - "elasticsearch package not found, please install with 'pip install " - "elasticsearch'" - ) - - es_cloud_id = es_cloud_id or get_from_dict_or_env("es_cloud_id", "ES_CLOUD_ID") - es_user = es_user or get_from_dict_or_env("es_user", "ES_USER") - es_password = es_password or get_from_dict_or_env("es_password", "ES_PASSWORD") - - # Connect to Elasticsearch - es_connection = Elasticsearch( - cloud_id=es_cloud_id, basic_auth=(es_user, es_password) - ) - client = MlClient(es_connection) - return cls(client, model_id, input_field=input_field) - - @classmethod - def from_es_connection( - cls, - model_id: str, - es_connection: Elasticsearch, - input_field: str = "text_field", - ) -> ElasticsearchEmbeddings: - """Instantiate embeddings from an existing Elasticsearch connection. - - This method provides a way to create an instance of the ElasticsearchEmbeddings - class using an existing Elasticsearch connection. The connection object is used - to create an MlClient, which is then used to initialize the - ElasticsearchEmbeddings instance. - - Args: - model_id (str): The model_id of the model deployed in the Elasticsearch cluster. - es_connection (elasticsearch.Elasticsearch): An existing Elasticsearch - connection object. input_field (str, optional): The name of the key for the - input text field in the document. Defaults to 'text_field'. - - Returns: - ElasticsearchEmbeddings: An instance of the ElasticsearchEmbeddings class. - - Example: - .. code-block:: python - - from elasticsearch import Elasticsearch - - from langchain.embeddings import ElasticsearchEmbeddings - - # Define the model ID and input field name (if different from default) - model_id = "your_model_id" - # Optional, only if different from 'text_field' - input_field = "your_input_field" - - # Create Elasticsearch connection - es_connection = Elasticsearch( - hosts=["localhost:9200"], http_auth=("user", "password") - ) - - # Instantiate ElasticsearchEmbeddings using the existing connection - embeddings = ElasticsearchEmbeddings.from_es_connection( - model_id, - es_connection, - input_field=input_field, - ) - - documents = [ - "This is an example document.", - "Another example document to generate embeddings for.", - ] - embeddings_generator.embed_documents(documents) - """ - # Importing MlClient from elasticsearch.client within the method to - # avoid unnecessary import if the method is not used - from elasticsearch.client import MlClient - - # Create an MlClient from the given Elasticsearch connection - client = MlClient(es_connection) - - # Return a new instance of the ElasticsearchEmbeddings class with - # the MlClient, model_id, and input_field - return cls(client, model_id, input_field=input_field) - - def _embedding_func(self, texts: List[str]) -> List[List[float]]: - """Generate embeddings for the given texts using the Elasticsearch model. - - Args: - texts (List[str]): A list of text strings to generate embeddings for. - - Returns: - List[List[float]]: A list of embeddings, one for each text in the input - list. - """ - response = self.client.infer_trained_model( - model_id=self.model_id, docs=[{self.input_field: text} for text in texts] - ) - - embeddings = [doc["predicted_value"] for doc in response["inference_results"]] - return embeddings - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Generate embeddings for a list of documents. - - Args: - texts (List[str]): A list of document text strings to generate embeddings - for. - - Returns: - List[List[float]]: A list of embeddings, one for each document in the input - list. - """ - return self._embedding_func(texts) - - def embed_query(self, text: str) -> List[float]: - """Generate an embedding for a single query text. - - Args: - text (str): The query text to generate an embedding for. - - Returns: - List[float]: The embedding for the input query text. - """ - return self._embedding_func([text])[0] diff --git a/nextpy/ai/models/embedding/embaas.py b/nextpy/ai/models/embedding/embaas.py deleted file mode 100644 index c1d58346..00000000 --- a/nextpy/ai/models/embedding/embaas.py +++ /dev/null @@ -1,142 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around embaas embeddings API.""" -from typing import Any, Dict, List, Mapping, Optional - -import requests -from pydantic import BaseModel, Extra, root_validator -from typing_extensions import NotRequired, TypedDict - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -# Currently supported maximum batch size for embedding requests -MAX_BATCH_SIZE = 256 -EMBAAS_API_URL = "https://api.embaas.io/v1/embeddings/" - - -class EmbaasEmbeddingsPayload(TypedDict): - """Payload for the embaas embeddings API.""" - - model: str - texts: List[str] - instruction: NotRequired[str] - - -class EmbaasEmbeddings(BaseModel, Embeddings): - """Wrapper around embaas's embedding service. - - To use, you should have the - environment variable ``EMBAAS_API_KEY`` set with your API key, or pass - it as a named parameter to the constructor. - - Example: - .. code-block:: python - - # Initialise with default model and instruction - from nextpy.ai.models.embeddings import EmbaasEmbeddings - emb = EmbaasEmbeddings() - - # Initialise with custom model and instruction - from nextpy.ai.models.embeddings import EmbaasEmbeddings - emb_model = "instructor-large" - emb_inst = "Represent the Wikipedia document for retrieval" - emb = EmbaasEmbeddings( - model=emb_model, - instruction=emb_inst - ) - """ - - model: str = "e5-large-v2" - """The model used for embeddings.""" - instruction: Optional[str] = None - """Instruction used for domain-specific embeddings.""" - api_url: str = EMBAAS_API_URL - """The URL for the embaas embeddings API.""" - embaas_api_key: Optional[str] = None - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - embaas_api_key = get_from_dict_or_env( - values, "embaas_api_key", "EMBAAS_API_KEY" - ) - values["embaas_api_key"] = embaas_api_key - return values - - @property - def _identifying_params(self) -> Mapping[str, Any]: - """Get the identifying params.""" - return {"model": self.model, "instruction": self.instruction} - - def _generate_payload(self, texts: List[str]) -> EmbaasEmbeddingsPayload: - """Generates payload for the API request.""" - payload = EmbaasEmbeddingsPayload(texts=texts, model=self.model) - if self.instruction: - payload["instruction"] = self.instruction - return payload - - def _handle_request(self, payload: EmbaasEmbeddingsPayload) -> List[List[float]]: - """Sends a request to the Embaas API and handles the response.""" - headers = { - "Authorization": f"Bearer {self.embaas_api_key}", - "Content-Type": "application/json", - } - - response = requests.post(self.api_url, headers=headers, json=payload) - response.raise_for_status() - - parsed_response = response.json() - embeddings = [item["embedding"] for item in parsed_response["data"]] - - return embeddings - - def _generate_embeddings(self, texts: List[str]) -> List[List[float]]: - """Generate embeddings using the Embaas API.""" - payload = self._generate_payload(texts) - try: - return self._handle_request(payload) - except requests.exceptions.RequestException as e: - if e.response is None or not e.response.text: - raise ValueError(f"Error raised by embaas embeddings API: {e}") - - parsed_response = e.response.json() - if "message" in parsed_response: - raise ValueError( - "Validation Error raised by embaas embeddings API:" - f"{parsed_response['message']}" - ) - raise - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Get embeddings for a list of texts. - - Args: - texts: The list of texts to get embeddings for. - - Returns: - List of embeddings, one for each text. - """ - batches = [ - texts[i : i + MAX_BATCH_SIZE] for i in range(0, len(texts), MAX_BATCH_SIZE) - ] - embeddings = [self._generate_embeddings(batch) for batch in batches] - # flatten the list of lists into a single list - return [embedding for batch in embeddings for embedding in batch] - - def embed_query(self, text: str) -> List[float]: - """Get embeddings for a single text. - - Args: - text: The text to get embeddings for. - - Returns: - List of embeddings. - """ - return self.embed_documents([text])[0] diff --git a/nextpy/ai/models/embedding/fake.py b/nextpy/ai/models/embedding/fake.py deleted file mode 100644 index aa24f3f0..00000000 --- a/nextpy/ai/models/embedding/fake.py +++ /dev/null @@ -1,22 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import List - -import numpy as np -from pydantic import BaseModel - -from nextpy.ai.models.embedding.base import Embeddings - - -class FakeEmbeddings(Embeddings, BaseModel): - size: int - - def _get_embedding(self) -> List[float]: - return list(np.random.normal(size=self.size)) - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - return [self._get_embedding() for _ in texts] - - def embed_query(self, text: str) -> List[float]: - return self._get_embedding() diff --git a/nextpy/ai/models/embedding/google_palm.py b/nextpy/ai/models/embedding/google_palm.py deleted file mode 100644 index 0befcc13..00000000 --- a/nextpy/ai/models/embedding/google_palm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import logging -from typing import Any, Callable, Dict, List, Optional - -from pydantic import BaseModel, root_validator -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -logger = logging.getLogger(__name__) - - -class GooglePalmEmbeddings(BaseModel, Embeddings): - client: Any - google_api_key: Optional[str] - model_name: str = "models/embedding-gecko-001" - """Model name to use.""" - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate api key, python package exists.""" - google_api_key = get_from_dict_or_env( - values, "google_api_key", "GOOGLE_API_KEY" - ) - try: - import google.generativeai as genai - - genai.configure(api_key=google_api_key) - except ImportError: - raise ImportError("Could not import google.generativeai python package.") - - values["client"] = genai - - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - return [self.embed_query(text) for text in texts] - - def embed_query(self, text: str) -> List[float]: - """Embed query text.""" - embedding = embed_with_retry(self, self.model_name, text) - return embedding["embedding"] - - -def _create_retry_decorator() -> Callable[[Any], Any]: - """Returns a tenacity retry decorator, preconfigured to handle PaLM exceptions.""" - import google.api_core.exceptions - - multiplier = 2 - min_seconds = 1 - max_seconds = 60 - max_retries = 10 - - return retry( - reraise=True, - stop=stop_after_attempt(max_retries), - wait=wait_exponential(multiplier=multiplier, min=min_seconds, max=max_seconds), - retry=( - retry_if_exception_type(google.api_core.exceptions.ResourceExhausted) - | retry_if_exception_type(google.api_core.exceptions.ServiceUnavailable) - | retry_if_exception_type(google.api_core.exceptions.GoogleAPIError) - ), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - -def embed_with_retry( - embeddings: GooglePalmEmbeddings, *args: Any, **kwargs: Any -) -> Any: - """Use tenacity to retry the completion call.""" - retry_decorator = _create_retry_decorator() - - @retry_decorator - def _embed_with_retry(*args: Any, **kwargs: Any) -> Any: - return embeddings.client.generate_embeddings(*args, **kwargs) - - return _embed_with_retry(*args, **kwargs) diff --git a/nextpy/ai/models/embedding/huggingface.py b/nextpy/ai/models/embedding/huggingface.py deleted file mode 100644 index a270f7c1..00000000 --- a/nextpy/ai/models/embedding/huggingface.py +++ /dev/null @@ -1,274 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around HuggingFace embedding models: hub, sentence-transformers and instruct embeddings.""" -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Extra, Field, root_validator - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" -DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large" -DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: " -DEFAULT_QUERY_INSTRUCTION = ( - "Represent the question for retrieving supporting documents: " -) - -DEFAULT_REPO_ID = "sentence-transformers/all-mpnet-base-v2" -VALID_TASKS = ("feature-extraction",) - - -class HuggingFaceHubEmbeddings(BaseModel, Embeddings): - """Wrapper around HuggingFaceHub embedding models. - - To use, you should have the ``huggingface_hub`` python package installed, and the - environment variable ``HUGGINGFACEHUB_API_TOKEN`` set with your API token, or pass - it as a named parameter to the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import HuggingFaceHubEmbeddings - repo_id = "sentence-transformers/all-mpnet-base-v2" - hf = HuggingFaceHubEmbeddings( - repo_id=repo_id, - task="feature-extraction", - huggingfacehub_api_token="my-api-key", - ) - """ - - client: Any #: :meta private: - repo_id: str = DEFAULT_REPO_ID - """Model name to use.""" - task: Optional[str] = "feature-extraction" - """Task to call the model with.""" - model_kwargs: Optional[dict] = None - """Key word arguments to pass to the model.""" - - huggingfacehub_api_token: Optional[str] = None - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - huggingfacehub_api_token = get_from_dict_or_env( - values, "huggingfacehub_api_token", "HUGGINGFACEHUB_API_TOKEN" - ) - try: - from huggingface_hub.inference_api import InferenceApi - - repo_id = values["repo_id"] - if not repo_id.startswith("sentence-transformers"): - raise ValueError( - "Currently only 'sentence-transformers' embedding models " - f"are supported. Got invalid 'repo_id' {repo_id}." - ) - client = InferenceApi( - repo_id=repo_id, - token=huggingfacehub_api_token, - task=values.get("task"), - ) - if client.task not in VALID_TASKS: - raise ValueError( - f"Got invalid task {client.task}, " - f"currently only {VALID_TASKS} are supported" - ) - values["client"] = client - except ImportError: - raise ValueError( - "Could not import huggingface_hub python package. " - "Please install it with `pip install huggingface_hub`." - ) - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to HuggingFaceHub's embedding endpoint for embedding search docs. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - # replace newlines, which can negatively affect performance. - texts = [text.replace("\n", " ") for text in texts] - _model_kwargs = self.model_kwargs or {} - responses = self.client(inputs=texts, params=_model_kwargs) - return responses - - def embed_query(self, text: str) -> List[float]: - """Call out to HuggingFaceHub's embedding endpoint for embedding query text. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - response = self.embed_documents([text])[0] - return response - - -class HuggingFaceSetenceTransformersEmbeddings(BaseModel, Embeddings): - """Wrapper around sentence_transformers embedding models. - - To use, you should have the ``sentence_transformers`` python package installed. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import HuggingFaceEmbeddings - - model_name = "sentence-transformers/all-mpnet-base-v2" - model_kwargs = {'device': 'cpu'} - encode_kwargs = {'normalize_embeddings': False} - hf = HuggingFaceEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs - ) - """ - - client: Any #: :meta private: - model_name: str = DEFAULT_MODEL_NAME - """Model name to use.""" - cache_folder: Optional[str] = None - """Path to store models. - Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" - model_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Key word arguments to pass to the model.""" - encode_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Key word arguments to pass when calling the `encode` method of the model.""" - - def __init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - super().__init__(**kwargs) - try: - import sentence_transformers - - except ImportError as exc: - raise ImportError( - "Could not import sentence_transformers python package. " - "Please install it with `pip install sentence_transformers`." - ) from exc - - self.client = sentence_transformers.SentenceTransformer( - self.model_name, cache_folder=self.cache_folder, **self.model_kwargs - ) - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - texts = list(map(lambda x: x.replace("\n", " "), texts)) - embeddings = self.client.encode(texts, **self.encode_kwargs) - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a HuggingFace transformer model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - text = text.replace("\n", " ") - embedding = self.client.encode(text, **self.encode_kwargs) - return embedding.tolist() - - -class HuggingFaceInstructEmbeddings(BaseModel, Embeddings): - """Wrapper around sentence_transformers embedding models. - - To use, you should have the ``sentence_transformers`` - and ``InstructorEmbedding`` python packages installed. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import HuggingFaceInstructEmbeddings - - model_name = "hkunlp/instructor-large" - model_kwargs = {'device': 'cpu'} - encode_kwargs = {'normalize_embeddings': True} - hf = HuggingFaceInstructEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs - ) - """ - - client: Any #: :meta private: - model_name: str = DEFAULT_INSTRUCT_MODEL - """Model name to use.""" - cache_folder: Optional[str] = None - """Path to store models. - Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" - model_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Key word arguments to pass to the model.""" - encode_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Key word arguments to pass when calling the `encode` method of the model.""" - embed_instruction: str = DEFAULT_EMBED_INSTRUCTION - """Instruction to use for embedding documents.""" - query_instruction: str = DEFAULT_QUERY_INSTRUCTION - """Instruction to use for embedding query.""" - - def __init__(self, **kwargs: Any): - """Initialize the sentence_transformer.""" - super().__init__(**kwargs) - try: - from InstructorEmbedding import INSTRUCTOR - - self.client = INSTRUCTOR( - self.model_name, cache_folder=self.cache_folder, **self.model_kwargs - ) - except ImportError as e: - raise ValueError("Dependencies for InstructorEmbedding not found.") from e - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace instruct model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - instruction_pairs = [[self.embed_instruction, text] for text in texts] - embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs) - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a HuggingFace instruct model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - instruction_pair = [self.query_instruction, text] - embedding = self.client.encode([instruction_pair], **self.encode_kwargs)[0] - return embedding.tolist() diff --git a/nextpy/ai/models/embedding/jina.py b/nextpy/ai/models/embedding/jina.py deleted file mode 100644 index 92779714..00000000 --- a/nextpy/ai/models/embedding/jina.py +++ /dev/null @@ -1,101 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import os -from typing import Any, Dict, List, Optional - -import requests -from pydantic import BaseModel, root_validator - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - - -class JinaEmbeddings(BaseModel, Embeddings): - client: Any #: :meta private: - - model_name: str = "ViT-B-32::openai" - """Model name to use.""" - - jina_auth_token: Optional[str] = None - jina_api_url: str = "https://api.clip.jina.ai/api/v1/models/" - request_headers: Optional[dict] = None - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that auth token exists in environment.""" - # Set Auth - jina_auth_token = get_from_dict_or_env( - values, "jina_auth_token", "JINA_AUTH_TOKEN" - ) - values["jina_auth_token"] = jina_auth_token - values["request_headers"] = (("authorization", jina_auth_token),) - - # Test that package is installed - try: - import jina - except ImportError: - raise ImportError( - "Could not import `jina` python package. " - "Please install it with `pip install jina`." - ) - - # Setup client - jina_api_url = os.environ.get("JINA_API_URL", values["jina_api_url"]) - model_name = values["model_name"] - try: - resp = requests.get( - jina_api_url + f"?model_name={model_name}", - headers={"Authorization": jina_auth_token}, - ) - - if resp.status_code == 401: - raise ValueError( - "The given Jina auth token is invalid. " - "Please check your Jina auth token." - ) - elif resp.status_code == 404: - raise ValueError( - f"The given model name `{model_name}` is not valid. " - f"Please go to https://cloud.jina.ai/user/inference " - f"and create a model with the given model name." - ) - resp.raise_for_status() - - endpoint = resp.json()["endpoints"]["grpc"] - values["client"] = jina.Client(host=endpoint) - except requests.exceptions.HTTPError as err: - raise ValueError(f"Error: {err!r}") - return values - - def _post(self, docs: List[Any], **kwargs: Any) -> Any: - payload = dict(inputs=docs, metadata=self.request_headers, **kwargs) - return self.client.post(on="/encode", **payload) - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Call out to Jina's embedding endpoint. - Args: - texts: The list of texts to embed. - - Returns: - List of embedding, one for each text. - """ - from docarray import Document, DocumentArray - - embeddings = self._post( - docs=DocumentArray([Document(text=t) for t in texts]) - ).embeddings - return [list(map(float, e)) for e in embeddings] - - def embed_query(self, text: str) -> List[float]: - """Call out to Jina's embedding endpoint. - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - from docarray import Document, DocumentArray - - embedding = self._post(docs=DocumentArray([Document(text=text)])).embeddings[0] - return list(map(float, embedding)) diff --git a/nextpy/ai/models/embedding/llamacpp.py b/nextpy/ai/models/embedding/llamacpp.py deleted file mode 100644 index 289834f3..00000000 --- a/nextpy/ai/models/embedding/llamacpp.py +++ /dev/null @@ -1,127 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around llama.cpp embedding models.""" -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Extra, Field, root_validator - -from nextpy.ai.models.embedding.base import Embeddings - - -class LlamaCppEmbeddings(BaseModel, Embeddings): - """Wrapper around llama.cpp embedding models. - - To use, you should have the llama-cpp-python library installed, and provide the - path to the Llama model as a named parameter to the constructor. - Check out: https://github.com/abetlen/llama-cpp-python - - Example: - .. code-block:: python - - from nextpy.ai.embedding import LlamaCppEmbeddings - llama = LlamaCppEmbeddings(model_path="/path/to/model.bin") - """ - - client: Any #: :meta private: - model_path: str - - n_ctx: int = Field(512, alias="n_ctx") - """Token context window.""" - - n_parts: int = Field(-1, alias="n_parts") - """Number of parts to split the model into. - If -1, the number of parts is automatically determined.""" - - seed: int = Field(-1, alias="seed") - """Seed. If -1, a random seed is used.""" - - f16_kv: bool = Field(False, alias="f16_kv") - """Use half-precision for key/value cache.""" - - logits_all: bool = Field(False, alias="logits_all") - """Return logits for all tokens, not just the last token.""" - - vocab_only: bool = Field(False, alias="vocab_only") - """Only load the vocabulary, no weights.""" - - use_mlock: bool = Field(False, alias="use_mlock") - """Force system to keep model in RAM.""" - - n_threads: Optional[int] = Field(None, alias="n_threads") - """Number of threads to use. If None, the number - of threads is automatically determined.""" - - n_batch: Optional[int] = Field(8, alias="n_batch") - """Number of tokens to process in parallel. - Should be a number between 1 and n_ctx.""" - - n_gpu_layers: Optional[int] = Field(None, alias="n_gpu_layers") - """Number of layers to be loaded into gpu memory. Default None.""" - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that llama-cpp-python library is installed.""" - model_path = values["model_path"] - model_param_names = [ - "n_ctx", - "n_parts", - "seed", - "f16_kv", - "logits_all", - "vocab_only", - "use_mlock", - "n_threads", - "n_batch", - ] - model_params = {k: values[k] for k in model_param_names} - # For backwards compatibility, only include if non-null. - if values["n_gpu_layers"] is not None: - model_params["n_gpu_layers"] = values["n_gpu_layers"] - - try: - from llama_cpp import Llama - - values["client"] = Llama(model_path, embedding=True, **model_params) - except ImportError: - raise ModuleNotFoundError( - "Could not import llama-cpp-python library. " - "Please install the llama-cpp-python library to " - "use this embedding model: pip install llama-cpp-python" - ) - except Exception as e: - raise ValueError( - f"Could not load Llama model from path: {model_path}. " - f"Received error {e}" - ) - - return values - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed a list of documents using the Llama model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - embeddings = [self.client.embed(text) for text in texts] - return [list(map(float, e)) for e in embeddings] - - def embed_query(self, text: str) -> List[float]: - """Embed a query using the Llama model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - embedding = self.client.embed(text) - return list(map(float, embedding)) diff --git a/nextpy/ai/models/embedding/minimax.py b/nextpy/ai/models/embedding/minimax.py deleted file mode 100644 index e9a7d7a2..00000000 --- a/nextpy/ai/models/embedding/minimax.py +++ /dev/null @@ -1,164 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from __future__ import annotations - -import logging -from typing import Any, Callable, Dict, List, Optional - -import requests -from pydantic import BaseModel, Extra, root_validator -from tenacity import ( - before_sleep_log, - retry, - stop_after_attempt, - wait_exponential, -) - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -logger = logging.getLogger(__name__) - - -def _create_retry_decorator() -> Callable[[Any], Any]: - """Returns a tenacity retry decorator.""" - multiplier = 1 - min_seconds = 1 - max_seconds = 4 - max_retries = 6 - - return retry( - reraise=True, - stop=stop_after_attempt(max_retries), - wait=wait_exponential(multiplier=multiplier, min=min_seconds, max=max_seconds), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - -def embed_with_retry(embeddings: MiniMaxEmbeddings, *args: Any, **kwargs: Any) -> Any: - """Use tenacity to retry the completion call.""" - retry_decorator = _create_retry_decorator() - - @retry_decorator - def _embed_with_retry(*args: Any, **kwargs: Any) -> Any: - return embeddings.embed(*args, **kwargs) - - return _embed_with_retry(*args, **kwargs) - - -class MiniMaxEmbeddings(BaseModel, Embeddings): - """Wrapper around MiniMax's embedding inference service. - - To use, you should have the environment variable ``MINIMAX_GROUP_ID`` and - ``MINIMAX_API_KEY`` set with your API token, or pass it as a named parameter to - the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import MiniMaxEmbeddings - embeddings = MiniMaxEmbeddings() - - query_text = "This is a test query." - query_result = embeddings.embed_query(query_text) - - document_text = "This is a test document." - document_result = embeddings.embed_documents([document_text]) - - """ - - endpoint_url: str = "https://api.minimax.chat/v1/embeddings" - """Endpoint URL to use.""" - model: str = "embo-01" - """Embeddings model name to use.""" - embed_type_db: str = "db" - """For embed_documents""" - embed_type_query: str = "query" - """For embed_query""" - - minimax_group_id: Optional[str] = None - """Group ID for MiniMax API.""" - minimax_api_key: Optional[str] = None - """API Key for MiniMax API.""" - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that group id and api key exists in environment.""" - minimax_group_id = get_from_dict_or_env( - values, "minimax_group_id", "MINIMAX_GROUP_ID" - ) - minimax_api_key = get_from_dict_or_env( - values, "minimax_api_key", "MINIMAX_API_KEY" - ) - values["minimax_group_id"] = minimax_group_id - values["minimax_api_key"] = minimax_api_key - return values - - def embed( - self, - texts: List[str], - embed_type: str, - ) -> List[List[float]]: - payload = { - "model": self.model, - "type": embed_type, - "texts": texts, - } - - # HTTP headers for authorization - headers = { - "Authorization": f"Bearer {self.minimax_api_key}", - "Content-Type": "application/json", - } - - params = { - "GroupId": self.minimax_group_id, - } - - # send request - response = requests.post( - self.endpoint_url, params=params, headers=headers, json=payload - ) - parsed_response = response.json() - - # check for errors - if parsed_response["base_resp"]["status_code"] != 0: - raise ValueError( - f"MiniMax API returned an error: {parsed_response['base_resp']}" - ) - - embeddings = parsed_response["vectors"] - - return embeddings - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed documents using a MiniMax embedding endpoint. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - embeddings = embed_with_retry(self, texts=texts, embed_type=self.embed_type_db) - return embeddings - - def embed_query(self, text: str) -> List[float]: - """Embed a query using a MiniMax embedding endpoint. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - embeddings = embed_with_retry( - self, texts=[text], embed_type=self.embed_type_query - ) - return embeddings[0] diff --git a/nextpy/ai/models/embedding/modelscopehub.py b/nextpy/ai/models/embedding/modelscopehub.py deleted file mode 100644 index a676c2f8..00000000 --- a/nextpy/ai/models/embedding/modelscopehub.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around ModelScopeHub embedding models.""" -from typing import Any, List - -from pydantic import BaseModel, Extra - -from nextpy.ai.models.embedding.base import Embeddings - - -class ModelScopeEmbeddings(BaseModel, Embeddings): - """Wrapper around modelscope_hub embedding models. - - To use, you should have the ``modelscope`` python package installed. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import ModelScopeEmbeddings - model_id = "damo/nlp_corom_sentence-embedding_english-base" - embed = ModelScopeEmbeddings(model_id=model_id) - """ - - embed: Any - model_id: str = "damo/nlp_corom_sentence-embedding_english-base" - """Model name to use.""" - - def __init__(self, **kwargs: Any): - """Initialize the modelscope.""" - super().__init__(**kwargs) - try: - from modelscope.pipelines import pipeline - from modelscope.utils.constant import Tasks - - self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id) - - except ImportError as e: - raise ImportError( - "Could not import some python packages." - "Please install it with `pip install modelscope`." - ) from e - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a modelscope embedding model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - texts = list(map(lambda x: x.replace("\n", " "), texts)) - inputs = {"source_sentence": texts} - embeddings = self.embed(input=inputs)["text_embedding"] - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a modelscope embedding model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - text = text.replace("\n", " ") - inputs = {"source_sentence": [text]} - embedding = self.embed(input=inputs)["text_embedding"][0] - return embedding.tolist() diff --git a/nextpy/ai/models/embedding/mosaicml.py b/nextpy/ai/models/embedding/mosaicml.py deleted file mode 100644 index b7882992..00000000 --- a/nextpy/ai/models/embedding/mosaicml.py +++ /dev/null @@ -1,169 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around MosaicML APIs.""" -from __future__ import annotations - -from typing import Any, Dict, List, Mapping, Optional, Tuple - -import requests -from pydantic import BaseModel, Extra, root_validator - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - - -class MosaicMLInstructorEmbeddings(BaseModel, Embeddings): - """Wrapper around MosaicML's embedding inference service. - - To use, you should have the - environment variable ``MOSAICML_API_TOKEN`` set with your API token, or pass - it as a named parameter to the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.endpoints import MosaicMLInstructorEmbeddings - endpoint_url = ( - "https://models.hosted-on.mosaicml.hosting/instructor-large/v1/predict" - ) - mosaic_llm = MosaicMLInstructorEmbeddings( - endpoint_url=endpoint_url, - mosaicml_api_token="my-api-key" - ) - """ - - endpoint_url: str = ( - "https://models.hosted-on.mosaicml.hosting/instructor-xl/v1/predict" - ) - """Endpoint URL to use.""" - embed_instruction: str = "Represent the document for retrieval: " - """Instruction used to embed documents.""" - query_instruction: str = ( - "Represent the question for retrieving supporting documents: " - ) - """Instruction used to embed the query.""" - retry_sleep: float = 1.0 - """How long to try sleeping for if a rate limit is encountered""" - - mosaicml_api_token: Optional[str] = None - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - @root_validator() - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - mosaicml_api_token = get_from_dict_or_env( - values, "mosaicml_api_token", "MOSAICML_API_TOKEN" - ) - values["mosaicml_api_token"] = mosaicml_api_token - return values - - @property - def _identifying_params(self) -> Mapping[str, Any]: - """Get the identifying parameters.""" - return {"endpoint_url": self.endpoint_url} - - def _embed( - self, input: List[Tuple[str, str]], is_retry: bool = False - ) -> List[List[float]]: - payload = {"input_strings": input} - - # HTTP headers for authorization - headers = { - "Authorization": f"{self.mosaicml_api_token}", - "Content-Type": "application/json", - } - - # send request - try: - response = requests.post(self.endpoint_url, headers=headers, json=payload) - except requests.exceptions.RequestException as e: - raise ValueError(f"Error raised by inference endpoint: {e}") - - try: - parsed_response = response.json() - - if "error" in parsed_response: - # if we get rate limited, try sleeping for 1 second - if ( - not is_retry - and "rate limit exceeded" in parsed_response["error"].lower() - ): - import time - - time.sleep(self.retry_sleep) - - return self._embed(input, is_retry=True) - - raise ValueError( - f"Error raised by inference API: {parsed_response['error']}" - ) - - # The inference API has changed a couple of times, so we add some handling - # to be robust to multiple response formats. - if isinstance(parsed_response, dict): - if "data" in parsed_response: - output_item = parsed_response["data"] - elif "output" in parsed_response: - output_item = parsed_response["output"] - else: - raise ValueError( - f"No key data or output in response: {parsed_response}" - ) - - if isinstance(output_item, list) and isinstance(output_item[0], list): - embeddings = output_item - else: - embeddings = [output_item] - elif isinstance(parsed_response, list): - first_item = parsed_response[0] - if isinstance(first_item, list): - embeddings = parsed_response - elif isinstance(first_item, dict): - if "output" in first_item: - embeddings = [item["output"] for item in parsed_response] - else: - raise ValueError( - f"No key data or output in response: {parsed_response}" - ) - else: - raise ValueError(f"Unexpected response format: {parsed_response}") - else: - raise ValueError(f"Unexpected response type: {parsed_response}") - - except requests.exceptions.JSONDecodeError as e: - raise ValueError( - f"Error raised by inference API: {e}.\nResponse: {response.text}" - ) - - return embeddings - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embed documents using a MosaicML deployed instructor embedding model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - instruction_pairs = [(self.embed_instruction, text) for text in texts] - embeddings = self._embed(instruction_pairs) - return embeddings - - def embed_query(self, text: str) -> List[float]: - """Embed a query using a MosaicML deployed instructor embedding model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - instruction_pair = (self.query_instruction, text) - embedding = self._embed([instruction_pair])[0] - return embedding diff --git a/nextpy/ai/models/embedding/openai.py b/nextpy/ai/models/embedding/openai.py deleted file mode 100644 index 9db23568..00000000 --- a/nextpy/ai/models/embedding/openai.py +++ /dev/null @@ -1,311 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from __future__ import annotations - -import logging -from typing import ( - Any, - Callable, - Dict, - List, - Literal, - Optional, - Sequence, - Set, - Tuple, - Union, -) - -import numpy as np -from pydantic import BaseModel, Extra, root_validator -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from nextpy.ai.models.embedding.base import Embeddings -from nextpy.utils.data_ops import get_from_dict_or_env - -logger = logging.getLogger(__name__) - - -class OpenAIEmbeddings(BaseModel, Embeddings): - """Wrapper around OpenAI embedding models. - - To use, you should have the ``openai`` python package installed, and the - environment variable ``OPENAI_API_KEY`` set with your API key or pass it - as a named parameter to the constructor. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import OpenAIEmbeddings - openai = OpenAIEmbeddings(openai_api_key="my-api-key") - - In order to use the library with Microsoft Azure endpoints, you need to set - the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION. - The OPENAI_API_TYPE must be set to 'azure' and the others correspond to - the properties of your endpoint. - In addition, the deployment name must be passed as the model parameter. - - Example: - .. code-block:: python - - import os - os.environ["OPENAI_API_TYPE"] = "azure" - os.environ["OPENAI_API_BASE"] = "https:// Dict: - """Validate that api key and python package exists in environment.""" - values["openai_api_key"] = get_from_dict_or_env( - values, "openai_api_key", "OPENAI_API_KEY" - ) - values["openai_api_base"] = get_from_dict_or_env( - values, - "openai_api_base", - "OPENAI_API_BASE", - default="", - ) - values["openai_api_type"] = get_from_dict_or_env( - values, - "openai_api_type", - "OPENAI_API_TYPE", - default="", - ) - values["openai_proxy"] = get_from_dict_or_env( - values, - "openai_proxy", - "OPENAI_PROXY", - default="", - ) - if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): - default_api_version = "2022-12-01" - else: - default_api_version = "" - values["openai_api_version"] = get_from_dict_or_env( - values, - "openai_api_version", - "OPENAI_API_VERSION", - default=default_api_version, - ) - values["openai_organization"] = get_from_dict_or_env( - values, - "openai_organization", - "OPENAI_ORGANIZATION", - default="", - ) - try: - import openai - - values["client"] = openai.Embedding - except ImportError: - raise ImportError( - "Could not import openai python package. " - "Please install it with `pip install openai`." - ) - return values - - @property - def _invocation_params(self) -> Dict: - openai_args = { - "engine": self.deployment, - "request_timeout": self.request_timeout, - "headers": self.headers, - "api_key": self.openai_api_key, - "organization": self.openai_organization, - "api_base": self.openai_api_base, - "api_type": self.openai_api_type, - "api_version": self.openai_api_version, - } - if self.openai_proxy: - import openai - - openai.proxy = { - "http": self.openai_proxy, - "https": self.openai_proxy, - } # type: ignore[assignment] # noqa: E501 - return openai_args - - # please refer to - # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb - def _get_len_safe_embeddings( - self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None - ) -> List[List[float]]: - embeddings: List[List[float]] = [[] for _ in range(len(texts))] - try: - import tiktoken - except ImportError: - raise ImportError( - "Could not import tiktoken python package. " - "This is needed in order to for OpenAIEmbeddings. " - "Please install it with `pip install tiktoken`." - ) - - tokens = [] - indices = [] - encoding = tiktoken.model.encoding_for_model(self.model) - for i, text in enumerate(texts): - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - token = encoding.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - for j in range(0, len(token), self.embedding_ctx_length): - tokens += [token[j : j + self.embedding_ctx_length]] - indices += [i] - - batched_embeddings = [] - _chunk_size = chunk_size or self.chunk_size - for i in range(0, len(tokens), _chunk_size): - response = embed_with_retry( - self, - input=tokens[i : i + _chunk_size], - **self._invocation_params, - ) - batched_embeddings += [r["embedding"] for r in response["data"]] - - results: List[List[List[float]]] = [[] for _ in range(len(texts))] - num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] - for i in range(len(indices)): - results[indices[i]].append(batched_embeddings[i]) - num_tokens_in_batch[indices[i]].append(len(tokens[i])) - - for i in range(len(texts)): - _result = results[i] - if len(_result) == 0: - average = embed_with_retry(self, input="", **self._invocation_params,)[ - "data" - ][0]["embedding"] - else: - average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) - embeddings[i] = (average / np.linalg.norm(average)).tolist() - - return embeddings - - def _embedding_func(self, text: str, *, engine: str) -> List[float]: - """Call out to OpenAI's embedding endpoint.""" - # handle large input text - if len(text) > self.embedding_ctx_length: - return self._get_len_safe_embeddings([text], engine=engine)[0] - else: - if self.model.endswith("001"): - # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") - return embed_with_retry(self, input=[text], **self._invocation_params,)[ - "data" - ][0]["embedding"] - - def embed_documents( - self, texts: List[str], chunk_size: Optional[int] = 0 - ) -> List[List[float]]: - """Call out to OpenAI's embedding endpoint for embedding search docs. - - Args: - texts: The list of texts to embed. - chunk_size: The chunk size of embeddings. If None, will use the chunk size - specified by the class. - - Returns: - List of embeddings, one for each text. - """ - # NOTE: to keep things simple, we assume the list may contain texts longer - # than the maximum context and use length-safe embedding function. - return self._get_len_safe_embeddings(texts, engine=self.deployment) - - def embed_query(self, text: str) -> List[float]: - """Call out to OpenAI's embedding endpoint for embedding query text. - - Args: - text: The text to embed. - - Returns: - Embedding for the text. - """ - embedding = self._embedding_func(text, engine=self.deployment) - return embedding - - -def _create_retry_decorator(embeddings: OpenAIEmbeddings) -> Callable[[Any], Any]: - import openai - - min_seconds = 4 - max_seconds = 10 - # Wait 2^x * 1 second between each retry starting with - # 4 seconds, then up to 10 seconds, then 10 seconds afterwards - return retry( - reraise=True, - stop=stop_after_attempt(embeddings.max_retries), - wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), - retry=( - retry_if_exception_type(openai.error.Timeout) - | retry_if_exception_type(openai.error.APIError) - | retry_if_exception_type(openai.error.APIConnectionError) - | retry_if_exception_type(openai.error.RateLimitError) - | retry_if_exception_type(openai.error.ServiceUnavailableError) - ), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - - -def embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any: - """Use tenacity to retry the embedding call.""" - retry_decorator = _create_retry_decorator(embeddings) - - @retry_decorator - def _embed_with_retry(**kwargs: Any) -> Any: - return embeddings.client.create(**kwargs) - - return _embed_with_retry(**kwargs) diff --git a/nextpy/ai/models/embedding/tensorflowhub.py b/nextpy/ai/models/embedding/tensorflowhub.py deleted file mode 100644 index 3ae5665f..00000000 --- a/nextpy/ai/models/embedding/tensorflowhub.py +++ /dev/null @@ -1,80 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Wrapper around TensorflowHub embedding models.""" -from typing import Any, List - -from pydantic import BaseModel, Extra - -from nextpy.ai.models.embedding.base import Embeddings - -DEFAULT_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" - - -class TensorflowHubEmbeddings(BaseModel, Embeddings): - """Wrapper around tensorflow_hub embedding models. - - To use, you should have the ``tensorflow_text`` python package installed. - - Example: - .. code-block:: python - - from nextpy.ai.models.embeddings import TensorflowHubEmbeddings - url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" - tf = TensorflowHubEmbeddings(model_url=url) - """ - - embed: Any #: :meta private: - model_url: str = DEFAULT_MODEL_URL - """Model name to use.""" - - def __init__(self, **kwargs: Any): - """Initialize the tensorflow_hub and tensorflow_text.""" - super().__init__(**kwargs) - try: - import tensorflow_hub - except ImportError: - raise ImportError( - "Could not import tensorflow-hub python package. " - "Please install it with `pip install tensorflow-hub``." - ) - try: - import tensorflow_text # noqa - except ImportError: - raise ImportError( - "Could not import tensorflow_text python package. " - "Please install it with `pip install tensorflow_text``." - ) - - self.embed = tensorflow_hub.load(self.model_url) - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.forbid - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a TensorflowHub embedding model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - texts = list(map(lambda x: x.replace("\n", " "), texts)) - embeddings = self.embed(texts).numpy() - return embeddings.tolist() - - def embed_query(self, text: str) -> List[float]: - """Compute query embeddings using a TensorflowHub embedding model. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - text = text.replace("\n", " ") - embedding = self.embed([text]).numpy()[0] - return embedding.tolist() diff --git a/nextpy/ai/models/image/Readme.md b/nextpy/ai/models/image/Readme.md deleted file mode 100644 index c678ac69..00000000 --- a/nextpy/ai/models/image/Readme.md +++ /dev/null @@ -1,63 +0,0 @@ -# OpenAI DALL-E Image Generation - -This is a simple Python interface for generating images using OpenAI's DALL-E model. - -## Prerequisites - -Ensure you have the `openai` Python library installed. If not, you can install it using pip: - -```bash -pip install openai -``` -# Usage Dalle - -```python - -# Define your API key and any other settings -api_key = 'your-api-key-here' -image_model = 'your-image-model-here' # Optional -number_of_results = 5 # Optional, default is 1 - -# Create an instance of the OpenAiDalle class -dalle = OpenAiDalle(api_key, image_model, number_of_results) - -# Define a prompt and image size -prompt = 'A beautiful sunset over the mountains' -size = 512 # Optional, default is 512 - -# Generate an image -response = dalle.generate_image(prompt, size) - -# Print the response -print(response) -``` -# Usage - -```python - -# Define your API key and any other settings -api_key = 'your-api-key-here' -image_model = 'your-image-model-here' # Optional -number_of_results = 5 # Optional, default is 1 -client_id = 'your-client-id-here' # Optional -client_version = 'your-client-version-here' # Optional - -# Create an instance of the StableDiffusion class -image_llm = StableDiffusion(api_key, image_model, number_of_results, client_id, client_version) - -# Define a prompt and image size -prompt = 'A beautiful sunset over the mountains' -size = 512 # Optional, default is 512 - -# Define other settings -style_preset = 'enhance' # Optional, default is 'enhance' -cfg_scale = 7 # Optional, default is 7 -steps = 50 # Optional, default is 50 -seed = 0 # Optional, default is 0 - -# Generate an image -response = image_llm.generate_image(prompt, size, style_preset, cfg_scale, steps, seed) - -# Print the response -print(response) -``` \ No newline at end of file diff --git a/nextpy/ai/models/image/_base.py b/nextpy/ai/models/image/_base.py deleted file mode 100644 index 9a92ac96..00000000 --- a/nextpy/ai/models/image/_base.py +++ /dev/null @@ -1,14 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from abc import ABC, abstractmethod - - -class BaseImageModel(ABC): - @abstractmethod - def get_image_model(self): - pass - - @abstractmethod - def generate_image(self, prompt: str, size: int = 512, num: int = 2): - pass diff --git a/nextpy/ai/models/image/openai_dalle.py b/nextpy/ai/models/image/openai_dalle.py deleted file mode 100644 index 6abed644..00000000 --- a/nextpy/ai/models/image/openai_dalle.py +++ /dev/null @@ -1,44 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import os - -import openai - -from ._base import BaseImageModel - - -class OpenAiDalle(BaseImageModel): - def __init__(self, api_key, image_model=None, number_of_results=1): - """Args: - api_key (str): The OpenAI API key. - image_model (str): The image model. - number_of_results (int): The number of results. - """ - self.number_of_results = number_of_results - self.api_key = api_key - self.image_model = image_model - openai.api_key = api_key - openai.api_base = os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1") - - def get_image_model(self): - """Returns: - str: The image model. - """ - return self.image_model - - def generate_image(self, prompt: str, size: int = 512): - """Call the OpenAI image API. - - Args: - prompt (str): The prompt. - size (int): The size. - num (int): The number of images. - - Returns: - dict: The response. - """ - response = openai.Image.create( - prompt=prompt, n=self.number_of_results, size=f"{size}x{size}" - ) - return response diff --git a/nextpy/ai/models/image/stable_diffusion.py b/nextpy/ai/models/image/stable_diffusion.py deleted file mode 100644 index c012da33..00000000 --- a/nextpy/ai/models/image/stable_diffusion.py +++ /dev/null @@ -1,101 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import base64 -import os - -import requests - -from ._base import BaseImageModel - - -class StableDiffusion(BaseImageModel): - def __init__( - self, - api_key, - image_model=None, - number_of_results=1, - client_id=None, - client_version=None, - ): - """Args: - api_key (str): The Stability API key. - image_model (str): The image model. - number_of_results (int): The number of results. - client_id (str): Client ID. - client_version (str): Client version. - """ - self.api_key = api_key - self.image_model = image_model or "stable-diffusion-xl-beta-v2-2-2" - self.number_of_results = number_of_results - self.api_host = os.getenv("API_HOST", "https://api.stability.ai") - self.url = f"{self.api_host}/v1/generation/{self.image_model}/text-to-image" - self.client_id = client_id - self.client_version = client_version - - def get_image_model(self): - """Returns: - str: The image model. - """ - return self.image_model - - def generate_image( - self, - prompt: str, - size: int = 512, - style_preset="enhance", - cfg_scale=7, - steps=50, - seed=0, - ): - """Call the Stability image API. - - Args: - prompt (str): The prompt. - size (int): The size. - style_preset (str): The style preset. - cfg_scale (int): The config scale. - steps (int): The number of diffusion steps. - seed (int): The seed for random noise. - - Returns: - dict: The response. - """ - body = { - "width": size, - "height": size, - "steps": steps, - "seed": seed, - "cfg_scale": cfg_scale, - "samples": self.number_of_results, - "style_preset": style_preset, - "text_prompts": [{"text": prompt, "weight": 1}], - } - headers = { - "Accept": "application/json", - "Content-Type": "application/json", - "Authorization": f"Bearer {self.api_key}", - } - - # Add client ID and version headers if provided - if self.client_id is not None: - headers["Stability-Client-ID"] = self.client_id - if self.client_version is not None: - headers["Stability-Client-Version"] = self.client_version - - response = requests.post( - self.url, - headers=headers, - json=body, - ) - - if response.status_code != 200: - raise Exception("Non-200 response: " + str(response.text)) - - data = response.json() - - for _i, image in enumerate(data["artifacts"]): - with open(f"./out/txt2img_{image['seed']}.png", "wb") as f: - f.write(base64.b64decode(image["base64"])) - - return data diff --git a/nextpy/ai/models/llm/__init__.py b/nextpy/ai/models/llm/__init__.py deleted file mode 100644 index 658f5989..00000000 --- a/nextpy/ai/models/llm/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from .llm_client import Azure, OpenAI diff --git a/nextpy/ai/models/llm/llm_client.py b/nextpy/ai/models/llm/llm_client.py deleted file mode 100644 index a73ab524..00000000 --- a/nextpy/ai/models/llm/llm_client.py +++ /dev/null @@ -1,59 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import os -from abc import ABC, abstractmethod - -from litellm import completion - - -class LLMClient(ABC): - def __init__(self, api_key): - self.api_key = api_key - self.chat = self.Chat(self) - - class Chat(ABC): - def __init__(self, parent): - self.api_key = parent.api_key - self.completions = self.Completions(self) - - class Completions(ABC): - def __init__(self, parent): - self.api_key = parent.api_key - - @abstractmethod - def create(self, model, messages): - pass - - -class OpenAI(LLMClient): - class Chat(LLMClient.Chat): - class Completions(LLMClient.Chat.Completions): - def create(self, model, messages): - os.environ["OPENAI_API_KEY"] = self.api_key - response = completion(model=model, messages=messages) - return response - - -class Azure(LLMClient): - class Chat(LLMClient.Chat): - class Completions(LLMClient.Chat.Completions): - def create(self, model, messages): - os.environ["AZURE_API_KEY"] = self.api_key - os.environ["AZURE_API_BASE"] = "your-azure-api-base" - os.environ["AZURE_API_VERSION"] = "your-azure-api-version" - response = completion(model=model, messages=messages) - return response - - -# Usage for OpenAI -# openai_client = OpenAI(api_key="sk-") # Replace with your API key -# openai_response = openai_client.chat.completions.create( -# model="gpt-3.5-turbo", -# messages=[ -# {"role": "system", "content": "You are a helpful assistant."}, -# {"role": "user", "content": "Hello!"} -# ] -# ) -# # Print the OpenAI response -# print(openai_response["choices"][0]["message"]) diff --git "a/nextpy/ai/prompt_on_the_outside.\360\237\226\212\357\270\217" "b/nextpy/ai/prompt_on_the_outside.\360\237\226\212\357\270\217" deleted file mode 100644 index 615b0f0a..00000000 --- "a/nextpy/ai/prompt_on_the_outside.\360\237\226\212\357\270\217" +++ /dev/null @@ -1,13 +0,0 @@ -{{#system~}} -You are a helpful assistant -{{~/system}} - -{{~#geneach 'conversation' stop=False}} -{{#user~}} -{{set 'this.user_text' (await 'user_text') hidden=False}} -{{~/user}} - -{{#assistant~}} -{{gen 'this.ai_text' temperature=0 max_tokens=300}} -{{~/assistant}} -{{~/geneach}} diff --git a/nextpy/ai/rag/__init__.py b/nextpy/ai/rag/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/base.py b/nextpy/ai/rag/base.py deleted file mode 100644 index fd57d1a1..00000000 --- a/nextpy/ai/rag/base.py +++ /dev/null @@ -1,67 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import List - - -class SimpleRAG: - def __init__(self, raw_data=None, data_transformer=None, vector_store=None): - """Initialize the knowledge base. - - Args: - raw_data: The raw data to add to the knowledge base. Default is None. - data_transformer: An object with a `split_documents` method to apply to the raw data. Default is None. - vector_store: An object with `add_documents` and `similarity_search` methods to use for storing vectors. Default is None. - """ - self.data_transformer = data_transformer - self.vector_store = vector_store - self.references = [] - self.add_data(raw_data) - - def add_data(self, raw_data): - """Add raw data into the knowledge base. - - Args: - raw_data: The raw data to add. - """ - # Validate raw data - if not raw_data: - raise ValueError("Raw data cannot be empty.") - - # fetch and add references - for data in raw_data: - self.references.append(data.metadata) - - # Split raw data into chunks - split_data = self.data_transformer.split_documents(raw_data) - - # Add split data to vector store - try: - self.vector_store.add_documents(split_data) - except Exception as e: - print(f"Failed to add documents: {e}") - raise - - def retrieve_data(self, query, top_k=1) -> List[str]: - """Retrieve documents from the knowledge base. - - Args: - query: The query to use for the retrieval. - top_k: The number of documents to retrieve. Default is 1. - - Returns: - A list of the retrieved documents. - """ - try: - results = self.vector_store.similarity_search(query=query, top_k=top_k) - except Exception as e: - print(f"Failed to retrieve documents: {e}") - raise - - # Handle no results case - if not results: - return [] - - # Extract page content - docs = [result[0].page_content for result in results] - return docs diff --git a/nextpy/ai/rag/doc_loader.py b/nextpy/ai/rag/doc_loader.py deleted file mode 100644 index a239b353..00000000 --- a/nextpy/ai/rag/doc_loader.py +++ /dev/null @@ -1,128 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import importlib -from typing import Any - - -def import_class(class_path): - module_name, class_name = class_path.rsplit(".", 1) - module = importlib.import_module(module_name) - return getattr(module, class_name) - - -def document_loader(reader_type: str) -> Any: - mapping = { - "airtable": "nextpy.ai.rag.document_loaders.airtable.base.AirtableReader", - "apify_dataset": "nextpy.ai.rag.document_loaders.apify.dataset.base.ApifyDataset", - "asana": "nextpy.ai.rag.document_loaders.asana.base.AsanaReader", - "azcognitive_search": "nextpy.ai.rag.document_loaders.azcognitive_search.base.AzCognitiveSearchReader", - "bilibili": "nextpy.ai.rag.document_loaders.bilibili.base.BilibiliTranscriptReader", - "boarddocs": "nextpy.ai.rag.document_loaders.boarddocs.base.BoardDocsReader", - "chatgpt_plugin": "nextpy.ai.rag.document_loaders.chatgpt_plugin.base.ChatGPTRetrievalPluginReader", - "chroma": "nextpy.ai.rag.document_loaders.chroma.base.ChromaReader", - "confluence": "nextpy.ai.rag.document_loaders.confluence.base.ConfluenceReader", - "couchdb": "nextpy.ai.rag.document_loaders.couchdb.base.SimpleCouchDBReader", - "dad_jokes": "nextpy.ai.rag.document_loaders.dad_jokes.base.DadJokesReader", - "deep_lake": "nextpy.ai.rag.document_loaders.deeplake.base.DeepLakeReader", - "discord": "nextpy.ai.rag.document_loaders.discord.base.DiscordReader", - "docugami": "nextpy.ai.rag.document_loaders.docugami.base.DocugamiReader", - "elasticsearch": "nextpy.ai.rag.document_loaders.elasticsearch.base.ElasticsearchReader", - "faiss": "nextpy.ai.rag.document_loaders.faiss.base.FaissReader", - "feedly_rss": "nextpy.ai.rag.document_loaders.feedly_rss.base.FeedlyRssReader", - "feishu_docs": "nextpy.ai.rag.document_loaders.feishu_docs.base.FeishuDocsReader", - "file_directory": "nextpy.ai.rag.document_loaders.file.base.SimpleDirectoryReader", - "file_audio": "nextpy.ai.rag.document_loaders.file.audio.base.AudioTranscriber", - "gladia_audio": "nextpy.ai.rag.document_loaders.file.audio_gladia.base.GladiaAudioTranscriber", - "file_cjk_pdf": "nextpy.ai.rag.document_loaders.file.cjk_pdf.base.CJKPDFReader", - "deep_doctection": "nextpy.ai.rag.document_loaders.file.deepdoctection.base.DeepDoctectionReader", - "file_docx": "nextpy.ai.rag.document_loaders.file.docx.base.DocxReader", - "file_epub": "nextpy.ai.rag.document_loaders.file.epub.base.EpubReader", - "flat_pdf": "nextpy.ai.rag.document_loaders.file.flat_pdf.base.FlatPdfReader", - "image": "nextpy.ai.rag.document_loaders.file.image.base.ImageReader", - "image_caption": "nextpy.ai.rag.document_loaders.file.image_blip.base.ImageCaptionReader", - "image_vision": "nextpy.ai.rag.document_loaders.file.image_blip2.base.ImageVisionLLMReader", - "image_tabular_chart": "nextpy.ai.rag.document_loaders.file.image_deplot.base.ImageTabularChartReader", - "ipynb": "nextpy.ai.rag.document_loaders.file.ipynb.base.IPYNBReader", - "json": "nextpy.ai.rag.document_loaders.file.json.base.JSONReader", - "markdown": "nextpy.ai.rag.document_loaders.file.markdown.base.MarkdownReader", - "mbox": "nextpy.ai.rag.document_loaders.file.mbox.base.MboxReader", - "paged_csv": "nextpy.ai.rag.document_loaders.file.paged_csv.base.PagedCSVReader", - "pandas_csv": "nextpy.ai.rag.document_loaders.file.pandas_csv.base.PandasCSVReader", - "pandas_excel": "nextpy.ai.rag.document_loaders.file.pandas_excel.base.PandasExcelReader", - "pdf": "nextpy.ai.rag.document_loaders.file.pdf.base.PDFReader", - "pdf_miner": "nextpy.ai.rag.document_loaders.file.pdf_miner.base.PDFMinerReader", - "pptx": "nextpy.ai.rag.document_loaders.file.pptx.base.PptxReader", - "pymu_pdf": "nextpy.ai.rag.document_loaders.file.pymu_pdf.base.PyMuPDFReader", - "rdf": "nextpy.ai.rag.document_loaders.file.rdf.base.RDFReader", - "simple_csv": "nextpy.ai.rag.document_loaders.file.simple_csv.base.SimpleCSVReader", - "unstructured": "nextpy.ai.rag.document_loaders.file.unstructured.base.UnstructuredReader", - "firebase_realtimedb": "nextpy.ai.rag.document_loaders.firebase_realtimedb.base.FirebaseRealtimeDatabaseReader", - "firestore": "nextpy.ai.rag.document_loaders.firestore.base.FirestoreReader", - "github_repo_issues": "nextpy.ai.rag.document_loaders.github_repo_issues.base.GitHubRepositoryIssuesReader", - "gmail": "nextpy.ai.rag.document_loaders.gmail.base.GmailReader", - "google_calendar": "nextpy.ai.rag.document_loaders.google_calendar.base.GoogleCalendarReader", - "google_docs": "nextpy.ai.rag.document_loaders.google_docs.base.GoogleDocsReader", - "google_keep": "nextpy.ai.rag.document_loaders.google_keep.base.GoogleKeepReader", - "google_sheets": "nextpy.ai.rag.document_loaders.google_sheets.base.GoogleSheetsReader", - "gpt_repo": "nextpy.ai.rag.document_loaders.gpt_repo.base.GPTRepoReader", - "graphdb_cypher": "nextpy.ai.rag.document_loaders.graphdb_cypher.base.GraphDBCypherReader", - "graphql": "nextpy.ai.rag.document_loaders.graphql.base.GraphQLReader", - "hatena_blog": "nextpy.ai.rag.document_loaders.hatena_blog.base.HatenaBlogReader", - "hubspot": "nextpy.ai.rag.document_loaders.hubspot.base.HubspotReader", - "huggingface_fs": "nextpy.ai.rag.document_loaders.huggingface.fs.base.HuggingFaceFSReader", - "intercom": "nextpy.ai.rag.document_loaders.intercom.base.IntercomReader", - "jira": "nextpy.ai.rag.document_loaders.jira.base.JiraReader", - # "joplin": "nextpy.ai.rag.document_loaders.joplin.base.JoplinReader", - "jsondata": "nextpy.ai.rag.document_loaders.jsondata.base.JSONDataReader", - "kaltura_esearch": "nextpy.ai.rag.document_loaders.kaltura.esearch.base.KalturaESearchReader", - "kibela": "nextpy.ai.rag.document_loaders.kibela.base.KibelaReader", - # "make_com": "nextpy.ai.rag.document_loaders.make_com.base.MakeWrapper", - "mangoapps_guides": "nextpy.ai.rag.document_loaders.mangoapps_guides.base.MangoppsGuidesReader", - "maps": "nextpy.ai.rag.document_loaders.maps.base.OpenMap", - "memos": "nextpy.ai.rag.document_loaders.memos.base.MemosReader", - "metal": "nextpy.ai.rag.document_loaders.metal.base.MetalReader", - "milvus": "nextpy.ai.rag.document_loaders.milvus.base.MilvusReader", - "mondaydotcom": "nextpy.ai.rag.document_loaders.mondaydotcom.base.MondayReader", - "mongo": "nextpy.ai.rag.document_loaders.mongo.base.SimpleMongoReader", - "notion": "nextpy.ai.rag.document_loaders.notion.base.NotionPageReader", - "obsidian": "nextpy.ai.rag.document_loaders.obsidian.base.ObsidianReader", - "opendal": "nextpy.ai.rag.document_loaders.opendal_reader.base.OpendalReader", - "opendal_azblob": "nextpy.ai.rag.document_loaders.opendal_reader.azblob.base.OpendalAzblobReader", - "opendal_gcs": "nextpy.ai.rag.document_loaders.opendal_reader.gcs.base.OpendalGcsReader", - "opendal_s3": "nextpy.ai.rag.document_loaders.opendal_reader.s3.base.OpendalS3Reader", - "outlook_localcalendar": "nextpy.ai.rag.document_loaders.outlook_localcalendar.base.OutlookLocalCalendarReader", - "pubmed": "nextpy.ai.rag.document_loaders.papers.pubmed.base.PubmedReader", - "pinecone": "nextpy.ai.rag.document_loaders.pinecone.base.PineconeReader", - "qdrant": "nextpy.ai.rag.document_loaders.qdrant.base.QdrantReader", - "readwise": "nextpy.ai.rag.document_loaders.readwise.base.ReadwiseReader", - "reddit": "nextpy.ai.rag.document_loaders.reddit.base.RedditReader", - "slack": "nextpy.ai.rag.document_loaders.slack.base.SlackReader", - "snscrape_twitter": "nextpy.ai.rag.document_loaders.snscrape_twitter.base.SnscrapeTwitterReader", - "spotify": "nextpy.ai.rag.document_loaders.spotify.base.SpotifyReader", - "stackoverflow": "nextpy.ai.rag.document_loaders.stackoverflow.base.StackoverflowReader", - "steamship": "nextpy.ai.rag.document_loaders.steamship.base.SteamshipFileReader", - "string_iterable": "nextpy.ai.rag.document_loaders.string_iterable.base.StringIterableReader", - "trello": "nextpy.ai.rag.document_loaders.trello.base.TrelloReader", - "twitter": "nextpy.ai.rag.document_loaders.twitter.base.TwitterTweetReader", - "weather": "nextpy.ai.rag.document_loaders.weather.base.WeatherReader", - "weaviate": "nextpy.ai.rag.document_loaders.weaviate.base.WeaviateReader", - "async_web": "nextpy.ai.rag.document_loaders.web.async_web.base.AsyncWebPageReader", - "beautiful_soup_web": "nextpy.ai.rag.document_loaders.web.beautiful_soup_web.base.BeautifulSoupWebReader", - "knowledge_base_web": "nextpy.ai.rag.document_loaders.web.knowledge_base.base.RAGWebReader", - # "readability_web": "nextpy.ai.rag.document_loaders.web.readability_web.base.ReadabilityWebPageReader", - "rss": "nextpy.ai.rag.document_loaders.web.rss.base.RssReader", - "simple_web": "nextpy.ai.rag.document_loaders.web.simple_web.base.SimpleWebPageReader", - # "sitemap": "nextpy.ai.rag.document_loaders.web.sitemap.base.SitemapReader", - "trafilatura_web": "nextpy.ai.rag.document_loaders.web.trafilatura_web.base.TrafilaturaWebReader", - "unstructured_web": "nextpy.ai.rag.document_loaders.web.unstructured_web.base.UnstructuredURLLoader", - "whatsapp": "nextpy.ai.rag.document_loaders.whatsapp.base.WhatsappChatLoader", - "wikipedia": "nextpy.ai.rag.document_loaders.wikipedia.base.WikipediaReader", - "wordlift": "nextpy.ai.rag.document_loaders.wordlift.base.WordLiftLoader", - "wordpress": "nextpy.ai.rag.document_loaders.wordpress.base.WordpressReader", - "youtube_transcript": "nextpy.ai.rag.document_loaders.youtube_transcript.base.YoutubeTranscriptReader", - "zendesk": "nextpy.ai.rag.document_loaders.zendesk.base.ZendeskReader", - "zulip": "nextpy.ai.rag.document_loaders.zulip.base.ZulipReader", - } - reader_class = import_class(mapping[reader_type]) - return reader_class diff --git a/nextpy/ai/rag/document_loaders/README.md b/nextpy/ai/rag/document_loaders/README.md deleted file mode 100644 index a65da93f..00000000 --- a/nextpy/ai/rag/document_loaders/README.md +++ /dev/null @@ -1,3 +0,0 @@ -You can find the loaders from Llama Hub in this temp directory. Please submit any new or updated loaders to the Llama Hub repository: https://github.com/emptycrown/llama-hub/tree/main - -We are updating our library to use Llama Hub as a dependency, but the api will not change. You can continue to use the loaders apis without any issues. diff --git a/nextpy/ai/rag/document_loaders/__init__.py b/nextpy/ai/rag/document_loaders/__init__.py deleted file mode 100644 index 17ff1225..00000000 --- a/nextpy/ai/rag/document_loaders/__init__.py +++ /dev/null @@ -1,127 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -# """Init file.""" -# from nextpy.ai.rag.document_loaders.basereader import BaseReader -# from nextpy.ai.rag.document_loaders.utils import import_loader -# from nextpy.ai.rag.document_loaders.airtable.base import AirtableReader -# # from nextpy.ai.rag.document_loaders.apify.actor.base import ApifyActor -# from nextpy.ai.rag.document_loaders.apify.dataset.base import ApifyDataset -# from nextpy.ai.rag.document_loaders.asana.base import AsanaReader -# from nextpy.ai.rag.document_loaders.azcognitive_search.base import AzCognitiveSearchReader -# # from nextpy.ai.rag.document_loaders.azstorage_blob.base import AzStorageBlobReader -# from nextpy.ai.rag.document_loaders.bilibili.base import BilibiliTranscriptReader -# from nextpy.ai.rag.document_loaders.boarddocs.base import BoardDocsReader -# from nextpy.ai.rag.document_loaders.chatgpt_plugin.base import ChatGPTRetrievalPluginReader -# from nextpy.ai.rag.document_loaders.chroma.base import ChromaReader -# from nextpy.ai.rag.document_loaders.confluence.base import ConfluenceReader -# from nextpy.ai.rag.document_loaders.couchdb.base import SimpleCouchDBReader -# from nextpy.ai.rag.document_loaders.dad_jokes.base import DadJokesReader -# # from nextpy.ai.rag.document_loaders.database.base import DatabaseReader -# from nextpy.ai.rag.document_loaders.deeplake.base import DeepLakeReader -# from nextpy.ai.rag.document_loaders.discord.base import DiscordReader -# from nextpy.ai.rag.document_loaders.docugami.base import DocugamiReader -# from nextpy.ai.rag.document_loaders.elasticsearch.base import ElasticsearchReader -# from nextpy.ai.rag.document_loaders.faiss.base import FaissReader -# from nextpy.ai.rag.document_loaders.feedly_rss.base import FeedlyRssReader -# from nextpy.ai.rag.document_loaders.feishu_docs.base import FeishuDocsReader -# from nextpy.ai.rag.document_loaders.file.base import SimpleDirectoryReader -# from nextpy.ai.rag.document_loaders.file.audio.base import AudioTranscriber -# from nextpy.ai.rag.document_loaders.file.audio_gladia.base import GladiaAudioTranscriber -# from nextpy.ai.rag.document_loaders.file.cjk_pdf.base import CJKPDFReader -# from nextpy.ai.rag.document_loaders.file.deepdoctection.base import DeepDoctectionReader -# from nextpy.ai.rag.document_loaders.file.docx.base import DocxReader -# from nextpy.ai.rag.document_loaders.file.epub.base import EpubReader -# from nextpy.ai.rag.document_loaders.file.flat_pdf.base import FlatPdfReader -# # from nextpy.ai.rag.document_loaders.file.image.base import ImageReader -# # from nextpy.ai.rag.document_loaders.file.image_blip.base import ImageCaptionReader -# # from nextpy.ai.rag.document_loaders.file.image_blip2.base import ImageVisionLLMReader -# # from nextpy.ai.rag.document_loaders.file.image_deplot.base import ImageTabularChartReader -# from nextpy.ai.rag.document_loaders.file.ipynb.base import IPYNBReader -# from nextpy.ai.rag.document_loaders.file.json.base import JSONReader -# from nextpy.ai.rag.document_loaders.file.markdown.base import MarkdownReader -# from nextpy.ai.rag.document_loaders.file.mbox.base import MboxReader -# from nextpy.ai.rag.document_loaders.file.paged_csv.base import PagedCSVReader -# from nextpy.ai.rag.document_loaders.file.pandas_csv.base import PandasCSVReader -# from nextpy.ai.rag.document_loaders.file.pandas_excel.base import PandasExcelReader -# from nextpy.ai.rag.document_loaders.file.pdf.base import PDFReader -# from nextpy.ai.rag.document_loaders.file.pdf_miner.base import PDFMinerReader -# from nextpy.ai.rag.document_loaders.file.pptx.base import PptxReader -# from nextpy.ai.rag.document_loaders.file.pymu_pdf.base import PyMuPDFReader -# from nextpy.ai.rag.document_loaders.file.rdf.base import RDFReader -# from nextpy.ai.rag.document_loaders.file.simple_csv.base import SimpleCSVReader -# from nextpy.ai.rag.document_loaders.file.unstructured.base import UnstructuredReader -# from nextpy.ai.rag.document_loaders.firebase_realtimedb.base import FirebaseRealtimeDatabaseReader -# from nextpy.ai.rag.document_loaders.firestore.base import FirestoreReader -# # from nextpy.ai.rag.document_loaders.github_repo.base import GithubRepositoryReader -# from nextpy.ai.rag.document_loaders.github_repo_issues.base import GitHubRepositoryIssuesReader -# from nextpy.ai.rag.document_loaders.gmail.base import GmailReader -# from nextpy.ai.rag.document_loaders.google_calendar.base import GoogleCalendarReader -# from nextpy.ai.rag.document_loaders.google_docs.base import GoogleDocsReader -# # from nextpy.ai.rag.document_loaders.google_drive.base import GoogleDriveReader -# from nextpy.ai.rag.document_loaders.google_keep.base import GoogleKeepReader -# from nextpy.ai.rag.document_loaders.google_sheets.base import GoogleSheetsReader -# from nextpy.ai.rag.document_loaders.gpt_repo.base import GPTRepoReader -# from nextpy.ai.rag.document_loaders.graphdb_cypher.base import GraphDBCypherReader -# from nextpy.ai.rag.document_loaders.graphql.base import GraphQLReader -# from nextpy.ai.rag.document_loaders.hatena_blog.base import HatenaBlogReader -# from nextpy.ai.rag.document_loaders.hubspot.base import HubspotReader -# from nextpy.ai.rag.document_loaders.huggingface.fs.base import HuggingFaceFSReader -# from nextpy.ai.rag.document_loaders.intercom.base import IntercomReader -# from nextpy.ai.rag.document_loaders.jira.base import JiraReader -# # from nextpy.ai.rag.document_loaders.joplin.base import JoplinReader -# from nextpy.ai.rag.document_loaders.jsondata.base import JSONDataReader -# from nextpy.ai.rag.document_loaders.kaltura.esearch.base import KalturaESearchReader -# from nextpy.ai.rag.document_loaders.kibela.base import KibelaReader -# # from nextpy.ai.rag.document_loaders.make_com.base import MakeWrapper -# from nextpy.ai.rag.document_loaders.mangoapps_guides.base import MangoppsGuidesReader -# from nextpy.ai.rag.document_loaders.maps.base import OpenMap -# from nextpy.ai.rag.document_loaders.memos.base import MemosReader -# from nextpy.ai.rag.document_loaders.metal.base import MetalReader -# from nextpy.ai.rag.document_loaders.milvus.base import MilvusReader -# from nextpy.ai.rag.document_loaders.mondaydotcom.base import MondayReader -# from nextpy.ai.rag.document_loaders.mongo.base import SimpleMongoReader -# from nextpy.ai.rag.document_loaders.notion.base import NotionPageReader -# # from nextpy.ai.rag.document_loaders.obsidian.base import ObsidianReader -# # from nextpy.ai.rag.document_loaders.opendal_reader.base import OpendalReader -# # from nextpy.ai.rag.document_loaders.opendal_reader.azblob.base import OpendalAzblobReader -# # from nextpy.ai.rag.document_loaders.opendal_reader.gcs.base import OpendalGcsReader -# # from nextpy.ai.rag.document_loaders.opendal_reader.s3.base import OpendalS3Reader -# from nextpy.ai.rag.document_loaders.outlook_localcalendar.base import OutlookLocalCalendarReader -# # from nextpy.ai.rag.document_loaders.pandas_ai.base import PandasAIReader -# # from nextpy.ai.rag.document_loaders.papers.arxiv.base import ArxivReader -# from nextpy.ai.rag.document_loaders.papers.pubmed.base import PubmedReader -# from nextpy.ai.rag.document_loaders.pinecone.base import PineconeReader -# from nextpy.ai.rag.document_loaders.qdrant.base import QdrantReader -# from nextpy.ai.rag.document_loaders.readwise.base import ReadwiseReader -# from nextpy.ai.rag.document_loaders.reddit.base import RedditReader -# # from nextpy.ai.rag.document_loaders.remote.base import RemoteReader -# # from nextpy.ai.rag.document_loaders.remote_depth.base import RemoteDepthReader -# # from nextpy.ai.rag.document_loaders.s3.base import S3Reader -# # from nextpy.ai.rag.document_loaders.singlestore.base import SingleStoreReader -# from nextpy.ai.rag.document_loaders.slack.base import SlackReader -# from nextpy.ai.rag.document_loaders.snscrape_twitter.base import SnscrapeTwitterReader -# from nextpy.ai.rag.document_loaders.spotify.base import SpotifyReader -# from nextpy.ai.rag.document_loaders.stackoverflow.base import StackoverflowReader -# from nextpy.ai.rag.document_loaders.steamship.base import SteamshipFileReader -# from nextpy.ai.rag.document_loaders.string_iterable.base import StringIterableReader -# from nextpy.ai.rag.document_loaders.trello.base import TrelloReader -# from nextpy.ai.rag.document_loaders.twitter.base import TwitterTweetReader -# from nextpy.ai.rag.document_loaders.weather.base import WeatherReader -# from nextpy.ai.rag.document_loaders.weaviate.base import WeaviateReader -# from nextpy.ai.rag.document_loaders.web.async_web.base import AsyncWebPageReader -# from nextpy.ai.rag.document_loaders.web.beautiful_soup_web.base import BeautifulSoupWebReader -# from nextpy.ai.rag.document_loaders.web.knowledge_base.base import RAGWebReader -# # from nextpy.ai.rag.document_loaders.web.readability_web.base import ReadabilityWebPageReader -# from nextpy.ai.rag.document_loaders.web.rss.base import RssReader -# from nextpy.ai.rag.document_loaders.web.simple_web.base import SimpleWebPageReader -# # from nextpy.ai.rag.document_loaders.web.sitemap.base import SitemapReader -# from nextpy.ai.rag.document_loaders.web.trafilatura_web.base import TrafilaturaWebReader -# from nextpy.ai.rag.document_loaders.web.unstructured_web.base import UnstructuredURLLoader -# from nextpy.ai.rag.document_loaders.whatsapp.base import WhatsappChatLoader -# from nextpy.ai.rag.document_loaders.wikipedia.base import WikipediaReader -# from nextpy.ai.rag.document_loaders.wordlift.base import WordLiftLoader -# from nextpy.ai.rag.document_loaders.wordpress.base import WordpressReader -# from nextpy.ai.rag.document_loaders.youtube_transcript.base import YoutubeTranscriptReader -# from nextpy.ai.rag.document_loaders.zendesk.base import ZendeskReader -# from nextpy.ai.rag.document_loaders.zulip.base import ZulipReader diff --git a/nextpy/ai/rag/document_loaders/add_loader.sh b/nextpy/ai/rag/document_loaders/add_loader.sh deleted file mode 100644 index ddecec9e..00000000 --- a/nextpy/ai/rag/document_loaders/add_loader.sh +++ /dev/null @@ -1,5 +0,0 @@ -mkdir $1; -touch $1/base.py; -touch $1/README.md; -touch $1/__init__.py; -echo "\"\"\"Init file.\"\"\"" > $1/__init__.py; diff --git a/nextpy/ai/rag/document_loaders/airtable/README.md b/nextpy/ai/rag/document_loaders/airtable/README.md deleted file mode 100644 index 881d0b99..00000000 --- a/nextpy/ai/rag/document_loaders/airtable/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Airtable Loader - -This loader loads documents from Airtable. The user specifies an API token to initialize the AirtableReader. They then specify a `table_id` and a `base_id` to load in the corresponding DocumentNode objects. - -## Usage - -Here's an example usage of the AirtableReader. - -```python -from nextpy.ai import download_loader -import os - -AirtableReader = download_loader('AirtableReader') - -reader = AirtableReader(") -documents = reader.load_data(table_id="",base_id="") - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/airtable/__init__.py b/nextpy/ai/rag/document_loaders/airtable/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/airtable/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/airtable/base.py b/nextpy/ai/rag/document_loaders/airtable/base.py deleted file mode 100644 index 0420b24b..00000000 --- a/nextpy/ai/rag/document_loaders/airtable/base.py +++ /dev/null @@ -1,38 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Airtable reader.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class AirtableReader(BaseReader): - """Airtable reader. Reads data from a table in a base. - - Args: - api_key (str): Airtable API key. - """ - - def __init__(self, api_key: str) -> None: - """Initialize Airtable reader.""" - self.api_key = api_key - - def load_data(self, base_id: str, table_id: str) -> List[DocumentNode]: - """Load data from a table in a base. - - Args: - table_id (str): Table ID. - base_id (str): Base ID. - - Returns: - List[DocumentNode]: List of LIDocuments. - """ - from pyairtable import Table - - metadata = {"base_id": base_id, "table_id": table_id} - - table = Table(self.api_key, base_id, table_id) - all_records = table.all() - return [DocumentNode(text=f"{all_records}", extra_info=metadata)] diff --git a/nextpy/ai/rag/document_loaders/airtable/requirements.txt b/nextpy/ai/rag/document_loaders/airtable/requirements.txt deleted file mode 100644 index 83c39582..00000000 --- a/nextpy/ai/rag/document_loaders/airtable/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pyairtable \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/apify/actor/README.md b/nextpy/ai/rag/document_loaders/apify/actor/README.md deleted file mode 100644 index d55ffb27..00000000 --- a/nextpy/ai/rag/document_loaders/apify/actor/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Apify Actor Loader - -[Apify](https://apify.com/) is a cloud platform for web scraping and data extraction, -which provides an [ecosystem](https://apify.com/store) of more than a thousand -ready-made apps called _Actors_ for various scraping, crawling, and extraction use cases. - -This loader runs a specific Actor and loads its results. - -## Usage - -In this example, we’ll use the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor, -which can deeply crawl websites such as documentation, knowledge bases, help centers, -or blogs, and extract text content from the web pages. -The extracted text then can be fed to a vector index or language model like GPT -in order to answer questions from it. - -To use this loader, you need to have a (free) Apify account -and set your [Apify API token](https://console.apify.com/account/integrations) in the code. - -```python -from nextpy.ai import download_loader -from nextpy.ai.schema import DocumentNode - -# Converts a single record from the Actor's resulting dataset to the LlamaIndex format -def tranform_dataset_item(item): - return DocumentNode( - text=item.get("text"), - extra_info={ - "url": item.get("url"), - }, - ) - -ApifyActor = download_loader("ApifyActor") - -reader = ApifyActor("") -documents = reader.load_data( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://gpt-index.readthedocs.io/en/latest"}]} - dataset_mapping_function=tranform_dataset_item, -) -``` - -This loader is designed to be used as a way to load data into -[LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently -used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. -See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/apify/actor/__init__.py b/nextpy/ai/rag/document_loaders/apify/actor/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/apify/actor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/apify/actor/base.py b/nextpy/ai/rag/document_loaders/apify/actor/base.py deleted file mode 100644 index 9fabb080..00000000 --- a/nextpy/ai/rag/document_loaders/apify/actor/base.py +++ /dev/null @@ -1,69 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Apify Actor reader.""" -from typing import Callable, Dict, List, Optional - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ApifyActor(BaseReader): - """Apify Actor reader. - Calls an Actor on the Apify platform and reads its resulting dataset when it finishes. - - Args: - apify_api_token (str): Apify API token. - """ - - def __init__(self, apify_api_token: str) -> None: - """Initialize the Apify Actor reader.""" - from apify_client import ApifyClient - - self.apify_api_token = apify_api_token - self.apify_client = ApifyClient(apify_api_token) - - def load_data( - self, - actor_id: str, - run_input: Dict, - dataset_mapping_function: Callable[[Dict], DocumentNode], - *, - build: Optional[str] = None, - memory_mbytes: Optional[int] = None, - timeout_secs: Optional[int] = None, - ) -> List[DocumentNode]: - """Call an Actor on the Apify platform, wait for it to finish, and return its resulting dataset. - Args: - actor_id (str): The ID or name of the Actor. - run_input (Dict): The input object of the Actor that you're trying to run. - dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the DocumentNode class. - build (str, optional): Optionally specifies the Actor build to run. It can be either a build tag or build number. - memory_mbytes (int, optional): Optional memory limit for the run, in megabytes. - timeout_secs (int, optional): Optional timeout for the run, in seconds. - - Returns: - List[DocumentNode]: List of documents. - """ - actor_call = self.apify_client.actor(actor_id).call( - run_input=run_input, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - ) - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - ApifyDataset = import_loader("ApifyDataset") - except ImportError: - ApifyDataset = download_loader("ApifyDataset") - - reader = ApifyDataset(self.apify_api_token) - documents = reader.load_data( - dataset_id=actor_call.get("defaultDatasetId"), - dataset_mapping_function=dataset_mapping_function, - ) - - return documents diff --git a/nextpy/ai/rag/document_loaders/apify/actor/requirements.txt b/nextpy/ai/rag/document_loaders/apify/actor/requirements.txt deleted file mode 100644 index 5a3a1cbf..00000000 --- a/nextpy/ai/rag/document_loaders/apify/actor/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -apify-client diff --git a/nextpy/ai/rag/document_loaders/apify/dataset/README.md b/nextpy/ai/rag/document_loaders/apify/dataset/README.md deleted file mode 100644 index 915e7dac..00000000 --- a/nextpy/ai/rag/document_loaders/apify/dataset/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Apify Dataset Loader - -[Apify](https://apify.com/) is a cloud platform for web scraping and data extraction, -which provides an [ecosystem](https://apify.com/store) of more than a thousand -ready-made apps called _Actors_ for various scraping, crawling, and extraction use cases. - -This loader loads documents from an existing [Apify dataset](https://docs.apify.com/platform/storage/dataset). - -## Usage - -In this example, we’ll load a dataset generated by -the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor, -which can deeply crawl websites such as documentation, knowledge bases, help centers, -or blogs, and extract text content from the web pages. -The extracted text then can be fed to a vector index or language model like GPT -in order to answer questions from it. - -To use this loader, you need to have a (free) Apify account -and set your [Apify API token](https://console.apify.com/account/integrations) in the code. - -```python -from nextpy.ai import download_loader -from nextpy.ai.schema import DocumentNode - -# Converts a single record from the Apify dataset to the LlamaIndex format -def tranform_dataset_item(item): - return DocumentNode( - text=item.get("text"), - extra_info={ - "url": item.get("url"), - }, - ) - -ApifyDataset = download_loader("ApifyDataset") - -reader = ApifyDataset("")) -documents = reader.load_data(dataset_id="", dataset_mapping_function=tranform_dataset_item) -``` diff --git a/nextpy/ai/rag/document_loaders/apify/dataset/__init__.py b/nextpy/ai/rag/document_loaders/apify/dataset/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/apify/dataset/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/apify/dataset/base.py b/nextpy/ai/rag/document_loaders/apify/dataset/base.py deleted file mode 100644 index fc4f8025..00000000 --- a/nextpy/ai/rag/document_loaders/apify/dataset/base.py +++ /dev/null @@ -1,45 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Apify dataset reader.""" -from typing import Callable, Dict, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ApifyDataset(BaseReader): - """Apify Dataset reader. - Reads a dataset on the Apify platform. - - Args: - apify_api_token (str): Apify API token. - """ - - def __init__(self, apify_api_token: str) -> None: - """Initialize Apify dataset reader.""" - from apify_client import ApifyClient - - self.apify_client = ApifyClient(apify_api_token) - - def load_data( - self, dataset_id: str, dataset_mapping_function: Callable[[Dict], DocumentNode] - ) -> List[DocumentNode]: - """Load data from the Apify dataset. - Args: - dataset_id (str): Dataset ID. - dataset_mapping_function (Callable[[Dict], DocumentNode]): Function to map dataset items to DocumentNode. - - Returns: - List[DocumentNode]: List of documents. - """ - items_list = self.apify_client.dataset(dataset_id).list_items(clean=True) - - document_list = [] - for item in items_list.items: - DocumentNode = dataset_mapping_function(item) - if not isinstance(DocumentNode, DocumentNode): - raise ValueError("Dataset_mapping_function must return a DocumentNode") - document_list.append(DocumentNode) - - return document_list diff --git a/nextpy/ai/rag/document_loaders/apify/dataset/requirements.txt b/nextpy/ai/rag/document_loaders/apify/dataset/requirements.txt deleted file mode 100644 index 5a3a1cbf..00000000 --- a/nextpy/ai/rag/document_loaders/apify/dataset/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -apify-client diff --git a/nextpy/ai/rag/document_loaders/asana/README.md b/nextpy/ai/rag/document_loaders/asana/README.md deleted file mode 100644 index 7f3e5b11..00000000 --- a/nextpy/ai/rag/document_loaders/asana/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Asana Loader - -This loader loads documents from Asana. The user specifies an API token to initialize the AsanaReader. They then specify a `workspace_id` to load in the corresponding DocumentNode objects. - -## Usage - -Here's an example usage of the AsanaReader. - -```python -from nextpy.ai import download_loader -import os - -AsanaReader = download_loader('AsanaReader') - -reader = AsanaReader(") -documents = reader.load_data(workspace_id=") - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/asana/__init__.py b/nextpy/ai/rag/document_loaders/asana/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/asana/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/asana/base.py b/nextpy/ai/rag/document_loaders/asana/base.py deleted file mode 100644 index dc9c5604..00000000 --- a/nextpy/ai/rag/document_loaders/asana/base.py +++ /dev/null @@ -1,63 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Asana reader.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class AsanaReader(BaseReader): - """Asana reader. Reads data from an Asana workspace. - - Args: - asana_token (str): Asana token. - asana_workspace (str): Asana workspace. - """ - - def __init__(self, asana_token: str) -> None: - """Initialize Asana reader.""" - import asana - - self.client = asana.Client.access_token(asana_token) - - def load_data(self, workspace_id: str) -> List[DocumentNode]: - """Load data from the workspace. - - Args: - workspace_id (str): Workspace ID. - - Returns: - List[DocumentNode]: List of documents. - """ - results = [] - - projects = self.client.projects.find_all({"workspace": workspace_id}) - - for project in projects: - tasks = self.client.tasks.find_all( - { - "project": project["gid"], - "opt_fields": "name,notes,completed,due_on,assignee", - } - ) - for task in tasks: - stories = self.client.tasks.stories(task["gid"], opt_fields="type,text") - comments = "\n".join( - [story["text"] for story in stories if story["type"] == "comment"] - ) - results.append( - DocumentNode( - text=task["name"] + " " + task["notes"] + " " + comments, - extra_info={ - "task_id": task["gid"], - "name": task["name"], - "assignee": task["assignee"], - "project": project["name"], - "workspace_id": workspace_id, - }, - ) - ) - - return results diff --git a/nextpy/ai/rag/document_loaders/asana/requirements.txt b/nextpy/ai/rag/document_loaders/asana/requirements.txt deleted file mode 100644 index d7cf09d4..00000000 --- a/nextpy/ai/rag/document_loaders/asana/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -asana diff --git a/nextpy/ai/rag/document_loaders/azcognitive_search/README.md b/nextpy/ai/rag/document_loaders/azcognitive_search/README.md deleted file mode 100644 index 1a5f4f20..00000000 --- a/nextpy/ai/rag/document_loaders/azcognitive_search/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# Azure Cognitive Search Loader - -The AzCognitiveSearchReader Loader returns a set of texts corresponding to documents retrieved from specific index of Azure Cognitive Search. -The user initializes the loader with credentials (service name and key) and the index name. - -## Usage - -Here's an example usage of the AzCognitiveSearchReader. - -```python -from nextpy.ai import download_loader - -AzCognitiveSearchReader = download_loader("AzCognitiveSearchReader") - -reader = AzCognitiveSearchReader( - "", - ", - " -) - - -query_sample = "" -documents = reader.load_data( - query="", content_field="", filter="" -) -``` - -## Usage in combination with langchain - -```python - - from nextpy.ai import GPTVectorDBIndex, download_loader - from langchain.chains.conversation.memory import ConversationBufferMemory - from langchain.agents import Tool, AgentExecutor, load_tools, initialize_agent - - AzCognitiveSearchReader = download_loader("AzCognitiveSearchReader") - - az_loader = AzCognitiveSearchReader( - COGNITIVE_SEARCH_SERVICE_NAME, - COGNITIVE_SEARCH_KEY, - INDEX_NAME) - - documents = az_loader.load_data(query, field_name) - - index = GPTVectorDBIndex.from_documents(documents, service_context=service_context) - - tools = [ - Tool( - name="Azure cognitive search index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about the text on azure cognitive search.", - ), - ] - memory = ConversationBufferMemory(memory_key="chat_history") - agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory - ) - - result = agent_chain.run(input="How can I contact with my health insurance?") -``` - - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/azcognitive_search/__init__.py b/nextpy/ai/rag/document_loaders/azcognitive_search/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/azcognitive_search/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/azcognitive_search/base.py b/nextpy/ai/rag/document_loaders/azcognitive_search/base.py deleted file mode 100644 index cb33dd52..00000000 --- a/nextpy/ai/rag/document_loaders/azcognitive_search/base.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Azure Cognitive Search reader. -A loader that fetches documents from specific index. - -""" - -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class AzCognitiveSearchReader(BaseReader): - """General reader for any Azure Cognitive Search index reader. - - Args: - service_name (str): the name of azure cognitive search service. - search_key (str): provide azure search access key directly. - index (str): index name - - """ - - def __init__(self, service_name: str, searck_key: str, index: str) -> None: - """Initialize Azure cognitive search service using the search key.""" - import logging - - from azure.core.credentials import AzureKeyCredential - from azure.search.documents import SearchClient - - self.service_name = service_name - - logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") - logger.setLevel(logging.WARNING) - - azure_credential = AzureKeyCredential(searck_key) - - self.search_client = SearchClient( - endpoint=f"https://{service_name}.search.windows.net", - index_name=index, - credential=azure_credential, - ) - - def load_data( - self, query: str, content_field: str, filter: Optional[str] = None - ) -> List[DocumentNode]: - """Read data from azure cognitive search index. - - Args: - query (str): search term in Azure Search index - content_field (str): field name of the DocumentNode content. - filter (str): Filter expression. For example : 'sourcepage eq - 'employee_handbook-3.pdf' and sourcefile eq 'employee_handbook.pdf'' - - Returns: - List[DocumentNode]: A list of documents. - - """ - search_result = self.search_client.search(query, filter=filter) - - docs = [] - for result in search_result: - text = result[content_field] - metadata = { - "id": result["id"], - "score": result["@search.score"], - "service_name": self.service_name, - "query": query, - "content_field": content_field, - "filter": filter, - } - docs.append(DocumentNode(text=text, extra_info=metadata)) - - return docs diff --git a/nextpy/ai/rag/document_loaders/azcognitive_search/requirements.txt b/nextpy/ai/rag/document_loaders/azcognitive_search/requirements.txt deleted file mode 100644 index 9dbd6a12..00000000 --- a/nextpy/ai/rag/document_loaders/azcognitive_search/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -azure-search-documents -azure-identity \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/azstorage_blob/README.md b/nextpy/ai/rag/document_loaders/azstorage_blob/README.md deleted file mode 100644 index fdfb3274..00000000 --- a/nextpy/ai/rag/document_loaders/azstorage_blob/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Azure Storage Blob Loader - -This loader parses any file stored as an Azure Storage blob or the entire container (with an optional prefix / attribute filter) if no particular file is specified. When initializing `AzStorageBlobReader`, you may pass in your account url with a SAS token or crdentials to authenticate. - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -## Usage - -To use this loader, you need to pass in the name of your Azure Storage Container. After that, if you want to just parse a single file, pass in its blob name. Note that if the file is nested in a subdirectory, the blob name should contain the path such as `subdirectory/input.txt`. This loader is a thin wrapper over the [Azure Blob Storage Client for Python](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=managed-identity%2Croles-azure-portal%2Csign-in-azure-cli), see [ContainerClient](https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python) for detailed parameter usage options. - - -### Using a Storage Accout SAS URL -```python -from nextpy.ai import download_loader - -AzStorageBlobReader = download_loader("AzStorageBlobReader") - -loader = AzStorageBlobReader(container='scrabble-dictionary', blob='dictionary.txt', account_url='') - -documents = loader.load_data() -``` - -### Using Azure AD -Ensure the Azure Identity library is available ```pip install azure-identity``` - -The sample below downloads all files in the container using the default credential, alternative credential options are avaible such as a service principal ```ClientSecretCredential``` - -```python -from nextpy.ai import download_loader -from azure.identity import DefaultAzureCredential - -default_credential = DefaultAzureCredential() - -AzStorageBlobReader = download_loader("AzStorageBlobReader") - -loader = AzStorageBlobReader(container_name='scrabble-dictionary', account_url='https://.blob.core.windows.net', credential=default_credential) - -documents = loader.load_data() -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/azstorage_blob/__init__.py b/nextpy/ai/rag/document_loaders/azstorage_blob/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/azstorage_blob/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/azstorage_blob/base.py b/nextpy/ai/rag/document_loaders/azstorage_blob/base.py deleted file mode 100644 index abe2150c..00000000 --- a/nextpy/ai/rag/document_loaders/azstorage_blob/base.py +++ /dev/null @@ -1,133 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Azure Storage Blob file and directory reader. - -A loader that fetches a file or iterates through a directory from Azure Storage Blob. - -""" -import logging -import math -import tempfile -import time -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -class AzStorageBlobReader(BaseReader): - """General reader for any Azure Storage Blob file or directory. - - Args: - container_name (str): name of the container for the blob. - blob (Optional[str]): name of the file to download. If none specified - this loader will iterate through list of blobs in the container. - name_starts_with (Optional[str]): filter the list of blobs to download - to only those whose names begin with the specified string. - include: (Union[str, List[str], None]): Specifies one or more additional - datasets to include in the response. Options include: 'snapshots', - 'metadata', 'uncommittedblobs', 'copy', 'deleted', - 'deletedwithversions', 'tags', 'versions', 'immutabilitypolicy', - 'legalhold'. - file_extractor (Optional[Dict[str, Union[str, BaseReader]]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - account_url (str): URI to the storage account, may include SAS token. - credential (Union[str, Dict[str, str], AzureNamedKeyCredential, AzureSasCredential, TokenCredential, None] = None): - The credentials with which to authenticate. This is optional if the account URL already has a SAS token. - """ - - def __init__( - self, - *args: Any, - container_name: str, - blob: Optional[str] = None, - name_starts_with: Optional[str] = None, - include: Optional[Any] = None, - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - account_url: str, - credential: Optional[Any] = None, - **kwargs: Any, - ) -> None: - """Initializes Azure Storage Account.""" - super().__init__(*args, **kwargs) - - self.container_name = container_name - self.blob = blob - self.name_starts_with = name_starts_with - self.include = include - - self.file_extractor = file_extractor - - self.account_url = account_url - self.credential = credential - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from Azure Storage Blob.""" - # from azure.core.credentials import AzureNamedKeyCredential, AzureSasCredential, TokenCredential - # from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient - from azure.storage.blob import ContainerClient - - container_client = ContainerClient( - self.account_url, self.container_name, credential=self.credential - ) - total_download_start_time = time.time() - - with tempfile.TemporaryDirectory() as temp_dir: - if self.blob: - extension = Path(self.blob).suffix - download_file_path = ( - f"{temp_dir}/{next(tempfile._get_candidate_names())}{extension}" - ) - logger.info(f"Start download of {self.blob}") - start_time = time.time() - stream = container_client.download_blob(self.blob) - with open(file=download_file_path, mode="wb") as download_file: - stream.readinto(download_file) - end_time = time.time() - logger.info( - f"{self.blob} downloaded in {end_time - start_time} seconds." - ) - else: - logger.info("Listing blobs") - blobs_list = container_client.list_blobs( - self.name_starts_with, self.include - ) - for obj in blobs_list: - extension = Path(obj.name).suffix - download_file_path = ( - f"{temp_dir}/{next(tempfile._get_candidate_names())}{extension}" - ) - logger.info(f"Start download of {obj.name}") - start_time = time.time() - stream = container_client.download_blob(obj) - with open(file=download_file_path, mode="wb") as download_file: - stream.readinto(download_file) - end_time = time.time() - logger.info( - f"{obj.name} downloaded in {end_time - start_time} seconds." - ) - - total_download_end_time = time.time() - total_elapsed_time = math.ceil( - total_download_end_time - total_download_start_time - ) - logger.info( - f"Downloading completed in approximately {total_elapsed_time // 60}min {total_elapsed_time % 60}s." - ) - logger.info("DocumentNode creation starting") - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - SimpleDirectoryReader = import_loader("SimpleDirectoryReader") - except ImportError: - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - loader = SimpleDirectoryReader(temp_dir, file_extractor=self.file_extractor) - - return loader.load_data() diff --git a/nextpy/ai/rag/document_loaders/azstorage_blob/requirements.txt b/nextpy/ai/rag/document_loaders/azstorage_blob/requirements.txt deleted file mode 100644 index fa3619d2..00000000 --- a/nextpy/ai/rag/document_loaders/azstorage_blob/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -azure-storage-blob -azure-identity \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/basereader.py b/nextpy/ai/rag/document_loaders/basereader.py deleted file mode 100644 index c9aa434b..00000000 --- a/nextpy/ai/rag/document_loaders/basereader.py +++ /dev/null @@ -1,21 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Base reader class.""" -from abc import abstractmethod -from typing import Any, List - -from nextpy.ai.schema import DocumentNode - - -class BaseReader: - """Utilities for loading data from a directory.""" - - @abstractmethod - def load_data(self, *args: Any, **load_kwargs: Any) -> List[DocumentNode]: - """Load data from the input directory.""" - - def load_langchain_documents(self, **load_kwargs: Any) -> List[DocumentNode]: - """Load data in LangChain DocumentNode format.""" - docs = self.load_data(**load_kwargs) - return [d.to_langchain_format() for d in docs] diff --git a/nextpy/ai/rag/document_loaders/bilibili/README.md b/nextpy/ai/rag/document_loaders/bilibili/README.md deleted file mode 100644 index 1916024f..00000000 --- a/nextpy/ai/rag/document_loaders/bilibili/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Bilibili Transcript Loader - -This loader utilizes the `bilibili_api` to fetch the text transcript from Bilibili, one of the most beloved long-form video sites in China. - -With this BilibiliTranscriptReader, users can easily obtain the transcript of their desired video content on the platform. - -## Usage - -To use this loader, you need to pass in an array of Bilibili video links. - -```python -from nextpy.ai import download_loader - -BilibiliTranscriptReader= download_loader("BilibiliTranscriptReader") -loader = BilibiliTranscriptReader() -documents = loader.load_data(video_urls=['https://www.bilibili.com/video/BV1yx411L73B/']) -``` - -Note that there is no official API available for Bilibili Transcript, so changes to the official website can sometimes cause issues. - -This loader is designed to be used as a way to load data into [Llama Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/bilibili/__init__.py b/nextpy/ai/rag/document_loaders/bilibili/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/bilibili/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/bilibili/base.py b/nextpy/ai/rag/document_loaders/bilibili/base.py deleted file mode 100644 index 309c169b..00000000 --- a/nextpy/ai/rag/document_loaders/bilibili/base.py +++ /dev/null @@ -1,71 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple Reader that reads transcript and general infor of Bilibili video.""" -import warnings -from typing import Any, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class BilibiliTranscriptReader(BaseReader): - """Bilibili Transcript and video info reader.""" - - @staticmethod - def get_bilibili_info_and_subs(bili_url): - import json - import re - - import requests - from bilibili_api import sync, video - - bvid = re.search(r"BV\w+", bili_url).group() - # Create credential object - v = video.Video(bvid=bvid) - # Get video info and basic infor - video_info = sync(v.get_info()) - title = video_info["title"] - desc = video_info["desc"] - - # Get subtitle url - sub_list = video_info["subtitle"]["list"] - if sub_list: - sub_url = sub_list[0]["subtitle_url"] - result = requests.get(sub_url) - raw_sub_titles = json.loads(result.content)["body"] - raw_transcript = " ".join([c["content"] for c in raw_sub_titles]) - # Add basic video info to transcript - raw_transcript_with_meta_info = f"Video Title: {title}, description: {desc}\nTranscript: {raw_transcript}" - return raw_transcript_with_meta_info - else: - raw_transcript = "" - warnings.warn( - f"No subtitles found for video: {bili_url}. Return Empty transcript." - ) - return raw_transcript - - def load_data( - self, video_urls: List[str], **load_kwargs: Any - ) -> List[DocumentNode]: - """Load auto generated Video Transcripts from Bilibili, including additional metadata. - - Args: - video_urls (List[str]): List of Bilibili links for which transcripts are to be read. - - Returns: - List[DocumentNode]: A list of DocumentNode objects, each containing the transcript for a Bilibili video. - """ - results = [] - - metadata = {"video_urls": video_urls} - - for bili_url in video_urls: - try: - transcript = self.get_bilibili_info_and_subs(bili_url) - results.append(DocumentNode(text=transcript, extra_info=metadata)) - except Exception as e: - warnings.warn( - f"Error loading transcript for video {bili_url}: {str(e)}. Skipping video." - ) - return results diff --git a/nextpy/ai/rag/document_loaders/bilibili/requirements.txt b/nextpy/ai/rag/document_loaders/bilibili/requirements.txt deleted file mode 100644 index 376ce433..00000000 --- a/nextpy/ai/rag/document_loaders/bilibili/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -bilibili_api -requests \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/boarddocs/BoardDocsReader.ipynb b/nextpy/ai/rag/document_loaders/boarddocs/BoardDocsReader.ipynb deleted file mode 100644 index 288177b4..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/BoardDocsReader.ipynb +++ /dev/null @@ -1,81 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a8fda9ff", - "metadata": {}, - "source": [ - "# Bored Llama: BoardDocs in LLaMA Index!\n", - "\n", - "This is a fun experiment to see if we can crawl a BoardDocs site to index it for LangChain fun." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013bd7f3", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from nextpy.ai import download_loader\n", - "\n", - "# Use the temporary / staging location to exercise the loader before first checkin lands\n", - "BoardDocsReader = download_loader(\"BoardDocsReader\",\n", - " loader_hub_url=\"https://raw.githubusercontent.com/dweekly/llama-hub/boarddocs/llama_hub\",\n", - " refresh_cache=True)\n", - "loader = BoardDocsReader(site=\"ca/redwood\", committee_id=\"A4EP6J588C05\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27e1a431", - "metadata": {}, - "outputs": [], - "source": [ - "# now the data is loaded, query it\n", - "from nextpy.ai import GPTSimpleVectorIndex\n", - "\n", - "# load all meetings from this committee.\n", - "documents = loader.load_data(meeting_ids=[\"CPSNV9612DF1\"])\n", - "\n", - "# build an index\n", - "index = GPTSimpleVectorIndex.from_documents(documents)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1701638", - "metadata": {}, - "outputs": [], - "source": [ - "# Now we can start asking it questions!!\n", - "answer = index.query('When did Trustee Weekly start attending meetings?')\n", - "print(answer.response)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/nextpy/ai/rag/document_loaders/boarddocs/README.md b/nextpy/ai/rag/document_loaders/boarddocs/README.md deleted file mode 100644 index 32820403..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# BoardDocs Loader - -This loader retrieves an agenda and associated material from a BoardDocs site. - -This loader is not endorsed by, developed by, supported by, or in any way formally affiliated with Diligent Corporation. - -## Usage - -To use this loader, you'll need to specify which BoardDocs site you want to load, -as well as the committee on the site you want to scrape. - -```python -from nextpy.ai import download_loader - -BoardDocsReader = download_loader("BoardDocsReader") - -# For a site URL https://go.boarddocs.com/ca/redwood/Board.nsf/Public -# your site should be set to 'ca/redwood' -# You'll also need to specify which committee on the site you want to index, -# in this case A4EP6J588C05 is the Board of Trustees meeting. -loader = BoardDocsReader(site="ca/redwood", committee_id="A4EP6J588C05") - -# You can optionally specify to load a specific set of meetings; if you don't -# pass in meeting_ids, the loader will attempt to load *all* meeting content. -# Since we're actually scraping a site, this can take a little while. -documents = loader.load_data(meeting_ids=["CPSNV9612DF1"]) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/boarddocs/__init__.py b/nextpy/ai/rag/document_loaders/boarddocs/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/boarddocs/base.py b/nextpy/ai/rag/document_loaders/boarddocs/base.py deleted file mode 100644 index fa5adf4f..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/base.py +++ /dev/null @@ -1,130 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Reader that pulls in a BoardDocs site.""" -import json -from typing import Any, List, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class BoardDocsReader(BaseReader): - """BoardDocs doc reader. - - Read public agendas included on a BoardDocs site. - - Args: - site (str): The BoardDocs site you'd like to index, e.g. "ca/redwood" - committee_id (str): The committee on the site you want to index - """ - - def __init__( - self, - site: str, - committee_id: str, - ) -> None: - """Initialize with parameters.""" - self.site = site - self.committee_id = committee_id - self.base_url = "https://go.boarddocs.com/" + site + "/Board.nsf" - - # set up the headers required for the server to answer - self.headers = { - "accept": "application/json, text/javascript, */*; q=0.01", - "accept-language": "en-US,en;q=0.9", - "content-type": "application/x-www-form-urlencoded; charset=UTF-8", - "sec-ch-ua": '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', - "sec-ch-ua-mobile": "?0", - "sec-ch-ua-platform": '"macOS"', - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", - "x-requested-with": "XMLHttpRequest", - } - super().__init__() - - def get_meeting_list(self) -> List[dict]: - """Returns a list of meetings for the committee. - - Args: - None - Returns: - List[dict]: A list of meetings, each with a meetingID, date, and unid - """ - meeting_list_url = self.base_url + "/BD-GetMeetingsList?open" - - data = "current_committee_id=" + self.committee_id - response = requests.post(meeting_list_url, headers=self.headers, data=data) - meetingsData = json.loads(response.text) - - meetings = [ - { - "meetingID": meeting.get("unique", None), - "date": meeting.get("numberdate", None), - "unid": meeting.get("unid", None), - } - for meeting in meetingsData - ] - return meetings - - def process_meeting( - self, meeting_id: str, index_pdfs: bool = True - ) -> List[DocumentNode]: - """Returns documents from the given meeting.""" - agenda_url = self.base_url + "/PRINT-AgendaDetailed" - - # set the meetingID & committee - data = "id=" + meeting_id + "&" + "current_committee_id=" + self.committee_id - - # POST the request! - response = requests.post(agenda_url, headers=self.headers, data=data) - - import html2text - from bs4 import BeautifulSoup - - # parse the returned HTML - soup = BeautifulSoup(response.content, "html.parser") - agenda_date = soup.find("div", {"class": "print-meeting-date"}).string - agenda_title = soup.find("div", {"class": "print-meeting-name"}).string - [fd.a.get("href") for fd in soup.find_all("div", {"class": "public-file"})] - agenda_data = html2text.html2text(response.text) - - # TODO: index the linked PDFs in agenda_files! - - metadata = { - "committee": self.committee_id, - "title": agenda_title, - "date": agenda_date, - "url": agenda_url, - } - docs = [] - agenda_doc = DocumentNode( - text=agenda_data, - doc_id=meeting_id, - extra_info=metadata, - ) - docs.append(agenda_doc) - return docs - - def load_data( - self, meeting_ids: Optional[List[str]] = None, **load_kwargs: Any - ) -> List[DocumentNode]: - """Load all meetings of the committee. - - Args: - meeting_ids (List[str]): A list of meeting IDs to load. If None, load all meetings. - """ - # if a list of meetings wasn't provided, enumerate them all - if not meeting_ids: - meeting_ids = [ - meeting.get("meetingID") for meeting in self.get_meeting_list() - ] - - # process all relevant meetings & return the documents - docs = [] - for meeting_id in meeting_ids: - docs.extend(self.process_meeting(meeting_id)) - return docs diff --git a/nextpy/ai/rag/document_loaders/boarddocs/crawl.ipynb b/nextpy/ai/rag/document_loaders/boarddocs/crawl.ipynb deleted file mode 100644 index c160250c..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/crawl.ipynb +++ /dev/null @@ -1,536 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d764323a", - "metadata": {}, - "source": [ - "# BoardDocs Crawl\n", - "\n", - "Let's figure out how to crawl BoardDocs!\n", - "\n", - "We'll try the Redwood City School District site using BeautifulSoup.\n", - "\n", - "https://go.boarddocs.com/ca/redwood/Board.nsf/Public" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "903d5cbf", - "metadata": {}, - "outputs": [], - "source": [ - "# Each site may contain multiple committees, we have to pick which we want to index\n", - "# For example, RCSD's Board of Trustees is commitee A4EP6J588C05 in ca/redwood\n", - "\n", - "site = \"ca/redwood\"\n", - "committeeID = \"A4EP6J588C05\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1499236d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Status returned by meetings list request: 200\n" - ] - } - ], - "source": [ - "# We'll use the requests module to fetch info here.\n", - "\n", - "import requests\n", - "\n", - "# set up the BoardDocs llms based on params we were passed.\n", - "baseURL = \"https://go.boarddocs.com/\" + site + \"/Board.nsf\"\n", - "publicURL = baseURL + \"/Public\"\n", - "meetingsListURL = baseURL + \"/BD-GetMeetingsList?open\"\n", - "\n", - "# set up the headers required for the server to answer\n", - "headers = {\n", - " \"accept\": \"application/json, text/javascript, */*; q=0.01\",\n", - " \"accept-language\": \"en-US,en;q=0.9\",\n", - " \"content-type\": \"application/x-www-form-urlencoded; charset=UTF-8\",\n", - " \"sec-ch-ua\": \"\\\"Google Chrome\\\";v=\\\"113\\\", \\\"Chromium\\\";v=\\\"113\\\", \\\"Not-A.Brand\\\";v=\\\"24\\\"\",\n", - " \"sec-ch-ua-mobile\": \"?0\",\n", - " \"sec-ch-ua-platform\": \"\\\"macOS\\\"\",\n", - " \"sec-fetch-dest\": \"empty\",\n", - " \"sec-fetch-mode\": \"cors\",\n", - " \"sec-fetch-site\": \"same-origin\",\n", - " \"x-requested-with\": \"XMLHttpRequest\"\n", - "}\n", - "\n", - "# set the committee\n", - "data = \"current_committee_id=\" + committeeID\n", - "\n", - "# POST the request!\n", - "response = requests.post(meetingsListURL, headers=headers, data=data)\n", - "\n", - "print(\"Status returned by meetings list request:\",response.status_code)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6c8ffbc4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "278 meetings found\n" - ] - } - ], - "source": [ - "# Now we're going to parse the JSON data.\n", - "\n", - "# Response is a JSON array of meetings, in this format:\n", - "# [{\"unique\": \"CPSNV9612DF1\",\n", - "# \"name\": \"Board of Trustees Regular Meeting - 7:00pm (Closed Session at 6:15 PM)\",\n", - "# \"current\": \"1\",\n", - "# \"preliveoak\": \"\",\n", - "# \"numberdate\": \"20230510\",\n", - "# \"unid\": \"BE4CAA121D6BFD458525896E00612DF1\"},\n", - "\n", - "# print(response.text)\n", - "\n", - "import json\n", - "\n", - "meetingsData = json.loads(response.text)\n", - "\n", - "meetings = [{\"meetingID\": meeting.get('unique', None), \n", - " \"date\": meeting.get('numberdate', None), \n", - " \"unid\": meeting.get('unid', None)} for meeting in meetingsData]\n", - "\n", - "print (str(len(meetings)) + \" meetings found\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e802fd0", - "metadata": {}, - "outputs": [], - "source": [ - "# Here's an alternate approach, there's apparently an XML feed..\n", - "\n", - "import xml.etree.ElementTree as ET\n", - "\n", - "xmlMeetingListURL = baseURL + \"/XML-ActiveMeetings\"\n", - "xmlMeetingListData = requests.get(xmlMeetingListURL)\n", - "xmlMeetingList = ET.fromstring(xmlMeetingListData)\n", - "\n", - "# The returned XML DocumentNode is in this form:\n", - "\n", - "# \n", - "# \n", - "# Board of Trustees Regular Meeting - 7:00pm\n", - "# \n", - "# 2021-08-11\n", - "# \n", - "# Wednesday\n", - "# August 11, 2021\n", - "# \n", - "# \n", - "# Please click the video link above to access the regular board meeting EDUCATING EVERY CHILD FOR SUCCESS REDWOOD CITY SCHOOL DISTRICT BOARD OF EDUCATION REGULAR MEETING WEDNESDAY, AUGUST 11, 2021 AT 7:00pm TELECONFERENCE MEETING https://rcsdk8-net.zoom.us/s/86849531859 (to participate in the Regular Board Meeting) US : +1 669 900 6833 or +1 346 248 7799 or +1 301 715 8592 or +1 312 626 6799 or +1 929 436 2866 or +1 253 215 8782 Webinar ID: 868 4953 1859 Password: rcsdbot Backup Password: 0863523 (to listen to the Regular Board Meeting) TELECONFERENCE NOTIFICATION for the REGULAR BOARD MEETING In light of the current Public Health Emergency and consistent with the Governor’s recent order suspending some of the Brown Act’s teleconferencing requirements, the Board will be holding its August 11th regular meeting by teleconference. The Board invites the public to join the open session portion of the meeting and offer public comment via Zoom. Additionally, the meeting will be recorded and staff will be available to receive real-time comments via the links below. Comments received during the open session of the meeting will be shared publicly during the meeting: ENGLISH https://docs.google.com/forms/d/e/1FAIpQLSexN3rAtNYJrhCjKT0s9AG__Eq0-_iAUFPI6ID3Mo0Jn8yeGA/viewform?usp=sf_link SPANISH https://docs.google.com/forms/d/e/1FAIpQLScMO3Wo8kjGmJF7KNhihQqanOLfzfoyQ7IT904jU9QtFFF28Q/viewform?usp=sf_link If you require Spanish interpretation please call: 978-990-5137 and press 8377041# for the password. Si requiere interpretación al español por favor llame al: 978-990-5137 y presione 8377041# para la contraseña. If you need special assistance or a modification due to a disability (including auxiliary aids or services) to participate in this meeting, please contact Eliana García at egarcia@rcsdk8.net at least 48 hours in advance of the meeting and we will make our best efforts to accommodate.\n", - "# http://go.boarddocs.com/ca/redwood/Board.nsf/goto?open&id=C55TDQ76E688\n", - "# \n", - "# 1. Call to Order\n", - "# \n", - "# \n", - "# 1.1 Roll Call\n", - "# http://go.boarddocs.com/ca/redwood/Board.nsf/goto?open&id=C55TDS76E68A\n", - "# Procedural\n", - "# \n", - "# \n", - "# \n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b292ff49", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Status returned by detailed agenda fetch request: 200\n", - "Agenda Title: Board of Trustees Regular Meeting - 7:00pm (Closed Session at 6:15 PM)\n", - "Agenda Date: Wednesday, May 10, 2023\n", - "Number of Files: 33\n", - "['/ca/redwood/Board.nsf/files/CRAQFV6923F8/$file/230510%20RCSD%20%2420k%20and%20Under%20Tracker%20FY%2022-23.pdf', '/ca/redwood/Board.nsf/files/CRASSK741766/$file/230510%20RCSD%20GA%20Bid%20Package%20D%20CO%20No.%2014%20Package.pdf', '/ca/redwood/Board.nsf/files/CRATNB7827AD/$file/230510%20RCSD%20GA%20Bid%20Package%20G%20CO%20No.%2016%20Package.pdf', '/ca/redwood/Board.nsf/files/CR9SWS74B531/$file/01-118012_Invoice_01-13356_2023-04-18.pdf', '/ca/redwood/Board.nsf/files/CRFNZ4615266/$file/3250%20BP_AR%20Transportation%20Fees.pdf', '/ca/redwood/Board.nsf/files/CRFP8N62304A/$file/3540%20BP%20Transportation.pdf', '/ca/redwood/Board.nsf/files/CRFPGE63E9A7/$file/3555%20BP_E%20Nutrition%20Program%20Compliance.pdf', '/ca/redwood/Board.nsf/files/CRFPM964FB8C/$file/4030%20BP_AR%20Nondiscrimination%20in%20Employment.pdf', '/ca/redwood/Board.nsf/files/CRFPVX66768F/$file/5142%20BP_AR%20Safety.pdf', '/ca/redwood/Board.nsf/files/CRFQDT68D3B9/$file/5142.2%20BP_AR%20Safe%20Routes%20to%20School%20Program.pdf', '/ca/redwood/Board.nsf/files/CRFR8D6B7403/$file/9320%20BB%20Meetings%20and%20Notices.pdf', '/ca/redwood/Board.nsf/files/CRJPQY62B0F7/$file/Board%20Minutes%2004.19.23%20DRAFT.Regular.pdf', '/ca/redwood/Board.nsf/files/CRJPQL62A3B4/$file/Board%20Minutes%2004.26.2023%20DRAFT%20-%20CLOSED.pdf', '/ca/redwood/Board.nsf/files/CRJPRM62D8F5/$file/Board%20Minutes%204.26.23%20DRAFT%20(Study%20Session).pdf', '/ca/redwood/Board.nsf/files/CRBTS978BA27/$file/Master%20Contract%202022-2023(final).pdf', '/ca/redwood/Board.nsf/files/CRBTSB78BBDB/$file/Approved%20Rate%20Sheets%204.19.pdf', '/ca/redwood/Board.nsf/files/CRETMP6C923E/$file/UC%20REGENTS%20RCSD%20CRLP.pdf', '/ca/redwood/Board.nsf/files/CRJVHK80D60D/$file/UC%20REGENTS%20RCSD%20CRLP%20Amendment.pdf', '/ca/redwood/Board.nsf/files/CRJVGC80A7F2/$file/SMCOE%2023-24%20Teacher%20Residency%20Agreement.pdf', '/ca/redwood/Board.nsf/files/CRJV5P7F1674/$file/2023.24%20RCSD%20Outdoor%20Education.pdf', '/ca/redwood/Board.nsf/files/CRFLZV581C06/$file/Warrant%20Register%20April%202023.pdf', '/ca/redwood/Board.nsf/files/CRHVKX812F21/$file/230510%20Connect%20AB841%20Resolution%2033.pdf', '/ca/redwood/Board.nsf/files/CRHVWC82B4EB/$file/230510%20KIPP%20Excelencia%20AB841%20Resolution%2034.pdf', '/ca/redwood/Board.nsf/files/CRHVYE82FE9B/$file/230510%20Redwood%20City%20School%20District%20AB841%20Resolution%2035.pdf', '/ca/redwood/Board.nsf/files/CRHVZR833219/$file/230510%20Rocketship%20AB841%20Resolution%2036.pdf', '/ca/redwood/Board.nsf/files/CRERDF6750EE/$file/KIPP%20Excelencia%2022.23%202nd%20Interim%20Report%20Review%20Letter.pdf', '/ca/redwood/Board.nsf/files/CRERPC6862FD/$file/KIPP%20Excelencia%20%2022.23%202nd%20Interim%20Report.pdf', '/ca/redwood/Board.nsf/files/CRERMM682F52/$file/Connect%2022.23%202nd%20Interim%20Report%20Review%20Letter.pdf', '/ca/redwood/Board.nsf/files/CRERNM68494F/$file/Connect%20%2022.23%202nd%20Interim%20Report.pdf', '/ca/redwood/Board.nsf/files/CRERSD68BED6/$file/Rocketship%20RC%2022.23%202nd%20Interim%20Report%20Review%20Letter.pdf', '/ca/redwood/Board.nsf/files/CRERS968BC64/$file/Rocketship%20RC%2022.23%202nd%20Interim%20Report.pdf', '/ca/redwood/Board.nsf/files/CRFNG75F3C1B/$file/5131.41%20AR%20Use%20Of%20Seclusion%20And%20Restraint.pdf', '/ca/redwood/Board.nsf/files/CRHQ3P673134/$file/22-23%20RCSD%20Board%20Meeting%20Calendar.Updated%204.19.23.pdf']\n" - ] - } - ], - "source": [ - "# Ah HA! The detailes \"print\" agenda has all the info we want - and links to the PDFs!\n", - "\n", - "detailedMeetingAgendaURL = baseURL + \"/PRINT-AgendaDetailed\"\n", - "\n", - "meetingID = \"CPSNV9612DF1\"\n", - "\n", - "# set the meetingID & committee\n", - "data = \"id=\" + meetingID + \"&\" + \"current_committee_id=\" + committeeID\n", - "\n", - "# POST the request!\n", - "response = requests.post(detailedMeetingAgendaURL, headers=headers, data=data)\n", - "\n", - "print(\"Status returned by detailed agenda fetch request:\",response.status_code)\n", - "\n", - "import html2text\n", - "from bs4 import BeautifulSoup\n", - "\n", - "# parse the returned HTML\n", - "soup = BeautifulSoup(response.content, \"html.parser\")\n", - "agendaDate = soup.find(\"div\", {\"class\":\"print-meeting-date\"}).string\n", - "agendaTitle = soup.find(\"div\", {\"class\":\"print-meeting-name\"}).string\n", - "agendaFiles = [fd.a.get('href') for fd in soup.find_all(\"div\", {\"class\":\"public-file\"})]\n", - "agendaData = html2text.html2text(response.text)\n", - "print(\"Agenda Title:\", agendaTitle)\n", - "print(\"Agenda Date:\", agendaDate)\n", - "print(\"Number of Files:\",len(agendaFiles))\n", - "\n", - "print(agendaFiles)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "81571996", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPSNV9612DF1\n", - "CPNUPZ7B7D09\n", - "CQ7TPZ78313B\n", - "CR2MCR59EE37\n", - "CNUN245B80D7\n", - "CNCQ2F663B8C\n", - "CPWNM5605E00\n", - "CNCPQY64EE36\n", - "CMSTNT783963\n", - "CMSTML77B689\n", - "CN9V837F7242\n", - "CMZR4H6C2928\n", - "CMBPD95DF6DB\n", - "CKYUYU7E62A8\n", - "CLLPZT5E8971\n", - "CKJKSG533AF1\n", - "CKHSER725DEA\n", - "CK4PBG638FA6\n", - "CJYTL8775FA8\n", - "CJANRA6126F9\n", - "CK6PAK62FF2D\n", - "CK6N565C9EB6\n", - "CJ2S33686A4D\n", - "CHKLWM588244\n", - "CHEM3K58E555\n", - "CHEMVQ5D1F0F\n", - "CH4UY57E3BD1\n", - "CFLT9N7492F3\n", - "CFFTMD7567B0\n", - "CF8Q7X66C51F\n", - "CETRFZ6DD9CE\n", - "CF7TF6771C58\n", - "CEPKKH523FEC\n", - "CEBNMZ5DAC30\n", - "CDWQH3694A8D\n", - "CDARDL6D82AB\n", - "CDFKEW510C6E\n", - "CCSN6X5E7859\n", - "CCMRJT6E4626\n", - "CC5UYY7E6893\n", - "CBJQLT6911AB\n", - "CBATCX765D01\n", - "CAYM47593BD6\n", - "CAFRFB6D7A83\n", - "CABM9357C659\n", - "CACUCV7B77BB\n", - "C9BVZ5831E3D\n", - "C8SP2G6169F1\n", - "C8FTNP72595E\n", - "C8MQ92681B5B\n", - "C87LTS552926\n", - "C7XVCJ801ABC\n", - "C7KUF87BCE71\n", - "C72NJ46017D1\n", - "C75M5L592D5D\n", - "C6GTZ9796118\n", - "C6DRX2700FAB\n", - "C63URL79A65D\n", - "C66PAR62DFB1\n", - "C5LNS66103E7\n", - "C55TDQ76E688\n", - "CRN7DG191DCC\n", - "CRN63A12EF28\n", - "CRP2ZC7DEDD9\n", - "CRM2R703650F\n", - "CRM2YY0488C9\n", - "CRJ2SA01B8F1\n", - "CRLUJK7C4CE2\n", - "CRJ2QE00512B\n", - "CRH24J005DC4\n", - "CRKVVW82A567\n", - "CRFVN48180D5\n", - "CRE4XS0DBC93\n", - "CRE4S90CEC88\n", - "CRDUU67DB46C\n", - "CQNLT957DAEE\n", - "CRAUSP7B7A9A\n", - "CR8TSZ78D926\n", - "CR72JE026707\n", - "CR6U2Q79FA31\n", - "CR62XM0455DD\n", - "CQZ75B17EB8C\n", - "CQXU6T7A9410\n", - "CQXU4L7A403C\n", - "CQXT7R7606A0\n", - "CQWT8A761B85\n", - "CQWSTR74456C\n", - "CQWPSF66018B\n", - "CQV3X908F7FA\n", - "CQS5N81105E8\n", - "CQR34Z052019\n", - "CQQ83K1C5A77\n", - "CQQ7BN18D917\n", - "CQP87H1CEE10\n", - "CQN2Y404680E\n", - "CQL2SY03A75A\n", - "CQKVEX8074FB\n", - "CQF3F5069B40\n", - "CQD2Z9049366\n", - "CQC4LQ0C1D32\n", - "CQB3CV064707\n", - "CQB34N05137F\n", - "CQ5VS9821D50\n", - "CQ3VGR80B8D8\n", - "CQ3VQF81D881\n", - "CQ3UQE7D2740\n", - "CQ2UQE7D27BF\n", - "CPYV2A7E99F1\n", - "CPY28V010165\n", - "CPW64G131BA5\n", - "CPN4FD0B53C7\n", - "CPU8MD1EF61A\n", - "CPP6ZA1753E4\n", - "CPN4AS0AA855\n", - "CPN4790A23E1\n", - "CPTVEK806706\n", - "CPT45Y09F4C5\n", - "CPN3ZS095791\n", - "CPS2TU7C428F\n", - "CPN3UA088940\n", - "CPL7AA18A582\n", - "CPR2X2043FEA\n", - "CPK46K0A0ADB\n", - "CPH3E20672F6\n", - "CPH3AF05EB4E\n", - "CPQ3A705E24F\n", - "CPEQSE6AB2FE\n", - "CPEQKY69C163\n", - "CPEQAJ685EFF\n", - "CPEQJN698FE0\n", - "CPE8N71F1438\n", - "CPC3TR08758C\n", - "CPB4FT0B658A\n", - "CP9L5W54ED29\n", - "CP93X508F31A\n", - "CP92V603F9AE\n", - "CP5VCP802000\n", - "CP5UNX7CF030\n", - "CP44MF0C354D\n", - "CP327T00D8ED\n", - "CNXTPN785AFD\n", - "CNV5480E625A\n", - "CNTBZL155845\n", - "CNU3VM08BBD4\n", - "CNS5EE0FE08E\n", - "CNS3MB0783F9\n", - "CNHVMU81772C\n", - "CNG26A005BE8\n", - "CNE2UT03EB26\n", - "CND7B218C26C\n", - "CND6WV16F9F8\n", - "CNM3VE08B384\n", - "CNL4FD0B5575\n", - "CNC4DV0B1C65\n", - "CNC3H406E589\n", - "CNB9D822744D\n", - "CNB95X216314\n", - "CNB8US200C24\n", - "CNB94P2133E5\n", - "CNB8BG1D8279\n", - "CN77C618EBDC\n", - "CN935C052CA0\n", - "CN7788185851\n", - "CN76VE16C3B6\n", - "CN85ZP12B3BE\n", - "CN3SBF71DFBD\n", - "CNK27Z00E06F\n", - "CNJUWP7E12B6\n", - "CMX79Z189B0F\n", - "CN657P0EE500\n", - "CMX6VG16C613\n", - "CMX6SK165849\n", - "CMX6M6158D53\n", - "CNJ28K00F603\n", - "CN3SED724EF6\n", - "CMV99R21F29E\n", - "CMW3GL06D288\n", - "CMV8YD20921A\n", - "CMV8C61D9C8C\n", - "CMV6R516227D\n", - "CMSW4582A51D\n", - "CMV266009B40\n", - "CMSUXJ7E32AC\n", - "CMR42J097328\n", - "CMPURD7D4BCD\n", - "CMPSJP72F16A\n", - "CMQ7GD198AD2\n", - "CMPS5U710FE0\n", - "CMPRMB6EA3EE\n", - "CMP8H61E5797\n", - "CMP7FW1978AB\n", - "CMJ3TE0867D8\n", - "CMJ3Q607EE18\n", - "CMP6GV14ED50\n", - "CMJ3KH073FE4\n", - "CMN3K50731E5\n", - "CMJ3EL06879A\n", - "CMM8DD1DC2BC\n", - "CMM6TQ168411\n", - "CMHU9N7AFDF2\n", - "CMHNZN626280\n", - "CMH8WH204C01\n", - "CMH8U31FF10B\n", - "CMH8NV1F2E0B\n", - "CMH2VQ040EC4\n", - "CMH2PG03245D\n", - "CMH228000800\n", - "CMD25M0087B4\n", - "CMCRBN6D3703\n", - "CMB2TF03B9B2\n", - "CMC4970A6D1B\n", - "CMB2R90366BD\n", - "CMB2FN01FF45\n", - "CMAVBT7FFF73\n", - "CMAUME7793B2\n", - "CMAUQH78BB41\n", - "CMA6QH160B8E\n", - "CM965U134FE6\n", - "CMA52X0E32D0\n", - "CLX75717E76B\n", - "CM63WT08E776\n", - "CLX6J21518B0\n", - "CLV6HH1504C3\n", - "CLM8ZW20CC65\n", - "CLK99T21F46E\n", - "CLM7X31BB117\n", - "CLK8CY1DBAE9\n", - "CLH2G3020E36\n", - "CLGV447EDE4F\n", - "CLF5FQ1011E2\n", - "CLD66J13695C\n", - "CLD4LF0C1288\n", - "CLC8G51E30C7\n", - "CLC7DV192CA4\n", - "CLC6YX174706\n", - "CLB87A1CE5C8\n", - "CLB3DH0653B9\n", - "CLA4CR0AF29A\n", - "CKYW3V8385D2\n", - "CKYV9X7FB91E\n", - "CKY3R708141A\n", - "CKWS5S710CF7\n", - "CKWPMF65483E\n", - "CKW6XR171B8E\n", - "CM46K7154372\n", - "CM56F314A917\n", - "CM467H138D7F\n", - "CM34KQ0BF7D4\n", - "CM23LK076765\n", - "CM22MW02E95B\n", - "CM2242004BF0\n", - "CLZVYG830594\n", - "CLZ79R189275\n", - "CLZVVN829C4B\n", - "CLX4YY0DE92B\n", - "CLW7PD1A9067\n", - "CLW7BG18D117\n", - "CJYN775E1AED\n", - "CK5NDA5F4AFD\n", - "CLW77L1840EF\n", - "CKW4YZ0DE9A9\n", - "CKW63X130795\n", - "CKVRW66FEDCB\n", - "CKVRME6EA743\n", - "CKTV7K7F5FAD\n", - "CKTUCJ7B2B51\n", - "CKTSMP71C7BE\n", - "CKRUTH7D9B57\n", - "CKS2BT783AEC\n", - "CKRVS770AD19\n", - "CKRQPD6A3A65\n", - "CKRPGX649F6E\n", - "CKR672137C3D\n", - "CKRM8259E4A5\n", - "CKPVDA803654\n", - "CKP85E1C9F16\n", - "CKNRDA6D762A\n", - "None\n" - ] - } - ], - "source": [ - "# Fetch meeting agenda for each meeting\n", - "\n", - "for meeting in meetings:\n", - " print(meeting['meetingID'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4827cdf4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/nextpy/ai/rag/document_loaders/boarddocs/requirements.txt b/nextpy/ai/rag/document_loaders/boarddocs/requirements.txt deleted file mode 100644 index af9477ef..00000000 --- a/nextpy/ai/rag/document_loaders/boarddocs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -bs4 -html2text -requests \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/chatgpt_plugin/README.md b/nextpy/ai/rag/document_loaders/chatgpt_plugin/README.md deleted file mode 100644 index 1899917e..00000000 --- a/nextpy/ai/rag/document_loaders/chatgpt_plugin/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# ChatGPT Plugin Loader - -The ChatGPT Plugin loader returns a set of documents from a server that implements that. -[ChatGPT Retrieval Plugin interface](https://github.com/openai/chatgpt-retrieval-plugin). - -## Usage - -Here's an example usage of the ChatGPTRetrievalPluginReader. - -```python -from nextpy.ai import download_loader - -ChatGPTRetrievalPluginReader = download_loader("ChatGPTRetrievalPluginReader") - -bearer_token = os.getenv("BEARER_TOKEN") -reader = ChatGPTRetrievalPluginReader( - endpoint_url="http://localhost:8000", - bearer_token=bearer_token -) - -documents = reader.load_data("text query") -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/chatgpt_plugin/__init__.py b/nextpy/ai/rag/document_loaders/chatgpt_plugin/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/chatgpt_plugin/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/chatgpt_plugin/base.py b/nextpy/ai/rag/document_loaders/chatgpt_plugin/base.py deleted file mode 100644 index 34ddffd2..00000000 --- a/nextpy/ai/rag/document_loaders/chatgpt_plugin/base.py +++ /dev/null @@ -1,77 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""ChatGPT Plugin.""" - -import os -from typing import Any, List, Optional - -import requests -from requests.adapters import HTTPAdapter, Retry - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ChatGPTRetrievalPluginReader(BaseReader): - """ChatGPT Retrieval Plugin reader.""" - - def __init__( - self, - endpoint_url: str, - bearer_token: Optional[str] = None, - retries: Optional[Retry] = None, - batch_size: int = 100, - ) -> None: - """Chatgpt Retrieval Plugin.""" - self._endpoint_url = endpoint_url - self._bearer_token = bearer_token or os.getenv("BEARER_TOKEN") - self._retries = retries - self._batch_size = batch_size - - self._s = requests.Session() - self._s.mount("http://", HTTPAdapter(max_retries=self._retries)) - - def load_data( - self, - query: str, - top_k: int = 10, - separate_documents: bool = True, - **kwargs: Any, - ) -> List[DocumentNode]: - """Load data from ChatGPT Retrieval Plugin.""" - headers = {"Authorization": f"Bearer {self._bearer_token}"} - queries = [{"query": query, "top_k": top_k}] - res = requests.post( - f"{self._endpoint_url}/query", headers=headers, json={"queries": queries} - ) - - metadata = { - "endpoint_url": self._endpoint_url, - "query": query, - "tok_k": top_k, - "separate_documents": separate_documents, - } - documents: List[DocumentNode] = [] - for query_result in res.json()["results"]: - for result in query_result["results"]: - result_id = result["id"] - result_txt = result["text"] - result_embedding = result["embedding"] - doc = DocumentNode( - text=result_txt, - doc_id=result_id, - embedding=result_embedding, - extra_info=metadata, - ) - documents.append(doc) - - # NOTE: there should only be one query - break - - if not separate_documents: - text_list = [doc.get_text() for doc in documents] - text = "\n\n".join(text_list) - documents = [DocumentNode(text=text, extra_info=metadata)] - - return documents diff --git a/nextpy/ai/rag/document_loaders/chatgpt_plugin/requirements.txt b/nextpy/ai/rag/document_loaders/chatgpt_plugin/requirements.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/nextpy/ai/rag/document_loaders/chroma/README.md b/nextpy/ai/rag/document_loaders/chroma/README.md deleted file mode 100644 index 9c0c3176..00000000 --- a/nextpy/ai/rag/document_loaders/chroma/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Chroma Loader - -The Chroma Loader returns a set of texts corresponding to embeddings retrieved from a Chroma Index. -The user initializes the loader with a Chroma index. They then pass in a query vector. - -## Usage - -Here's an example usage of the ChromaReader. - -```python -from nextpy.ai import download_loader - -ChromaReader = download_loader("ChromaReader") - -# The chroma reader loads data from a persisted Chroma collection. -# This requires a collection name and a persist directory. -reader = ChromaReader( - collection_name="chroma_collection", - persist_directory="examples/data_connectors/chroma_collection" -) - -query_vector=[n1, n2, n3, ...] - -documents = reader.load_data(collection_name="demo", query_vector=query_vector, limit=5) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/chroma/__init__.py b/nextpy/ai/rag/document_loaders/chroma/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/chroma/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/chroma/base.py b/nextpy/ai/rag/document_loaders/chroma/base.py deleted file mode 100644 index b4174274..00000000 --- a/nextpy/ai/rag/document_loaders/chroma/base.py +++ /dev/null @@ -1,73 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Chroma Reader.""" - -from typing import Any - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ChromaReader(BaseReader): - """Chroma reader. - - Retrieve documents from existing persisted Chroma collections. - - Args: - collection_name: Name of the peristed collection. - persist_directory: Directory where the collection is persisted. - - """ - - def __init__( - self, - collection_name: str, - persist_directory: str, - ) -> None: - """Initialize with parameters.""" - import chromadb # noqa: F401 - from chromadb.config import Settings - - self.collection_name = collection_name - - if (collection_name is None) or (persist_directory is None): - raise ValueError("Please provide a collection name and persist directory.") - - self._client = chromadb.Client( - Settings(is_persistent=True, persist_directory=persist_directory) - ) - self._collection = self._client.get_collection(collection_name) - - def load_data( - self, - query_vector: Any, - limit: int = 10, - ) -> Any: - """Load data from Chroma. - - Args: - query_vector (Any): Query - limit (int): Number of results to return. - - Returns: - List[DocumentNode]: A list of documents. - """ - results = self._collection.query(query_embeddings=query_vector, n_results=limit) - - metadata = { - "collection_name": self.collection_name, - "query_vector": query_vector, - "limit": limit, - } - documents = [] - for result in zip(results["ids"], results["documents"], results["embeddings"]): - doc = DocumentNode( - doc_id=result[0][0], - text=result[1][0], - embedding=result[2][0], - extra_info=metadata, - ) - documents.append(doc) - - return documents diff --git a/nextpy/ai/rag/document_loaders/chroma/requirements.txt b/nextpy/ai/rag/document_loaders/chroma/requirements.txt deleted file mode 100644 index 6dee1ba4..00000000 --- a/nextpy/ai/rag/document_loaders/chroma/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -chromadb \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/confluence/README.md b/nextpy/ai/rag/document_loaders/confluence/README.md deleted file mode 100644 index e5f2ef08..00000000 --- a/nextpy/ai/rag/document_loaders/confluence/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# Confluence Loader - -This loader loads pages from a given Confluence cloud instance. The user needs to specify the base URL for a Confluence -instance to initialize the ConfluenceReader - base URL needs to end with `/wiki`. The user can optionally specify -OAuth 2.0 credentials to authenticate with the Confluence instance. If no credentials are specified, the loader will -look for `CONFLUENCE_API_TOKEN` or `CONFLUENCE_USERNAME`/`CONFLUENCE_PASSWORD` environment variables to proceed with basic authentication. - -For more on authenticating using OAuth 2.0, checkout: - -- https://atlassian-python-api.readthedocs.io/index.html -- https://developer.atlassian.com/cloud/confluence/oauth-2-3lo-apps/ - -Confluence pages are obtained through one of 4 four mutually exclusive ways: - -1. `page_ids`: Load all pages from a list of page ids -2. `space_key`: Load all pages from a space -3. `label`: Load all pages with a given label -4. `cql`: Load all pages that match a given CQL query (Confluence Query Language https://developer.atlassian.com/cloud/confluence/advanced-searching-using-cql/ ). - -When `page_ids` is specified, `include_children` will cause the loader to also load all descendent pages. -When `space_key` is specified, `page_status` further specifies the status of pages to load: None, 'current', 'archived', 'draft'. - -limit (int): Deprecated, use `max_num_results` instead. - -max_num_results (int): Maximum number of results to return. If None, return all results. Requests are made in batches to achieve the desired number of results. - -User can also specify a boolean `include_attachments` to -include attachments, this is set to `False` by default, if set to `True` all attachments will be downloaded and -ConfluenceReader will extract the text from the attachments and add it to the DocumentNode object. -Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel. - -Hint: `space_key` and `page_id` can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces//pages/ - -## Usage - -Here's an example usage of the ConfluenceReader. - -```python - -from llama_hub.confluence.base import ConfluenceReader - -token = { - access_token: "", - token_type: "" -} -oauth2_dict = { - "client_id": "", - "token": token -} - -base_url = "https://yoursite.atlassian.com/wiki" - -page_ids = ["", "", " None: - if base_url is None: - raise ValueError("Must provide `base_url`") - - self.base_url = base_url - - try: - from atlassian import Confluence - except ImportError: - raise ImportError( - "`atlassian` package not found, please run `pip install atlassian-python-api`" - ) - self.confluence: Confluence = None - if oauth2: - self.confluence = Confluence(url=base_url, oauth2=oauth2, cloud=cloud) - else: - api_token = os.getenv(CONFLUENCE_API_TOKEN) - if api_token is not None: - self.confluence = Confluence(url=base_url, token=api_token, cloud=cloud) - else: - user_name = os.getenv(CONFLUENCE_USERNAME) - if user_name is None: - raise ValueError( - "Must set environment variable `CONFLUENCE_USERNAME` if oauth, oauth2, or `CONFLUENCE_API_TOKEN` are not provided." - ) - password = os.getenv(CONFLUENCE_PASSWORD) - if password is None: - raise ValueError( - "Must set environment variable `CONFLUENCE_PASSWORD` if oauth, oauth2, or `CONFLUENCE_API_TOKEN` are not provided." - ) - self.confluence = Confluence( - url=base_url, username=user_name, password=password, cloud=cloud - ) - - def load_data( - self, - space_key: Optional[str] = None, - page_ids: Optional[List[str]] = None, - page_status: Optional[str] = None, - label: Optional[str] = None, - cql: Optional[str] = None, - include_attachments=False, - include_children=False, - limit: Optional[int] = None, - max_num_results: Optional[int] = None, - ) -> List[DocumentNode]: - """Load Confluence pages from Confluence, specifying by one of four mutually exclusive methods: - `space_key`, `page_ids`, `label`, or `cql` - (Confluence Query Language https://developer.atlassian.com/cloud/confluence/advanced-searching-using-cql/ ). - - Args: - space_key (str): Confluence space key, eg 'DS' - page_ids (list): List of page ids, eg ['123456', '123457'] - page_status (str): Page status, one of None (all statuses), 'current', 'draft', 'archived'. Only compatible with space_key. - label (str): Confluence label, eg 'my-label' - cql (str): Confluence Query Language query, eg 'label="my-label"' - include_attachments (bool): If True, include attachments. - include_children (bool): If True, do a DFS of the descendants of each page_id in `page_ids`. Only compatible with `page_ids`. - limit (int): Deprecated, use `max_num_results` instead. - max_num_results (int): Maximum number of results to return. If None, return all results. Requests are made in batches to achieve the desired number of results. - """ - metadata = { - "base_url": self.base_url, - "space_key": space_key, - "page_ids": page_ids, - "page_status": page_status, - "label": label, - "cql": cql, - "include_attachments": include_attachments, - "include_children": include_children, - "limit": limit, - "max_num_results": max_num_results, - } - - num_space_key_parameter = 1 if space_key else 0 - num_page_ids_parameter = 1 if page_ids is not None else 0 - num_label_parameter = 1 if label else 0 - num_cql_parameter = 1 if cql else 0 - if ( - num_space_key_parameter - + num_page_ids_parameter - + num_label_parameter - + num_cql_parameter - != 1 - ): - raise ValueError( - "Must specify exactly one among `space_key`, `page_ids`, `label`, `cql` parameters." - ) - - if page_status and not space_key: - raise ValueError( - "Must specify `space_key` when `page_status` is specified." - ) - - if include_children and not page_ids: - raise ValueError( - "Must specify `page_ids` when `include_children` is specified." - ) - - if limit is not None: - max_num_results = limit - logger.warning( - "`limit` is deprecated and no longer relates to the Confluence server's API limits. If " - "you wish to limit the number of returned results please use `max_num_results` instead." - ) - - try: - import html2text # type: ignore - except ImportError: - raise ImportError( - "`html2text` package not found, please run `pip install html2text`" - ) - - text_maker = html2text.HTML2Text() - text_maker.ignore_links = True - text_maker.ignore_images = True - - pages: List = [] - if space_key: - pages.extend( - self._get_data_with_paging( - self.confluence.get_all_pages_from_space, - max_num_results=max_num_results, - space=space_key, - status=page_status, - expand="body.storage.value", - content_type="page", - ) - ) - elif label: - pages.extend( - self._get_cql_data_with_paging( - cql=f'type="page" AND label="{label}"', - max_num_results=max_num_results, - expand="body.storage.value", - ) - ) - elif cql: - pages.extend( - self._get_cql_data_with_paging( - cql=cql, - max_num_results=max_num_results, - expand="body.storage.value", - ) - ) - elif page_ids: - if include_children: - dfs_page_ids = [] - max_num_remaining = max_num_results - for page_id in page_ids: - current_dfs_page_ids = self._dfs_page_ids( - page_id, max_num_remaining - ) - dfs_page_ids.extend(current_dfs_page_ids) - if max_num_results is not None: - max_num_remaining -= len(current_dfs_page_ids) - if max_num_remaining <= 0: - break - page_ids = dfs_page_ids - for page_id in ( - page_ids[:max_num_results] if max_num_results is not None else page_ids - ): - pages.append( - self._get_data_with_retry( - self.confluence.get_page_by_id, - page_id=page_id, - expand="body.storage.value", - ) - ) - - docs = [] - for page in pages: - doc = self.process_page(page, include_attachments, text_maker, metadata) - docs.append(doc) - - return docs - - def _dfs_page_ids(self, page_id, max_num_results): - ret = [page_id] - max_num_remaining = ( - (max_num_results - 1) if max_num_results is not None else None - ) - if max_num_results is not None and max_num_remaining <= 0: - return ret - - child_page_ids = self._get_data_with_paging( - self.confluence.get_child_id_list, - page_id=page_id, - type="page", - max_num_results=max_num_remaining, - ) - for child_page_id in child_page_ids: - dfs_ids = self._dfs_page_ids(child_page_id, max_num_remaining) - ret.extend(dfs_ids) - if max_num_results is not None: - max_num_remaining -= len(dfs_ids) - if max_num_remaining <= 0: - break - return ret - - def _get_data_with_paging(self, paged_function, max_num_results=50, **kwargs): - start = 0 - max_num_remaining = max_num_results - ret = [] - while True: - results = self._get_data_with_retry( - paged_function, start=start, limit=max_num_remaining, **kwargs - ) - ret.extend(results) - if ( - len(results) == 0 - or max_num_results is not None - and len(results) >= max_num_remaining - ): - break - start += len(results) - if max_num_remaining is not None: - max_num_remaining -= len(results) - return ret - - def _get_cql_data_with_paging( - self, cql, max_num_results=50, expand="body.storage.value" - ): - max_num_remaining = max_num_results - ret = [] - params = {"cql": cql, "start": 0, "expand": expand} - if max_num_results is not None: - params["limit"] = max_num_remaining - while True: - results = self._get_data_with_retry( - self.confluence.get, path="rest/api/content/search", params=params - ) - ret.extend(results["results"]) - - params["start"] += len(results["results"]) - - if max_num_results is not None: - params["limit"] -= len(results["results"]) - if params["limit"] <= 0: - break - - next_url = ( - results["_links"]["next"] if "next" in results["_links"] else None - ) - if not next_url: - break - cursor = next_url.split("cursor=")[1].split("&")[0] - params["cursor"] = cursor - - return ret - - @retry(stop_max_attempt_number=4, wait_fixed=4000) - def _get_data_with_retry(self, function, **kwargs): - return function(**kwargs) - - def process_page(self, page, include_attachments, text_maker, metadata): - - if include_attachments: - attachment_texts = self.process_attachment(page["id"]) - else: - attachment_texts = [] - text = text_maker.handle(page["body"]["storage"]["value"]) + "".join( - attachment_texts - ) - - metadata["title"] = page["title"] - - return DocumentNode(text=text, doc_id=page["id"], extra_info=metadata) - - def process_attachment(self, page_id): - try: - pass - except ImportError: - raise ImportError( - "`pytesseract` or `pdf2image` or `Pillow` package not found, please run `pip install " - "pytesseract pdf2image Pillow`" - ) - - # depending on setup you may also need to set the correct path for poppler and tesseract - attachments = self.confluence.get_attachments_from_content(page_id)["results"] - texts = [] - for attachment in attachments: - media_type = attachment["metadata"]["mediaType"] - absolute_url = self.base_url + attachment["_links"]["download"] - title = attachment["title"] - if media_type == "application/pdf": - text = title + self.process_pdf(absolute_url) - elif ( - media_type == "image/png" - or media_type == "image/jpg" - or media_type == "image/jpeg" - ): - text = title + self.process_image(absolute_url) - elif ( - media_type - == "application/vnd.openxmlformats-officedocument.wordprocessingml.DocumentNode" - ): - text = title + self.process_doc(absolute_url) - elif media_type == "application/vnd.ms-excel": - text = title + self.process_xls(absolute_url) - elif media_type == "image/svg+xml": - text = title + self.process_svg(absolute_url) - else: - continue - texts.append(text) - - return texts - - def process_pdf(self, link): - try: - import pytesseract # type: ignore - from pdf2image import convert_from_bytes # type: ignore - except ImportError: - raise ImportError( - "`pytesseract` or `pdf2image` package not found, please run `pip install pytesseract pdf2image`" - ) - - import pytesseract # type: ignore - from pdf2image import convert_from_bytes # type: ignore - - response = self.confluence.request(path=link, absolute=True) - text = "" - - if ( - response.status_code != 200 - or response.content == b"" - or response.content is None - ): - return text - try: - images = convert_from_bytes(response.content) - except ValueError: - return text - - for i, image in enumerate(images): - image_text = pytesseract.image_to_string(image) - text += f"Page {i + 1}:\n{image_text}\n\n" - - return text - - def process_image(self, link): - try: - from io import BytesIO # type: ignore - - import pytesseract # type: ignore - from PIL import Image # type: ignore - except ImportError: - raise ImportError( - "`pytesseract` or `Pillow` package not found, please run `pip install pytesseract Pillow`" - ) - - response = self.confluence.request(path=link, absolute=True) - text = "" - - if ( - response.status_code != 200 - or response.content == b"" - or response.content is None - ): - return text - try: - image = Image.open(BytesIO(response.content)) - except OSError: - return text - - return pytesseract.image_to_string(image) - - def process_doc(self, link): - try: - from io import BytesIO # type: ignore - - import docx2txt # type: ignore - except ImportError: - raise ImportError( - "`docx2txt` package not found, please run `pip install docx2txt`" - ) - - response = self.confluence.request(path=link, absolute=True) - text = "" - - if ( - response.status_code != 200 - or response.content == b"" - or response.content is None - ): - return text - file_data = BytesIO(response.content) - - return docx2txt.process(file_data) - - def process_xls(self, link): - try: - import xlrd # type: ignore - except ImportError: - raise ImportError("`xlrd` package not found, please run `pip install xlrd`") - - response = self.confluence.request(path=link, absolute=True) - text = "" - - if ( - response.status_code != 200 - or response.content == b"" - or response.content is None - ): - return text - - workbook = xlrd.open_workbook(file_contents=response.content) - for sheet in workbook.sheets(): - text += f"{sheet.name}:\n" - for row in range(sheet.nrows): - for col in range(sheet.ncols): - text += f"{sheet.cell_value(row, col)}\t" - text += "\n" - text += "\n" - - return text - - def process_svg(self, link): - try: - from io import BytesIO # type: ignore - - import pytesseract # type: ignore - from PIL import Image # type: ignore - from reportlab.graphics import renderPM # type: ignore - from svglib.svglib import svg2rlg # type: ignore - except ImportError: - raise ImportError( - "`pytesseract`, `Pillow`, or `svglib` package not found, please run `pip install pytesseract Pillow svglib`" - ) - - response = self.confluence.request(path=link, absolute=True) - text = "" - - if ( - response.status_code != 200 - or response.content == b"" - or response.content is None - ): - return text - - drawing = svg2rlg(BytesIO(response.content)) - - img_data = BytesIO() - renderPM.drawToFile(drawing, img_data, fmt="PNG") - img_data.seek(0) - image = Image.open(img_data) - - return pytesseract.image_to_string(image) - - -if __name__ == "__main__": - reader = ConfluenceReader() diff --git a/nextpy/ai/rag/document_loaders/confluence/requirements.txt b/nextpy/ai/rag/document_loaders/confluence/requirements.txt deleted file mode 100644 index 4996a3e3..00000000 --- a/nextpy/ai/rag/document_loaders/confluence/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -atlassian-python-api -html2text -pytesseract -pdf2image -Pillow -docx2txt -xlrd -svglib -retrying \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/couchdb/README.md b/nextpy/ai/rag/document_loaders/couchdb/README.md deleted file mode 100644 index 27647045..00000000 --- a/nextpy/ai/rag/document_loaders/couchdb/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# CouchDB Loader - -This loader loads documents from CouchDB. The loader currently supports CouchDB 3.x -using the CouchDB3 python wrapper from https://github.com/n-vlahovic/couchdb3 -The user specifies a CouchDB instance to initialize the reader. They then specify -the database name and query params to fetch the relevant docs. - -## Usage - -Here's an example usage of the SimpleCouchDBReader. - -```python -from nextpy.ai import download_loader -import os - -SimpleCouchDBReader = download_loader('SimpleCouchDBReader') - -host = "" -port = "" -db_name = "" -# query is passed into db.find() -query_str = "{ couchdb_find_sytax_json }" -reader = SimpleCouchDBReader(host, port) -documents = reader.load_data(db_name, query=query_str) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/couchdb/__init__.py b/nextpy/ai/rag/document_loaders/couchdb/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/couchdb/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/couchdb/base.py b/nextpy/ai/rag/document_loaders/couchdb/base.py deleted file mode 100644 index 4ec907df..00000000 --- a/nextpy/ai/rag/document_loaders/couchdb/base.py +++ /dev/null @@ -1,100 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""CouchDB client.""" - -import json -import logging -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SimpleCouchDBReader(BaseReader): - """Simple CouchDB reader. - - Concatenates each CouchDB doc into DocumentNode used by LlamaIndex. - - Args: - couchdb_url (str): CouchDB Full URL. - max_docs (int): Maximum number of documents to load. - - """ - - def __init__( - self, - user: str, - pwd: str, - host: str, - port: int, - couchdb_url: Optional[Dict] = None, - max_docs: int = 1000, - ) -> None: - """Initialize with parameters.""" - self.user = user - - import couchdb3 - - if couchdb_url is not None: - self.client: CouchDBClient = couchdb3.Server(couchdb_url) - else: - self.client: CouchDBClient = couchdb3.Server( - f"http://{user}:{pwd}@{host}:{port}" - ) - self.max_docs = max_docs - - def load_data( - self, db_name: str, query: Optional[str] = None - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - db_name (str): name of the database. - query (Optional[str]): query to filter documents. - Defaults to None - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = {"user": self.user, "db_name": db_name, "query": query} - - documents = [] - db = self.client.get(db_name) - if query is None: - # if no query is specified, return all docs in database - logging.debug("showing all docs") - results = db.view("_all_docs", include_docs=True) - else: - logging.debug("executing query") - results = db.find(query) - - if type(results) is not dict: - logging.debug(results.rows) - else: - logging.debug(results) - - # check if more than one result - if type(results) is not dict and results.rows is not None: - for row in results.rows: - # check that the id field exists - if "id" not in row: - raise ValueError("`id` field not found in CouchDB DocumentNode.") - documents.append( - DocumentNode(text=json.dumps(row.doc), extra_info=metadata) - ) - else: - # only one result - if results.get("docs") is not None: - for item in results.get("docs"): - # check that the _id field exists - if "_id" not in item: - raise ValueError( - "`_id` field not found in CouchDB DocumentNode." - ) - documents.append( - DocumentNode(text=json.dumps(item), extra_info=metadata) - ) - - return documents diff --git a/nextpy/ai/rag/document_loaders/couchdb/requirements.txt b/nextpy/ai/rag/document_loaders/couchdb/requirements.txt deleted file mode 100644 index a9f1fb1f..00000000 --- a/nextpy/ai/rag/document_loaders/couchdb/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -couchdb3 diff --git a/nextpy/ai/rag/document_loaders/dad_jokes/README.md b/nextpy/ai/rag/document_loaders/dad_jokes/README.md deleted file mode 100644 index 267b672a..00000000 --- a/nextpy/ai/rag/document_loaders/dad_jokes/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# DadJoke Loader - -This loader fetches a joke from icanhazdadjoke. - -## Usage - -To use this loader, load it. - -```python -from nextpy.ai import download_loader - -DadJokesReader = download_loader("DadJokesReader") - -loader = DadJokesReader() -documents = loader.load_data() -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/dad_jokes/__init__.py b/nextpy/ai/rag/document_loaders/dad_jokes/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/dad_jokes/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/dad_jokes/base.py b/nextpy/ai/rag/document_loaders/dad_jokes/base.py deleted file mode 100644 index 3aff9e68..00000000 --- a/nextpy/ai/rag/document_loaders/dad_jokes/base.py +++ /dev/null @@ -1,36 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""dad_jokes reader.""" - -from typing import List - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class DadJokesReader(BaseReader): - """Dad jokes reader. - - Reads a random dad joke. - - """ - - def _get_random_dad_joke(self): - response = requests.get( - "https://icanhazdadjoke.com/", headers={"Accept": "application/json"} - ) - response.raise_for_status() - json_data = response.json() - return json_data["joke"] - - def load_data(self) -> List[DocumentNode]: - """Return a random dad joke. - - Args: - None. - - """ - return [DocumentNode(text=self._get_random_dad_joke())] diff --git a/nextpy/ai/rag/document_loaders/database/README.md b/nextpy/ai/rag/document_loaders/database/README.md deleted file mode 100644 index ca8dbb2d..00000000 --- a/nextpy/ai/rag/document_loaders/database/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Database Loader - -This loader connects to a database (using SQLAlchemy under the hood). The user specifies a query and extracts DocumentNode objects corresponding to the results. For instance, you can use this loader to easily connect to a database on AWS, Snowflake, etc. and pass the documents into a `GPTSQLStructStoreIndex` from LlamaIndex. - -## Usage - -Here's an example usage of the DatabaseReader. - -```python -from nextpy.ai import download_loader - -DatabaseReader = download_loader('DatabaseReader') - -reader = DatabaseReader( - scheme = "postgresql", # Database Scheme - host = "localhost", # Database Host - port = "5432", # Database Port - user = "postgres", # Database User - password = "FakeExamplePassword", # Database Password - dbname = "postgres", # Database Name -) - -query = f""" -SELECT - CONCAT(name, ' is ', age, ' years old.') AS text -FROM public.users -WHERE age >= 18 -""" - -documents = reader.load_data(query=query) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/database/__init__.py b/nextpy/ai/rag/document_loaders/database/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/database/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/database/base.py b/nextpy/ai/rag/document_loaders/database/base.py deleted file mode 100644 index 2276f963..00000000 --- a/nextpy/ai/rag/document_loaders/database/base.py +++ /dev/null @@ -1,102 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Database Reader.""" - -from typing import Any, List, Optional - -from sqlalchemy import text -from sqlalchemy.engine import Engine - -from nextpy.ai.langchain_helpers.sql_wrapper import SQLDatabase -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class DatabaseReader(BaseReader): - """Simple Database reader. - - Concatenates each row into DocumentNode used by LlamaIndex. - - Args: - sql_database (Optional[SQLDatabase]): SQL database to use, - including table names to specify. - See :ref:`Ref-Struct-Store` for more details. - - OR - - engine (Optional[Engine]): SQLAlchemy Engine object of the database connection. - - OR - - uri (Optional[str]): uri of the database connection. - - OR - - scheme (Optional[str]): scheme of the database connection. - host (Optional[str]): host of the database connection. - port (Optional[int]): port of the database connection. - user (Optional[str]): user of the database connection. - password (Optional[str]): password of the database connection. - dbname (Optional[str]): dbname of the database connection. - - Returns: - DatabaseReader: A DatabaseReader object. - """ - - def __init__( - self, - sql_database: Optional[SQLDatabase] = None, - engine: Optional[Engine] = None, - uri: Optional[str] = None, - scheme: Optional[str] = None, - host: Optional[str] = None, - port: Optional[str] = None, - user: Optional[str] = None, - password: Optional[str] = None, - dbname: Optional[str] = None, - *args: Optional[Any], - **kwargs: Optional[Any], - ) -> None: - """Initialize with parameters.""" - if sql_database: - self.sql_database = sql_database - elif engine: - self.sql_database = SQLDatabase(engine, *args, **kwargs) - elif uri: - self.uri = uri - self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs) - elif scheme and host and port and user and password and dbname: - uri = f"{scheme}://{user}:{password}@{host}:{port}/{dbname}" - self.uri = uri - self.sql_database = SQLDatabase.from_uri(uri, *args, **kwargs) - else: - raise ValueError( - "You must provide either a SQLDatabase, " - "a SQL Alchemy Engine, a valid connection URI, or a valid " - "set of credentials." - ) - - def load_data(self, query: str) -> List[DocumentNode]: - """Query and load data from the Database, returning a list of Documents. - - Args: - query (str): Query parameter to filter tables and rows. - - Returns: - List[DocumentNode]: A list of DocumentNode objects. - """ - metadata = {"sql_database": self.sql_database, "uri": self.uri, "query": query} - - documents = [] - with self.sql_database.engine.connect() as connection: - if query is None: - raise ValueError("A query parameter is necessary to filter the data") - else: - result = connection.execute(text(query)) - - for item in result.fetchall(): - # fetch each item - doc_str = ", ".join([str(entry) for entry in item]) - documents.append(DocumentNode(text=doc_str, extra_info=metadata)) - return documents diff --git a/nextpy/ai/rag/document_loaders/deeplake/README.md b/nextpy/ai/rag/document_loaders/deeplake/README.md deleted file mode 100644 index 48268f9f..00000000 --- a/nextpy/ai/rag/document_loaders/deeplake/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# DeepLake Reader - -The DeepLake loader returns a set of texts corresponding to embeddings retrieved from a DeepLake vector store. -The user initializes the loader with an auth token. They then pass in a query vector. - -## Usage - -Here's an example usage of the DeepLake reader. - -```python -from nextpy.ai import download_loader -import os - -DeepLakeReader = download_loader("DeepLakeReader") - -reader = DeepLakeReader(token="") -# the query_vector is an embedding representation of your query_vector -# Example query vector: -# query_vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] - -query_vector=[n1, n2, n3, ...] - -# NOTE: Required args are query_vector, dataset_path. -documents = reader.load_data( - query_vector=query_vector, - dataset_path="", - limit=5 -) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/deeplake/__init__.py b/nextpy/ai/rag/document_loaders/deeplake/__init__.py deleted file mode 100644 index 1c233aca..00000000 --- a/nextpy/ai/rag/document_loaders/deeplake/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init params.""" diff --git a/nextpy/ai/rag/document_loaders/deeplake/base.py b/nextpy/ai/rag/document_loaders/deeplake/base.py deleted file mode 100644 index 6013a5a0..00000000 --- a/nextpy/ai/rag/document_loaders/deeplake/base.py +++ /dev/null @@ -1,126 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""DeepLake reader.""" -from typing import List, Optional, Union - -import numpy as np - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -distance_metric_map = { - "l2": lambda a, b: np.linalg.norm(a - b, axis=1, ord=2), - "l1": lambda a, b: np.linalg.norm(a - b, axis=1, ord=1), - "max": lambda a, b: np.linalg.norm(a - b, axis=1, ord=np.inf), - "cos": lambda a, b: np.dot(a, b.T) - / (np.linalg.norm(a) * np.linalg.norm(b, axis=1)), - "dot": lambda a, b: np.dot(a, b.T), -} - - -def vector_search( - query_vector: Union[List, np.ndarray], - data_vectors: np.ndarray, - distance_metric: str = "l2", - limit: Optional[int] = 4, -) -> List: - """Naive search for nearest neighbors - args: - query_vector: Union[List, np.ndarray] - data_vectors: np.ndarray - limit (int): number of nearest neighbors - distance_metric: distance function 'L2' for Euclidean, 'L1' for Nuclear, 'Max' - l-infinity distnace, 'cos' for cosine similarity, 'dot' for dot product - returns: - nearest_indices: List, indices of nearest neighbors. - """ - # Calculate the distance between the query_vector and all data_vectors - if isinstance(query_vector, list): - query_vector = np.array(query_vector) - query_vector = query_vector.reshape(1, -1) - - distances = distance_metric_map[distance_metric](query_vector, data_vectors) - nearest_indices = np.argsort(distances) - - nearest_indices = ( - nearest_indices[::-1][:limit] - if distance_metric in ["cos"] - else nearest_indices[:limit] - ) - - return nearest_indices.tolist() - - -class DeepLakeReader(BaseReader): - """DeepLake reader. - - Retrieve documents from existing DeepLake datasets. - - Args: - dataset_name: Name of the deeplake dataset. - """ - - def __init__( - self, - token: Optional[str] = None, - ): - """initializing the deepLake reader.""" - import_err_msg = ( - "`deeplake` package not found, please run `pip install deeplake`" - ) - try: - import deeplake # noqa: F401 - except ImportError: - raise ImportError(import_err_msg) - self.token = token - - def load_data( - self, - query_vector: List[float], - dataset_path: str, - limit: int = 4, - distance_metric: str = "l2", - ) -> List[DocumentNode]: - """Load data from DeepLake. - - Args: - dataset_name (str): Name of the DeepLake dataet. - query_vector (List[float]): Query vector. - limit (int): Number of results to return. - - Returns: - List[DocumentNode]: A list of documents. - """ - import deeplake - from deeplake.util.exceptions import TensorDoesNotExistError - - dataset = deeplake.load(dataset_path, token=self.token) - - try: - embeddings = dataset.embedding.numpy(fetch_chunks=True) - except Exception: - raise TensorDoesNotExistError("embedding") - - indices = vector_search( - query_vector, embeddings, distance_metric=distance_metric, limit=limit - ) - - metadata = { - "query_vector": query_vector, - "dataset_path": dataset_path, - "limit": limit, - "distance_metric": distance_metric, - } - - documents = [] - for idx in indices: - doc = DocumentNode( - doc_id=dataset[idx].ids.numpy().tolist()[0], - text=str(dataset[idx].text.numpy().tolist()[0]), - extra_info=metadata, - ) - - documents.append(doc) - - return documents diff --git a/nextpy/ai/rag/document_loaders/deeplake/requirements.txt b/nextpy/ai/rag/document_loaders/deeplake/requirements.txt deleted file mode 100644 index bd1ea014..00000000 --- a/nextpy/ai/rag/document_loaders/deeplake/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -deeplake \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/discord/README.md b/nextpy/ai/rag/document_loaders/discord/README.md deleted file mode 100644 index b8076249..00000000 --- a/nextpy/ai/rag/document_loaders/discord/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Discord Loader - -This loader loads conversations from Discord. The user specifies `channel_ids` and we fetch conversations from -those `channel_ids`. - -## Usage - -Here's an example usage of the DiscordReader. - -```python -from nextpy.ai import download_loader -import os - -DiscordReader = download_loader('DiscordReader') - -discord_token = os.getenv("DISCORD_TOKEN") -channel_ids = [1057178784895348746] # Replace with your channel_id -reader = DiscordReader(discord_token=discord_token) -documents = reader.load_data(channel_ids=channel_ids) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/discord/__init__.py b/nextpy/ai/rag/document_loaders/discord/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/discord/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/discord/base.py b/nextpy/ai/rag/document_loaders/discord/base.py deleted file mode 100644 index 62f7336b..00000000 --- a/nextpy/ai/rag/document_loaders/discord/base.py +++ /dev/null @@ -1,144 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Discord reader.""" - -import asyncio -import logging -import os -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -async def read_channel( - discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool -) -> str: - """Async read channel. - - Note: This is our hack to create a synchronous interface to the - async discord.py API. We use the `asyncio` module to run - this function with `asyncio.get_event_loop().run_until_complete`. - - """ - import discord # noqa: F401 - - messages: List[discord.Message] = [] - - class CustomClient(discord.Client): - async def on_ready(self) -> None: - try: - print(f"{self.user} has connected to Discord!") - channel = client.get_channel(channel_id) - # only work for text channels for now - if not isinstance(channel, discord.TextChannel): - raise ValueError( - f"Channel {channel_id} is not a text channel. " - "Only text channels are supported for now." - ) - # thread_dict maps thread_id to thread - thread_dict = {} - for thread in channel.threads: - thread_dict[thread.id] = thread - - async for msg in channel.history( - limit=limit, oldest_first=oldest_first - ): - messages.append(msg) - if msg.id in thread_dict: - thread = thread_dict[msg.id] - async for thread_msg in thread.history( - limit=limit, oldest_first=oldest_first - ): - messages.append(thread_msg) - except Exception as e: - print("Encountered error: " + str(e)) - finally: - await self.close() - - intents = discord.Intents.default() - intents.message_content = True - client = CustomClient(intents=intents) - await client.start(discord_token) - - msg_txt_list = [m.content for m in messages] - - return "\n\n".join(msg_txt_list) - - -class DiscordReader(BaseReader): - """Discord reader. - - Reads conversations from channels. - - Args: - discord_token (Optional[str]): Discord token. If not provided, we - assume the environment variable `DISCORD_TOKEN` is set. - - """ - - def __init__(self, discord_token: Optional[str] = None) -> None: - """Initialize with parameters.""" - if discord_token is None: - discord_token = os.environ["DISCORD_TOKEN"] - if discord_token is None: - raise ValueError( - "Must specify `discord_token` or set environment " - "variable `DISCORD_TOKEN`." - ) - - self.discord_token = discord_token - - def _read_channel( - self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True - ) -> str: - """Read channel.""" - result = asyncio.get_event_loop().run_until_complete( - read_channel( - self.discord_token, channel_id, limit=limit, oldest_first=oldest_first - ) - ) - return result - - def load_data( - self, - channel_ids: List[int], - limit: Optional[int] = None, - oldest_first: bool = True, - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - channel_ids (List[int]): List of channel ids to read. - limit (Optional[int]): Maximum number of messages to read. - oldest_first (bool): Whether to read oldest messages first. - Defaults to `True`. - - Returns: - List[DocumentNode]: List of documents. - - """ - metadata = {"channel": channel_id, "limit": limit, "oldest_first": oldest_first} - - results: List[DocumentNode] = [] - for channel_id in channel_ids: - if not isinstance(channel_id, int): - raise ValueError( - f"Channel id {channel_id} must be an integer, " - f"not {type(channel_id)}." - ) - channel_content = self._read_channel( - channel_id, limit=limit, oldest_first=oldest_first - ) - results.append(DocumentNode(text=channel_content, extra_info=metadata)) - return results - - -if __name__ == "__main__": - reader = DiscordReader() - print("initialized reader") - output = reader.load_data(channel_ids=[1057178784895348746], limit=10) - print(output) diff --git a/nextpy/ai/rag/document_loaders/discord/requirements.txt b/nextpy/ai/rag/document_loaders/discord/requirements.txt deleted file mode 100644 index 503dba90..00000000 --- a/nextpy/ai/rag/document_loaders/discord/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -discord.py \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/docugami/README.md b/nextpy/ai/rag/document_loaders/docugami/README.md deleted file mode 100644 index 2a1b637f..00000000 --- a/nextpy/ai/rag/document_loaders/docugami/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Docugami Loader - -This loader takes in IDs of PDF, DOCX or DOC files processed by [Docugami](https://docugami.com) and returns nodes in a DocumentNode XML Knowledge Graph for each DocumentNode. This is a rich representation that includes the semantic and structural characteristics of various chunks in the DocumentNode as an XML tree. Entire sets of documents are processed, resulting in forests of XML semantic trees. - -## Pre-requisites - -1. Create a Docugami workspace: [http://www.docugami.com](http://www.docugami.com) (free trials available) -2. Add your documents (PDF, DOCX or DOC) and allow Docugami to ingest and cluster them into sets of similar documents, e.g. NDAs, Lease Agreements, and Service Agreements. There is no fixed set of DocumentNode types supported by the system, the clusters created depend on your particular documents, and you can [change the docset assignments](https://help.docugami.com/home/working-with-the-doc-sets-view) later. -3. Create an access token via the Developer Playground for your workspace. Detailed instructions: [https://help.docugami.com/home/docugami-api](https://help.docugami.com/home/docugami-api) -4. Explore the Docugami API at [https://api-docs.docugami.com](https://api-docs.docugami.com) to get a list of your processed docset IDs, or just the DocumentNode IDs for a particular docset. - -## Usage - -To use this loader, you simply need to pass in a Docugami Doc Set ID, and optionally an array of DocumentNode IDs (by default, all documents in the Doc Set are loaded). - -```python -from nextpy.ai import download_loader - -DocugamiReader = download_loader('DocugamiReader') - -docset_id="ecxqpipcoe2p" -document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"] - -loader = DocugamiReader() -documents = loader.load_data(docset_id=docset_id, document_ids=document_ids) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - -See more information about how to use Docugami with LangChain in the [LangChain docs](https://python.langchain.com/docs/ecosystem/integrations/docugami). - -# Advantages vs Other Chunking Techniques - -Appropriate chunking of your documents is critical for retrieval from documents. Many chunking techniques exist, including simple ones that rely on whitespace and recursive chunk splitting based on character length. Docugami offers a different approach: - -1. **Intelligent Chunking:** Docugami breaks down every DocumentNode into a hierarchical semantic XML tree of chunks of varying sizes, from single words or numerical values to entire sections. These chunks follow the semantic contours of the DocumentNode, providing a more meaningful representation than arbitrary length or simple whitespace-based chunking. -2. **Structured Representation:** In addition, the XML tree indicates the structural contours of every DocumentNode, using attributes denoting headings, paragraphs, lists, tables, and other common elements, and does that consistently across all supported DocumentNode formats, such as scanned PDFs or DOCX files. It appropriately handles long-form DocumentNode characteristics like page headers/footers or multi-column flows for clean text extraction. -3. **Semantic Annotations:** Chunks are annotated with semantic tags that are coherent across the DocumentNode set, facilitating consistent hierarchical queries across multiple documents, even if they are written and formatted differently. For example, in set of lease agreements, you can easily identify key provisions like the Landlord, Tenant, or Renewal Date, as well as more complex information such as the wording of any sub-lease provision or whether a specific jurisdiction has an exception section within a Termination Clause. -4. **Additional Metadata:** Chunks are also annotated with additional metadata, if a user has been using Docugami. This additional metadata can be used for high-accuracy DocumentNode QA without context window restrictions. See detailed code walk-through in [this notebook](https://github.com/docugami/llama-hub/blob/main/llama_hub/docugami/docugami.ipynb). diff --git a/nextpy/ai/rag/document_loaders/docugami/__init__.py b/nextpy/ai/rag/document_loaders/docugami/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/docugami/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/docugami/base.py b/nextpy/ai/rag/document_loaders/docugami/base.py deleted file mode 100644 index bf808f76..00000000 --- a/nextpy/ai/rag/document_loaders/docugami/base.py +++ /dev/null @@ -1,344 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Docugami reader.""" - -import io -import os -import re -from typing import Any, Dict, List, Mapping, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -TD_NAME = "{http://www.w3.org/1999/xhtml}td" -TABLE_NAME = "{http://www.w3.org/1999/xhtml}table" - -XPATH_KEY = "xpath" -DOCUMENT_ID_KEY = "id" -DOCUMENT_NAME_KEY = "name" -STRUCTURE_KEY = "structure" -TAG_KEY = "tag" -PROJECTS_KEY = "projects" - -DEFAULT_API_ENDPOINT = "https://api.docugami.com/v1preview1" - - -class DocugamiReader(BaseReader): - """Docugami reader. - - Reads Documents as nodes in a DocumentNode XML Knowledge Graph, from Docugami. - - """ - - api: str = DEFAULT_API_ENDPOINT - access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY") - min_chunk_size: int = 32 # appended to next chunk to avoid over-chunking - - def _parse_dgml( - self, - DocumentNode: Mapping, - content: bytes, - doc_metadata: Optional[Mapping] = None, - ) -> List[DocumentNode]: - """Parse a single DGML DocumentNode into a list of Documents.""" - try: - from lxml import etree - except ImportError: - raise ValueError( - "Could not import lxml python package. " - "Please install it with `pip install lxml`." - ) - - # helpers - def _xpath_qname_for_chunk(chunk: Any) -> str: - """Get the xpath qname for a chunk.""" - qname = f"{chunk.prefix}:{chunk.tag.split('}')[-1]}" - - parent = chunk.getparent() - if parent is not None: - doppelgangers = [x for x in parent if x.tag == chunk.tag] - if len(doppelgangers) > 1: - idx_of_self = doppelgangers.index(chunk) - qname = f"{qname}[{idx_of_self + 1}]" - - return qname - - def _xpath_for_chunk(chunk: Any) -> str: - """Get the xpath for a chunk.""" - ancestor_chain = chunk.xpath("ancestor-or-self::*") - return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain) - - def _structure_value(node: Any) -> Optional[str]: - """Get the structure value for a node.""" - structure = ( - "table" - if node.tag == TABLE_NAME - else node.attrib["structure"] - if "structure" in node.attrib - else None - ) - return structure - - def _is_structural(node: Any) -> bool: - """Check if a node is structural.""" - return _structure_value(node) is not None - - def _is_heading(node: Any) -> bool: - """Check if a node is a heading.""" - structure = _structure_value(node) - return structure is not None and structure.lower().startswith("h") - - def _get_text(node: Any) -> str: - """Get the text of a node.""" - return " ".join(node.itertext()).strip() - - def _has_structural_descendant(node: Any) -> bool: - """Check if a node has a structural descendant.""" - for child in node: - if _is_structural(child) or _has_structural_descendant(child): - return True - return False - - def _leaf_structural_nodes(node: Any) -> List: - """Get the leaf structural nodes of a node.""" - if _is_structural(node) and not _has_structural_descendant(node): - return [node] - else: - leaf_nodes = [] - for child in node: - leaf_nodes.extend(_leaf_structural_nodes(child)) - return leaf_nodes - - def _create_doc(node: Any, text: str) -> DocumentNode: - """Create a DocumentNode from a node and text.""" - metadata = { - XPATH_KEY: _xpath_for_chunk(node), - DOCUMENT_ID_KEY: DocumentNode["id"], - DOCUMENT_NAME_KEY: DocumentNode["name"], - STRUCTURE_KEY: node.attrib.get("structure", ""), - TAG_KEY: re.sub(r"\{.*\}", "", node.tag), - } - - if doc_metadata: - metadata.update(doc_metadata) - - return DocumentNode( - text=text, - metadata=metadata, - excluded_llm_metadata_keys=[XPATH_KEY, DOCUMENT_ID_KEY, STRUCTURE_KEY], - ) - - # parse the tree and return chunks - tree = etree.parse(io.BytesIO(content)) - root = tree.getroot() - - chunks: List[DocumentNode] = [] - prev_small_chunk_text = None - for node in _leaf_structural_nodes(root): - text = _get_text(node) - if prev_small_chunk_text: - text = prev_small_chunk_text + " " + text - prev_small_chunk_text = None - - if _is_heading(node) or len(text) < self.min_chunk_size: - # Save headings or other small chunks to be appended to the next chunk - prev_small_chunk_text = text - else: - chunks.append(_create_doc(node, text)) - - if prev_small_chunk_text and len(chunks) > 0: - # small chunk at the end left over, just append to last chunk - if not chunks[-1].text: - chunks[-1].text = prev_small_chunk_text - else: - chunks[-1].text += " " + prev_small_chunk_text - - return chunks - - def _document_details_for_docset_id(self, docset_id: str) -> List[Dict]: - """Gets all DocumentNode details for the given docset ID.""" - url = f"{self.api}/docsets/{docset_id}/documents" - all_documents = [] - - while url: - response = requests.get( - url, - headers={"Authorization": f"Bearer {self.access_token}"}, - ) - if response.ok: - data = response.json() - all_documents.extend(data["documents"]) - url = data.get("next", None) - else: - raise Exception( - f"Failed to download {url} (status: {response.status_code})" - ) - - return all_documents - - def _project_details_for_docset_id(self, docset_id: str) -> List[Dict]: - """Gets all project details for the given docset ID.""" - url = f"{self.api}/projects?docset.id={docset_id}" - all_projects = [] - - while url: - response = requests.request( - "GET", - url, - headers={"Authorization": f"Bearer {self.access_token}"}, - data={}, - ) - if response.ok: - data = response.json() - all_projects.extend(data["projects"]) - url = data.get("next", None) - else: - raise Exception( - f"Failed to download {url} (status: {response.status_code})" - ) - - return all_projects - - def _metadata_for_project(self, project: Dict) -> Dict: - """Gets project metadata for all files.""" - project_id = project.get("id") - - url = f"{self.api}/projects/{project_id}/artifacts/latest" - all_artifacts = [] - - while url: - response = requests.request( - "GET", - url, - headers={"Authorization": f"Bearer {self.access_token}"}, - data={}, - ) - if response.ok: - data = response.json() - all_artifacts.extend(data["artifacts"]) - url = data.get("next", None) - else: - raise Exception( - f"Failed to download {url} (status: {response.status_code})" - ) - - per_file_metadata = {} - for artifact in all_artifacts: - artifact_name = artifact.get("name") - artifact_url = artifact.get("url") - artifact_doc = artifact.get("DocumentNode") - - if artifact_name == "report-values.xml" and artifact_url and artifact_doc: - doc_id = artifact_doc["id"] - metadata: Dict = {} - - # the evaluated XML for each DocumentNode is named after the project - response = requests.request( - "GET", - f"{artifact_url}/content", - headers={"Authorization": f"Bearer {self.access_token}"}, - data={}, - ) - - if response.ok: - try: - from lxml import etree - except ImportError: - raise ValueError( - "Could not import lxml python package. " - "Please install it with `pip install lxml`." - ) - artifact_tree = etree.parse(io.BytesIO(response.content)) - artifact_root = artifact_tree.getroot() - ns = artifact_root.nsmap - entries = artifact_root.xpath("//pr:Entry", namespaces=ns) - for entry in entries: - heading = entry.xpath("./pr:Heading", namespaces=ns)[0].text - value = " ".join( - entry.xpath("./pr:Value", namespaces=ns)[0].itertext() - ).strip() - metadata[heading] = value - per_file_metadata[doc_id] = metadata - else: - raise Exception( - f"Failed to download {artifact_url}/content " - + "(status: {response.status_code})" - ) - - return per_file_metadata - - def _load_chunks_for_document( - self, docset_id: str, DocumentNode: Dict, doc_metadata: Optional[Dict] = None - ) -> List[DocumentNode]: - """Load chunks for a DocumentNode.""" - document_id = DocumentNode["id"] - url = f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml" - - response = requests.request( - "GET", - url, - headers={"Authorization": f"Bearer {self.access_token}"}, - data={}, - ) - - if response.ok: - return self._parse_dgml(DocumentNode, response.content, doc_metadata) - else: - raise Exception( - f"Failed to download {url} (status: {response.status_code})" - ) - - def load_data( - self, - docset_id: str, - document_ids: Optional[List[str]] = None, - access_token: Optional[str] = None, - ) -> List[DocumentNode]: - """Load data the given docset_id in Docugami. - - Args: - docset_id (str): DocumentNode set ID to load data for. - document_ids (Optional[List[str]]): Optional list of DocumentNode ids to load data for. - If not specified, all documents from docset_id are loaded. - """ - chunks: List[DocumentNode] = [] - - if access_token: - self.access_token = access_token - - if not self.access_token: - raise Exception( - "Please specify access token as argument or set the DOCUGAMI_API_KEY env var." - ) - - _document_details = self._document_details_for_docset_id(docset_id) - if document_ids: - _document_details = [ - d for d in _document_details if d["id"] in document_ids - ] - - _project_details = self._project_details_for_docset_id(docset_id) - combined_project_metadata = {} - if _project_details: - # if there are any projects for this docset, load project metadata - for project in _project_details: - metadata = self._metadata_for_project(project) - combined_project_metadata.update(metadata) - - for doc in _document_details: - doc_metadata = combined_project_metadata.get(doc["id"]) - chunks += self._load_chunks_for_document(docset_id, doc, doc_metadata) - - return chunks - - -if __name__ == "__main__": - reader = DocugamiReader() - print( - reader.load_data( - docset_id="ecxqpipcoe2p", document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"] - ) - ) diff --git a/nextpy/ai/rag/document_loaders/docugami/docugami.ipynb b/nextpy/ai/rag/document_loaders/docugami/docugami.ipynb deleted file mode 100644 index 9a11cc4e..00000000 --- a/nextpy/ai/rag/document_loaders/docugami/docugami.ipynb +++ /dev/null @@ -1,367 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Docugami\n", - "This notebook covers how to load documents from `Docugami`. See [README](./README.md) for more details, and the advantages of using this system over alternative data loaders.\n", - "\n", - "## Prerequisites\n", - "1. Follow the Quick Start section in [README](./README.md)\n", - "2. Grab an access token for your workspace, and make sure it is set as the DOCUGAMI_API_KEY environment variable\n", - "3. Grab some docset and DocumentNode IDs for your processed documents, as described here: https://help.docugami.com/home/docugami-api" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Documents\n", - "\n", - "If the DOCUGAMI_API_KEY environment variable is set, there is no need to pass it in to the loader explicitly otherwise you can pass it in as the `access_token` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DocumentNode(id_='c1adad58-13c4-4455-b286-68ade1aa23ef', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='368d8592f11eea5a4d5283bea95d58615ecb5c26d0ff334589530154567ba1c7', text='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “ Agreement ”) is entered into and made effective as of April 4 , 2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Caleb Divine , an individual, whose address is 1201 Rt 300 , Newburgh NY 12550 .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='71d42249-72f6-4b9f-a867-0006ab8cdd7f', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Discussions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Discussions'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='82d619fcda012945be1f03fe6695214a4ca4d2cca1762b3bb7de49c9b3e6fc7f', text='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “Purpose”). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='c6f7e876-bc98-464c-a077-603e050b5e5b', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Consideration', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'Consideration'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='56c557f48bcb2f6f1d9543f5ebaf8403f7560855fc4fd56db8ce2d49956b04ae', text='In consideration of the foregoing, the parties agree as follows:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='0b1d21d9-e5d1-4bf8-9817-5c58abc7c798', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Purposes/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ConfidentialInformation'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='2b897e1e8b630de4f0955b6401a88096c4bc65bcab5525e6986de49117581dbd', text='1. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='8b1ec620-e76a-47a5-9a47-93bb51b2cffa', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Obligations/docset:ObligationsAndRestrictions-section/docset:ObligationsAndRestrictions', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ObligationsAndRestrictions'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='4863e312bc2c4c138558e37529e0ac109f18d4791495efb9f123bf36b0c73ef7', text=\"2. Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and (iii) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other party’s Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other party’s Confidential Information as those set forth in this Agreement .\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='ab98027e-b9ae-4270-8cd7-55ab32c136da', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Exceptions/docset:Exceptions-section/docset:Exceptions[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Exceptions'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='9e8f83441e0ac68bc629fcfcd9a5b185b8dde0c2eb7d7209c12283fe2e42369f', text='3. Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='2e45a618-bbc3-4a83-a5ee-c2bfdf833f7f', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheDate/docset:TheDate', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheDate'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='8c232813973ffefbc77c3ac3a89c7e3d4cdd78540c62700b2be74bb392f688d1', text='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='e6c988b8-3c7e-47d6-a4a2-d81cc18a495a', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='cf659be008f33074f113194b8e69fd7c91ae5c48d4a9ee4514b573525d666443', text='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='f048e75f-693b-4d1f-8486-1160f008e862', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheDate/docset:TheReceivingParty/docset:TheReceivingParty', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheReceivingParty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='917fcdd86af937d5616920f555349580287c71de5f0b7ceef01b2bb2ed7ba85b', text='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party;', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='3446f26e-57a6-4d7e-952a-f0da49f4645d', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'CompelledDisclosure'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='aa6792c7c7b06bc8369669d9f9396d1130cf43b46bab873995ea9e4baefac99b', text='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='8723c63b-2909-498f-ad3b-eeabac75296c', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='96228459e30933cfb4caef7bf622a8d69d1e2ad81a5bab80b437d476064d180e', text='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='3ce7cce5-5406-4ad0-8a46-c7b60ae05bba', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoObligations'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='ec8a72aec1f9b3c79b75320d0791b57cc4ad6477b5736f8dd7d412601a045de0', text='6. No Obligations . Each party retains the right to determine whether to disclose any Confidential Information to the other party.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='d07975c2-fc28-48be-b08f-026376045c0e', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'NoWarranty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='6ec95c44359ab768933cd504cef2995a968fd0b2c492ec9e86feca828bada420', text='7. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='7a8c5987-c32a-44ce-a0be-1e4fed2f622a', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:ThisAgreement/docset:Term-section/docset:Term', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Term'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0433e3ad32d54390ef1a56f71737ec1022ea503f69154fea86f7412ab06be4e4', text='8. Term. This Agreement will remain in effect for a period of seven ( 7 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='0067a629-2a96-4685-b4e6-96638e35853c', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'EquitableRelief'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='644d87541f4a44c2aa5fa8507178e85198fcaec1649e43202b00f5309322909a', text='9. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='113dc7ff-9cc2-4727-89ca-cc571892ceff', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:TheMaximumExtent/docset:Non-compete-section/docset:Non-compete', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Non-compete'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='fbe53faf86b169b8eff8493aa195dfc93bdb23b49b634852d61a713ea70b89c5', text='10. Non-compete. To the maximum extent permitted by applicable law, during the Term of this Agreement and for a period of one ( 1 ) year thereafter, Caleb Divine may not market software products or do business that directly or indirectly competes with Docugami software products .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='0d7c960a-4354-47d3-b771-d4544eb5c002', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:MutualNon-disclosure/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:Consideration/docset:Purposes/docset:Accordance/docset:Miscellaneous-section/docset:Miscellaneous', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'div', 'tag': 'Miscellaneous'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='3a77702016956a88bdb283d44959e69ffac34b85aa9517f0690daf7f66ad23c0', text='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='274e8599-41c3-4bd7-bc8d-ff1c7028688e', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:TheParties', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': 'p', 'tag': 'TheParties'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='39fd4861c450f4aa99db25846744bb1c85524dc093e8bf2f9c9e872c1040594c', text='[SIGNATURE PAGE FOLLOWS] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='a846786d-d48f-4f8a-b6d0-db72b41aff93', embedding=None, metadata={'xpath': '/docset:MutualNon-disclosure/docset:Witness/docset:TheParties/docset:DocugamiInc/docset:DocugamiInc/xhtml:table', 'id': '43rj0ds7s0ur', 'name': 'NDA simple layout.docx', 'structure': '', 'tag': 'table'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='e311d3c0a8be4ae9f3543e2586bad04cb321ab2613a025422e4b320e3771232b', text='DOCUGAMI INC . : \\n\\n Caleb Divine : \\n\\n Signature: Signature: Name: \\n\\n Jean Paoli Name: Title: \\n\\n CEO Title:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='772cb699-5da6-40b3-b8a7-ad4e27f2d6df', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement/docset:ThisMutualNon-disclosureAgreement', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'ThisMutualNon-disclosureAgreement'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='348c40a6fef0b79ee94c35d1ea6722717afb473dbf9fe97cae7ea73ad9a9f6f2', text='MUTUAL NON-DISCLOSURE AGREEMENT This Mutual Non-Disclosure Agreement (this “Agreement’) is entered into and made effective as of 2/4/2018 between Docugami Inc. , a Delaware corporation , whose address is 150 Lake Street South , Suite 221 , Kirkland , Washington 98033 , and Leonarda Hosler , an individual, whose address is 374 William S Canning Blvd , Fall River MA 2721 .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='d071de3e-e3d3-43b4-a5b0-5bd476e20397', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement/docset:Discussions', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'Discussions'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='bf0d4bf957e57f052949cae510d3a6a012a908edc9e83fe9186c98e5b8229f53', text='The above named parties desire to engage in discussions regarding a potential agreement or other transaction between the parties (the “ Purpose’). In connection with such discussions, it may be necessary for the parties to disclose to each other certain confidential information or materials to enable them to evaluate whether to enter into such agreement or transaction.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='4b0c411e-4a2e-46f0-9ce6-1a954af0c43b', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement/docset:Consideration/docset:Consideration', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'Consideration'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='e96bfc5a92ebedb78c5ead071be8a1c94cd54fc3ad8a6c3fc9359ceeec7ca5e2', text='In consideration of the foregoing, the parties agree as follows:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='08070725-492d-4d6c-99f9-959c4c4b41b5', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ThisMutualNon-disclosureAgreement/docset:Consideration/dg:chunk/docset:IlConfidentialInformation/docset:ConfidentialInformation-section/docset:ConfidentialInformation[2]', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'ConfidentialInformation'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='d234322b083398877c5fde4e8d8e208d2f8853041d8bb36c285d3f7fd922984b', text='iL. Confidential Information . For purposes of this Agreement , “ Confidential Information ” means any information or materials disclosed by one party to the other party that: (i) if disclosed in writing or in the form of tangible materials, is marked “confidential” or “proprietary” at the time of such disclosure; (ii) if disclosed orally or by visual presentation, is identified as “confidential” or “proprietary” at the time of such disclosure, and is summarized in a writing sent by the disclosing party to the receiving party within thirty ( 30 ) days after any such disclosure; or (iii) due to its nature or the circumstances of its disclosure, a person exercising reasonable business judgment would understand to be confidential or proprietary.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='709b93bc-9506-429a-9dba-010f23c545a9', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:StrictConfidence/docset:StrictConfidence', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'StrictConfidence'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='72cb89c8632ae4c6e6a70a744b4b80c6c654dcbcc19fa6685b3cce76621d0ac5', text=\"Ze Obligations and Restrictions . Each party agrees: (i) to maintain the other party's Confidential Information in strict confidence; (ii) not to disclose such Confidential Information to any third party; and ( iii ) not to use such Confidential Information for any purpose except for the Purpose. Each party may disclose the other party’s Confidential Information to its employees and consultants who have a bona fide need to know such Confidential Information for the Purpose, but solely to the extent necessary to pursue the Purpose and for no other purpose; provided, that each such employee and consultant first executes a written agreement (or is otherwise already bound by a written agreement) that contains use and nondisclosure restrictions at least as protective of the other party’s Confidential Information as those set forth in this Agreement .\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='11295e87-7b9c-418e-a624-77255cb83995', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheObligations', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheObligations'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='209d94e5c657f32d408683f633ae6365e64933a5f573da42ac00aa5f28a4e8ed', text='is Exceptions. The obligations and restrictions in Section 2 will not apply to any information or materials that:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='5a18d3ec-40fd-42f2-9a82-9ed10595131f', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:TheDate/docset:TheDate', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheDate'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='6874cd19a59835e3088539c2f030a7a48e161144f3027aa998e9a1e4e6d97e55', text='(i) were, at the date of disclosure, or have subsequently become, generally known or available to the public through no act or failure to act by the receiving party;', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='2194b494-89b6-4314-9a55-c48d6bebd8f9', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:SuchInformation/docset:TheReceivingParty', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheReceivingParty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='45b2b8b3c690f1740cfb9d107a7aac93957558657f23fb33de1d5c1a3d9766d5', text='(ii) were rightfully known by the receiving party prior to receiving such information or materials from the disclosing party;', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='bfc61523-08cb-45b7-b993-5cf5005b9cf2', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:TheReceivingParty[1]/docset:TheReceivingParty', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheReceivingParty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='c5619393062d7e158772d63dc65e69cc1e0307001e94e7fa95c8ddef0af995ae', text='(iii) are rightfully acquired by the receiving party from a third party who has the right to disclose such information or materials without breach of any confidentiality obligation to the disclosing party; or', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='11fa54db-a95a-4444-b4f0-0d084f911987', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:TheReceivingParty[2]/docset:TheReceivingParty', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheReceivingParty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='ae9f256f6d6c0eced35325f4581324e5d7c62d015b399dc6d53c422a1f7299f6', text='(iv) are independently developed by the receiving party without access to any Confidential Information of the disclosing party.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='26e13edc-e722-49a0-8911-aeee735655b1', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:Disclosure/docset:CompelledDisclosure-section/docset:CompelledDisclosure', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'CompelledDisclosure'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='05a777dce696dde4b471bb89e39c811d431b0094678a1aa43d54375e883971b2', text='4. Compelled Disclosure . Nothing in this Agreement will be deemed to restrict a party from disclosing the other party’s Confidential Information to the extent required by any order, subpoena, law, statute or regulation; provided, that the party required to make such a disclosure uses reasonable efforts to give the other party reasonable advance notice of such required disclosure in order to enable the other party to prevent or limit such disclosure.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='ed00c799-026b-4477-93b2-6a4ee5bfc9e5', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:TheCompletion/docset:ReturnofConfidentialInformation-section/docset:ReturnofConfidentialInformation', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'ReturnofConfidentialInformation'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='362a60349e7398655df172684ddd398718b40111ec44f3a4b3766286277398ec', text='5. Return of Confidential Information . Upon the completion or abandonment of the Purpose, and in any event upon the disclosing party’s request, the receiving party will promptly return to the disclosing party all tangible items and embodiments containing or consisting of the disclosing party’s Confidential Information and all copies thereof (including electronic copies), and any notes, analyses, compilations, studies, interpretations, memoranda or other documents (regardless of the form thereof) prepared by or on behalf of the receiving party that contain or are based upon the disclosing party’s Confidential Information .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='8db0ab30-5bfb-4627-9da5-a6101a35b6d9', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:Exceptions-section/docset:Exceptions/docset:TheDate/docset:NoObligations/docset:NoObligations-section/docset:NoObligations[2]', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'NoObligations'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='3ba55f7b677f0eb25b628b31fa943f62ee192afe8b34c3ef76712f67c7cf9489', text='6. No Obligations . Each party retains the right, in its sole discretion, to determine whether to disclose any Confidential Information to the other party. Neither party will be required to negotiate nor enter into any other agreements or arrangements with the other party, whether or not related to the Purpose.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='c7490616-7bf0-47fc-ac29-16ee90a99d92', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:TheSoleAndExclusiveProperty/docset:TheSoleAndExclusiveProperty', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheSoleAndExclusiveProperty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='137345886cee3712d74ff75fba0e2143d33b82f7b3cf1b70883719b412a37e1c', text='ie No License . All Confidential Information remains the sole and exclusive property of the disclosing party. Each party acknowledges and agrees that nothing in this Agreement will be construed as granting any rights to the receiving party, by license or otherwise, in or to any Confidential Information of the disclosing party, or any patent, copyright or other intellectual property or proprietary rights of the disclosing party, except as specified in this Agreement .', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='e8572e09-ace8-4ce7-9b9d-dc34e2b67009', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:TheSoleAndExclusiveProperty/docset:NoWarranty/docset:NoWarranty/docset:NoWarranty-section/docset:NoWarranty[2]', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'NoWarranty'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='1e12c1c70bca5303929648afd4bf2240fb0540572f8c1de37668e5f8d4928667', text='8. No Warranty. ALL CONFIDENTIAL INFORMATION IS PROVIDED BY THE DISCLOSING PARTY “AS IS ”.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='ce9cd379-27ab-4f96-900b-60013bae4594', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:TheSoleAndExclusiveProperty/docset:NoWarranty/docset:ThisAgreement/docset:Term-section/docset:Term', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'Term'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='5ac55d08549f7d427f14fc7c2e35ad192b84a86784cafe120e139ad8fd4ad216', text='9. Term. This Agreement will remain in effect for a period of five ( 5 ) years from the date of last disclosure of Confidential Information by either party, at which time it will terminate.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='d0cb17da-b553-4d26-901f-eee4e880fa6e', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:TheSoleAndExclusiveProperty/docset:NoWarranty/docset:EquitableRelief/docset:EquitableRelief-section/docset:EquitableRelief[2]', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'EquitableRelief'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='f5c840f17e99e16816b1b1263b4062b382001a9a8467cca43c3624da4cb357c5', text='10. Equitable Relief . Each party acknowledges that the unauthorized use or disclosure of the disclosing party’s Confidential Information may cause the disclosing party to incur irreparable harm and significant damages, the degree of which may be difficult to ascertain. Accordingly, each party agrees that the disclosing party will have the right to seek immediate equitable relief to enjoin any unauthorized use or disclosure of its Confidential Information , in addition to any other rights and remedies that it may have at law or otherwise.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='ed6fa7e8-f0c2-4b1e-af9b-d89ce650ce79', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:TheSoleAndExclusiveProperty/docset:NoWarranty/docset:Accordance/docset:MiscellaneousThisAgreementWillBeGovernedAndConstruedinAccordancewithT/docset:MiscellaneousThisAgreementWillBeGovernedAndConstruedinAccordancewithT', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'div', 'tag': 'MiscellaneousThisAgreementWillBeGovernedAndConstruedinAccordancewithT'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='cdb200445df5b1577492f4c03e1f643d6a6195e7bdf794e0a77d6eb63c99ccad', text='11. Miscellaneous. This Agreement will be governed and construed in accordance with the laws of the State of Washington , excluding its body of law controlling conflict of laws. This Agreement is the complete and exclusive understanding and agreement between the parties regarding the subject matter of this Agreement and supersedes all prior agreements, understandings and communications, oral or written, between the parties regarding the subject matter of this Agreement . If any provision of this Agreement is held invalid or unenforceable by a court of competent jurisdiction, that provision of this Agreement will be enforced to the maximum extent permissible and the other provisions of this Agreement will remain in full force and effect. Neither party may assign this Agreement , in whole or in part, by operation of law or otherwise, without the other party’s prior written consent, and any attempted assignment without such consent will be void. This Agreement may be executed in counterparts, each of which will be deemed an original, but all of which together will constitute one and the same instrument.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", - " DocumentNode(id_='83a9cc7c-3e89-43c1-a351-2dcb09573d65', embedding=None, metadata={'xpath': '/docset:MUTUALNON-DISCLOSUREAGREEMENT-section/docset:MUTUALNON-DISCLOSUREAGREEMENT/docset:ZeObligationsAndRestrictions-section/docset:ZeObligationsAndRestrictions/docset:Exceptions/docset:IeNoLicense-section/docset:IeNoLicense[2]/docset:SIGNATUREPAGEFOLLOWS-section/docset:SIGNATUREPAGEFOLLOWS/docset:INWITNESSWHEREOF/docset:TheParties', 'id': 'bpc1vibyeke2', 'name': 'NDA simple layout nicely scanned.pdf', 'structure': 'p', 'tag': 'TheParties'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='2c9caed694c0786e86562840dbd946d23c3e5c36c30718204d0d7e0986d84d9d', text='[SIGNATURE PAGE FOLLows] IN WITNESS WHEREOF, the parties hereto have executed this Mutual Non-Disclosure Agreement by their duly authorized officers or representatives as of the date first set forth above. DOCUGAMI INC . INC .: Leonarda Hosler : Name: Name: Title: Title:', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from nextpy.ai import download_loader\n", - "\n", - "DocugamiReader = download_loader('DocugamiReader')\n", - "\n", - "docset_id=\"ecxqpipcoe2p\"\n", - "document_ids=[\"43rj0ds7s0ur\", \"bpc1vibyeke2\"]\n", - "\n", - "loader = DocugamiReader()\n", - "documents = loader.load_data(docset_id=docset_id, document_ids=document_ids)\n", - "documents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `metadata` for each `DocumentNode` (really, a chunk of an actual PDF, DOC or DOCX) contains some useful additional information:\n", - "\n", - "1. **id and name:** ID and Name of the file (PDF, DOC or DOCX) the chunk is sourced from within Docugami.\n", - "2. **xpath:** XPath inside the XML representation of the DocumentNode, for the chunk. Useful for source citations directly to the actual chunk inside the DocumentNode XML.\n", - "3. **structure:** Structural attributes of the chunk, e.g. h1, h2, div, table, td, etc. Useful to filter out certain kinds of chunks if needed by the caller.\n", - "4. **tag:** Semantic tag for the chunk, using various generative and extractive techniques. More details here: https://github.com/docugami/DFM-benchmarks" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Use: Docugami Loader for DocumentNode QA\n", - "\n", - "You can use the Docugami Loader like a standard loader for DocumentNode QA over multiple docs, albeit with much better chunks that follow the natural contours of the DocumentNode. There are many great tutorials on how to do this, e.g. [this one](https://gpt-index.readthedocs.io/en/latest/getting_started/starter_example.html). We can just use the same code, but use the `DocugamiLoader` for better chunking, instead of loading text or PDF files directly with basic splitting techniques." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from nextpy.ai import GPTVectorDBIndex\n", - "\n", - "DocugamiReader = download_loader('DocugamiReader')\n", - "\n", - "# For this example, we already have a processed docset for a set of lease documents\n", - "docset_id=\"wh2kned25uqm\"\n", - "documents = loader.load_data(docset_id=docset_id)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The documents returned by the loader are already split into chunks. Optionally, we can use the metadata on each chunk, for example the structure or tag attributes, to do any post-processing we want.\n", - "\n", - "We will just use the output of the `DocugamiLoader` as-is to set up a query engine the usual way." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTVectorDBIndex.from_documents(documents)\n", - "query_engine = index.as_query_engine(similarity_top_k=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Tenants can place or attach signs (digital or otherwise) or other forms of identification to their properties after receiving written permission from the landlord. Any signs or other forms of identification must conform to all applicable laws, ordinances, etc. governing the same. Tenants must also have any window or glass identification completely removed and cleaned at their expense promptly upon vacating the premises.\n", - "NodeWithScore(node=Node(text='Signage. Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises. \\n\\n ARTICLE VII UTILITIES 7.01', doc_id='1e89f5bf-0cb6-491a-acf6-8be9e6dc6ffb', embedding=None, doc_hash='50e3892892d18199d6b6db4d6205beb327f09b031539afc9e9b239548639a89d', extra_info={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOFFICELEASEAGREEMENTThis/docset:ArticleIBasic/docset:ArticleIiiUseAndCareOf/docset:ARTICLEIIIUSEANDCAREOFPREMISES-section/docset:ARTICLEIIIUSEANDCAREOFPREMISES/docset:NoOtherPurposes/docset:TenantsResponsibility/dg:chunk', 'id': 'g2fvhekmltza', 'name': 'TruTone Lane 6.pdf', 'structure': 'lim', 'tag': 'chunk'}, node_info={'start': 0, 'end': 747}, relationships={: '84779dc3-a104-4bff-bced-f7e2dde58cc1'}), score=0.8617797232715348)\n", - "NodeWithScore(node=Node(text=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", doc_id='ac44b4fe-551d-4b17-9100-0889c4842f5f', embedding=None, doc_hash='d383b8792e586979e3082ebd4f9e06121f663a53ffd6a712c5622f5cec65bba5', extra_info={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Period/docset:ApplicableSalesTax/docset:PercentageRent/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS', 'id': 'qkn9cyqsiuch', 'name': 'Shorebucks LLC_AZ.pdf', 'structure': 'div', 'tag': 'SIGNS'}, node_info={'start': 0, 'end': 597}, relationships={: 'eccd7773-5fcf-4064-8f62-67f45c724ecd'}), score=0.8508437736864953)\n", - "NodeWithScore(node=Node(text='ARTICLE VI SIGNAGE 6.01 Signage . Tenant may place or attach to the Premises signs (digital or otherwise) or other such identification as needed after receiving written permission from the Landlord , which permission shall not be unreasonably withheld. Any damage caused to the Premises by the Tenant ’s erecting or removing such signs shall be repaired promptly by the Tenant at the Tenant ’s expense . Any signs or other form of identification allowed must conform to all applicable laws, ordinances, etc. governing the same. Tenant also agrees to have any window or glass identification completely removed and cleaned at its expense promptly upon vacating the Premises.', doc_id='7aa86f41-d711-42bd-94ed-fc99f7c90443', embedding=None, doc_hash='9cf87806118da7fa99be843c9f926302b5ccf1716ceec2fa2352b5f8726182c1', extra_info={'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:Article/docset:ARTICLEVISIGNAGE-section/docset:_601Signage-section/docset:_601Signage', 'id': 'v1bvgaozfkak', 'name': 'TruTone Lane 2.docx', 'structure': 'div', 'tag': '_601Signage'}, node_info={'start': 0, 'end': 684}, relationships={: '1afd38c9-900b-4e5d-902a-020f0b824751'}), score=0.8491465492763234)\n", - "NodeWithScore(node=Node(text=\"44 . Signs And Exterior Appearance Tenant agrees that all signs, awnings, protective gates, security devices and other installations visible from the exterior of the Premises shall be subject to Landlord 's prior written approval , shall be subject to the prior approval of the Landmarks Preservation Commission of the City of New York , if required, and shall not interfere with or block either of the adjacent stores, provided, however, that Landlord shall not unreasonably withhold consent for signs that Tenant desires to install. Tenant agrees that any permitted signs, awnings, protective gates, security devices, and other installations shall be installed at Tenant ’s sole cost and expense professionally prepared and dignified and subject to Landlord 's prior written approval , which shall not be unreasonably withheld, delayed or conditioned, and subject to such reasonable rules and restrictions as Landlord from time to time may impose. Tenant shall submit to Landlord drawings of the proposed signs and other installations, showing the size, color, illumination and general appearance thereof, together with a statement of the manner in which the same are to be affixed to the Premises. Tenant shall not commence the installation of the proposed signs and other installations unless and until Landlord shall have approved the same in writing. . Tenant shall not install any neon sign. The aforesaid signs shall be used solely for the purpose of identifying Tenant 's business . No changes shall be made in the signs and other installations without first obtaining Landlord 's prior written consent thereto, which consent shall not be unreasonably withheld, delayed or conditioned. Tenant shall, at its own cost and expense, obtain and exhibit to Landlord such permits or certificates of approval as Tenant may be required to obtain from any and all City , State and other authorities having jurisdiction covering the erection, installation, maintenance or use of said signs or other installations, and Tenant shall maintain the said signs and other installations together with any appurtenances thereto in good order and\", doc_id='df1def90-2c7e-449b-96f1-4c8b62b44e74', embedding=None, doc_hash='b5b03c69d554cba1efa555a76d44ebc099877484f788d748b1892a9622a1de1a', extra_info={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42hSmokingProhibitedTenant/docset:TenantsEmployees/docset:TheArea/docset:_44SignsAndExteriorAppearance-section/docset:_44SignsAndExteriorAppearance/docset:TheExterior/docset:TheExterior', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'TheExterior'}, node_info={'start': 0, 'end': 2181}, relationships={: '063cb174-4593-461a-8afe-1bec0190cecd'}), score=0.8484529479796804)\n", - "NodeWithScore(node=Node(text=\"24. SIGNS . No signage shall be placed by Tenant on any portion of the Project . However, Tenant shall be permitted to place a sign bearing its name in a location approved by Landlord near the entrance to the Premises (at Tenant's cost ) and will be furnished a single listing of its name in the Building's directory (at Landlord 's cost ), all in accordance with the criteria adopted from time to time by Landlord for the Project . Any changes or additional listings in the directory shall be furnished (subject to availability of space) for the then Building Standard charge .\", doc_id='87672346-8373-4c19-a1e3-5fe55410c561', embedding=None, doc_hash='6f90f6b2ac80947c072d4fbfcab6824f68af7b74ab3b284b6e65d30ce3ed6f4c', extra_info={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:Guaranty-section/docset:Guaranty[2]/docset:TheTransfer/docset:TheTerms/docset:Indemnification/docset:INDEMNIFICATION-section/docset:INDEMNIFICATION/docset:Waiver/docset:Waiver/docset:Signs/docset:SIGNS-section/docset:SIGNS', 'id': 'md8rieecquyv', 'name': 'Shorebucks LLC_NJ.pdf', 'structure': 'div', 'tag': 'SIGNS'}, node_info={'start': 0, 'end': 597}, relationships={: '942fd7ed-4303-4b8e-8877-b198e8bb80bb'}), score=0.8460398975408094)\n" - ] - } - ], - "source": [ - "# Try out the query engine with example query\n", - "response = query_engine.query(\"What can tenants do with signage on their properties?\")\n", - "print(response.response)\n", - "for node in response.source_nodes:\n", - " print(node)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Docugami to Add Metadata to Chunks for High Accuracy DocumentNode QA\n", - "\n", - "One issue with large documents is that the correct answer to your question may depend on chunks that are far apart in the DocumentNode. Typical chunking techniques, even with overlap, will struggle with providing the LLM sufficent context to answer such questions. With upcoming very large context LLMs, it may be possible to stuff a lot of tokens, perhaps even entire documents, inside the context but this will still hit limits at some point with very long documents, or a lot of documents.\n", - "\n", - "For example, if we ask a more complex question that requires the LLM to draw on chunks from different parts of the DocumentNode, even OpenAI's powerful LLM is unable to answer correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "The security deposit for the property owned by Birch Street is not specified in the context information provided.\n", - "Shorebucks LLC_CO.pdf\n", - "1.12 Security Deposit . As of the Date of this Lease , there is no Security Deposit .\n", - "Shorebucks LLC_AZ.pdf\n", - "22. SECURITY DEPOSIT . The Security Deposit shall be held by Landlord as security for Tenant 's full and faithful performance of this Lease including the payment of Rent . Tenant grants Landlord a security interest in the Security Deposit . The Security Deposit may be commingled with other funds of Landlord and Landlord shall have no liability for payment of any interest on the Security Deposit . Landlord may apply the Security Deposit to the extent required to cure any default by Tenant . If Landlord so applies the Security Deposit , Tenant shall deliver to Landlord the amount necessary to replenish the Security Deposit to its original sum within five days after notice from Landlord . The Security Deposit shall not be deemed an advance payment of Rent or a measure of damages for any default by Tenant , nor shall it be a defense to any action that Landlord may bring against Tenant .\n", - "Shorebucks LLC_NJ.pdf\n", - "22. SECURITY DEPOSIT . The Security Deposit shall be held by Landlord as security for Tenant 's full and faithful performance of this Lease including the payment of Rent . Tenant grants Landlord a security interest in the Security Deposit . The Security Deposit may be commingled with other funds of Landlord and Landlord shall have no liability for payment of any interest on the Security Deposit . Landlord may apply the Security Deposit to the extent required to cure any default by Tenant . If Landlord so applies the Security Deposit , Tenant shall deliver to Landlord the amount necessary to replenish the Security Deposit to its original sum within five days after notice from Landlord . The Security Deposit shall not be deemed an advance payment of Rent or a measure of damages for any default by Tenant , nor shall it be a defense to any action that Landlord may bring against Tenant .\n", - "Shorebucks LLC_CO.pdf\n", - "22. SECURITY DEPOSIT . The Security Deposit shall be held by Landlord as security for Tenant 's full and faithful performance of this Lease including the payment of Rent . Tenant grants Landlord a security interest in the Security Deposit . The Security Deposit may be commingled with other funds of Landlord and Landlord shall have no liability for payment of any interest on the Security Deposit . Landlord may apply the Security Deposit to the extent required to cure any default by Tenant . If Landlord so applies the Security Deposit , Tenant shall deliver to Landlord the amount necessary to replenish the Security Deposit to its original sum within five days after notice from Landlord . The Security Deposit shall not be deemed an advance payment of Rent or a measure of damages for any default by Tenant , nor shall it be a defense to any action that Landlord may bring against Tenant .\n", - "Shorebucks LLC_NJ.pdf\n", - "1.12 Security Deposit . As of the Date of this Lease , there is no Security Deposit .\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/openams/data_structs/node.py:181: UserWarning: .extra_info is deprecated, use .node.extra_info instead\n", - " warnings.warn(\".extra_info is deprecated, use .node.extra_info instead\")\n" - ] - } - ], - "source": [ - "response = query_engine.query(\"What is the security deposit for the property owned by Birch Street?\")\n", - "print(response.response) # the correct answer should be $78,000\n", - "for node in response.source_nodes:\n", - " print(node.metadata[\"name\"])\n", - " print(node.node.text)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At first glance the answer may seem reasonable, but if you review the source chunks carefully for this answer, you will see that the chunking of the DocumentNode did not end up putting the Landlord name and the rentable area in the same context, since they are far apart in the DocumentNode. The query engine therefore ends up finding unrelated chunks from other documents not even related to the **Birch Street** landlord. That landlord happens to be mentioned on the first page of the file **TruTone Lane 1.docx** file, and none of the source chunks used by the query engine contain the correct answer (**$78,000**), and the answer is therefore incorrect." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Docugami can help here. Chunks are annotated with additional metadata created using different techniques if a user has been [using Docugami](https://help.docugami.com/home/reports). More technical approaches will be added later.\n", - "\n", - "Specifically, let's look at the additional metadata that is returned on the documents returned by docugami after some additional use, in the form of some simple key/value pairs on all the text chunks:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'xpath': '/docset:OFFICELEASEAGREEMENT-section/docset:OFFICELEASEAGREEMENT/docset:ThisOfficeLeaseAgreement',\n", - " 'id': 'v1bvgaozfkak',\n", - " 'name': 'TruTone Lane 2.docx',\n", - " 'structure': 'p',\n", - " 'tag': 'ThisOfficeLeaseAgreement',\n", - " 'Landlord': 'BUBBA CENTER PARTNERSHIP',\n", - " 'Tenant': 'Truetone Lane LLC'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docset_id=\"wh2kned25uqm\"\n", - "documents = loader.load_data(docset_id=docset_id)\n", - "documents[0].metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTVectorDBIndex.from_documents(documents)\n", - "query_engine = index.as_query_engine(similarity_top_k=5)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's run the same question again. It returns the correct result since all the chunks have metadata key/value pairs on them carrying key information about the DocumentNode even if this infromation is physically very far away from the source chunk used to generate the answer." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "The security deposit for the property owned by Birch Street is $78,000.\n", - "TruTone Lane 1.docx\n", - "NodeWithScore(node=Node(text='$ 20,023.78 of the Security to the Tenant and the Security obligation shall be $ 31,976.72 and remain until the expiration or earlier termination of this Lease .', doc_id='d34995dc-cbe2-4f70-a248-ca0e8c937d7b', embedding=None, doc_hash='84ec2102e9e9cc07487556772b8f97aa14e01d6f763ba1315e0ae2132d67691c', extra_info={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42hSmokingProhibitedTenant/docset:TenantsEmployees/docset:TheArea/docset:_56SecurityDeposit-section/docset:_56SecurityDeposit/docset:TheForegoing/docset:TheSecurity', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'TheSecurity', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}, node_info={'start': 0, 'end': 171}, relationships={: '659e354f-b749-4938-967f-638fea177fa0'}), score=0.8289222268861388)\n", - "TruTone Lane 1.docx\n", - "NodeWithScore(node=Node(text='The Security being held pursuant to this Article shall at all times be an amount equal to \\n\\n\\n\\n\\n\\n three ( 3 ) times the monthly fixed rent then reserved under Article 40 of this Lease . On the first day of the month following each anniversary of the Rent Commencement Date of this Lease , Tenant shall pay to Landlord funds sufficient so that the un-applied Security held by Landlord shall at all times equal three times the monthly fixed rent then reserved under Article 40 of this Lease .', doc_id='f0d27e80-90b8-4436-85eb-f0deaa485b77', embedding=None, doc_hash='a0fcdc9cd2dc6dc9f9f97423f8d76494af80b500c5c7bdbefc2c05aea9085d89', extra_info={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42hSmokingProhibitedTenant/docset:TenantsEmployees/docset:TheArea/docset:_56SecurityDeposit-section/docset:_56SecurityDeposit/docset:TheEvent/docset:TheSecurity', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'TheSecurity', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}, node_info={'start': 0, 'end': 517}, relationships={: 'a97b9f2a-2e01-4d65-bfd3-89aa18fca942'}), score=0.8227364343224219)\n", - "TruTone Lane 1.docx\n", - "NodeWithScore(node=Node(text=\"56 . Security Deposit Upon execution of this Lease , Tenant has deposited with Landlord the sum of $ 78,000.00 in good funds as security for the full and faithful performance and observance by Tenant of the terms, covenants and conditions of this Lease (the “Security”). If Tenant defaults in the performance or observance of any term, covenant or condition of this Lease , including without limitation the obligation of Tenant to pay any rent or other sum required hereunder, Landlord may use, after 10 days written notice to Tenant ,apply, or retain, without any application to any court or tribunal, the whole or any part of the Security so deposited to the extent required for the payment of any rent or any other sum as to which Tenant is in default or for any sum which Landlord may expend or may be required to expend by reason of Tenant 's default , including without limitation any damages or deficiency accrued before or after summary proceedings or other re-entry by Landlord . Such use, application, or retention by the Landlord shall be without prejudice to Landlord ’s rights to seek any and all additional rent and/or damages that may have accrued. If Tenant shall fully and faithfully observe and perform all of the terms, covenants, and conditions of this Lease , the Security , shall be returned to Tenant after the end of the term of this Lease or at permissible early termination as provided herein and the delivery of possession of the demised Premises to Landlord .\", doc_id='5456d727-13b5-4197-9070-b6acad549f58', embedding=None, doc_hash='3ae3541e4750e005e58bd6a9c8379f548309eadc5559b6fd9d0636fea6909fc0', extra_info={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42hSmokingProhibitedTenant/docset:TenantsEmployees/docset:TheArea/docset:_56SecurityDeposit-section/docset:_56SecurityDeposit/docset:Execution', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'Execution', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}, node_info={'start': 0, 'end': 1533}, relationships={: '9a9d71ca-c0a3-4ab4-ab58-cf5cd611a53c'}), score=0.8225535679622072)\n", - "Shorebucks LLC_CO.pdf\n", - "NodeWithScore(node=Node(text='1.12 Security Deposit . As of the Date of this Lease , there is no Security Deposit .', doc_id='418f110b-c0fd-4813-9649-2003a0c47504', embedding=None, doc_hash='6344b5840d282172b1bcb82b4e29a74e524b011c1f73dfd26d5563dfc796193b', extra_info={'xpath': '/docset:OFFICELEASE-section/docset:OFFICELEASE/docset:THISOFFICELEASE/docset:WITNESSETH-section/docset:WITNESSETH/docset:GrossRentCreditTheRentCredit-section/docset:GrossRentCreditTheRentCredit/docset:First/docset:ApplicableSalesTax/docset:PercentageRent/docset:SecurityDeposit/docset:SecurityDeposit-section/docset:SecurityDeposit[2]', 'id': 'dsyfhh4vpeyf', 'name': 'Shorebucks LLC_CO.pdf', 'structure': 'div', 'tag': 'SecurityDeposit', 'Landlord': 'Perry & Blair LLC', 'Tenant': 'Shorebucks LLC'}, node_info={'start': 0, 'end': 87}, relationships={: '04ab648a-18d9-473f-83cc-ea0a872a1049'}), score=0.8222174185648468)\n", - "TruTone Lane 1.docx\n", - "NodeWithScore(node=Node(text='Notwithstanding the foregoing, provided Tenant is not then in default of this Lease , on March 15 , 2022 , Landlord shall return $ 26,000 of the Security to the Tenant and the Security obligation shall be $ 52,000 . In the event Tenant continues to comply with all of the terms and conditions of this Lease , and provided Tenant is not then in default of this Lease , on March 15 , 2022 , Landlord shall return', doc_id='738bf4d8-cf83-43da-9083-49434954f8f3', embedding=None, doc_hash='20e4e9257ce3e8a2072eb0d4973160af6362a290c0e4fac16be6195356f97898', extra_info={'xpath': '/docset:Rider/docset:RIDERTOLEASE-section/docset:RIDERTOLEASE/docset:FixedRent/docset:TermYearPeriod/docset:Lease/docset:_42hSmokingProhibitedTenant/docset:TenantsEmployees/docset:TheArea/docset:_56SecurityDeposit-section/docset:_56SecurityDeposit/docset:TheForegoing/docset:TheForegoing', 'id': 'omvs4mysdk6b', 'name': 'TruTone Lane 1.docx', 'structure': 'p', 'tag': 'TheForegoing', 'Landlord': 'BIRCH STREET , LLC', 'Tenant': 'Trutone Lane LLC'}, node_info={'start': 0, 'end': 438}, relationships={: '7248de1e-0140-4e59-b324-ee5df7065ceb'}), score=0.8159128793979528)\n" - ] - } - ], - "source": [ - "response = query_engine.query(\"What is the security deposit for the property owned by Birch Street?\")\n", - "print(response.response) # the correct answer should be $78,000\n", - "for node in response.source_nodes:\n", - " print(node.metadata[\"name\"])\n", - " print(node)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/nextpy/ai/rag/document_loaders/docugami/requirements.txt b/nextpy/ai/rag/document_loaders/docugami/requirements.txt deleted file mode 100644 index dd7c9377..00000000 --- a/nextpy/ai/rag/document_loaders/docugami/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -lxml -requests -typing \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/elasticsearch/README.md b/nextpy/ai/rag/document_loaders/elasticsearch/README.md deleted file mode 100644 index d2776b28..00000000 --- a/nextpy/ai/rag/document_loaders/elasticsearch/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Elasticsearch Loader - -The Elasticsearch Loader returns a set of texts corresponding to documents retrieved from an Elasticsearch index. -The user initializes the loader with an Elasticsearch index. They then pass in a field, and optionally a JSON query DSL object to fetch the fields they want. - -## Usage - -Here's an example usage of the ElasticsearchReader. - -```python -from nextpy.ai import download_loader - -ElasticsearchReader = download_loader("ElasticsearchReader") - -reader = ElasticsearchReader( - "http://localhost:9200", - index_name, -) - - -query_dict = {"query": {"match": {"message": {"query": "this is a test"}}}} -documents = reader.load_data( - "", query=query_dict, embedding_field="field_name" -) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/elasticsearch/__init__.py b/nextpy/ai/rag/document_loaders/elasticsearch/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/elasticsearch/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/elasticsearch/base.py b/nextpy/ai/rag/document_loaders/elasticsearch/base.py deleted file mode 100644 index 760ea5da..00000000 --- a/nextpy/ai/rag/document_loaders/elasticsearch/base.py +++ /dev/null @@ -1,78 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Elasticsearch (or Opensearch) reader over REST api. - -This only uses the basic search api, so it will work with Elasticsearch and Opensearch. - -""" - - -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ElasticsearchReader(BaseReader): - """Read documents from an Elasticsearch/Opensearch index. - - These documents can then be used in a downstream Llama Index data structure. - - Args: - endpoint (str): URL (http/https) of cluster - index (str): Name of the index (required) - httpx_client_args (dict): Optional additional args to pass to the `httpx.Client` - """ - - def __init__( - self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None - ): - """Initialize with parameters.""" - import httpx # noqa: F401 - - self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {})) - self._index = index - self._endpoint = endpoint - - def load_data( - self, - field: str, - query: Optional[dict] = None, - embedding_field: Optional[str] = None, - ) -> List[DocumentNode]: - """Read data from the Elasticsearch index. - - Args: - field (str): Field in the DocumentNode to retrieve text from - query (Optional[dict]): Elasticsearch JSON query DSL object. - For example: - {"query": {"match": {"message": {"query": "this is a test"}}}} - embedding_field (Optional[str]): If there are embeddings stored in - this index, this field can be used - to set the embedding field on the returned DocumentNode list. - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = { - "endpoint": self._endpoint, - "index": self._index, - "field": field, - "query": query, - } - - res = self._client.post(f"{self._index}/_search", json=query).json() - documents = [] - for hit in res["hits"]["hits"]: - value = hit["_source"][field] - embedding = hit["_source"].get(embedding_field or "", None) - documents.append( - DocumentNode( - text=value, - extra_info={**metadata, **hit["_source"]}, - embedding=embedding, - ) - ) - return documents diff --git a/nextpy/ai/rag/document_loaders/elasticsearch/requirements.txt b/nextpy/ai/rag/document_loaders/elasticsearch/requirements.txt deleted file mode 100644 index 79228389..00000000 --- a/nextpy/ai/rag/document_loaders/elasticsearch/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -httpx \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/faiss/README.md b/nextpy/ai/rag/document_loaders/faiss/README.md deleted file mode 100644 index d86fbcca..00000000 --- a/nextpy/ai/rag/document_loaders/faiss/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Faiss Loader - -The Faiss Loader returns a set of texts corresponding to embeddings retrieved from a [Faiss Index](https://github.com/facebookresearch/faiss), an efficient way to do similar search and clustering, developed by Meta. The user initializes the loader with a Faiss index. They then pass in a query vector. - -## Usage - -Here's an example usage of the FaissReader. - -```python -from nextpy.ai import download_loader -import faiss - -FaissReader = download_loader('FaissReader') - -id_to_text_map = { - "id1": "text blob 1", - "id2": "text blob 2", -} -index = faiss.IndexFlatL2(d) -# add embeddings to the index -index.add(...) - -# initialize reader -reader = FaissReader(index) -# To load data from the Faiss index, you must specify: -# k: top nearest neighbors -# query: a 2D embedding representation of your queries (rows are queries) -k = 4 -query1 = np.array([...]) -query2 = np.array([...]) -query=np.array([query1, query2]) -documents = reader.load_data(query=query, id_to_text_map=id_to_text_map, k=k) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/faiss/__init__.py b/nextpy/ai/rag/document_loaders/faiss/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/faiss/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/faiss/base.py b/nextpy/ai/rag/document_loaders/faiss/base.py deleted file mode 100644 index 4c4a0ad8..00000000 --- a/nextpy/ai/rag/document_loaders/faiss/base.py +++ /dev/null @@ -1,77 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Faiss reader.""" - -from typing import Any, Dict, List - -import numpy as np - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class FaissReader(BaseReader): - """Faiss reader. - - Retrieves documents through an existing in-memory Faiss index. - These documents can then be used in a downstream LlamaIndex data structure. - If you wish use Faiss itself as an index to to organize documents, - insert documents, and perform queries on them, please use GPTFaissIndex. - - Args: - faiss_index (faiss.Index): A Faiss Index object (required) - - """ - - def __init__(self, index: Any): - """Initialize with parameters.""" - self._index = index - - def load_data( - self, - query: np.ndarray, - id_to_text_map: Dict[str, str], - k: int = 4, - separate_documents: bool = True, - ) -> List[DocumentNode]: - """Load data from Faiss. - - Args: - query (np.ndarray): A 2D numpy array of query vectors. - id_to_text_map (Dict[str, str]): A map from ID's to text. - k (int): Number of nearest neighbors to retrieve. Defaults to 4. - separate_documents (Optional[bool]): Whether to return separate - documents. Defaults to True. - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = { - "index": self._index, - "query": query, - "id_to_text_map": id_to_text_map, - "k": k, - "separate_documents": separate_documents, - } - - dists, indices = self._index.search(query, k) - documents = [] - for qidx in range(indices.shape[0]): - for didx in range(indices.shape[1]): - doc_id = indices[qidx, didx] - if doc_id not in id_to_text_map: - raise ValueError( - f"DocumentNode ID {doc_id} not found in id_to_text_map." - ) - text = id_to_text_map[doc_id] - documents.append(DocumentNode(text=text, extra_info=metadata)) - - if not separate_documents: - # join all documents into one - text_list = [doc.get_text() for doc in documents] - text = "\n\n".join(text_list) - documents = [DocumentNode(text=text, extra_info=metadata)] - - return documents diff --git a/nextpy/ai/rag/document_loaders/faiss/requirements.txt b/nextpy/ai/rag/document_loaders/faiss/requirements.txt deleted file mode 100644 index f4193d23..00000000 --- a/nextpy/ai/rag/document_loaders/faiss/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -faiss \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/feedly_rss/README.md b/nextpy/ai/rag/document_loaders/feedly_rss/README.md deleted file mode 100644 index 86395a97..00000000 --- a/nextpy/ai/rag/document_loaders/feedly_rss/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Feedly Loader - -This loader fetches the entries from a list of RSS feeds subscribed in [Feedly](https://feedly.com). You must initialize the loader with your [Feedly API token](https://developer.feedly.com), and then pass the category name which you want to extract. - -## Usage -```python -from nextpy.ai import download_loader -feedlyRssReader = download_loader("FeedlyRssReader") - -loader = feedlyRssReader(bearer_token = "[YOUR_TOKEN]") -documents = loader.load_data(category_name = "news", max_count = 100) -``` - -## Dependencies -[feedly-client](https://pypi.org/project/feedly-client/) \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/feedly_rss/__init__.py b/nextpy/ai/rag/document_loaders/feedly_rss/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/feedly_rss/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/feedly_rss/base.py b/nextpy/ai/rag/document_loaders/feedly_rss/base.py deleted file mode 100644 index 064e98e1..00000000 --- a/nextpy/ai/rag/document_loaders/feedly_rss/base.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Feedly Rss Reader.""" - -import json -from pathlib import Path - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class FeedlyRssReader(BaseReader): - """Feedly Rss Reader. - - Get entries from Feedly Rss Reader - - Uses Feedly Official python-api-client: https://github.com/feedly/python-api-client - """ - - def __init__(self, bearer_token: str) -> None: - """Initialize with parameters.""" - super().__init__() - self.bearer_token = bearer_token - - def setup_auth( - self, directory: Path = Path.home() / ".config/feedly", overwrite: bool = False - ): - """Modified from python-api-client/feedly/api_client/utils.py - Instead promopting for user input, we take the token as an argument. - """ - self.directory = directory - - directory.mkdir(exist_ok=True, parents=True) - - auth_file = directory / "access.token" - - if not auth_file.exists() or overwrite: - auth = self.bearer_token - auth_file.write_text(auth.strip()) - - def load_data(self, category_name, max_count=100): - """Get the entries from a feedly category.""" - from feedly.api_client.session import FeedlySession - from feedly.api_client.stream import StreamOptions - - self.setup_auth(overwrite=True) - sess = FeedlySession() - category = sess.user.user_categories.get(category_name) - - metadata = { - "directory": self.directory, - "category": category, - "max_count": max_count, - } - - documents = [] - for article in category.stream_contents( - options=StreamOptions(max_count=max_count) - ): - # doc for available fields: https://developer.feedly.com/v3/streams/ - entry = { - "title": article["title"], - "published": article["published"], - "summary": article["summary"], - "author": article["author"], - "content": article["content"], - "keywords": article["keywords"], - "commonTopics": article["commonTopics"], - } - - text = json.dumps(entry, ensure_ascii=False) - - documents.append(DocumentNode(text=text, extra_info=metadata)) - return documents diff --git a/nextpy/ai/rag/document_loaders/feedly_rss/requirements.txt b/nextpy/ai/rag/document_loaders/feedly_rss/requirements.txt deleted file mode 100644 index 42628943..00000000 --- a/nextpy/ai/rag/document_loaders/feedly_rss/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -feedly-client \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/feishu_docs/README.md b/nextpy/ai/rag/document_loaders/feishu_docs/README.md deleted file mode 100644 index 292af38d..00000000 --- a/nextpy/ai/rag/document_loaders/feishu_docs/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Feishu Doc Loader - -This loader takes in IDs of Feishu Docs and parses their text into `documents`. You can extract a Feishu Doc's ID directly from its URL. For example, the ID of `https://test-csl481dfkgqf.feishu.cn/docx/HIH2dHv21ox9kVxjRuwc1W0jnkf` is `HIH2dHv21ox9kVxjRuwc1W0jnkf`. As a prerequisite, you will need to register with Feishu and build an custom app. See [here](https://open.feishu.cn/DocumentNode/home/introduction-to-custom-app-development/self-built-application-development-process) for instructions. - -## Usage - -To use this loader, you simply need to pass in an array of Feishu Doc IDs. The default API llms are for Feishu, in order to switch to Lark, we should use `set_lark_domain`. - -```python -from nextpy.ai import download_loader - -app_id="cli_slkdjalasdkjasd" -app_secret="dskLLdkasdjlasdKK" -doc_ids = ['HIH2dHv21ox9kVxjRuwc1W0jnkf'] -FeishuDocsReader = download_loader('FeishuDocsReader') -loader = FeishuDocsReader(app_id, app_secret) -documents = loader.load_data(document_ids=doc_ids) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/feishu_docs/__init__.py b/nextpy/ai/rag/document_loaders/feishu_docs/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/feishu_docs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/feishu_docs/base.py b/nextpy/ai/rag/document_loaders/feishu_docs/base.py deleted file mode 100644 index 4220c109..00000000 --- a/nextpy/ai/rag/document_loaders/feishu_docs/base.py +++ /dev/null @@ -1,114 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Feishu docs reader.""" -import json -import time -from typing import List - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -# Copyright (2023) Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class FeishuDocsReader(BaseReader): - """Feishu Docs reader. - - Reads a page from Google Docs - - """ - - host = "https://open.feishu.cn" - documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content" - tenant_access_token_internal_url_path = ( - "/open-apis/auth/v3/tenant_access_token/internal" - ) - - def __init__(self, app_id, app_secret): - """Args: - app_id: The unique identifier of the application is obtained after the application is created. - app_secret: Application key, obtained after creating the application. - """ - super(FeishuDocsReader, self).__init__() - self.app_id = app_id - self.app_secret = app_secret - - self.tenant_access_token = "" - self.expire = 0 - - def load_data(self, document_ids: List[str]) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - document_ids (List[str]): a list of DocumentNode ids. - """ - if document_ids is None: - raise ValueError('Must specify a "document_ids" in `load_kwargs`.') - - results = [] - for document_id in document_ids: - doc = self._load_doc(document_id) - results.append( - DocumentNode( - text=doc, - extra_info={ - "app_id": self.app_id, - "document_id": document_id, - }, - ) - ) - return results - - def _load_doc(self, document_id) -> str: - """Load a DocumentNode from Feishu Docs. - - Args: - document_id: the DocumentNode id. - - Returns: - The DocumentNode text. - """ - url = self.host + self.documents_raw_content_url_path.format(document_id) - if self.tenant_access_token == "" or self.expire < time.time(): - self._update_tenant_access_token() - headers = { - "Authorization": "Bearer {}".format(self.tenant_access_token), - "Content-Type": "application/json; charset=utf-8", - } - response = requests.get(url, headers=headers) - return response.json()["data"]["content"] - - def _update_tenant_access_token(self): - """For update tenant_access_token.""" - url = self.host + self.tenant_access_token_internal_url_path - headers = {"Content-Type": "application/json; charset=utf-8"} - data = {"app_id": self.app_id, "app_secret": self.app_secret} - response = requests.post(url, data=json.dumps(data), headers=headers) - self.tenant_access_token = response.json()["tenant_access_token"] - self.expire = time.time() + response.json()["expire"] - - def set_lark_domain(self): - """The default API llms are for Feishu, in order to switch to Lark, we should use set_lark_domain.""" - self.host = "https://open.larksuite.com" - - -if __name__ == "__main__": - app_id = "cli_a4d536f6a738d00b" - app_secret = "HL29tOCwRHw390Cr6jQBBdFjmYlTJt1e" - reader = FeishuDocsReader(app_id, app_secret) - print(reader.load_data(document_ids=["HIH2dHv21ox9kVxjRuwc1W0jnkf"])) diff --git a/nextpy/ai/rag/document_loaders/feishu_docs/requirements.txt b/nextpy/ai/rag/document_loaders/feishu_docs/requirements.txt deleted file mode 100644 index fc75559e..00000000 --- a/nextpy/ai/rag/document_loaders/feishu_docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -openams -requests \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/README.md b/nextpy/ai/rag/document_loaders/file/README.md deleted file mode 100644 index 0338242e..00000000 --- a/nextpy/ai/rag/document_loaders/file/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# File Loader - -This loader takes in a local directory containing files and extracts `DocumentNode`s from each of the files. By default, the loader will utilize the specialized loaders in this library to parse common file extensions (e.g. .pdf, .png, .docx, etc). You can optionally pass in your own custom loaders. Note: if no loader is found for a file extension, and the file extension is not in the list to skip, the file will be read directly. - -## Usage - -To use this loader, you simply need to instantiate the `SimpleDirectoryReader` class with a directory, along with other optional settings, such as whether to ignore hidden files. See the code for the complete list. - -```python -from llama_hub.file.base import SimpleDirectoryReader - -# other way of loading -# from nextpy.ai import download_loader -# SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - -loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) -documents = loader.load_data() -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from llama_hub.file.base import SimpleDirectoryReader -from nextpy.ai import GPTVectorDBIndex - -# other way of loading -# from nextpy.ai import download_loader -# SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - -loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) -index.query('What are these files about?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from llama_hub.file.base import SimpleDirectoryReader -from nextpy.ai import GPTVectorDBIndex -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -# other way of loading -# from nextpy.ai import download_loader -# SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - -loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True) -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Local Directory Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about the files in your local directory.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What are these files about?") -``` diff --git a/nextpy/ai/rag/document_loaders/file/__init__.py b/nextpy/ai/rag/document_loaders/file/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/audio/README.md b/nextpy/ai/rag/document_loaders/file/audio/README.md deleted file mode 100644 index 62ef38f5..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Audio File Loader - -This loader uses OpenAI's Whisper model to transcribe the text of an audio file or the audio track of a video file. The file formats .mp3 and .mp4 are preferred. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you will need the `whisper` python package installed. You can do so with `pip install whisper`. - -Then, simply pass a `Path` to a local file into `load_data`: - -```python -from pathlib import Path -from llama_hub.file.audio import AudioTranscriber - -loader = AudioTranscriber() -documents = loader.load_data(file=Path('./podcast.mp3')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/audio/__init__.py b/nextpy/ai/rag/document_loaders/file/audio/__init__.py deleted file mode 100644 index c46d61b3..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from llama_hub.file.audio.base import AudioTranscriber - -__all__ = ["AudioTranscriber"] diff --git a/nextpy/ai/rag/document_loaders/file/audio/base.py b/nextpy/ai/rag/document_loaders/file/audio/base.py deleted file mode 100644 index dbac0516..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio/base.py +++ /dev/null @@ -1,64 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Audio Transcriber. - -A transcriber for the audio of mp3, mp4 files. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional, cast - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class AudioTranscriber(BaseReader): - """Audio parser. - - Extract text from transcript of video/audio files using OpenAI Whisper. - - """ - - def __init__(self, *args: Any, model_version: str = "base", **kwargs: Any) -> None: - """Init params.""" - try: - import whisper - except ImportError: - raise ImportError( - "Missing required package: whisper\n" - "Please `pip install whisper` to use AudioTranscriber" - ) - - super().__init__(*args, **kwargs) - self._model_version = model_version - - model = whisper.load_model(self._model_version) - - self.parser_config = {"model": model} - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import whisper - - if file.name.endswith("mp4"): - from pydub import AudioSegment # noqa: F401 - - # open file - video = AudioSegment.from_file(file, format="mp4") - - # Extract audio from video - audio = video.split_to_mono()[0] - - file_str = str(file)[:-4] + ".mp3" - # export file - audio.export(file_str, format="mp3") - - model = cast(whisper.Whisper, self.parser_config["model"]) - result = model.transcribe(str(file)) - - transcript = result["text"] - - return [DocumentNode(text=transcript, extra_info=extra_info or {})] diff --git a/nextpy/ai/rag/document_loaders/file/audio/requirements.txt b/nextpy/ai/rag/document_loaders/file/audio/requirements.txt deleted file mode 100644 index 36719d37..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -openai-whisper -pydub \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/audio_gladia/README.md b/nextpy/ai/rag/document_loaders/file/audio_gladia/README.md deleted file mode 100644 index 5a59d49c..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio_gladia/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Audio File Loader - -This loader uses Gladia's OpenAI's Whisper model to transcribe the text of an audio file or the audio track of a video file. The file formats .mp3 and .mp4 are preferred. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -Please check following parameters on [Gladia](https://www.gladia.io/) before proceeding further. - -1. gladia_api_key -2. diarization_max_speakers -3. language -4. language_behaviour -5. target_translation_language -6. transcription_hint - -You need to signup on [Gladia](https://www.gladia.io/) to get `API-KEY` - -```python -from pathlib import Path -from nextpy.ai import download_loader - -AudioTranscriber = download_loader("AudioTranscriber") - -# using gladia -loader = AudioTranscriber(model_type = 'gladia', gladia_api_key = 'YOUR API KEY') -documents = loader.load_data(file=Path('./podcast.mp3')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/audio_gladia/__init__.py b/nextpy/ai/rag/document_loaders/file/audio_gladia/__init__.py deleted file mode 100644 index 1c233aca..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio_gladia/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init params.""" diff --git a/nextpy/ai/rag/document_loaders/file/audio_gladia/base.py b/nextpy/ai/rag/document_loaders/file/audio_gladia/base.py deleted file mode 100644 index f953d21c..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio_gladia/base.py +++ /dev/null @@ -1,99 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Audio Transcriber. - -A transcriber for the audio of mp3, mp4 files using Gladia's OpenAI Whisper. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class GladiaAudioTranscriber(BaseReader): - """Audio parser. - - Extract text from transcript of video/audio files using - Gladia's OpenAI Whisper. - - """ - - def __init__( - self, - *args: Any, - diarization_max_speakers: Optional[str] = None, - language: Optional[str] = None, - language_behaviour: str = "automatic multiple languages", - target_translation_language: str = "english", - gladia_api_key: Optional[str] = None, - transcription_hint: Optional[str] = None, - **kwargs: Any - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - - self.parser_config = {} - self.parser_config["gladia_api_key"] = gladia_api_key - self.parser_config["diarization_max_speakers"] = diarization_max_speakers - self.parser_config["language"] = language - self.parser_config["language_behaviour"] = language_behaviour - self.parser_config["target_translation_language"] = target_translation_language - self.parser_config["transcription_hint"] = transcription_hint - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - if file.name.endswith("mp4"): - from pydub import AudioSegment # noqa: F401 - - # open file - video = AudioSegment.from_file(file, format="mp4") - - # Extract audio from video - audio = video.split_to_mono()[0] - - file = str(file)[:-4] + ".mp3" - # export file - audio.export(file, format="mp3") - - import requests - - headers = { - "accept": "application/json", - "x-gladia-key": self.parser_config["gladia_api_key"], - } - - files = { - "audio": (str(file), open(str(file), "rb"), "audio/mpeg"), - "output_format": (None, "txt"), - } - - if self.parser_config["diarization_max_speakers"]: - files["diarization_max_speakers"] = ( - None, - self.parser_config["diarization_max_speakers"], - ) - if self.parser_config["language"]: - files["language"] = self.parser_config["language"] - if self.parser_config["language_behaviour"]: - files["language_behaviour"] = self.parser_config["language_behaviour"] - if self.parser_config["target_translation_language"]: - files["target_translation_language"] = self.parser_config[ - "target_translation_language" - ] - if self.parser_config["transcription_hint"]: - files = self.parser_config["transcription_hint"] - - response = requests.post( - "https://api.gladia.io/audio/text/audio-transcription/", - headers=headers, - files=files, - ) - response_dict = response.json() - transcript = response_dict["prediction"] - - return [DocumentNode(text=transcript, extra_info=extra_info or {})] diff --git a/nextpy/ai/rag/document_loaders/file/audio_gladia/requirements.txt b/nextpy/ai/rag/document_loaders/file/audio_gladia/requirements.txt deleted file mode 100644 index 36719d37..00000000 --- a/nextpy/ai/rag/document_loaders/file/audio_gladia/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -openai-whisper -pydub \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/base.py b/nextpy/ai/rag/document_loaders/file/base.py deleted file mode 100644 index a64c1f5b..00000000 --- a/nextpy/ai/rag/document_loaders/file/base.py +++ /dev/null @@ -1,157 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that reads files of different formats from a directory.""" - -import logging -from pathlib import Path -from typing import Callable, Dict, List, Optional, Union - -# from nextpy.ai.readers.download import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -DEFAULT_FILE_EXTRACTOR: Dict[str, str] = { - ".pdf": "PDFReader", - ".docx": "DocxReader", - ".pptx": "PptxReader", - ".jpg": "ImageReader", - ".png": "ImageReader", - ".jpeg": "ImageReader", - ".mp3": "AudioTranscriber", - ".mp4": "AudioTranscriber", - ".csv": "PagedCSVReader", - ".epub": "EpubReader", - ".md": "MarkdownReader", - ".mbox": "MboxReader", - ".eml": "UnstructuredReader", - ".html": "UnstructuredReader", - ".json": "JSONReader", -} - - -class SimpleDirectoryReader(BaseReader): - """Simple directory reader. - - Can read files into separate documents, or concatenates - files into one DocumentNode text. - - Args: - input_dir (str): Path to the directory. - exclude_hidden (bool): Whether to exclude hidden files (dotfiles). - errors (str): how encoding and decoding errors are to be handled, - see https://docs.python.org/3/library/functions.html#open - recursive (bool): Whether to recursively search in subdirectories. - False by default. - required_exts (Optional[List[str]]): List of required extensions. - Default is None. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See DEFAULT_FILE_EXTRACTOR. - num_files_limit (Optional[int]): Maximum number of files to read. - Default is None. - file_metadata (Optional[Callable[str, Dict]]): A function that takes - in a filename and returns a Dict of metadata for the DocumentNode. - Default is None. - """ - - def __init__( - self, - input_dir: str, - exclude_hidden: bool = True, - errors: str = "ignore", - recursive: bool = False, - required_exts: Optional[List[str]] = None, - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - num_files_limit: Optional[int] = None, - file_metadata: Optional[Callable[[str], Dict]] = None, - ) -> None: - """Initialize with parameters.""" - super().__init__() - self.input_dir = Path(input_dir) - self.errors = errors - - self.recursive = recursive - self.exclude_hidden = exclude_hidden - self.required_exts = required_exts - self.num_files_limit = num_files_limit - - self.input_files = self._add_files(self.input_dir) - self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR - self.file_metadata = file_metadata - - def _add_files(self, input_dir: Path) -> List[Path]: - """Add files.""" - input_files = sorted(input_dir.iterdir()) - new_input_files = [] - dirs_to_explore = [] - for input_file in input_files: - if self.exclude_hidden and input_file.stem.startswith("."): - continue - elif input_file.is_dir(): - if self.recursive: - dirs_to_explore.append(input_file) - elif ( - self.required_exts is not None - and input_file.suffix not in self.required_exts - ): - continue - else: - new_input_files.append(input_file) - - for dir_to_explore in dirs_to_explore: - sub_input_files = self._add_files(dir_to_explore) - new_input_files.extend(sub_input_files) - - if self.num_files_limit is not None and self.num_files_limit > 0: - new_input_files = new_input_files[0 : self.num_files_limit] - - # print total number of files added - logging.debug( - f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" - ) - - return new_input_files - - def load_data(self) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - concatenate (bool): whether to concatenate all files into one DocumentNode. - If set to True, file metadata is ignored. - False by default. - - Returns: - List[DocumentNode]: A list of documents. - - """ - documents = [] - for input_file in self.input_files: - metadata = {"source": str(self.input_dir), "loader_key": "file_directory"} - if self.file_metadata is not None: - metadata = self.file_metadata(str(input_file)) - - if input_file.suffix in self.file_extractor: - reader = self.file_extractor[input_file.suffix] - - if isinstance(reader, str): - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - reader = import_loader(reader)() - except ImportError: - reader = download_loader(reader)() - - extracted_documents = reader.load_data( - file=input_file, extra_info=metadata - ) - documents.extend(extracted_documents) - else: - data = "" - # do standard read - with open(input_file, "r", errors=self.errors) as f: - data = f.read() - doc = DocumentNode(text=data, extra_info=metadata or {}) - documents.append(doc) - - return documents diff --git a/nextpy/ai/rag/document_loaders/file/cjk_pdf/README.md b/nextpy/ai/rag/document_loaders/file/cjk_pdf/README.md deleted file mode 100644 index 6d7f2730..00000000 --- a/nextpy/ai/rag/document_loaders/file/cjk_pdf/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Chinese/Japanese/Korean PDF Loader - -This loader extracts the text from a local PDF file using the `pdfminer.six` Python package, which is used instead of `PyPDF2` in order to load Asian languages, e.g. shift-jis encoded Japanese text. The officially supported characters are those in CJK (Chinese, Japanese, and Korean), though it may work for other languages as well. Any non-text elements are ignored. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -CJKPDFReader = download_loader("CJKPDFReader") - -loader = CJKPDFReader() -documents = loader.load_data(file=Path('./article.pdf')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/cjk_pdf/__init__.py b/nextpy/ai/rag/document_loaders/file/cjk_pdf/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/cjk_pdf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/cjk_pdf/base.py b/nextpy/ai/rag/document_loaders/file/cjk_pdf/base.py deleted file mode 100644 index 078f977e..00000000 --- a/nextpy/ai/rag/document_loaders/file/cjk_pdf/base.py +++ /dev/null @@ -1,84 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read PDF files.""" - -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class CJKPDFReader(BaseReader): - """CJK PDF reader. - - Extract text from PDF including CJK (Chinese, Japanese and Korean) languages using pdfminer.six. - - Args: - concat_pages (bool): whether to concatenate all pages into one DocumentNode. - If set to False, a DocumentNode will be created for each page. - True by default. - """ - - def __init__(self, *args: Any, concat_pages: bool = True, **kwargs: Any) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._concat_pages = concat_pages - - # Define a function to extract text from PDF - def _extract_text_by_page(self, pdf_path: Path) -> List[str]: - # Import pdfminer - from io import StringIO - - from pdfminer.converter import TextConverter - from pdfminer.layout import LAParams - from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - from pdfminer.pdfpage import PDFPage - - # Create a resource manager - rsrcmgr = PDFResourceManager() - # Create an object to store the text - retstr = StringIO() - # Create a text converter - codec = "utf-8" - laparams = LAParams() - device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) - # Create a PDF interpreter - interpreter = PDFPageInterpreter(rsrcmgr, device) - # Open the PDF file - fp = open(pdf_path, "rb") - # Create a list to store the text of each page - text_list = [] - # Extract text from each page - for page in PDFPage.get_pages(fp): - interpreter.process_page(page) - # Get the text - text = retstr.getvalue() - # Add the text to the list - text_list.append(text) - # Clear the text - retstr.truncate(0) - retstr.seek(0) - # Close the file - fp.close() - # Close the device - device.close() - # Return the text list - return text_list - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - text_list = self._extract_text_by_page(file) - - if self._concat_pages: - return [ - DocumentNode(text="\n".join(text_list), extra_info=extra_info or {}) - ] - else: - return [ - DocumentNode(text=text, extra_info=extra_info or {}) - for text in text_list - ] diff --git a/nextpy/ai/rag/document_loaders/file/cjk_pdf/requirements.txt b/nextpy/ai/rag/document_loaders/file/cjk_pdf/requirements.txt deleted file mode 100644 index 698b6805..00000000 --- a/nextpy/ai/rag/document_loaders/file/cjk_pdf/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pdfminer.six \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/deepdoctection/README.md b/nextpy/ai/rag/document_loaders/file/deepdoctection/README.md deleted file mode 100644 index 91039667..00000000 --- a/nextpy/ai/rag/document_loaders/file/deepdoctection/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# DeepDoctection Loader - -This loader extracts the text from a local PDF file using the deepdoctection Python package, a library that performs -doc extraction and DocumentNode layout. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -DeepDoctectionReader = download_loader("DeepDoctectionReader") - -loader = DeepDoctectionReader() -documents = loader.load_data(file=Path('./article.pdf')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/deepdoctection/__init__.py b/nextpy/ai/rag/document_loaders/file/deepdoctection/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/deepdoctection/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/deepdoctection/base.py b/nextpy/ai/rag/document_loaders/file/deepdoctection/base.py deleted file mode 100644 index 9d7b3e2a..00000000 --- a/nextpy/ai/rag/document_loaders/file/deepdoctection/base.py +++ /dev/null @@ -1,41 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Deepdoctection Data Reader.""" - -from pathlib import Path -from typing import Dict, List, Optional, Set - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class DeepDoctectionReader(BaseReader): - """Deepdoctection reader for pdf's. - - Uses deepdoctection as a library to parse PDF files. - - """ - - def __init__(self, attrs_as_metadata: Optional[Set] = None) -> None: - """Init params.""" - import deepdoctection as dd - - self.analyzer = dd.get_dd_analyzer() - self.attrs_as_metadata = attrs_as_metadata or set() - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - df = self.analyzer.analyze(path=str(file)) - df.reset_state() - doc = iter(df) - result_docs = [] - for page in doc: - doc_text = page.text - extra_info = { - k: getattr(page, k) for k in self.attrs_as_metadata if hasattr(page, k) - } - result_docs.append(DocumentNode(text=doc_text, extra_info=extra_info)) - return result_docs diff --git a/nextpy/ai/rag/document_loaders/file/deepdoctection/requirements.txt b/nextpy/ai/rag/document_loaders/file/deepdoctection/requirements.txt deleted file mode 100644 index 4b422009..00000000 --- a/nextpy/ai/rag/document_loaders/file/deepdoctection/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -deepdoctection[pt] -torch \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/docx/README.md b/nextpy/ai/rag/document_loaders/file/docx/README.md deleted file mode 100644 index 2d16aa82..00000000 --- a/nextpy/ai/rag/document_loaders/file/docx/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Microsoft Word Loader - -This loader extracts the text from a local Microsoft Word (.docx) file. Non-text items in the DocumentNode are ignored. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -DocxReader = download_loader("DocxReader") - -loader = DocxReader() -documents = loader.load_data(file=Path('./homework.docx')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/docx/__init__.py b/nextpy/ai/rag/document_loaders/file/docx/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/docx/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/docx/base.py b/nextpy/ai/rag/document_loaders/file/docx/base.py deleted file mode 100644 index 18501889..00000000 --- a/nextpy/ai/rag/document_loaders/file/docx/base.py +++ /dev/null @@ -1,28 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read Microsoft Word files.""" - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class DocxReader(BaseReader): - """Docx Reader.""" - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import docx2txt - - text = docx2txt.process(file) - metadata = {"file_name": file.name} - - if extra_info is not None: - metadata.update(extra_info) - - return [DocumentNode(text=text, extra_info=metadata)] diff --git a/nextpy/ai/rag/document_loaders/file/docx/requirements.txt b/nextpy/ai/rag/document_loaders/file/docx/requirements.txt deleted file mode 100644 index a5866142..00000000 --- a/nextpy/ai/rag/document_loaders/file/docx/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -docx2txt \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/epub/README.md b/nextpy/ai/rag/document_loaders/file/epub/README.md deleted file mode 100644 index 682507dc..00000000 --- a/nextpy/ai/rag/document_loaders/file/epub/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Epub Loader - -This loader extracts the text from a local Epub file. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -EpubReader = download_loader("EpubReader") - -loader = EpubReader() -documents = loader.load_data(file=Path('./book.epub')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/epub/__init__.py b/nextpy/ai/rag/document_loaders/file/epub/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/epub/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/epub/base.py b/nextpy/ai/rag/document_loaders/file/epub/base.py deleted file mode 100644 index 966949db..00000000 --- a/nextpy/ai/rag/document_loaders/file/epub/base.py +++ /dev/null @@ -1,39 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Epub Reader. - -A parser for epub files. -""" - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class EpubReader(BaseReader): - """Epub Parser.""" - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import ebooklib - import html2text - from ebooklib import epub - - text_list = [] - book = epub.read_epub(file, options={"ignore_ncx": True}) - - # Iterate through all chapters. - for item in book.get_items(): - # Chapters are typically located in epub documents items. - if item.get_type() == ebooklib.ITEM_DOCUMENT: - text_list.append( - html2text.html2text(item.get_content().decode("utf-8")) - ) - - text = "\n".join(text_list) - return [DocumentNode(text=text, extra_info=extra_info or {})] diff --git a/nextpy/ai/rag/document_loaders/file/epub/requirements.txt b/nextpy/ai/rag/document_loaders/file/epub/requirements.txt deleted file mode 100644 index dc7adf05..00000000 --- a/nextpy/ai/rag/document_loaders/file/epub/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -ebooklib -html2text \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/flat_pdf/README.md b/nextpy/ai/rag/document_loaders/file/flat_pdf/README.md deleted file mode 100644 index 3b51b2de..00000000 --- a/nextpy/ai/rag/document_loaders/file/flat_pdf/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Flat PDF Loader - -This loader extracts the text from a local flat PDF file using the `PyMuPDF` Python package and image loader. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need: - -- Download `ImageReader` and `FlatPdfReader` using `download_loader` -- Init a `ImageReader` -- Init a `FlatPdfReader` and pass `ImageReader` on init -- Pass a `Path` to a local file in method `load_data`. - -```python -from pathlib import Path -from nextpy.ai import download_loader - - -ImageReader = download_loader("ImageReader") -imageLoader = ImageReader(text_type="plain_text") -FlatPdfReader = download_loader("FlatPdfReader") -pdfLoader = FlatPdfReader(image_loader=imageLoader) - -DocumentNode = pdfLoader.load_data(file=Path('./file.pdf')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/openams/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/flat_pdf/__init__.py b/nextpy/ai/rag/document_loaders/file/flat_pdf/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/flat_pdf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/flat_pdf/base.py b/nextpy/ai/rag/document_loaders/file/flat_pdf/base.py deleted file mode 100644 index 589508ce..00000000 --- a/nextpy/ai/rag/document_loaders/file/flat_pdf/base.py +++ /dev/null @@ -1,87 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that reads flatten PDFs.""" -import os -import pathlib -import warnings -from pathlib import Path - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class FlatPdfReader(BaseReader): - image_loader: BaseReader - - def __init__(self, image_loader: BaseReader): - """:param self: Represent the instance of the class - :param image_loader: BaseReader: Pass the image_loader object to the class - :return: An object of the class - """ - self.image_loader = image_loader - - def load_data(self, file: Path) -> DocumentNode: - """The load_data function is the main function of the DataLoader class. - It takes a PDF file path as input and returns a DocumentNode object with text extracted from that PDF. - - - :param self: Represent the instance of the class - :param file: Path: The file that we want to load - :return: A DocumentNode object - """ - import shutil - - try: - - if not file.is_file() and file.suffix != ".pdf": - raise Exception("Invalid file") - - pdf_dir: Path = file - work_dir: str = str( - pathlib.Path().resolve() - ) + "/flat_pdf/{file_name}".format( - file_name=file.name.replace(file.suffix, "") - ) - pdf_content: str = "" - - shutil.rmtree( - str(pathlib.Path().resolve()) + "/flat_pdf", ignore_errors=True - ) - os.makedirs(work_dir) - - pdf_pages_count: int = self.convert_pdf_in_images( - pdf_dir=pdf_dir, work_dir=work_dir - ) - - for page_number in range(0, pdf_pages_count): - DocumentNode = self.image_loader.load_data( - file=Path(work_dir + f"/page-{page_number}.png") - ) - pdf_content += DocumentNode[0].text - return DocumentNode(text=pdf_content) - - except Exception as e: - warnings.warn(f"{str(e)}") - finally: - shutil.rmtree( - str(pathlib.Path().resolve()) + "/flat_pdf", ignore_errors=True - ) - - def convert_pdf_in_images(self, pdf_dir: Path, work_dir: str) -> int: - """The convert_pdf_in_images function converts a PDF file into images. - - :param pdf_dir: Path: Specify the path of the pdf file to be converted - :param work_dir: str: Specify the directory where the images will be saved - :return: The number of pages in the pdf file - """ - import fitz - - zoom_x = 2.0 # horizontal zoom - zoom_y = 2.0 # vertical zoom - mat = fitz.Matrix(zoom_x, zoom_y) - pages = fitz.open(pdf_dir) - for page in pages: # iterate through the pages - image = page.get_pixmap(matrix=mat) # render page to an image - image.save(f"{work_dir}/page-{page.number}.png") - return pages.page_count diff --git a/nextpy/ai/rag/document_loaders/file/flat_pdf/requirements.txt b/nextpy/ai/rag/document_loaders/file/flat_pdf/requirements.txt deleted file mode 100644 index 4a34ddfa..00000000 --- a/nextpy/ai/rag/document_loaders/file/flat_pdf/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -PyMuPDF==1.21.1 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/image/README.md b/nextpy/ai/rag/document_loaders/file/image/README.md deleted file mode 100644 index ddac3bb4..00000000 --- a/nextpy/ai/rag/document_loaders/file/image/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Image Loader - -This loader extracts the text from an image that has text in it, e.g. a receipt (key-value pairs) or a plain text image. If the image has plain text, the loader uses [pytesseract](https://pypi.org/project/pytesseract/). If image has text in key-value pairs like an invoice, the [Donut](https://huggingface.co/docs/transformers/model_doc/donut) transformer model is used. The file extensions .png, .jpg, and .jpeg are preferred. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -ImageReader = download_loader("ImageReader") - -# If the Image has key-value pairs text, use text_type = "key_value" -loader = ImageReader(text_type = "key_value") -documents = loader.load_data(file=Path('./receipt.png')) - -# If the Image has plain text, use text_type = "plain_text" -loader = ImageReader(text_type = "plain_text") -documents = loader.load_data(file=Path('./image.png')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/image/__init__.py b/nextpy/ai/rag/document_loaders/file/image/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/image/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/image/base.py b/nextpy/ai/rag/document_loaders/file/image/base.py deleted file mode 100644 index 08126ba2..00000000 --- a/nextpy/ai/rag/document_loaders/file/image/base.py +++ /dev/null @@ -1,122 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Image Reader. - -A parser for image files. - -""" - -import re -from pathlib import Path -from typing import Dict, List, Optional, cast - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode, ImageDocument - - -class ImageReader(BaseReader): - """Image parser. - - Extract text from images using DONUT. - - """ - - def __init__( - self, - text_type: str = "text", - parser_config: Optional[Dict] = None, - keep_image: bool = False, - parse_text: bool = True, - ): - """Init parser.""" - self._text_type = text_type - if parser_config is None and parse_text: - if text_type == "plain_text": - import pytesseract - - processor = None - model = pytesseract - else: - from transformers import DonutProcessor, VisionEncoderDecoderModel - - processor = DonutProcessor.from_pretrained( - "naver-clova-ix/donut-base-finetuned-cord-v2" - ) - model = VisionEncoderDecoderModel.from_pretrained( - "naver-clova-ix/donut-base-finetuned-cord-v2" - ) - parser_config = {"processor": processor, "model": model} - self._parser_config = parser_config - self._keep_image = keep_image - self._parse_text = parse_text - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - from PIL import Image - - from nextpy.ai.img_utils import img_2_b64 - - # load DocumentNode image - image = Image.open(file) - if image.mode != "RGB": - image = image.convert("RGB") - - # Encode image into base64 string and keep in DocumentNode - image_str: Optional[str] = None - if self._keep_image: - image_str = img_2_b64(image) - - # Parse image into text - text_str: str = "" - if self._parse_text: - model = self._parser_config["model"] - processor = self._parser_config["processor"] - - if processor: - import torch - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # prepare decoder inputs - task_prompt = "" - decoder_input_ids = processor.tokenizer( - task_prompt, add_special_tokens=False, return_tensors="pt" - ).input_ids - - pixel_values = processor(image, return_tensors="pt").pixel_values - - outputs = model.generate( - pixel_values.to(device), - decoder_input_ids=decoder_input_ids.to(device), - max_length=model.decoder.config.max_position_embeddings, - early_stopping=True, - pad_token_id=processor.tokenizer.pad_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - use_cache=True, - num_beams=3, - bad_words_ids=[[processor.tokenizer.unk_token_id]], - return_dict_in_generate=True, - ) - - sequence = processor.batch_decode(outputs.sequences)[0] - sequence = sequence.replace(processor.tokenizer.eos_token, "").replace( - processor.tokenizer.pad_token, "" - ) - # remove first task start token - text_str = re.sub(r"<.*?>", "", sequence, count=1).strip() - else: - import pytesseract - - model = cast(pytesseract, self._parser_config["model"]) - text_str = model.image_to_string(image) - - return [ - ImageDocument( - text=text_str, - image=image_str, - ) - ] diff --git a/nextpy/ai/rag/document_loaders/file/image/requirements.txt b/nextpy/ai/rag/document_loaders/file/image/requirements.txt deleted file mode 100644 index 66a8a119..00000000 --- a/nextpy/ai/rag/document_loaders/file/image/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -transformers -Pillow -torch -torchvision -sentencepiece -pytesseract \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/image_blip/README.md b/nextpy/ai/rag/document_loaders/file/image_blip/README.md deleted file mode 100644 index fd68527c..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Image Loader (Blip) - -This loader captions an image file using Blip. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -ImageCaptionReader = download_loader("ImageCaptionReader") - -loader = ImageCaptionReader() -documents = loader.load_data(file=Path('./image.png')) -``` diff --git a/nextpy/ai/rag/document_loaders/file/image_blip/__init__.py b/nextpy/ai/rag/document_loaders/file/image_blip/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/image_blip/base.py b/nextpy/ai/rag/document_loaders/file/image_blip/base.py deleted file mode 100644 index db50853e..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip/base.py +++ /dev/null @@ -1,110 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode, ImageDocument - - -class ImageCaptionReader(BaseReader): - """Image parser. - - Caption image using Blip. - - """ - - def __init__( - self, - parser_config: Optional[Dict] = None, - keep_image: bool = False, - prompt: str = None, - ): - """Init params.""" - self._keep_image = keep_image - self._prompt = prompt - if parser_config is None: - try: - import torch # noqa: F401 - except ImportError: - raise ImportError( - "install pytorch to use the model: " "`pip install torch`" - ) - try: - from transformers import BlipForConditionalGeneration, BlipProcessor - except ImportError: - raise ImportError( - "transformers is required for using BLIP model: " - "`pip install transformers`" - ) - try: - import sentencepiece # noqa: F401 - except ImportError: - raise ImportError( - "sentencepiece is required for using BLIP model: " - "`pip install sentencepiece`" - ) - try: - from PIL import Image # noqa: F401 - except ImportError: - raise ImportError( - "PIL is required to read image files: " "`pip install Pillow`" - ) - - device = "cuda" if torch.cuda.is_available() else "cpu" - dtype = torch.float16 if torch.cuda.is_available() else torch.float32 - - processor = BlipProcessor.from_pretrained( - "Salesforce/blip-image-captioning-large" - ) - model = BlipForConditionalGeneration.from_pretrained( - "Salesforce/blip-image-captioning-large", torch_dtype=dtype - ) - - parser_config = { - "processor": processor, - "model": model, - "device": device, - "dtype": dtype, - } - - self._parser_config = parser_config - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - from PIL import Image - - from nextpy.ai.img_utils import img_2_b64 - - # load DocumentNode image - image = Image.open(file) - if image.mode != "RGB": - image = image.convert("RGB") - - # Encode image into base64 string and keep in DocumentNode - image_str: Optional[str] = None - if self._keep_image: - image_str = img_2_b64(image) - - # Parse image into text - model = self._parser_config["model"] - processor = self._parser_config["processor"] - - device = self._parser_config["device"] - dtype = self._parser_config["dtype"] - model.to(device) - - # unconditional image captioning - - inputs = processor(image, self._prompt, return_tensors="pt").to(device, dtype) - - out = model.generate(**inputs) - text_str = processor.decode(out[0], skip_special_tokens=True) - - return ImageDocument( - text=text_str, - image=image_str, - ) diff --git a/nextpy/ai/rag/document_loaders/file/image_blip/requirements.txt b/nextpy/ai/rag/document_loaders/file/image_blip/requirements.txt deleted file mode 100644 index 752103bf..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -torch -transformers -sentencepiece -Pillow diff --git a/nextpy/ai/rag/document_loaders/file/image_blip2/README.md b/nextpy/ai/rag/document_loaders/file/image_blip2/README.md deleted file mode 100644 index b0aec06a..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip2/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Image Loader (Blip2) - -This loader captions an image file using Blip2 (a multimodal VisionLLM similar to GPT4). - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -ImageVisionLLMReader = download_loader("ImageVisionLLMReader") - -loader = ImageVisionLLMReader() -documents = loader.load_data(file=Path('./image.png')) -``` diff --git a/nextpy/ai/rag/document_loaders/file/image_blip2/__init__.py b/nextpy/ai/rag/document_loaders/file/image_blip2/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip2/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/image_blip2/base.py b/nextpy/ai/rag/document_loaders/file/image_blip2/base.py deleted file mode 100644 index d3ab007b..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip2/base.py +++ /dev/null @@ -1,104 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode, ImageDocument - - -class ImageVisionLLMReader(BaseReader): - """Image parser. - - Caption image using Blip2 (a multimodal VisionLLM similar to GPT4). - - """ - - def __init__( - self, - parser_config: Optional[Dict] = None, - keep_image: bool = False, - prompt: str = "Question: describe what you see in this image. Answer:", - ): - """Init params.""" - if parser_config is None: - try: - import torch # noqa: F401 - except ImportError: - raise ImportError( - "install pytorch to use the model: " "`pip install torch`" - ) - try: - from transformers import Blip2ForConditionalGeneration, Blip2Processor - except ImportError: - raise ImportError( - "transformers is required for using BLIP2 model: " - "`pip install transformers`" - ) - try: - import sentencepiece # noqa: F401 - except ImportError: - raise ImportError( - "sentencepiece is required for using BLIP2 model: " - "`pip install sentencepiece`" - ) - try: - from PIL import Image # noqa: F401 - except ImportError: - raise ImportError( - "PIL is required to read image files: " "`pip install Pillow`" - ) - device = "cuda" if torch.cuda.is_available() else "cpu" - dtype = torch.float16 if torch.cuda.is_available() else torch.float32 - processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") - model = Blip2ForConditionalGeneration.from_pretrained( - "Salesforce/blip2-opt-2.7b", torch_dtype=dtype - ) - parser_config = { - "processor": processor, - "model": model, - "device": device, - "dtype": dtype, - } - self._parser_config = parser_config - self._keep_image = keep_image - self._prompt = prompt - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - from PIL import Image - - from nextpy.ai.img_utils import img_2_b64 - - # load DocumentNode image - image = Image.open(file) - if image.mode != "RGB": - image = image.convert("RGB") - - # Encode image into base64 string and keep in DocumentNode - image_str: Optional[str] = None - if self._keep_image: - image_str = img_2_b64(image) - - # Parse image into text - model = self._parser_config["model"] - processor = self._parser_config["processor"] - - device = self._parser_config["device"] - dtype = self._parser_config["dtype"] - model.to(device) - - # unconditional image captioning - - inputs = processor(image, self._prompt, return_tensors="pt").to(device, dtype) - - out = model.generate(**inputs) - text_str = processor.decode(out[0], skip_special_tokens=True) - - return ImageDocument( - text=text_str, - image=image_str, - ) diff --git a/nextpy/ai/rag/document_loaders/file/image_blip2/requirements.txt b/nextpy/ai/rag/document_loaders/file/image_blip2/requirements.txt deleted file mode 100644 index 752103bf..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_blip2/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -torch -transformers -sentencepiece -Pillow diff --git a/nextpy/ai/rag/document_loaders/file/image_deplot/README.md b/nextpy/ai/rag/document_loaders/file/image_deplot/README.md deleted file mode 100644 index c0e3baa5..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_deplot/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Image Tabular Chart Loader (Deplot) - -This loader captions an image file containing a tabular chart (bar chart, line charts) using deplot. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from llama_hub.file.image_deplot.base import ImageTabularChartReader - -loader = ImageTabularChartReader() -documents = loader.load_data(file=Path('./image.png')) -``` diff --git a/nextpy/ai/rag/document_loaders/file/image_deplot/__init__.py b/nextpy/ai/rag/document_loaders/file/image_deplot/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_deplot/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/image_deplot/base.py b/nextpy/ai/rag/document_loaders/file/image_deplot/base.py deleted file mode 100644 index 2cf2024b..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_deplot/base.py +++ /dev/null @@ -1,100 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode, ImageDocument - - -class ImageTabularChartReader(BaseReader): - """Image parser. - - Extract tabular data from a chart or figure. - - """ - - def __init__( - self, - parser_config: Optional[Dict] = None, - keep_image: bool = False, - max_output_tokens=512, - prompt: str = "Generate underlying data table of the figure below:", - ): - """Init params.""" - if parser_config is None: - try: - import torch # noqa: F401 - from PIL import Image # noqa: F401 - from transformers import ( - Pix2StructForConditionalGeneration, - Pix2StructProcessor, - ) - except ImportError: - raise ImportError( - "Please install extra dependencies that are required for " - "the ImageCaptionReader: " - "`pip install torch transformers Pillow`" - ) - - device = "cuda" if torch.cuda.is_available() else "cpu" - dtype = torch.float16 if torch.cuda.is_available() else torch.float32 - processor = Pix2StructProcessor.from_pretrained("google/deplot") - model = Pix2StructForConditionalGeneration.from_pretrained( - "google/deplot", torch_dtype=dtype - ) - parser_config = { - "processor": processor, - "model": model, - "device": device, - "dtype": dtype, - } - - self._parser_config = parser_config - self._keep_image = keep_image - self._max_output_tokens = max_output_tokens - self._prompt = prompt - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - from PIL import Image - - from nextpy.ai.img_utils import img_2_b64 - - # load DocumentNode image - image = Image.open(file) - if image.mode != "RGB": - image = image.convert("RGB") - - # Encode image into base64 string and keep in DocumentNode - image_str: Optional[str] = None - if self._keep_image: - image_str = img_2_b64(image) - - # Parse image into text - model = self._parser_config["model"] - processor = self._parser_config["processor"] - - device = self._parser_config["device"] - dtype = self._parser_config["dtype"] - model.to(device) - - # unconditional image captioning - - inputs = processor(image, self._prompt, return_tensors="pt").to(device, dtype) - - out = model.generate(**inputs, max_new_tokens=self._max_output_tokens) - text_str = "Figure or chart with tabular data: " + processor.decode( - out[0], skip_special_tokens=True - ) - - return [ - ImageDocument( - text=text_str, - image=image_str, - extra_info=extra_info or {}, - ) - ] diff --git a/nextpy/ai/rag/document_loaders/file/image_deplot/requirements.txt b/nextpy/ai/rag/document_loaders/file/image_deplot/requirements.txt deleted file mode 100644 index 752103bf..00000000 --- a/nextpy/ai/rag/document_loaders/file/image_deplot/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -torch -transformers -sentencepiece -Pillow diff --git a/nextpy/ai/rag/document_loaders/file/ipynb/README.md b/nextpy/ai/rag/document_loaders/file/ipynb/README.md deleted file mode 100644 index 9a557b8a..00000000 --- a/nextpy/ai/rag/document_loaders/file/ipynb/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# IPynb Loader - -This loader extracts text from `.ipynb` (jupyter notebook) files. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -IPYNBReader = download_loader("IPYNBReader") - -# specify concatenate to determine whether to concat cells into one DocumentNode -loader = IPYNBReader(concatenate=True) -documents = loader.load_data(file=Path('./image.png')) -``` diff --git a/nextpy/ai/rag/document_loaders/file/ipynb/__init__.py b/nextpy/ai/rag/document_loaders/file/ipynb/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/ipynb/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/ipynb/base.py b/nextpy/ai/rag/document_loaders/file/ipynb/base.py deleted file mode 100644 index 034c244d..00000000 --- a/nextpy/ai/rag/document_loaders/file/ipynb/base.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import re -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class IPYNBReader(BaseReader): - """Ipynb file loader. - - Reads jupyter notebook files. - - """ - - def __init__( - self, - parser_config: Optional[Dict] = None, - concatenate: bool = False, - ): - """Init params.""" - self._parser_config = parser_config - self._concatenate = concatenate - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - if file.name.endswith(".ipynb"): - try: - import nbconvert # noqa: F401 - except ImportError: - raise ImportError("Please install nbconvert 'pip install nbconvert' ") - string = nbconvert.exporters.ScriptExporter().from_file(file)[0] - # split each In[] cell into a separate string - splits = re.split(r"In\[\d+\]:", string) - # remove the first element, which is empty - splits.pop(0) - - if self._concatenate: - docs = [DocumentNode(text="\n\n".join(splits))] - else: - docs = [DocumentNode(text=s) for s in splits] - return docs diff --git a/nextpy/ai/rag/document_loaders/file/ipynb/requirements.txt b/nextpy/ai/rag/document_loaders/file/ipynb/requirements.txt deleted file mode 100644 index b8b380fe..00000000 --- a/nextpy/ai/rag/document_loaders/file/ipynb/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -nbconvert \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/json/README.md b/nextpy/ai/rag/document_loaders/file/json/README.md deleted file mode 100644 index 0a221b53..00000000 --- a/nextpy/ai/rag/document_loaders/file/json/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# JSON Loader - -This loader extracts the text in a formatted manner from a JSON file. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -JSONReader = download_loader("JSONReader") - -loader = JSONReader() -documents = loader.load_data(Path('./data.json')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/json/__init__.py b/nextpy/ai/rag/document_loaders/file/json/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/json/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/json/base.py b/nextpy/ai/rag/document_loaders/file/json/base.py deleted file mode 100644 index 299aef9e..00000000 --- a/nextpy/ai/rag/document_loaders/file/json/base.py +++ /dev/null @@ -1,84 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""JSON Reader.""" - -import json -import re -from pathlib import Path -from typing import Dict, Generator, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -def _depth_first_yield( - json_data: Dict, levels_back: int, path: List[str] -) -> Generator[str, None, None]: - """Do depth first yield of all of the leaf nodes of a JSON. - - Combines keys in the JSON tree using spaces. - - If levels_back is set to 0, prints all levels. - - """ - if isinstance(json_data, dict): - for key, value in json_data.items(): - new_path = path[:] - new_path.append(key) - yield from _depth_first_yield(value, levels_back, new_path) - elif isinstance(json_data, list): - for _, value in enumerate(json_data): - yield from _depth_first_yield(value, levels_back, path) - else: - new_path = path[-levels_back:] - new_path.append(str(json_data)) - yield " ".join(new_path) - - -class JSONReader(BaseReader): - """JSON reader. - - Reads JSON documents with options to help suss out relationships between nodes. - - Args: - levels_back (int): the number of levels to go back in the JSON tree, 0 - if you want all levels. If levels_back is None, then we just format the - JSON and make each line an embedding - - """ - - def __init__(self, levels_back: Optional[int] = None) -> None: - """Initialize with arguments.""" - super().__init__() - self.levels_back = levels_back - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Load data from the input file.""" - # TODO: change Path typing for file in all load_data calls - if not isinstance(file, Path): - file = Path(file) - with open(file, "r") as f: - data = json.load(f) - if self.levels_back is None: - # If levels_back isn't set, we just format and make each - # line an embedding - json_output = json.dumps(data, indent=0) - lines = json_output.split("\n") - useful_lines = [ - line for line in lines if not re.match(r"^[{}\[\],]*$", line) - ] - return [ - DocumentNode( - text="\n".join(useful_lines), extra_info=extra_info or {} - ) - ] - elif self.levels_back is not None: - # If levels_back is set, we make the embeddings contain the labels - # from further up the JSON tree - lines = [*_depth_first_yield(data, self.levels_back, [])] - return [ - DocumentNode(text="\n".join(lines), extra_info=extra_info or {}) - ] diff --git a/nextpy/ai/rag/document_loaders/file/json/requirements.txt b/nextpy/ai/rag/document_loaders/file/json/requirements.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/nextpy/ai/rag/document_loaders/file/markdown/README.md b/nextpy/ai/rag/document_loaders/file/markdown/README.md deleted file mode 100644 index d9916ec8..00000000 --- a/nextpy/ai/rag/document_loaders/file/markdown/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Markdown Loader - -This loader extracts the text from a local Markdown file. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -MarkdownReader = download_loader("MarkdownReader") - -loader = MarkdownReader() -documents = loader.load_data(file=Path('./README.md')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/markdown/__init__.py b/nextpy/ai/rag/document_loaders/file/markdown/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/markdown/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/markdown/base.py b/nextpy/ai/rag/document_loaders/file/markdown/base.py deleted file mode 100644 index 320eb981..00000000 --- a/nextpy/ai/rag/document_loaders/file/markdown/base.py +++ /dev/null @@ -1,117 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Markdown Reader. - -A parser for md files. - -""" -import re -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MarkdownReader(BaseReader): - """Markdown parser. - - Extract text from markdown files. - Returns dictionary with keys as headers and values as the text between headers. - - """ - - def __init__( - self, - *args: Any, - remove_hyperlinks: bool = True, - remove_images: bool = True, - **kwargs: Any, - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._remove_hyperlinks = remove_hyperlinks - self._remove_images = remove_images - - def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: - """Convert a markdown file to a dictionary. - - The keys are the headers and the values are the text under each header. - - """ - markdown_tups: List[Tuple[Optional[str], str]] = [] - lines = markdown_text.split("\n") - - current_header = None - current_text = "" - - for line in lines: - header_match = re.match(r"^#+\s", line) - if header_match: - if current_header is not None: - if current_text == "" or None: - continue - markdown_tups.append((current_header, current_text)) - - current_header = line - current_text = "" - else: - current_text += line + "\n" - markdown_tups.append((current_header, current_text)) - - if current_header is not None: - # pass linting, assert keys are defined - markdown_tups = [ - (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) - for key, value in markdown_tups - ] - else: - markdown_tups = [ - (key, re.sub("<.*?>", "", value)) for key, value in markdown_tups - ] - - return markdown_tups - - def remove_images(self, content: str) -> str: - """Get a dictionary of a markdown file from its path.""" - pattern = r"!{1}\[\[(.*)\]\]" - content = re.sub(pattern, "", content) - return content - - def remove_hyperlinks(self, content: str) -> str: - """Get a dictionary of a markdown file from its path.""" - pattern = r"\[(.*?)\]\((.*?)\)" - content = re.sub(pattern, r"\1", content) - return content - - def parse_tups( - self, filepath: Path, content: Optional[str] = None, errors: str = "ignore" - ) -> List[Tuple[Optional[str], str]]: - """Parse file into tuples. - If content is provided, use that instead of reading from file. - """ - if content is None: - with open(filepath, "r") as f: - content = f.read() - if self._remove_hyperlinks: - content = self.remove_hyperlinks(content) - if self._remove_images: - content = self.remove_images(content) - markdown_tups = self.markdown_to_tups(content) - return markdown_tups - - def load_data( - self, - file: Path, - extra_info: Optional[Dict] = None, - content: Optional[str] = None, - ) -> List[DocumentNode]: - """Parse file into string. - If content is provided, use that instead of reading from file. - """ - tups = self.parse_tups(file, content=content) - # TODO: don't include headers right now - return [ - DocumentNode(text=value, extra_info=extra_info or {}) for _, value in tups - ] diff --git a/nextpy/ai/rag/document_loaders/file/mbox/README.md b/nextpy/ai/rag/document_loaders/file/mbox/README.md deleted file mode 100644 index abd1a679..00000000 --- a/nextpy/ai/rag/document_loaders/file/mbox/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Mailbox Loader - -This loader extracts the text from a local .mbox dump of emails. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -MboxReader = download_loader("MboxReader") -documents = MboxReader().load_data(file='./email.mbox') # Returns list of documents - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/mbox/__init__.py b/nextpy/ai/rag/document_loaders/file/mbox/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/mbox/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/mbox/base.py b/nextpy/ai/rag/document_loaders/file/mbox/base.py deleted file mode 100644 index 3571ef5f..00000000 --- a/nextpy/ai/rag/document_loaders/file/mbox/base.py +++ /dev/null @@ -1,116 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Mbox parser. - -Contains simple parser for mbox files. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MboxReader(BaseReader): - """Mbox reader. - - Extract messages from mailbox files. - Returns string including date, subject, sender, receiver and - content for each message. - - """ - - DEFAULT_MESSAGE_FORMAT: str = ( - "Date: {_date}\n" - "From: {_from}\n" - "To: {_to}\n" - "Subject: {_subject}\n" - "Content: {_content}" - ) - - def __init__( - self, - *args: Any, - max_count: int = 0, - message_format: str = DEFAULT_MESSAGE_FORMAT, - **kwargs: Any - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self.max_count = max_count - self.message_format = message_format - - def parse_file(self, filepath: Path, errors: str = "ignore") -> List[str]: - """Parse file into string.""" - # Import required libraries - import mailbox - from email.parser import BytesParser - from email.policy import default - - from bs4 import BeautifulSoup - - i = 0 - results: List[str] = [] - # Load file using mailbox - bytes_parser = BytesParser(policy=default).parse - mbox = mailbox.mbox(filepath, factory=bytes_parser) # type: ignore - - # Iterate through all messages - for _, _msg in enumerate(mbox): - msg: mailbox.mboxMessage = _msg - # Parse multipart messages - - content = None - - if msg.is_multipart(): - for part in msg.walk(): - ctype = part.get_content_type() - cdispo = str(part.get("Content-Disposition")) - if ctype == "text/plain" and "attachment" not in cdispo: - content = part.get_payload(decode=True) # decode - break - # Get plain message payload for non-multipart messages - else: - content = msg.get_payload(decode=True) - - if not content: - print( - "WARNING llama_hub.file.mbox found messages with content that stayed None. Skipping entry..." - ) - continue - - # Parse message HTML content and remove unneeded whitespace - soup = BeautifulSoup(content) - stripped_content = " ".join(soup.get_text().split()) - # Format message to include date, sender, receiver and subject - msg_string = self.message_format.format( - _date=msg["date"], - _from=msg["from"], - _to=msg["to"], - _subject=msg["subject"], - _content=stripped_content, - ) - # Add message string to results - results.append(msg_string) - # Increment counter and return if max count is met - i += 1 - if self.max_count > 0 and i >= self.max_count: - break - return results - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Load data from the input directory. - - load_kwargs: - max_count (int): Maximum amount of messages to read. - message_format (str): Message format overriding default. - """ - docs: List[DocumentNode] = [] - content = self.parse_file(file) - for msg in content: - docs.append(DocumentNode(text=msg, extra_info=extra_info or {})) - return docs diff --git a/nextpy/ai/rag/document_loaders/file/mbox/requirements.txt b/nextpy/ai/rag/document_loaders/file/mbox/requirements.txt deleted file mode 100644 index 041f722c..00000000 --- a/nextpy/ai/rag/document_loaders/file/mbox/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -beautifulsoup4 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/paged_csv/README.md b/nextpy/ai/rag/document_loaders/file/paged_csv/README.md deleted file mode 100644 index 470d4d7f..00000000 --- a/nextpy/ai/rag/document_loaders/file/paged_csv/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Paged CSV Loader - -This loader extracts the text from a local .csv file by formatting each row in an LLM-friendly way and inserting it into a separate DocumentNode. A single local file is passed in each time you call `load_data`. For example, a DocumentNode might look like: - -``` -First Name: Bruce -Last Name: Wayne -Age: 28 -Occupation: Unknown -``` - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PagedCSVReader = download_loader("PagedCSVReader") - -loader = PagedCSVReader(encoding="utf-8") -documents = loader.load_data(file=Path('./transactions.csv')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/paged_csv/__init__.py b/nextpy/ai/rag/document_loaders/file/paged_csv/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/file/paged_csv/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/file/paged_csv/base.py b/nextpy/ai/rag/document_loaders/file/paged_csv/base.py deleted file mode 100644 index cbd98155..00000000 --- a/nextpy/ai/rag/document_loaders/file/paged_csv/base.py +++ /dev/null @@ -1,49 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Paged CSV reader. - -A parser for tabular data files. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PagedCSVReader(BaseReader): - """Paged CSV parser. - - Displayed each row in an LLM-friendly format on a separate DocumentNode. - - Args: - encoding (str): Encoding used to open the file. - utf-8 by default. - """ - - def __init__(self, *args: Any, encoding: str = "utf-8", **kwargs: Any) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._encoding = encoding - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import csv - - docs = [] - with open(file, "r", encoding=self._encoding) as fp: - csv_reader = csv.DictReader(fp) # type: ignore - for row in csv_reader: - docs.append( - DocumentNode( - text="\n".join( - f"{k.strip()}: {v.strip()}" for k, v in row.items() - ), - extra_info=extra_info or {}, - ) - ) - return docs diff --git a/nextpy/ai/rag/document_loaders/file/pandas_csv/README.md b/nextpy/ai/rag/document_loaders/file/pandas_csv/README.md deleted file mode 100644 index 20d6a816..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_csv/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Pandas CSV Loader - -This loader extracts the text from a local .csv file using the `pandas` Python package. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PandasCSVReader = download_loader("PandasCSVReader") - -loader = PandasCSVReader() -documents = loader.load_data(file=Path('./transactions.csv')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pandas_csv/__init__.py b/nextpy/ai/rag/document_loaders/file/pandas_csv/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_csv/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pandas_csv/base.py b/nextpy/ai/rag/document_loaders/file/pandas_csv/base.py deleted file mode 100644 index 5acb687d..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_csv/base.py +++ /dev/null @@ -1,80 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Pandas CSV reader. - -A parser for tabular data files using pandas. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PandasCSVReader(BaseReader): - r"""Pandas-based CSV parser. - - Parses CSVs using the separator detection from Pandas `read_csv`function. - If special parameters are required, use the `pandas_config` dict. - - Args: - concat_rows (bool): whether to concatenate all rows into one DocumentNode. - If set to False, a DocumentNode will be created for each row. - True by default. - - col_joiner (str): Separator to use for joining cols per row. - Set to ", " by default. - - row_joiner (str): Separator to use for joining each row. - Only used when `concat_rows=True`. - Set to "\n" by default. - - pandas_config (dict): Options for the `pandas.read_csv` function call. - Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html - for more information. - Set to empty dict by default, this means pandas will try to figure - out the separators, table head, etc. on its own. - - """ - - def __init__( - self, - *args: Any, - concat_rows: bool = True, - col_joiner: str = ", ", - row_joiner: str = "\n", - pandas_config: dict = {}, - **kwargs: Any - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._concat_rows = concat_rows - self._col_joiner = col_joiner - self._row_joiner = row_joiner - self._pandas_config = pandas_config - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import pandas as pd - - df = pd.read_csv(file, **self._pandas_config) - - text_list = df.apply( - lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 - ).tolist() - - if self._concat_rows: - return [ - DocumentNode( - text=self._row_joiner.join(text_list), extra_info=extra_info or {} - ) - ] - else: - return [ - DocumentNode(text=text, extra_info=extra_info or {}) - for text in text_list - ] diff --git a/nextpy/ai/rag/document_loaders/file/pandas_csv/requirements.txt b/nextpy/ai/rag/document_loaders/file/pandas_csv/requirements.txt deleted file mode 100644 index 1411a4a0..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_csv/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandas \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/pandas_excel/README.md b/nextpy/ai/rag/document_loaders/file/pandas_excel/README.md deleted file mode 100644 index f4ded9a1..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_excel/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Pandas Excel Loader - -This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file, along with a `sheet_name` from which sheet to extract data. The default `sheet_name=None`, which means it will load all the sheets in the excel file. You can set `sheet_name="Data1` to load only the sheet named "Data1". Or you can set `sheet_name=0` to load the first sheet in the excel file. You can pass any additional pandas configuration options to the `pandas_config` parameter, please see the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html). - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PandasExcelReader = download_loader("PandasExcelReader") - -loader = PandasExcelReader(pandas_config={"header": 0}) -documents = loader.load_data(file=Path('./data.xlsx')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pandas_excel/__init__.py b/nextpy/ai/rag/document_loaders/file/pandas_excel/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_excel/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pandas_excel/base.py b/nextpy/ai/rag/document_loaders/file/pandas_excel/base.py deleted file mode 100644 index e5297742..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_excel/base.py +++ /dev/null @@ -1,93 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Pandas Excel reader. - -Pandas parser for .xlsx files. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PandasExcelReader(BaseReader): - r"""Pandas-based CSV parser. - - Parses CSVs using the separator detection from Pandas `read_csv`function. - If special parameters are required, use the `pandas_config` dict. - - Args: - pandas_config (dict): Options for the `pandas.read_excel` function call. - Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html - for more information. Set to empty dict by default, this means defaults will be used. - - """ - - def __init__( - self, - *args: Any, - pandas_config: Optional[dict] = None, - concat_rows: bool = True, - row_joiner: str = "\n", - **kwargs: Any - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._pandas_config = pandas_config or {} - self._concat_rows = concat_rows - self._row_joiner = row_joiner if row_joiner else "\n" - - def load_data( - self, - file: Path, - include_sheetname: bool = False, - sheet_name: Optional[Union[str, int]] = None, - extra_info: Optional[Dict] = None, - ) -> List[DocumentNode]: - """Parse file and extract values from a specific column. - - Args: - file (Path): The path to the Excel file to read. - column_name (str): The name of the column to use when creating the DocumentNode objects. - - Returns: - List[DocumentNode]: A list of`DocumentNode objects containing the values from the specified column in the Excel file. - """ - import itertools - - import pandas as pd - - df = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) - - keys = df.keys() - - df_sheets = [] - - for key in keys: - sheet = [] - if include_sheetname: - sheet.append([key]) - sheet.extend(df[key].values.astype(str).tolist()) - df_sheets.append(sheet) - - text_list = list( - itertools.chain.from_iterable(df_sheets) - ) # flatten list of lists - - if self._concat_rows: - return [ - DocumentNode( - text=(self._row_joiner).join( - self._row_joiner.join(sublist) for sublist in text_list - ), - extra_info=extra_info or {}, - ) - ] - else: - return [ - DocumentNode(text=text, extra_info=extra_info or {}) - for text in text_list - ] diff --git a/nextpy/ai/rag/document_loaders/file/pandas_excel/requirements.txt b/nextpy/ai/rag/document_loaders/file/pandas_excel/requirements.txt deleted file mode 100644 index 1411a4a0..00000000 --- a/nextpy/ai/rag/document_loaders/file/pandas_excel/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandas \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/pdf/README.md b/nextpy/ai/rag/document_loaders/file/pdf/README.md deleted file mode 100644 index 2b1ac19f..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# PDF Loader - -This loader extracts the text from a local PDF file using the `PyPDF2` Python package. Any non-text elements are ignored. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PDFReader = download_loader("PDFReader") - -loader = PDFReader() -documents = loader.load_data(file=Path('./article.pdf')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pdf/__init__.py b/nextpy/ai/rag/document_loaders/file/pdf/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pdf/base.py b/nextpy/ai/rag/document_loaders/file/pdf/base.py deleted file mode 100644 index cb121c31..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf/base.py +++ /dev/null @@ -1,41 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read PDF files.""" - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PDFReader(BaseReader): - """PDF reader.""" - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import pypdf - - with open(file, "rb") as fp: - # Create a PDF object - pdf = pypdf.PdfReader(fp) - - # Get the number of pages in the PDF DocumentNode - num_pages = len(pdf.pages) - - # Iterate over every page - docs = [] - for page in range(num_pages): - # Extract the text from the page - page_text = pdf.pages[page].extract_text() - page_label = pdf.page_labels[page] - metadata = {"page_label": page_label, "file_name": file.name} - - if extra_info is not None: - metadata.update(extra_info) - - docs.append(DocumentNode(text=page_text, extra_info=metadata)) - return docs diff --git a/nextpy/ai/rag/document_loaders/file/pdf/requirements.txt b/nextpy/ai/rag/document_loaders/file/pdf/requirements.txt deleted file mode 100644 index 1a69c480..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pypdf diff --git a/nextpy/ai/rag/document_loaders/file/pdf_miner/README.md b/nextpy/ai/rag/document_loaders/file/pdf_miner/README.md deleted file mode 100644 index db9fd13e..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf_miner/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# PDF Loader - -This loader extracts the text from a local PDF file using the `pdfminer.six` Python package. Any non-text elements are ignored. A single local file is passed in each time you call `load_data`. -This package often performs better than the builtin pdf parser based on the `pypdf` package. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PDFMinerReader = download_loader("PDFMinerReader") - -loader = PDFMinerReader() -documents = loader.load_data(file=Path('./article.pdf')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pdf_miner/__init__.py b/nextpy/ai/rag/document_loaders/file/pdf_miner/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf_miner/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pdf_miner/base.py b/nextpy/ai/rag/document_loaders/file/pdf_miner/base.py deleted file mode 100644 index 86f0b3ba..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf_miner/base.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read PDF files.""" - -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PDFMinerReader(BaseReader): - """PDF parser based on pdfminer.six.""" - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - try: - from io import StringIO - - from pdfminer.converter import TextConverter - from pdfminer.layout import LAParams - from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - from pdfminer.pdfpage import PDFPage as PDF_Page - - def _extract_text_from_page(page): - resource_manager = PDFResourceManager() - output_string = StringIO() - codec = "utf-8" - laparams = LAParams() - device = TextConverter( - resource_manager, output_string, codec=codec, laparams=laparams - ) - interpreter = PDFPageInterpreter(resource_manager, device) - interpreter.process_page(page) - text = output_string.getvalue() - device.close() - output_string.close() - return text - - except ImportError: - raise ImportError( - "pdfminer.six is required to read PDF files: `pip install pypdf`" - ) - with open(file, "rb") as fp: - reader = PDF_Page.get_pages(fp) - - # Iterate over every page - docs = [] - for i, page in enumerate(reader): - # Extract the text from the page - page_text = _extract_text_from_page(page) - - metadata = {"page_label": i, "file_name": file.name} - if extra_info is not None: - metadata.update(extra_info) - - docs.append(DocumentNode(text=page_text, extra_info=metadata)) - return docs diff --git a/nextpy/ai/rag/document_loaders/file/pdf_miner/requirements.txt b/nextpy/ai/rag/document_loaders/file/pdf_miner/requirements.txt deleted file mode 100644 index 48060604..00000000 --- a/nextpy/ai/rag/document_loaders/file/pdf_miner/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pdfminer.six diff --git a/nextpy/ai/rag/document_loaders/file/pptx/README.md b/nextpy/ai/rag/document_loaders/file/pptx/README.md deleted file mode 100644 index c9eed4af..00000000 --- a/nextpy/ai/rag/document_loaders/file/pptx/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Microsoft PowerPoint Loader - -This loader extracts the text from a local Microsoft PowerPoint (.pptx) file. Image elements are optionally captioned and inserted as text into the final `DocumentNode` using [GPT2 Image Captioning model](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning). For example, a team photo might be converted into "three people smiling in front of skyscrapers". To use this feature, initialize the loader with `caption_images = True`. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PptxReader = download_loader("PptxReader") - -loader = PptxReader() -documents = loader.load_data(file=Path('./deck.pptx')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pptx/__init__.py b/nextpy/ai/rag/document_loaders/file/pptx/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pptx/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pptx/base.py b/nextpy/ai/rag/document_loaders/file/pptx/base.py deleted file mode 100644 index 8868ead6..00000000 --- a/nextpy/ai/rag/document_loaders/file/pptx/base.py +++ /dev/null @@ -1,109 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read Microsoft PowerPoint files.""" - -import os -from pathlib import Path -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PptxReader(BaseReader): - """Powerpoint reader. - - Extract text, caption images, and specify slides. - - """ - - def __init__(self, caption_images: Optional[bool] = False) -> None: - """Init reader.""" - self.caption_images = caption_images - if caption_images: - from transformers import ( - AutoTokenizer, - VisionEncoderDecoderModel, - ViTFeatureExtractor, - ) - - model = VisionEncoderDecoderModel.from_pretrained( - "nlpconnect/vit-gpt2-image-captioning" - ) - feature_extractor = ViTFeatureExtractor.from_pretrained( - "nlpconnect/vit-gpt2-image-captioning" - ) - tokenizer = AutoTokenizer.from_pretrained( - "nlpconnect/vit-gpt2-image-captioning" - ) - - self.parser_config = { - "feature_extractor": feature_extractor, - "model": model, - "tokenizer": tokenizer, - } - - def generate_image_caption(self, tmp_image_file: str) -> str: - """Generate text caption of image.""" - if not self.caption_images: - return "" - - import torch - from PIL import Image - - model = self.parser_config["model"] - feature_extractor = self.parser_config["feature_extractor"] - tokenizer = self.parser_config["tokenizer"] - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - max_length = 16 - num_beams = 4 - gen_kwargs = {"max_length": max_length, "num_beams": num_beams} - - i_image = Image.open(tmp_image_file) - if i_image.mode != "RGB": - i_image = i_image.convert(mode="RGB") - - pixel_values = feature_extractor( - images=[i_image], return_tensors="pt" - ).pixel_values - pixel_values = pixel_values.to(device) - - output_ids = model.generate(pixel_values, **gen_kwargs) - - preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - return preds[0].strip() - - def load_data( - self, - file: Path, - extra_info: Optional[Dict] = None, - ) -> List[DocumentNode]: - """Parse file.""" - from pptx import Presentation - - presentation = Presentation(file) - result = "" - for i, slide in enumerate(presentation.slides): - result += f"\n\nSlide #{i}: \n" - for shape in slide.shapes: - if self.caption_images and hasattr(shape, "image"): - image = shape.image - # get image "file" contents - image_bytes = image.blob - # temporarily save the image to feed into model - image_filename = f"tmp_image.{image.ext}" - with open(image_filename, "wb") as f: - f.write(image_bytes) - result += ( - f"\n Image: {self.generate_image_caption(image_filename)}\n\n" - ) - - os.remove(image_filename) - if hasattr(shape, "text"): - result += f"{shape.text}\n" - - return [DocumentNode(text=result, extra_info=extra_info or {})] diff --git a/nextpy/ai/rag/document_loaders/file/pptx/requirements.txt b/nextpy/ai/rag/document_loaders/file/pptx/requirements.txt deleted file mode 100644 index f2834fb2..00000000 --- a/nextpy/ai/rag/document_loaders/file/pptx/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -transformers -Pillow -torch -torchvision -python-pptx \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/pymu_pdf/README.md b/nextpy/ai/rag/document_loaders/file/pymu_pdf/README.md deleted file mode 100644 index 7a99ac50..00000000 --- a/nextpy/ai/rag/document_loaders/file/pymu_pdf/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# PyMuPDF Loader - -This loader extracts text from a local PDF file using the `PyMuPDF` Python library. This is the fastest among all other PDF parsing options available in `llama_hub`. If `metadata` is passed as True while calling `load` function; extracted documents will include basic metadata such as page numbers, file path and total number of pages in pdf. - -## Usage - -To use this loader, you need to pass file path of the local file as string or `Path` when you call `load` function. By default, including metadata is set to True. You can also pass extra information in a `dict` format when you call `load` function. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -PyMuPDFReader = download_loader("PyMuPDFReader") - -loader = PyMuPDFReader() -documents = loader.load(file_path=Path('./article.pdf'), metadata=True) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/pymu_pdf/__init__.py b/nextpy/ai/rag/document_loaders/file/pymu_pdf/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/pymu_pdf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/pymu_pdf/base.py b/nextpy/ai/rag/document_loaders/file/pymu_pdf/base.py deleted file mode 100644 index b3142772..00000000 --- a/nextpy/ai/rag/document_loaders/file/pymu_pdf/base.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read PDF files using PyMuPDF library.""" -from pathlib import Path -from typing import Dict, List, Optional, Union - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PyMuPDFReader(BaseReader): - """Read PDF files using PyMuPDF library.""" - - def load( - self, - file_path: Union[Path, str], - metadata: bool = True, - extra_info: Optional[Dict] = None, - ) -> List[DocumentNode]: - """Loads list of documents from PDF file and also accepts extra information in dict format. - - Args: - file_path (Union[Path, str]): file path of PDF file (accepts string or Path). - metadata (bool, optional): if metadata to be included or not. Defaults to True. - extra_info (Optional[Dict], optional): extra information related to each DocumentNode in dict format. Defaults to None. - - Raises: - TypeError: if extra_info is not a dictionary. - TypeError: if file_path is not a string or Path. - - Returns: - List[DocumentNode]: list of documents. - """ - import fitz - - # check if file_path is a string or Path - if not isinstance(file_path, str) and not isinstance(file_path, Path): - raise TypeError("file_path must be a string or Path.") - - # open PDF file - doc = fitz.open(file_path) - - # if extra_info is not None, check if it is a dictionary - if extra_info and not isinstance(extra_info, dict): - raise TypeError("extra_info must be a dictionary.") - - # if metadata is True, add metadata to each DocumentNode - if metadata: - if not extra_info: - extra_info = {} - extra_info["total_pages"] = len(doc) - extra_info["file_path"] = file_path - - # return list of documents - return [ - DocumentNode( - text=page.get_text().encode("utf-8"), - extra_info=dict( - extra_info, - **{ - "source": f"{page.number+1}", - }, - ), - ) - for page in doc - ] - - else: - return [ - DocumentNode( - text=page.get_text().encode("utf-8"), extra_info=extra_info or {} - ) - for page in doc - ] diff --git a/nextpy/ai/rag/document_loaders/file/pymu_pdf/requirements.txt b/nextpy/ai/rag/document_loaders/file/pymu_pdf/requirements.txt deleted file mode 100644 index 2d431b0f..00000000 --- a/nextpy/ai/rag/document_loaders/file/pymu_pdf/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -PyMuPDF \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/rdf/README.md b/nextpy/ai/rag/document_loaders/file/rdf/README.md deleted file mode 100644 index e9f77871..00000000 --- a/nextpy/ai/rag/document_loaders/file/rdf/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# RDF Loader - -This loader extracts triples from a local [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) file using the `rdflib` Python package. The loader currently supports the RDF and RDF Schema namespaces. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -RDFReader = download_loader("RDFReader") - -loader = RDFReader() -documents = loader.load_data(file=Path('./knowledge-graph.nt')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/rdf/__init__.py b/nextpy/ai/rag/document_loaders/file/rdf/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/rdf/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/rdf/base.py b/nextpy/ai/rag/document_loaders/file/rdf/base.py deleted file mode 100644 index 900ac44e..00000000 --- a/nextpy/ai/rag/document_loaders/file/rdf/base.py +++ /dev/null @@ -1,79 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read RDF files.""" - -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class RDFReader(BaseReader): - """RDF reader.""" - - def __init__( - self, - *args: Any, - **kwargs: Any, - ) -> None: - """Initialize loader.""" - super().__init__(*args, **kwargs) - - from rdflib import Graph - from rdflib.namespace import RDF, RDFS - - self.Graph = Graph - self.RDF = RDF - self.RDFS = RDFS - - def fetch_labels(self, uri: Any, graph: Any, lang: str): - """Fetch all labels of a URI by language.""" - return list( - filter( - lambda x: x.language in [lang, None], - graph.objects(uri, self.RDFS.label), - ) - ) - - def fetch_label_in_graphs(self, uri: Any, lang: str = "en"): - """Fetch one label of a URI by language from the local or global graph.""" - labels = self.fetch_labels(uri, self.g_local, lang) - if len(labels) > 0: - return labels[0].value - - labels = self.fetch_labels(uri, self.g_global, lang) - if len(labels) > 0: - return labels[0].value - - raise Exception(f"Label not found for: {uri}") - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - lang = extra_info["lang"] if extra_info is not None else "en" - - self.g_local = self.Graph() - self.g_local.parse(file) - - self.g_global = self.Graph() - self.g_global.parse(str(self.RDF)) - self.g_global.parse(str(self.RDFS)) - - text_list = [] - - for s, p, o in self.g_local: - if p == self.RDFS.label: - continue - triple = ( - f"<{self.fetch_label_in_graphs(s, lang=lang)}> " - f"<{self.fetch_label_in_graphs(p, lang=lang)}> " - f"<{self.fetch_label_in_graphs(o, lang=lang)}>" - ) - text_list.append(triple) - - text = "\n".join(text_list) - - return [DocumentNode(text=text, extra_info=extra_info or {})] diff --git a/nextpy/ai/rag/document_loaders/file/rdf/requirements.txt b/nextpy/ai/rag/document_loaders/file/rdf/requirements.txt deleted file mode 100644 index fad8467e..00000000 --- a/nextpy/ai/rag/document_loaders/file/rdf/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -rdflib~=6.2.0 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/file/simple_csv/README.md b/nextpy/ai/rag/document_loaders/file/simple_csv/README.md deleted file mode 100644 index cf5077d9..00000000 --- a/nextpy/ai/rag/document_loaders/file/simple_csv/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Simple CSV Loader - -This loader extracts the text from a local .csv file by directly reading the file row by row. A single local file is passed in each time you call `load_data`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -SimpleCSVReader = download_loader("SimpleCSVReader") - -loader = SimpleCSVReader(encoding="utf-8") -documents = loader.load_data(file=Path('./transactions.csv')) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/file/simple_csv/__init__.py b/nextpy/ai/rag/document_loaders/file/simple_csv/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/simple_csv/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/simple_csv/base.py b/nextpy/ai/rag/document_loaders/file/simple_csv/base.py deleted file mode 100644 index dcb7e1d9..00000000 --- a/nextpy/ai/rag/document_loaders/file/simple_csv/base.py +++ /dev/null @@ -1,59 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple CSV reader. - -A parser for tabular data files. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SimpleCSVReader(BaseReader): - """CSV parser. - - Args: - encoding (str): Encoding used to open the file. - utf-8 by default. - concat_rows (bool): whether to concatenate all rows into one DocumentNode. - If set to False, a DocumentNode will be created for each row. - True by default. - - """ - - def __init__( - self, - *args: Any, - concat_rows: bool = True, - encoding: str = "utf-8", - **kwargs: Any - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self._concat_rows = concat_rows - self._encoding = encoding - - def load_data( - self, file: Path, extra_info: Optional[Dict] = None - ) -> List[DocumentNode]: - """Parse file.""" - import csv - - text_list = [] - with open(file, "r", encoding=self._encoding) as fp: - csv_reader = csv.reader(fp) - for row in csv_reader: - text_list.append(", ".join(row)) - if self._concat_rows: - return [ - DocumentNode(text="\n".join(text_list), extra_info=extra_info or {}) - ] - else: - return [ - DocumentNode(text=text, extra_info=extra_info or {}) - for text in text_list - ] diff --git a/nextpy/ai/rag/document_loaders/file/unstructured/README.md b/nextpy/ai/rag/document_loaders/file/unstructured/README.md deleted file mode 100644 index d8ac282e..00000000 --- a/nextpy/ai/rag/document_loaders/file/unstructured/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Unstructured.io File Loader - -This loader extracts the text from a variety of unstructured text files using [Unstructured.io](https://github.com/Unstructured-IO/unstructured). Currently, the file extensions that are supported are `.txt`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents. A single local file is passed in each time you call `load_data`. - -Check out their documentation to see more details, but notably, this enables you to parse the unstructured data of many use-cases. For example, you can download the 10-K SEC filings of public companies (e.g. [Coinbase](https://www.sec.gov/ix?doc=/Archives/edgar/data/0001679788/000167978822000031/coin-20211231.htm)), and feed it directly into this loader without worrying about cleaning up the formatting or HTML tags. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. Optionally, you may specify `split_documents` if you want each `element` generated by Unstructured.io to be placed in a separate DocumentNode. This will guarantee that those elements will be split when an index is created in LlamaIndex, which, depending on your use-case, could be a smarter form of text-splitting. By default this is `False`. - -```python -from pathlib import Path -from llama_hub.file.unstructured.base import UnstructuredReader - -loader = UnstructuredReader() -documents = loader.load_data(file=Path('./10k_filing.html')) -``` - -You can also easily use this loader in conjunction with `SimpleDirectoryReader` if you want to parse certain files throughout a directory with Unstructured.io. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - -loader = SimpleDirectoryReader('./data', file_extractor={ - ".pdf": UnstructuredReader(), - ".html": UnstructuredReader(), - ".eml": UnstructuredReader(), -}) -documents = loader.load_data() -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - -## Troubleshooting - -**"failed to find libmagic" error**: Try `pip install python-magic-bin==0.4.14`. Solution documented [here](https://github.com/Yelp/elastalert/issues/1927#issuecomment-425040424). On MacOS, you may also try `brew install libmagic`. diff --git a/nextpy/ai/rag/document_loaders/file/unstructured/__init__.py b/nextpy/ai/rag/document_loaders/file/unstructured/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/file/unstructured/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/file/unstructured/base.py b/nextpy/ai/rag/document_loaders/file/unstructured/base.py deleted file mode 100644 index 5a124716..00000000 --- a/nextpy/ai/rag/document_loaders/file/unstructured/base.py +++ /dev/null @@ -1,50 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Unstructured file reader. - -A parser for unstructured text files using Unstructured.io. -Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. - -""" -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class UnstructuredReader(BaseReader): - """General unstructured text reader for a variety of files.""" - - def __init__(self, *args: Any, **kwargs: Any) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - - # Prerequisite for Unstructured.io to work - import nltk - - nltk.download("punkt") - nltk.download("averaged_perceptron_tagger") - - def load_data( - self, - file: Path, - extra_info: Optional[Dict] = None, - split_documents: Optional[bool] = False, - ) -> List[DocumentNode]: - """Parse file.""" - from unstructured.partition.auto import partition - - elements = partition(str(file)) - text_chunks = [" ".join(str(el).split()) for el in elements] - - if split_documents: - return [ - DocumentNode(text=chunk, extra_info=extra_info or {}) - for chunk in text_chunks - ] - else: - return [ - DocumentNode(text="\n\n".join(text_chunks), extra_info=extra_info or {}) - ] diff --git a/nextpy/ai/rag/document_loaders/file/unstructured/requirements.txt b/nextpy/ai/rag/document_loaders/file/unstructured/requirements.txt deleted file mode 100644 index 9e290371..00000000 --- a/nextpy/ai/rag/document_loaders/file/unstructured/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -unstructured -nltk \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/firebase_realtimedb/README.md b/nextpy/ai/rag/document_loaders/firebase_realtimedb/README.md deleted file mode 100644 index 11d0beac..00000000 --- a/nextpy/ai/rag/document_loaders/firebase_realtimedb/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Firebase Realtime Database Loader - -This loader retrieves documents from Firebase Realtime Database. The user specifies the Firebase Realtime Database URL and, optionally, the path to a service account key file for authentication. - -## Usage - -Here's an example usage of the FirebaseRealtimeDatabaseReader. - -```python -from nextpy.ai import download_loader - -FirebaseRealtimeDatabaseReader = download_loader('FirebaseRealtimeDatabaseReader') - -database_url = "" -service_account_key_path = "" -path = "" -reader = FirebaseRealtimeDatabaseReader(database_url, service_account_key_path) -documents = reader.load_data(path) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/firebase_realtimedb/__init__.py b/nextpy/ai/rag/document_loaders/firebase_realtimedb/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/firebase_realtimedb/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/firebase_realtimedb/base.py b/nextpy/ai/rag/document_loaders/firebase_realtimedb/base.py deleted file mode 100644 index 8cbecc78..00000000 --- a/nextpy/ai/rag/document_loaders/firebase_realtimedb/base.py +++ /dev/null @@ -1,90 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Firebase Realtime Database Loader.""" - -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class FirebaseRealtimeDatabaseReader(BaseReader): - """Firebase Realtime Database reader. - - Retrieves data from Firebase Realtime Database and converts it into the DocumentNode used by LlamaIndex. - - Args: - database_url (str): Firebase Realtime Database URL. - service_account_key_path (Optional[str]): Path to the service account key file. - - """ - - def __init__( - self, - database_url: str, - service_account_key_path: Optional[str] = None, - ) -> None: - """Initialize with parameters.""" - self.database_url = database_url - - try: - import firebase_admin - from firebase_admin import credentials - except ImportError: - raise ImportError( - "`firebase_admin` package not found, please run `pip install firebase-admin`" - ) - - if not firebase_admin._apps: - if service_account_key_path: - cred = credentials.Certificate(service_account_key_path) - firebase_admin.initialize_app( - cred, options={"databaseURL": database_url} - ) - else: - firebase_admin.initialize_app(options={"databaseURL": database_url}) - - def load_data(self, path: str, field: Optional[str] = None) -> List[DocumentNode]: - """Load data from Firebase Realtime Database and convert it into documents. - - Args: - path (str): Path to the data in the Firebase Realtime Database. - field (str, Optional): Key to pick data from - - Returns: - List[DocumentNode]: A list of documents. - - """ - try: - from firebase_admin import db - except ImportError: - raise ImportError( - "`firebase_admin` package not found, please run `pip install firebase-admin`" - ) - - ref = db.reference(path) - data = ref.get() - - documents = [] - - if isinstance(data, Dict): - for key in data: - entry = data[key] - metadata = { - "document_id": key, - "databaseURL": self.database_url, - "path": path, - "field": field, - } - if type(entry) is Dict and field in entry: - text = entry[field] - else: - text = str(entry) - - DocumentNode = DocumentNode(text=text, extra_info=metadata) - documents.append(DocumentNode) - elif isinstance(data, str): - documents.append(DocumentNode(text=data)) - - return documents diff --git a/nextpy/ai/rag/document_loaders/firebase_realtimedb/requirements.txt b/nextpy/ai/rag/document_loaders/firebase_realtimedb/requirements.txt deleted file mode 100644 index 4720fc6f..00000000 --- a/nextpy/ai/rag/document_loaders/firebase_realtimedb/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -firebase-admin \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/firestore/README.md b/nextpy/ai/rag/document_loaders/firestore/README.md deleted file mode 100644 index b583570e..00000000 --- a/nextpy/ai/rag/document_loaders/firestore/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Firestore Loader - -This loader loads from a Firestore collection or a specific DocumentNode from Firestore. The loader assumes your project already has the google cloud credentials loaded. To find out how to set up credentials, [see here](https://cloud.google.com/docs/authentication/provide-credentials-adc). - -## Usage - -To initialize the loader, provide the project-id of the google cloud project. - -## Initializing the reader - -```python -from nextpy.ai import download_loader - -FirestoreReader = download_loader('FirestoreReader') -reader = FirestoreReader(project_id='') -``` - -## Loading Data from a Firestore Collection - -Load data from a Firestore collection with the load_data method: -The collection path should include all previous documents and collections if it is a nested collection. - -```python -documents = reader.load_data(collection='foo/bar/abc/') -``` - -## Loading a Single DocumentNode from Firestore - -Load a single DocumentNode from Firestore with the load_document method: - -```python -DocumentNode = reader.load_document(document_url='foo/bar/abc/MY_DOCUMENT') -``` - -Note: load_data returns a list of DocumentNode objects, whereas load_document returns a single DocumentNode object. - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/openams/tree/main/openams) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/firestore/__init__.py b/nextpy/ai/rag/document_loaders/firestore/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/firestore/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/firestore/base.py b/nextpy/ai/rag/document_loaders/firestore/base.py deleted file mode 100644 index 8a2231d6..00000000 --- a/nextpy/ai/rag/document_loaders/firestore/base.py +++ /dev/null @@ -1,78 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Firestore Reader.""" - -from typing import Any, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class FirestoreReader(BaseReader): - """Simple Firestore reader. - - Args: - project_id (str): The Google Cloud Project ID. - *args (Optional[Any]): Additional arguments. - **kwargs (Optional[Any]): Additional keyword arguments. - - Returns: - FirestoreReader: A FirestoreReader object. - """ - - def __init__( - self, - project_id: str, - *args: Optional[Any], - **kwargs: Optional[Any], - ) -> None: - """Initialize with parameters.""" - from google.cloud import firestore - - self.project_id = project_id - - self.db = firestore.Client(project=project_id) - - def load_data(self, collection: str) -> List[DocumentNode]: - """Load data from a Firestore collection, returning a list of Documents. - - Args: - collection (str): The name of the Firestore collection to read from. - - Returns: - List[DocumentNode]: A list of DocumentNode objects. - """ - metadata = {"project_id": self.project_id, "collection": collection} - - documents = [] - col_ref = self.db.collection(collection) - for doc in col_ref.stream(): - doc_str = ", ".join([f"{k}: {v}" for k, v in doc.to_dict().items()]) - documents.append(DocumentNode(text=doc_str, extra_info=metadata)) - return documents - - def load_document(self, document_url: str) -> DocumentNode: - """Load a single DocumentNode from Firestore. - - Args: - document_url (str): The absolute path to the Firestore DocumentNode to read. - - Returns: - DocumentNode: A DocumentNode object. - """ - metadata = {"project_id": self.project_id, "document_url": document_url} - - parts = document_url.split("/") - if len(parts) % 2 != 0: - raise ValueError(f"Invalid DocumentNode URL: {document_url}") - - ref = self.db.collection(parts[0]) - for i in range(1, len(parts)): - ref = ref.collection(parts[i]) if i % 2 == 0 else ref.DocumentNode(parts[i]) - - doc = ref.get() - if not doc.exists: - raise ValueError(f"No such DocumentNode: {document_url}") - doc_str = ", ".join([f"{k}: {v}" for k, v in doc.to_dict().items()]) - return DocumentNode(text=doc_str, extra_info=metadata) diff --git a/nextpy/ai/rag/document_loaders/firestore/requirements.txt b/nextpy/ai/rag/document_loaders/firestore/requirements.txt deleted file mode 100644 index aacb83a9..00000000 --- a/nextpy/ai/rag/document_loaders/firestore/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -google-cloud-firestore diff --git a/nextpy/ai/rag/document_loaders/github_repo/README.md b/nextpy/ai/rag/document_loaders/github_repo/README.md deleted file mode 100644 index 0df2b2dc..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/README.md +++ /dev/null @@ -1,89 +0,0 @@ -# Github Repository Loader - -This loader takes in `owner`, `repo`, `branch`, `commit_sha` and other optional parameters such as for filtering dicrectories or only allowing some files with given extensions etc. It then fetches all the contents of the GitHub repository. - -As a prerequisite, you will need to generate a "classic" personal access token with the `repo` and `read:org` scopes. See [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for instructions. - -## Usage - -To use this loader, you simply need to pass in the `owner` and `repo` and either `branch` or `commit_sha` for example, you can `owner = jerryjliu` and `repo = llama_index` and also either branch or commit `branch = main` or `commit_sha = a6c89159bf8e7086bea2f4305cff3f0a4102e370`. - -```shell -export GITHUB_TOKEN='...' -``` - -```python -import os - -from nextpy.ai import download_loader -download_loader("GithubRepositoryReader") - -from llama_hub.github_repo import GithubRepositoryReader, GithubClient - -github_client = GithubClient(os.getenv("GITHUB_TOKEN")) -loader = GithubRepositoryReader( - github_client, - owner = "jerryjliu", - repo = "llama_index", - filter_directories = (["gpt_index", "docs"], GithubRepositoryReader.FilterType.INCLUDE), - filter_file_extensions = ([".py"], GithubRepositoryReader.FilterType.INCLUDE), - verbose = True, - concurrent_requests = 10, -) - -docs = loader.load_data(branch="main") -# alternatively, load from a specific commit: -# docs = loader.load_data(commit_sha="a6c89159bf8e7086bea2f4305cff3f0a4102e370") - -for doc in docs: - print(doc.extra_info) -``` - -## Examples - -This loader designed to be used as a way to load data into [Llama Index](https://github.com/jerryjliu/llama_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### Llama Index - -```shell -export OPENAI_API_KEY='...' -export GITHUB_TOKEN='...' -``` - -```python -import pickle -import os - -from nextpy.ai import download_loader, GPTVectorDBIndex -download_loader("GithubRepositoryReader") - -from llama_hub.github_repo import GithubClient, GithubRepositoryReader - -docs = None -if os.path.exists("docs.pkl"): - with open("docs.pkl", "rb") as f: - docs = pickle.load(f) - -if docs is None: - github_client = GithubClient(os.getenv("GITHUB_TOKEN")) - loader = GithubRepositoryReader( - github_client, - owner = "jerryjliu", - repo = "llama_index", - filter_directories = (["gpt_index", "docs"], GithubRepositoryReader.FilterType.INCLUDE), - filter_file_extensions = ([".py"], GithubRepositoryReader.FilterType.INCLUDE), - verbose = True, - concurrent_requests = 10, - ) - - docs = loader.load_data(branch="main") - - with open("docs.pkl", "wb") as f: - pickle.dump(docs, f) - -index = GPTVectorDBIndex.from_documents(docs) - -query_engine = index.as_query_engine() -response = query_engine.query("Explain each LlamaIndex class?") -print(response) -``` diff --git a/nextpy/ai/rag/document_loaders/github_repo/__init__.py b/nextpy/ai/rag/document_loaders/github_repo/__init__.py deleted file mode 100644 index 394f7bd0..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" - -from .base import GithubRepositoryReader -from .github_client import GithubClient - -__all__ = ["GithubRepositoryReader", "GithubClient"] diff --git a/nextpy/ai/rag/document_loaders/github_repo/base.py b/nextpy/ai/rag/document_loaders/github_repo/base.py deleted file mode 100644 index 23bc1538..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/base.py +++ /dev/null @@ -1,593 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Github repository reader. - -Retrieves the contents of a Github repository and returns a list of documents. -The documents are either the contents of the files in the repository or -the text extracted from the files using the parser. -""" -import asyncio -import base64 -import binascii -import enum -import logging -import os -import pathlib -import sys -import tempfile -from typing import Any, Callable, Dict, List, Optional, Tuple - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.readers.file.base import DEFAULT_FILE_READER_CLS -from nextpy.ai.schema import DocumentNode - -if "pytest" in sys.modules: - from llama_hub.github_repo.github_client import ( - BaseGithubClient, - GitBranchResponseModel, - GitCommitResponseModel, - GithubClient, - GitTreeResponseModel, - ) - from llama_hub.github_repo.utils import ( - BufferedGitBlobDataIterator, - get_file_extension, - print_if_verbose, - ) -else: - from llama_hub.github_repo.github_client import ( - BaseGithubClient, - GitBranchResponseModel, - GitCommitResponseModel, - GithubClient, - GitTreeResponseModel, - ) - from llama_hub.github_repo.utils import ( - BufferedGitBlobDataIterator, - get_file_extension, - print_if_verbose, - ) - -logger = logging.getLogger(__name__) - - -class GithubRepositoryReader(BaseReader): - """Github repository reader. - - Retrieves the contents of a Github repository and returns a list of documents. - The documents are either the contents of the files in the repository or the text - extracted from the files using the parser. - - Examples: - >>> reader = GithubRepositoryReader("owner", "repo") - >>> branch_documents = reader.load_data(branch="branch") - >>> commit_documents = reader.load_data(commit_sha="commit_sha") - - """ - - class FilterType(enum.Enum): - """Filter type. - - Used to determine whether the filter is inclusive or exclusive. - - Attributes: - - EXCLUDE: Exclude the files in the directories or with the extensions. - - INCLUDE: Include only the files in the directories or with the extensions. - """ - - EXCLUDE = enum.auto() - INCLUDE = enum.auto() - - def __init__( - self, - github_client: BaseGithubClient, - owner: str, - repo: str, - use_parser: bool = False, - verbose: bool = False, - concurrent_requests: int = 5, - filter_directories: Optional[Tuple[List[str], FilterType]] = None, - filter_file_extensions: Optional[Tuple[List[str], FilterType]] = None, - ): - """Initialize params. - - Args: - - github_client (BaseGithubClient): Github client. - - owner (str): Owner of the repository. - - repo (str): Name of the repository. - - use_parser (bool): Whether to use the parser to extract - the text from the files. - - verbose (bool): Whether to print verbose messages. - - concurrent_requests (int): Number of concurrent requests to - make to the Github API. - - filter_directories (Optional[Tuple[List[str], FilterType]]): Tuple - containing a list of directories and a FilterType. If the FilterType - is INCLUDE, only the files in the directories in the list will be - included. If the FilterType is EXCLUDE, the files in the directories - in the list will be excluded. - - filter_file_extensions (Optional[Tuple[List[str], FilterType]]): Tuple - containing a list of file extensions and a FilterType. If the - FilterType is INCLUDE, only the files with the extensions in the list - will be included. If the FilterType is EXCLUDE, the files with the - extensions in the list will be excluded. - - Raises: - - `ValueError`: If the github_token is not provided and - the GITHUB_TOKEN environment variable is not set. - """ - super().__init__() - - self._owner = owner - self._repo = repo - self._use_parser = use_parser - self._verbose = verbose - self._concurrent_requests = concurrent_requests - self._filter_directories = filter_directories - self._filter_file_extensions = filter_file_extensions - - # Set up the event loop - try: - self._loop = asyncio.get_running_loop() - except RuntimeError: - # If there is no running loop, create a new one - self._loop = asyncio.new_event_loop() - asyncio.set_event_loop(self._loop) - - self._github_client = github_client - - self._file_readers: Dict[str, BaseReader] = {} - self._supported_suffix = list(DEFAULT_FILE_READER_CLS.keys()) - - def _check_filter_directories(self, tree_obj_path: str) -> bool: - """Check if a tree object should be allowed based on the directories. - - :param `tree_obj_path`: path of the tree object i.e. 'gpt_index/readers' - - :return: True if the tree object should be allowed, False otherwise - """ - if self._filter_directories is None: - return True - filter_directories, filter_type = self._filter_directories - print_if_verbose( - self._verbose, - f"Checking {tree_obj_path} whether to {filter_type} it" - + f" based on the filter directories: {filter_directories}", - ) - - if filter_type == self.FilterType.EXCLUDE: - print_if_verbose( - self._verbose, - f"Checking if {tree_obj_path} is not a subdirectory of any of the filter directories", - ) - return not any( - tree_obj_path.startswith(directory) for directory in filter_directories - ) - if filter_type == self.FilterType.INCLUDE: - print_if_verbose( - self._verbose, - f"Checking if {tree_obj_path} is a subdirectory of any of the filter directories", - ) - return any( - tree_obj_path.startswith(directory) - or directory.startswith(tree_obj_path) - for directory in filter_directories - ) - raise ValueError( - f"Unknown filter type: {filter_type}. " - "Please use either 'INCLUDE' or 'EXCLUDE'." - ) - - def _check_filter_file_extensions(self, tree_obj_path: str) -> bool: - """Check if a tree object should be allowed based on the file extensions. - - :param `tree_obj_path`: path of the tree object i.e. 'gpt_index/indices' - - :return: True if the tree object should be allowed, False otherwise - """ - if self._filter_file_extensions is None: - return True - filter_file_extensions, filter_type = self._filter_file_extensions - print_if_verbose( - self._verbose, - f"Checking {tree_obj_path} whether to {filter_type} it" - + f" based on the filter file extensions: {filter_file_extensions}", - ) - - if filter_type == self.FilterType.EXCLUDE: - return get_file_extension(tree_obj_path) not in filter_file_extensions - if filter_type == self.FilterType.INCLUDE: - return get_file_extension(tree_obj_path) in filter_file_extensions - raise ValueError( - f"Unknown filter type: {filter_type}. " - "Please use either 'INCLUDE' or 'EXCLUDE'." - ) - - def _allow_tree_obj(self, tree_obj_path: str, tree_obj_type: str) -> bool: - """Check if a tree object should be allowed. - - :param `tree_obj_path`: path of the tree object - - :return: True if the tree object should be allowed, False otherwise - - """ - if self._filter_directories is not None and tree_obj_type == "tree": - return self._check_filter_directories(tree_obj_path) - - if self._filter_file_extensions is not None and tree_obj_type == "blob": - return self._check_filter_directories( - tree_obj_path - ) and self._check_filter_file_extensions(tree_obj_path) - - return True - - def _load_data_from_commit(self, commit_sha: str) -> List[DocumentNode]: - """Load data from a commit. - - Loads github repository data from a specific commit sha. - - :param `commit`: commit sha - - :return: list of documents - """ - commit_response: GitCommitResponseModel = self._loop.run_until_complete( - self._github_client.get_commit(self._owner, self._repo, commit_sha) - ) - - tree_sha = commit_response.commit.tree.sha - blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha)) - - print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs") - - return self._loop.run_until_complete( - self._generate_documents(blobs_and_paths=blobs_and_paths) - ) - - def _load_data_from_branch(self, branch: str) -> List[DocumentNode]: - """Load data from a branch. - - Loads github repository data from a specific branch. - - :param `branch`: branch name - - :return: list of documents - """ - branch_data: GitBranchResponseModel = self._loop.run_until_complete( - self._github_client.get_branch(self._owner, self._repo, branch) - ) - - tree_sha = branch_data.commit.commit.tree.sha - blobs_and_paths = self._loop.run_until_complete(self._recurse_tree(tree_sha)) - - print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs") - - return self._loop.run_until_complete( - self._generate_documents(blobs_and_paths=blobs_and_paths) - ) - - def load_data( - self, - commit_sha: Optional[str] = None, - branch: Optional[str] = None, - ) -> List[DocumentNode]: - """Load data from a commit or a branch. - - Loads github repository data from a specific commit sha or a branch. - - :param `commit`: commit sha - :param `branch`: branch name - - :return: list of documents - """ - self.commit_sha = (commit_sha,) - self.branch = branch - - if commit_sha is not None and branch is not None: - raise ValueError("You can only specify one of commit or branch.") - - if commit_sha is None and branch is None: - raise ValueError("You must specify one of commit or branch.") - - if commit_sha is not None: - return self._load_data_from_commit(commit_sha) - - if branch is not None: - return self._load_data_from_branch(branch) - - raise ValueError("You must specify one of commit or branch.") - - async def _recurse_tree( - self, - tree_sha: str, - current_path: str = "", - current_depth: int = 0, - max_depth: int = -1, - ) -> Any: - """Recursively get all blob tree objects in a tree. - - And construct their full path relative to the root of the repository. - (see GitTreeResponseModel.GitTreeObject in - github_api_client.py for more information) - - :param `tree_sha`: sha of the tree to recurse - :param `current_path`: current path of the tree - :param `current_depth`: current depth of the tree - :return: list of tuples of - (tree object, file's full path realtive to the root of the repo) - """ - if max_depth != -1 and current_depth > max_depth: - return [] - - blobs_and_full_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]] = [] - print_if_verbose( - self._verbose, - "\t" * current_depth + f"current path: {current_path}", - ) - - tree_data: GitTreeResponseModel = await self._github_client.get_tree( - self._owner, self._repo, tree_sha - ) - print_if_verbose( - self._verbose, "\t" * current_depth + f"tree data: {tree_data}" - ) - print_if_verbose( - self._verbose, "\t" * current_depth + f"processing tree {tree_sha}" - ) - for tree_obj in tree_data.tree: - file_path = os.path.join(current_path, tree_obj.path) - if not self._allow_tree_obj(file_path, tree_obj.type): - print_if_verbose( - self._verbose, - "\t" * current_depth + f"ignoring {tree_obj.path} due to filter", - ) - continue - - print_if_verbose( - self._verbose, - "\t" * current_depth + f"tree object: {tree_obj}", - ) - - if tree_obj.type == "tree": - print_if_verbose( - self._verbose, - "\t" * current_depth + f"recursing into {tree_obj.path}", - ) - - blobs_and_full_paths.extend( - await self._recurse_tree( - tree_obj.sha, file_path, current_depth + 1, max_depth - ) - ) - elif tree_obj.type == "blob": - print_if_verbose( - self._verbose, - "\t" * current_depth + f"found blob {tree_obj.path}", - ) - - blobs_and_full_paths.append((tree_obj, file_path)) - - print_if_verbose( - self._verbose, - "\t" * current_depth + f"blob and full paths: {blobs_and_full_paths}", - ) - return blobs_and_full_paths - - async def _generate_documents( - self, - blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]], - ) -> List[DocumentNode]: - """Generate documents from a list of blobs and their full paths. - - :param `blobs_and_paths`: list of tuples of - (tree object, file's full path in the repo realtive to the root of the repo) - :return: list of documents - """ - buffered_iterator = BufferedGitBlobDataIterator( - blobs_and_paths=blobs_and_paths, - github_client=self._github_client, - owner=self._owner, - repo=self._repo, - loop=self._loop, - buffer_size=self._concurrent_requests, # TODO: make this configurable - verbose=self._verbose, - ) - - documents = [] - async for blob_data, full_path in buffered_iterator: - print_if_verbose(self._verbose, f"generating DocumentNode for {full_path}") - assert ( - blob_data.encoding == "base64" - ), f"blob encoding {blob_data.encoding} not supported" - decoded_bytes = None - try: - decoded_bytes = base64.b64decode(blob_data.content) - del blob_data.content - except binascii.Error: - print_if_verbose( - self._verbose, f"could not decode {full_path} as base64" - ) - continue - - metadata = { - "owner": self._owner, - "repo": self._repo, - "commit_sha": self.commit_sha, - "branch": self.branch, - "file_path": full_path, - "file_name": full_path.split("/")[-1], - } - - if self._use_parser: - DocumentNode = self._parse_supported_file( - file_path=full_path, - file_content=decoded_bytes, - tree_sha=blob_data.sha, - tree_path=full_path, - metadata=metadata, - ) - if DocumentNode is not None: - documents.append(DocumentNode) - continue - print_if_verbose( - self._verbose, - f"could not parse {full_path} as a supported file type" - + " - falling back to decoding as utf-8 raw text", - ) - - try: - if decoded_bytes is None: - raise ValueError("decoded_bytes is None") - decoded_text = decoded_bytes.decode("utf-8") - except UnicodeDecodeError: - print_if_verbose( - self._verbose, f"could not decode {full_path} as utf-8" - ) - continue - print_if_verbose( - self._verbose, - f"got {len(decoded_text)} characters" - + f"- adding to documents - {full_path}", - ) - DocumentNode = DocumentNode( - text=decoded_text, - doc_id=blob_data.sha, - extra_info=metadata, - ) - documents.append(DocumentNode) - return documents - - def _parse_supported_file( - self, - file_path: str, - file_content: bytes, - tree_sha: str, - tree_path: str, - metadata: dict, - ) -> Optional[DocumentNode]: - """Parse a file if it is supported by a parser. - - :param `file_path`: path of the file in the repo - :param `file_content`: content of the file - :return: DocumentNode if the file is supported by a parser, None otherwise - """ - metadata["file_path"] = file_path - metadata["file_name"] = tree_path - - file_extension = get_file_extension(file_path) - if file_extension not in self._supported_suffix: - # skip - return None - - if file_extension not in self._file_readers: - # initialize reader - cls_ = DEFAULT_FILE_READER_CLS[file_extension] - self._file_readers[file_extension] = cls_() - - reader = self._file_readers[file_extension] - - print_if_verbose( - self._verbose, - f"parsing {file_path}" - + f"as {file_extension} with " - + f"{reader.__class__.__name__}", - ) - with tempfile.TemporaryDirectory() as tmpdirname, tempfile.NamedTemporaryFile( - dir=tmpdirname, - suffix=f".{file_extension}", - mode="w+b", - delete=False, - ) as tmpfile: - print_if_verbose( - self._verbose, - "created a temporary file" + f"{tmpfile.name} for parsing {file_path}", - ) - tmpfile.write(file_content) - tmpfile.flush() - tmpfile.close() - try: - docs = reader.load_data(pathlib.Path(tmpfile.name)) - parsed_file = "\n\n".join([doc.get_text() for doc in docs]) - except Exception as e: - print_if_verbose(self._verbose, f"error while parsing {file_path}") - logger.error( - "Error while parsing " - + f"{file_path} with " - + f"{reader.__class__.__name__}:\n{e}" - ) - parsed_file = None - finally: - os.remove(tmpfile.name) - if parsed_file is None: - return None - return DocumentNode( - text=parsed_file, - doc_id=tree_sha, - extra_info=metadata, - ) - - -if __name__ == "__main__": - import time - - def timeit(func: Callable) -> Callable: - """Time a function.""" - - def wrapper(*args: Any, **kwargs: Any) -> None: - """Callcuate time taken to run a function.""" - start = time.time() - func(*args, **kwargs) - end = time.time() - print(f"Time taken: {end - start} seconds for {func.__name__}") - - return wrapper - - github_client = GithubClient(github_token=os.environ["GITHUB_TOKEN"], verbose=True) - - reader1 = GithubRepositoryReader( - github_client=github_client, - owner="jerryjliu", - repo="gpt_index", - use_parser=False, - verbose=True, - filter_directories=( - ["docs"], - GithubRepositoryReader.FilterType.INCLUDE, - ), - filter_file_extensions=( - [ - ".png", - ".jpg", - ".jpeg", - ".gif", - ".svg", - ".ico", - "json", - ".ipynb", - ], - GithubRepositoryReader.FilterType.EXCLUDE, - ), - ) - - @timeit - def load_data_from_commit() -> None: - """Load data from a commit.""" - documents = reader1.load_data( - commit_sha="22e198b3b166b5facd2843d6a62ac0db07894a13" - ) - for DocumentNode in documents: - print(DocumentNode.extra_info) - - @timeit - def load_data_from_branch() -> None: - """Load data from a branch.""" - documents = reader1.load_data(branch="main") - for DocumentNode in documents: - print(DocumentNode.extra_info) - - input("Press enter to load github repository from branch name...") - - load_data_from_branch() - - # input("Press enter to load github repository from commit sha...") - - # load_data_from_commit() diff --git a/nextpy/ai/rag/document_loaders/github_repo/github_client.py b/nextpy/ai/rag/document_loaders/github_repo/github_client.py deleted file mode 100644 index 01a8c809..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/github_client.py +++ /dev/null @@ -1,432 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Github API client for the GPT-Index library. - -This module contains the Github API client for the GPT-Index library. -It is used by the Github readers to retrieve the data from Github. -""" - -import os -from dataclasses import dataclass -from typing import Any, Dict, List, Optional - -from dataclasses_json import DataClassJsonMixin - - -@dataclass -class GitTreeResponseModel(DataClassJsonMixin): - """Dataclass for the response from the Github API's getTree endpoint. - - Attributes: - - sha (str): SHA1 checksum ID of the tree. - - url (str): URL for the tree. - - tree (List[GitTreeObject]): List of objects in the tree. - - truncated (bool): Whether the tree is truncated. - - Examples: - >>> tree = client.get_tree("owner", "repo", "branch") - >>> tree.sha - """ - - @dataclass - class GitTreeObject(DataClassJsonMixin): - """Dataclass for the objects in the tree. - - Attributes: - - path (str): Path to the object. - - mode (str): Mode of the object. - - type (str): Type of the object. - - sha (str): SHA1 checksum ID of the object. - - url (str): URL for the object. - - size (Optional[int]): Size of the object (only for blobs). - """ - - path: str - mode: str - type: str - sha: str - url: str - size: Optional[int] = None - - sha: str - url: str - tree: List[GitTreeObject] - truncated: bool - - -@dataclass -class GitBlobResponseModel(DataClassJsonMixin): - """Dataclass for the response from the Github API's getBlob endpoint. - - Attributes: - - content (str): Content of the blob. - - encoding (str): Encoding of the blob. - - url (str): URL for the blob. - - sha (str): SHA1 checksum ID of the blob. - - size (int): Size of the blob. - - node_id (str): Node ID of the blob. - """ - - content: str - encoding: str - url: str - sha: str - size: int - node_id: str - - -@dataclass -class GitCommitResponseModel(DataClassJsonMixin): - """Dataclass for the response from the Github API's getCommit endpoint. - - Attributes: - - tree (Tree): Tree object for the commit. - """ - - @dataclass - class Commit(DataClassJsonMixin): - """Dataclass for the commit object in the commit. (commit.commit).""" - - @dataclass - class Tree(DataClassJsonMixin): - """Dataclass for the tree object in the commit. - - Attributes: - - sha (str): SHA for the commit - """ - - sha: str - - tree: Tree - - commit: Commit - url: str - sha: str - - -@dataclass -class GitBranchResponseModel(DataClassJsonMixin): - """Dataclass for the response from the Github API's getBranch endpoint. - - Attributes: - - commit (Commit): Commit object for the branch. - """ - - @dataclass - class Commit(DataClassJsonMixin): - """Dataclass for the commit object in the branch. (commit.commit).""" - - @dataclass - class Commit(DataClassJsonMixin): - """Dataclass for the commit object in the commit. (commit.commit.tree).""" - - @dataclass - class Tree(DataClassJsonMixin): - """Dataclass for the tree object in the commit. - - Usage: commit.commit.tree.sha - """ - - sha: str - - tree: Tree - - commit: Commit - - @dataclass - class Links(DataClassJsonMixin): - self: str - html: str - - commit: Commit - name: str - _links: Links - - -from typing import Protocol - - -class BaseGithubClient(Protocol): - def get_all_endpoints(self) -> Dict[str, str]: - ... - - async def request( - self, - endpoint: str, - method: str, - headers: Dict[str, Any] = {}, - **kwargs: Any, - ) -> Any: - ... - - async def get_tree( - self, - owner: str, - repo: str, - tree_sha: str, - ) -> GitTreeResponseModel: - ... - - async def get_blob( - self, - owner: str, - repo: str, - file_sha: str, - ) -> GitBlobResponseModel: - ... - - async def get_commit( - self, - owner: str, - repo: str, - commit_sha: str, - ) -> GitCommitResponseModel: - ... - - async def get_branch( - self, - owner: str, - repo: str, - branch_name: str, - ) -> GitBranchResponseModel: - ... - - -class GithubClient: - """An asynchronous client for interacting with the Github API. - - This client is used for making API requests to Github. - It provides methods for accessing the Github API endpoints. - The client requires a Github token for authentication, - which can be passed as an argument or set as an environment variable. - If no Github token is provided, the client will raise a ValueError. - - Examples: - >>> client = GithubClient("my_github_token") - >>> branch_info = client.get_branch("owner", "repo", "branch") - """ - - DEFAULT_BASE_URL = "https://api.github.com" - DEFAULT_API_VERSION = "2022-11-28" - - def __init__( - self, - github_token: Optional[str] = None, - base_url: str = DEFAULT_BASE_URL, - api_version: str = DEFAULT_API_VERSION, - verbose: bool = False, - ) -> None: - """Initialize the GithubClient. - - Args: - - github_token (str): Github token for authentication. - If not provided, the client will try to get it from - the GITHUB_TOKEN environment variable. - - base_url (str): Base URL for the Github API - (defaults to "https://api.github.com"). - - api_version (str): Github API version (defaults to "2022-11-28"). - - Raises: - ValueError: If no Github token is provided. - """ - if github_token is None: - github_token = os.getenv("GITHUB_TOKEN") - if github_token is None: - raise ValueError( - "Please provide a Github token. " - + "You can do so by passing it as an argument to the GithubReader," - + "or by setting the GITHUB_TOKEN environment variable." - ) - - self._base_url = base_url - self._api_version = api_version - self._verbose = verbose - - self._endpoints = { - "getTree": "/repos/{owner}/{repo}/git/trees/{tree_sha}", - "getBranch": "/repos/{owner}/{repo}/branches/{branch}", - "getBlob": "/repos/{owner}/{repo}/git/blobs/{file_sha}", - "getCommit": "/repos/{owner}/{repo}/commits/{commit_sha}", - } - - self._headers = { - "Accept": "application/vnd.github+json", - "Authorization": f"Bearer {github_token}", - "X-GitHub-Api-Version": f"{self._api_version}", - } - - def get_all_endpoints(self) -> Dict[str, str]: - """Get all available endpoints.""" - return {**self._endpoints} - - async def request( - self, - endpoint: str, - method: str, - headers: Dict[str, Any] = {}, - **kwargs: Any, - ) -> Any: - """Make an API request to the Github API. - - This method is used for making API requests to the Github API. - It is used internally by the other methods in the client. - - Args: - - `endpoint (str)`: Name of the endpoint to make the request to. - - `method (str)`: HTTP method to use for the request. - - `headers (dict)`: HTTP headers to include in the request. - - `**kwargs`: Keyword arguments to pass to the endpoint URL. - - Returns: - - `response (httpx.Response)`: Response from the API request. - - Raises: - - ImportError: If the `httpx` library is not installed. - - httpx.HTTPError: If the API request fails. - - Examples: - >>> response = client.request("getTree", "GET", - owner="owner", repo="repo", - tree_sha="tree_sha") - """ - try: - import httpx - except ImportError: - raise ImportError( - "Please install httpx to use the GithubRepositoryReader. " - "You can do so by running `pip install httpx`." - ) - - _headers = {**self._headers, **headers} - - _client: httpx.AsyncClient - async with httpx.AsyncClient( - headers=_headers, base_url=self._base_url - ) as _client: - try: - response = await _client.request( - method, url=self._endpoints[endpoint].format(**kwargs) - ) - except httpx.HTTPError as excp: - print(f"HTTP Exception for {excp.request.url} - {excp}") - raise excp - return response - - async def get_branch( - self, owner: str, repo: str, branch: str - ) -> GitBranchResponseModel: - """Get information about a branch. (Github API endpoint: getBranch). - - Args: - - `owner (str)`: Owner of the repository. - - `repo (str)`: Name of the repository. - - `branch (str)`: Name of the branch. - - Returns: - - `branch_info (GitBranchResponseModel)`: Information about the branch. - - Examples: - >>> branch_info = client.get_branch("owner", "repo", "branch") - """ - return GitBranchResponseModel.from_json( - ( - await self.request( - "getBranch", "GET", owner=owner, repo=repo, branch=branch - ) - ).text - ) - - async def get_tree( - self, owner: str, repo: str, tree_sha: str - ) -> GitTreeResponseModel: - """Get information about a tree. (Github API endpoint: getTree). - - Args: - - `owner (str)`: Owner of the repository. - - `repo (str)`: Name of the repository. - - `tree_sha (str)`: SHA of the tree. - - Returns: - - `tree_info (GitTreeResponseModel)`: Information about the tree. - - Examples: - >>> tree_info = client.get_tree("owner", "repo", "tree_sha") - """ - return GitTreeResponseModel.from_json( - ( - await self.request( - "getTree", "GET", owner=owner, repo=repo, tree_sha=tree_sha - ) - ).text - ) - - async def get_blob( - self, owner: str, repo: str, file_sha: str - ) -> GitBlobResponseModel: - """Get information about a blob. (Github API endpoint: getBlob). - - Args: - - `owner (str)`: Owner of the repository. - - `repo (str)`: Name of the repository. - - `file_sha (str)`: SHA of the file. - - Returns: - - `blob_info (GitBlobResponseModel)`: Information about the blob. - - Examples: - >>> blob_info = client.get_blob("owner", "repo", "file_sha") - """ - return GitBlobResponseModel.from_json( - ( - await self.request( - "getBlob", "GET", owner=owner, repo=repo, file_sha=file_sha - ) - ).text - ) - - async def get_commit( - self, owner: str, repo: str, commit_sha: str - ) -> GitCommitResponseModel: - """Get information about a commit. (Github API endpoint: getCommit). - - Args: - - `owner (str)`: Owner of the repository. - - `repo (str)`: Name of the repository. - - `commit_sha (str)`: SHA of the commit. - - Returns: - - `commit_info (GitCommitResponseModel)`: Information about the commit. - - Examples: - >>> commit_info = client.get_commit("owner", "repo", "commit_sha") - """ - return GitCommitResponseModel.from_json( - ( - await self.request( - "getCommit", "GET", owner=owner, repo=repo, commit_sha=commit_sha - ) - ).text - ) - - -if __name__ == "__main__": - import asyncio - - async def main() -> None: - """Test the GithubClient.""" - client = GithubClient() - response = await client.get_tree( - owner="ahmetkca", repo="CommitAI", tree_sha="with-body" - ) - - for obj in response.tree: - if obj.type == "blob": - print(obj.path) - print(obj.sha) - blob_response = await client.get_blob( - owner="ahmetkca", repo="CommitAI", file_sha=obj.sha - ) - print(blob_response.content) - - asyncio.run(main()) diff --git a/nextpy/ai/rag/document_loaders/github_repo/requirements.txt b/nextpy/ai/rag/document_loaders/github_repo/requirements.txt deleted file mode 100644 index 79228389..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -httpx \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/github_repo/utils.py b/nextpy/ai/rag/document_loaders/github_repo/utils.py deleted file mode 100644 index 29637175..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo/utils.py +++ /dev/null @@ -1,174 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Github readers utils. - -This module contains utility functions for the Github readers. -""" -import asyncio -import os -import sys -import time -from abc import ABC, abstractmethod -from typing import List, Tuple - -if "pytest" in sys.modules: - from llama_hub.github_repo.github_client import ( - GitBlobResponseModel, - GithubClient, - GitTreeResponseModel, - ) -else: - from llama_hub.github_repo.github_client import ( - GitBlobResponseModel, - GithubClient, - GitTreeResponseModel, - ) - - -def print_if_verbose(verbose: bool, message: str) -> None: - """Log message if verbose is True.""" - if verbose: - print(message) - - -def get_file_extension(filename: str) -> str: - """Get file extension.""" - return f".{os.path.splitext(filename)[1][1:].lower()}" - - -class BufferedAsyncIterator(ABC): - """Base class for buffered async iterators. - - This class is to be used as a base class for async iterators - that need to buffer the results of an async operation. - The async operation is defined in the _fill_buffer method. - The _fill_buffer method is called when the buffer is empty. - """ - - def __init__(self, buffer_size: int): - """Initialize params. - - Args: - - `buffer_size (int)`: Size of the buffer. - It is also the number of items that will - be retrieved from the async operation at once. - see _fill_buffer. Defaults to 2. Setting it to 1 - will result in the same behavior as a synchronous iterator. - """ - self._buffer_size = buffer_size - self._buffer: List[Tuple[GitBlobResponseModel, str]] = [] - self._index = 0 - - @abstractmethod - async def _fill_buffer(self) -> None: - raise NotImplementedError - - def __aiter__(self) -> "BufferedAsyncIterator": - """Return the iterator object.""" - return self - - async def __anext__(self) -> Tuple[GitBlobResponseModel, str]: - """Get next item. - - Returns: - - `item (Tuple[GitBlobResponseModel, str])`: Next item. - - Raises: - - `StopAsyncIteration`: If there are no more items. - """ - if not self._buffer: - await self._fill_buffer() - - if not self._buffer: - raise StopAsyncIteration - - item = self._buffer.pop(0) - self._index += 1 - return item - - -class BufferedGitBlobDataIterator(BufferedAsyncIterator): - """Buffered async iterator for Git blobs. - - This class is an async iterator that buffers the results of the get_blob operation. - It is used to retrieve the contents of the files in a Github repository. - getBlob endpoint supports up to 100 megabytes of content for blobs. - This concrete implementation of BufferedAsyncIterator allows you to lazily retrieve - the contents of the files in a Github repository. - Otherwise you would have to retrieve all the contents of - the files in the repository at once, which would - be problematic if the repository is large. - """ - - def __init__( - self, - blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]], - github_client: GithubClient, - owner: str, - repo: str, - loop: asyncio.AbstractEventLoop, - buffer_size: int, - verbose: bool = False, - ): - """Initialize params. - - Args: - - blobs_and_paths (List[Tuple[GitTreeResponseModel.GitTreeObject, str]]): - List of tuples containing the blob and the path of the file. - - github_client (GithubClient): Github client. - - owner (str): Owner of the repository. - - repo (str): Name of the repository. - - loop (asyncio.AbstractEventLoop): Event loop. - - buffer_size (int): Size of the buffer. - """ - super().__init__(buffer_size) - self._blobs_and_paths = blobs_and_paths - self._github_client = github_client - self._owner = owner - self._repo = repo - self._verbose = verbose - if loop is None: - loop = asyncio.get_event_loop() - if loop is None: - raise ValueError("No event loop found") - - async def _fill_buffer(self) -> None: - """Fill the buffer with the results of the get_blob operation. - - The get_blob operation is called for each blob in the blobs_and_paths list. - The blobs are retrieved in batches of size buffer_size. - """ - del self._buffer[:] - self._buffer = [] - start = self._index - end = min(start + self._buffer_size, len(self._blobs_and_paths)) - - if start >= end: - return - - if self._verbose: - start_t = time.time() - results: List[GitBlobResponseModel] = await asyncio.gather( - *[ - self._github_client.get_blob(self._owner, self._repo, blob.sha) - for blob, _ in self._blobs_and_paths[ - start:end - ] # TODO: use batch_size instead of buffer_size for concurrent requests - ] - ) - if self._verbose: - end_t = time.time() - blob_names_and_sizes = [ - (blob.path, blob.size) for blob, _ in self._blobs_and_paths[start:end] - ] - print( - "Time to get blobs (" - + f"{blob_names_and_sizes}" - + f"): {end_t - start_t:.2f} seconds" - ) - - self._buffer = [ - (result, path) - for result, (_, path) in zip(results, self._blobs_and_paths[start:end]) - ] diff --git a/nextpy/ai/rag/document_loaders/github_repo_issues/README.md b/nextpy/ai/rag/document_loaders/github_repo_issues/README.md deleted file mode 100644 index a2d3c419..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo_issues/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# GitHub Repository Issues Loader - -A loader that fetches issues of a GitHub repository. It expects an `owner` and `repo` as parameters. - -To use it, a "classic" personal access token with the `read:org` and `read:project` scopes is required for public repos, for private repos you also need `repo`. -See [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for instructions. - -## Usage - -To use this loader, pass an `owner` and `repo` for which the GitHub token has permissions. -```shell -export GITHUB_TOKEN='...' -``` - -```python -import os - -from llama_hub.github_repo_issues import GitHubRepositoryIssuesReader, GitHubIssuesClient - -github_client = GitHubIssuesClient() -loader = GitHubRepositoryIssuesReader( - github_client, - owner = "jerryjliu", - repo = "llama_index", - verbose = True, -) - -docs = loader.load_data() - -for doc in docs: - print(doc.extra_info) -``` - -## Examples - -This loader designed to be used as a way to load data into [Llama Index](https://github.com/jerryjliu/llama_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. -### Llama Index - -```shell -export OPENAI_API_KEY='...' -export GITHUB_TOKEN='...' -``` - -```python -import pickle -import os - -from nextpy.ai import download_loader, VectorDBIndex -from llama_hub.github_repo_issues import GitHubIssuesClient, GitHubRepositoryIssuesReader - -docs = None -if os.path.exists("docs.pkl"): - with open("docs.pkl", "rb") as f: - docs = pickle.load(f) - -if docs is None: - loader = GitHubRepositoryIssuesReader( - GitHubIssuesClient(), - owner = "jerryjliu", - repo = "llama_index", - verbose = True, - ) - - docs = loader.load_data() - - with open("docs.pkl", "wb") as f: - pickle.dump(docs, f) - -index = VectorDBIndex.from_documents(docs) - -query_engine = index.as_query_engine() -response = query_engine.query("Summarize issues that mention stream") -print(response) -``` diff --git a/nextpy/ai/rag/document_loaders/github_repo_issues/__init__.py b/nextpy/ai/rag/document_loaders/github_repo_issues/__init__.py deleted file mode 100644 index 53df1a9c..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo_issues/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" - -from .base import GitHubRepositoryIssuesReader -from .github_client import GitHubIssuesClient - -__all__ = ["GitHubRepositoryIssuesReader", "GitHubIssuesClient"] diff --git a/nextpy/ai/rag/document_loaders/github_repo_issues/base.py b/nextpy/ai/rag/document_loaders/github_repo_issues/base.py deleted file mode 100644 index 393dda5d..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo_issues/base.py +++ /dev/null @@ -1,234 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""GitHub repository issues reader. - -Retrieves the list of issues of a GitHub repository and converts them to documents. - -Each issue is converted to a DocumentNode by doing the following: - - - The text of the DocumentNode is the concatenation of the title and the body of the issue. - - The title of the DocumentNode is the title of the issue. - - The doc_id of the DocumentNode is the issue number. - - The extra_info of the DocumentNode is a dictionary with the following keys: - - state: State of the issue. - - created_at: Date when the issue was created. - - closed_at: Date when the issue was closed. Only present if the issue is closed. - - url: URL of the issue. - - assignee: Login of the user assigned to the issue. Only present if the issue is assigned. - - The embedding of the DocumentNode is not set. - - The doc_hash of the DocumentNode is not set. - -""" -import asyncio -import enum -import logging -import sys -from typing import Dict, List, Optional, Tuple - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -if "pytest" in sys.modules: - from llama_hub.github_repo_issues.github_client import ( - BaseGitHubIssuesClient, - GitHubIssuesClient, - ) -else: - from llama_hub.github_repo_issues.github_client import ( - BaseGitHubIssuesClient, - GitHubIssuesClient, - ) - - -logger = logging.getLogger(__name__) - - -def print_if_verbose(verbose: bool, message: str) -> None: - """Log message if verbose is True.""" - if verbose: - print(message) - - -class GitHubRepositoryIssuesReader(BaseReader): - """GitHub repository issues reader. - - Retrieves the list of issues of a GitHub repository and returns a list of documents. - - Examples: - >>> reader = GitHubRepositoryIssuesReader("owner", "repo") - >>> issues = reader.load_data() - >>> print(issues) - - """ - - class IssueState(enum.Enum): - """Issue type. - - Used to decide what issues to retrieve. - - Attributes: - - OPEN: Just open issues. This is the default. - - CLOSED: Just closed issues. - - ALL: All issues, open and closed. - """ - - OPEN = "open" - CLOSED = "closed" - ALL = "all" - - class FilterType(enum.Enum): - """Filter type. - - Used to determine whether the filter is inclusive or exclusive. - """ - - EXCLUDE = enum.auto() - INCLUDE = enum.auto() - - def __init__( - self, - github_client: BaseGitHubIssuesClient, - owner: str, - repo: str, - verbose: bool = False, - ): - """Initialize params. - - Args: - - github_client (BaseGitHubIssuesClient): GitHub client. - - owner (str): Owner of the repository. - - repo (str): Name of the repository. - - verbose (bool): Whether to print verbose messages. - - Raises: - - `ValueError`: If the github_token is not provided and - the GITHUB_TOKEN environment variable is not set. - """ - super().__init__() - - self._owner = owner - self._repo = repo - self._verbose = verbose - - # Set up the event loop - try: - self._loop = asyncio.get_running_loop() - except RuntimeError: - # If there is no running loop, create a new one - self._loop = asyncio.new_event_loop() - asyncio.set_event_loop(self._loop) - - self._github_client = github_client - - def load_data( - self, - state: Optional[IssueState] = IssueState.OPEN, - labelFilters: Optional[List[Tuple[str, FilterType]]] = None, - ) -> List[DocumentNode]: - """Load issues from a repository and converts them to documents. - - Each issue is converted to a DocumentNode by doing the following: - - - The text of the DocumentNode is the concatenation of the title and the body of the issue. - - The title of the DocumentNode is the title of the issue. - - The doc_id of the DocumentNode is the issue number. - - The extra_info of the DocumentNode is a dictionary with the following keys: - - state: State of the issue. - - created_at: Date when the issue was created. - - closed_at: Date when the issue was closed. Only present if the issue is closed. - - url: URL of the issue. - - assignee: Login of the user assigned to the issue. Only present if the issue is assigned. - - The embedding of the DocumentNode is None. - - The doc_hash of the DocumentNode is None. - - Args: - - state (IssueState): State of the issues to retrieve. Default is IssueState.OPEN. - - labelFilters: an optional list of filters to apply to the issue list based on labels. - - :return: list of documents - """ - documents = [] - page = 1 - # Loop until there are no more issues - while True: - issues: Dict = self._loop.run_until_complete( - self._github_client.get_issues( - self._owner, self._repo, state=state.value, page=page - ) - ) - - if len(issues) == 0: - print_if_verbose(self._verbose, "No more issues found, stopping") - - break - print_if_verbose( - self._verbose, f"Found {len(issues)} issues in the repo page {page}" - ) - page += 1 - filterCount = 0 - for issue in issues: - if not self._must_include(labelFilters, issue): - filterCount += 1 - continue - title = issue["title"] - body = issue["body"] - DocumentNode = DocumentNode( - doc_id=str(issue["number"]), - text=f"{title}\n{body}", - ) - metadata = { - "owner": self._owner, - "repo": self._repo, - "state": issue["state"], - "created_at": issue["created_at"], - # url is the API URL - "url": issue["url"], - # source is the HTML URL, more conveninent for humans - "source": issue["html_url"], - } - if issue["closed_at"] is not None: - metadata["closed_at"] = issue["closed_at"] - if issue["assignee"] is not None: - metadata["assignee"] = issue["assignee"]["login"] - DocumentNode.extra_info = metadata - documents.append(DocumentNode) - - print_if_verbose(self._verbose, f"Resulted in {len(documents)} documents") - if labelFilters is not None: - print_if_verbose(self._verbose, f"Filtered out {filterCount} issues") - - return documents - - def _must_include(self, labelFilters, issue): - if labelFilters is None: - return True - labels = [label["name"] for label in issue["labels"]] - for labelFilter in labelFilters: - label = labelFilter[0] - filterType = labelFilter[1] - # Only include issues with the label and value - if filterType == self.FilterType.INCLUDE: - return label in labels - elif filterType == self.FilterType.EXCLUDE: - return label not in labels - - return True - - -if __name__ == "__main__": - """Load all issues in the repo labeled as bug.""" - github_client = GitHubIssuesClient(verbose=True) - - reader = GitHubRepositoryIssuesReader( - github_client=github_client, - owner="moncho", - repo="dry", - verbose=True, - ) - - documents = reader.load_data( - state=GitHubRepositoryIssuesReader.IssueState.ALL, - labelFilters=[("bug", GitHubRepositoryIssuesReader.FilterType.INCLUDE)], - ) - print(f"Got {len(documents)} documents") diff --git a/nextpy/ai/rag/document_loaders/github_repo_issues/github_client.py b/nextpy/ai/rag/document_loaders/github_repo_issues/github_client.py deleted file mode 100644 index 3a6881ce..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo_issues/github_client.py +++ /dev/null @@ -1,203 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""GitHub API client for issues.""" - -import os -from typing import Any, Dict, Optional, Protocol - - -class BaseGitHubIssuesClient(Protocol): - def get_all_endpoints(self) -> Dict[str, str]: - ... - - async def request( - self, - endpoint: str, - method: str, - headers: Dict[str, Any] = {}, - params: Dict[str, Any] = {}, - **kwargs: Any, - ) -> Any: - ... - - async def get_issues( - self, - owner: str, - repo: str, - state: str = "open", - page: int = 1, - ) -> Dict: - ... - - -class GitHubIssuesClient: - """An asynchronous client for interacting with the GitHub API for issues. - - The client requires a GitHub token for authentication, which can be passed as an argument - or set as an environment variable. - If no GitHub token is provided, the client will raise a ValueError. - - Examples: - >>> client = GitHubIssuesClient("my_github_token") - >>> issues = client.get_issues("owner", "repo") - """ - - DEFAULT_BASE_URL = "https://api.github.com" - DEFAULT_API_VERSION = "2022-11-28" - - def __init__( - self, - github_token: Optional[str] = None, - base_url: str = DEFAULT_BASE_URL, - api_version: str = DEFAULT_API_VERSION, - verbose: bool = False, - ) -> None: - """Initialize the GitHubIssuesClient. - - Args: - - github_token (str): GitHub token for authentication. - If not provided, the client will try to get it from - the GITHUB_TOKEN environment variable. - - base_url (str): Base URL for the GitHub API - (defaults to "https://api.github.com"). - - api_version (str): GitHub API version (defaults to "2022-11-28"). - - Raises: - ValueError: If no GitHub token is provided. - """ - if github_token is None: - github_token = os.getenv("GITHUB_TOKEN") - if github_token is None: - raise ValueError( - "Please provide a GitHub token. " - + "You can do so by passing it as an argument to the GitHubReader," - + "or by setting the GITHUB_TOKEN environment variable." - ) - - self._base_url = base_url - self._api_version = api_version - self._verbose = verbose - - self._endpoints = { - "getIssues": "/repos/{owner}/{repo}/issues", - } - - self._headers = { - "Accept": "application/vnd.github+json", - "Authorization": f"Bearer {github_token}", - "X-GitHub-Api-Version": f"{self._api_version}", - } - - def get_all_endpoints(self) -> Dict[str, str]: - """Get all available endpoints.""" - return {**self._endpoints} - - async def request( - self, - endpoint: str, - method: str, - headers: Dict[str, Any] = {}, - params: Dict[str, Any] = {}, - **kwargs: Any, - ) -> Any: - """Makes an API request to the GitHub API. - - Args: - - `endpoint (str)`: Name of the endpoint to make the request to. - - `method (str)`: HTTP method to use for the request. - - `headers (dict)`: HTTP headers to include in the request. - - `**kwargs`: Keyword arguments to pass to the endpoint URL. - - Returns: - - `response (httpx.Response)`: Response from the API request. - - Raises: - - ImportError: If the `httpx` library is not installed. - - httpx.HTTPError: If the API request fails. - - Examples: - >>> response = client.request("getIssues", "GET", - owner="owner", repo="repo", state="all") - """ - try: - import httpx - except ImportError: - raise ImportError( - "`https` package not found, please run `pip install httpx`" - ) - - _headers = {**self._headers, **headers} - - _client: httpx.AsyncClient - async with httpx.AsyncClient( - headers=_headers, base_url=self._base_url, params=params - ) as _client: - try: - response = await _client.request( - method, url=self._endpoints[endpoint].format(**kwargs) - ) - response.raise_for_status() - except httpx.HTTPError as excp: - print(f"HTTP Exception for {excp.request.url} - {excp}") - raise excp - return response - - async def get_issues( - self, - owner: str, - repo: str, - state: str = "open", - page: int = 1, - ) -> Dict: - """List issues in a repository. - - Note: GitHub's REST API considers every pull request an issue, but not every issue is a pull request. - For this reason, "Issues" endpoints may return both issues and pull requests in the response. - You can identify pull requests by the pull_request key. - Be aware that the id of a pull request returned from "Issues" endpoints will be an issue id. - To find out the pull request id, use the "List pull requests" endpoint. - - Args: - - `owner (str)`: Owner of the repository. - - `repo (str)`: Name of the repository. - - `state (str)`: Indicates the state of the issues to return. - Default: open - Can be one of: open, closed, all. - - Returns: - - See https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues - - Examples: - >>> repo_issues = client.get_issues("owner", "repo") - """ - return ( - await self.request( - endpoint="getIssues", - method="GET", - params={ - "state": state, - "per_page": 100, - "sort": "updated", - "direction": "desc", - "page": page, - }, - owner=owner, - repo=repo, - ) - ).json() - - -if __name__ == "__main__": - import asyncio - - async def main() -> None: - """Test the GitHubIssuesClient.""" - client = GitHubIssuesClient() - issues = await client.get_issues(owner="moncho", repo="dry", state="all") - - for issue in issues: - print(issue["title"]) - print(issue["body"]) - - asyncio.run(main()) diff --git a/nextpy/ai/rag/document_loaders/github_repo_issues/requirements.txt b/nextpy/ai/rag/document_loaders/github_repo_issues/requirements.txt deleted file mode 100644 index 79228389..00000000 --- a/nextpy/ai/rag/document_loaders/github_repo_issues/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -httpx \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/gmail/README.md b/nextpy/ai/rag/document_loaders/gmail/README.md deleted file mode 100644 index 1997ce74..00000000 --- a/nextpy/ai/rag/document_loaders/gmail/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Gmail Loader - -This loader seaches your Gmail account and parses the resulting emails into `DocumentNode`s. The search query can include normal query params, like `from: email@example.com label:inbox`. - -As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. - -## Usage - -To use this loader, you simply need to pass in a search query string. - -```python -from nextpy.ai import download_loader - -GmailReader = download_loader('GmailReader') -loader = GmailReader(query="from: me label:inbox") -documents = loader.load_data() -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/nextpy/ai/rag/document_loaders/gmail/__init__.py b/nextpy/ai/rag/document_loaders/gmail/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/gmail/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/gmail/base.py b/nextpy/ai/rag/document_loaders/gmail/base.py deleted file mode 100644 index 8d42ea13..00000000 --- a/nextpy/ai/rag/document_loaders/gmail/base.py +++ /dev/null @@ -1,201 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Google Mail reader.""" -import base64 -import email -from typing import Any, List, Optional - -from pydantic import BaseModel - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] - - -class GmailReader(BaseReader, BaseModel): - """Gmail reader. - - Reads emails - - Args: - max_results (int): Defaults to 10. - query (str): Gmail query. Defaults to None. - service (Any): Gmail service. Defaults to None. - results_per_page (Optional[int]): Max number of results per page. Defaults to 10. - use_iterative_parser (bool): Use iterative parser. Defaults to False. - """ - - query: str = None - use_iterative_parser: bool = False - max_results: int = 10 - service: Any - results_per_page: Optional[int] - - def load_data(self) -> List[DocumentNode]: - """Load emails from the user's account.""" - from googleapiclient.discovery import build - - credentials = self._get_credentials() - if not self.service: - self.service = build("gmail", "v1", credentials=credentials) - - messsages = self.search_messages() - - metadata = { - "query": self.query, - } - - results = [] - for message in messsages: - text = message.pop("body") - metadata["message"] = message - results.append(DocumentNode(text=text, extra_info=metadata or {})) - - return results - - def _get_credentials(self) -> Any: - """Get valid user credentials from storage. - - The file token.json stores the user's access and refresh tokens, and is - created automatically when the authorization flow completes for the first - time. - - Returns: - Credentials, the obtained credential. - """ - import os - - from google.auth.transport.requests import Request - from google.oauth2.credentials import Credentials - from google_auth_oauthlib.flow import InstalledAppFlow - - creds = None - if os.path.exists("token.json"): - creds = Credentials.from_authorized_user_file("token.json", SCOPES) - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - "credentials.json", SCOPES - ) - creds = flow.run_local_server(port=8080) - # Save the credentials for the next run - with open("token.json", "w") as token: - token.write(creds.to_json()) - - return creds - - def search_messages(self): - query = self.query - - max_results = self.max_results - if self.results_per_page: - max_results = self.results_per_page - - results = ( - self.service.users() - .messages() - .list(userId="me", q=query, maxResults=int(max_results)) - .execute() - ) - messages = results.get("messages", []) - - if len(messages) < self.max_results: - # paginate if there are more results - while "nextPageToken" in results: - page_token = results["nextPageToken"] - results = ( - self.service.users() - .messages() - .list( - userId="me", - q=query, - pageToken=page_token, - maxResults=int(max_results), - ) - .execute() - ) - messages.extend(results["messages"]) - if len(messages) >= self.max_results: - break - - result = [] - try: - for message in messages: - message_data = self.get_message_data(message) - if not message_data: - continue - result.append(message_data) - except Exception as e: - raise Exception("Can't get message data" + str(e)) - - return result - - def get_message_data(self, message): - message_id = message["id"] - message_data = ( - self.service.users() - .messages() - .get(format="raw", userId="me", id=message_id) - .execute() - ) - if self.use_iterative_parser: - body = self.extract_message_body_iterative(message_data) - else: - body = self.extract_message_body(message_data) - - if not body: - return None - - # https://developers.google.com/gmail/api/reference/rest/v1/users.messages - return { - "id": message_data["id"], - "threadId": message_data["threadId"], - "snippet": message_data["snippet"], - "internalDate": message_data["internalDate"], - "body": body, - } - - def extract_message_body_iterative(self, message: dict): - if message["raw"]: - body = base64.urlsafe_b64decode(message["raw"].encode("utf-8")) - mime_msg = email.message_from_bytes(body) - else: - mime_msg = message - - body_text = "" - if mime_msg.get_content_type() == "text/plain": - plain_text = mime_msg.get_payload(decode=True) - charset = mime_msg.get_content_charset("utf-8") - body_text = plain_text.decode(charset).encode("utf-8").decode("utf-8") - - elif mime_msg.get_content_maintype() == "multipart": - msg_parts = mime_msg.get_payload() - for msg_part in msg_parts: - body_text += self.extract_message_body_iterative(msg_part) - - return body_text - - def extract_message_body(self, message: dict): - from bs4 import BeautifulSoup - - try: - body = base64.urlsafe_b64decode(message["raw"].encode("utf-8")) - mime_msg = email.message_from_bytes(body) - - # If the message body contains HTML, parse it with BeautifulSoup - if "text/html" in mime_msg: - soup = BeautifulSoup(body, "html.parser") - body = soup.get_text() - return body.decode("utf-8") - except Exception as e: - raise Exception("Can't parse message body" + str(e)) - - -if __name__ == "__main__": - reader = GmailReader(query="from:me after:2023-01-01") - print(reader.load_data()) diff --git a/nextpy/ai/rag/document_loaders/gmail/requirements.txt b/nextpy/ai/rag/document_loaders/gmail/requirements.txt deleted file mode 100644 index fcf4511e..00000000 --- a/nextpy/ai/rag/document_loaders/gmail/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib -beautifulsoup4 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/google_calendar/README.md b/nextpy/ai/rag/document_loaders/google_calendar/README.md deleted file mode 100644 index 8d27de50..00000000 --- a/nextpy/ai/rag/document_loaders/google_calendar/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Google Calendar Loader - -This loader reads your upcoming Google Calendar events and parses the relevant info into `Documents`. - -As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. - -## Usage - -Here's an example usage of the GoogleCalendar. It will retrieve up to 100 future events, unless an optional `number_of_results` argument is passed. It will also retrieve only future events, unless an optional `start_date` argument is passed. - -```python -from nextpy.ai import download_loader - -GoogleCalendarReader = download_loader('GoogleCalendarReader') - -loader = GoogleCalendarReader() -documents = loader.load_data() -``` - -## Example - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -GoogleCalendarReader = download_loader('GoogleCalendarReader') - -loader = GoogleCalendarReader() -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) -index.query('When am I meeting Gordon?') -``` diff --git a/nextpy/ai/rag/document_loaders/google_calendar/__init__.py b/nextpy/ai/rag/document_loaders/google_calendar/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/google_calendar/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/google_calendar/base.py b/nextpy/ai/rag/document_loaders/google_calendar/base.py deleted file mode 100644 index fe244df4..00000000 --- a/nextpy/ai/rag/document_loaders/google_calendar/base.py +++ /dev/null @@ -1,144 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Google Calendar reader.""" - -import datetime -import os -from typing import Any, List, Optional, Union - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -SCOPES = ["https://www.googleapis.com/auth/calendar.readonly"] - -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class GoogleCalendarReader(BaseReader): - """Google Calendar reader. - - Reads events from Google Calendar - - """ - - def load_data( - self, - number_of_results: Optional[int] = 100, - start_date: Optional[Union[str, datetime.date]] = None, - ) -> List[DocumentNode]: - """Load data from user's calendar. - - Args: - number_of_results (Optional[int]): the number of events to return. Defaults to 100. - start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today. - """ - from googleapiclient.discovery import build - - credentials = self._get_credentials() - service = build("calendar", "v3", credentials=credentials) - - if start_date is None: - start_date = datetime.date.today() - elif isinstance(start_date, str): - start_date = datetime.date.fromisoformat(start_date) - - start_datetime = datetime.datetime.combine(start_date, datetime.time.min) - start_datetime_utc = start_datetime.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - - events_result = ( - service.events() - .list( - calendarId="primary", - timeMin=start_datetime_utc, - maxResults=number_of_results, - singleEvents=True, - orderBy="startTime", - ) - .execute() - ) - - metadata = {"number_of_results": number_of_results, "start_date": start_date} - - events = events_result.get("items", []) - - if not events: - return [] - - results = [] - for event in events: - if "dateTime" in event["start"]: - start_time = event["start"]["dateTime"] - else: - start_time = event["start"]["date"] - - if "dateTime" in event["end"]: - end_time = event["end"]["dateTime"] - else: - end_time = event["end"]["date"] - - event_string = f"Status: {event['status']}, " - event_string += f"Summary: {event['summary']}, " - event_string += f"Start time: {start_time}, " - event_string += f"End time: {end_time}, " - - organizer = event.get("organizer", {}) - display_name = organizer.get("displayName", "N/A") - email = organizer.get("email", "N/A") - if display_name != "N/A": - event_string += f"Organizer: {display_name} ({email})" - else: - event_string += f"Organizer: {email}" - - results.append(DocumentNode(text=event_string, extra_info=metadata)) - - return results - - def _get_credentials(self) -> Any: - """Get valid user credentials from storage. - - The file token.json stores the user's access and refresh tokens, and is - created automatically when the authorization flow completes for the first - time. - - Returns: - Credentials, the obtained credential. - """ - from google.auth.transport.requests import Request - from google.oauth2.credentials import Credentials - from google_auth_oauthlib.flow import InstalledAppFlow - - creds = None - if os.path.exists("token.json"): - creds = Credentials.from_authorized_user_file("token.json", SCOPES) - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - "credentials.json", SCOPES - ) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open("token.json", "w") as token: - token.write(creds.to_json()) - - return creds - - -if __name__ == "__main__": - reader = GoogleCalendarReader() - print(reader.load_data()) diff --git a/nextpy/ai/rag/document_loaders/google_calendar/requirements.txt b/nextpy/ai/rag/document_loaders/google_calendar/requirements.txt deleted file mode 100644 index ee8b5257..00000000 --- a/nextpy/ai/rag/document_loaders/google_calendar/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/google_docs/README.md b/nextpy/ai/rag/document_loaders/google_docs/README.md deleted file mode 100644 index 47941445..00000000 --- a/nextpy/ai/rag/document_loaders/google_docs/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# Google Doc Loader - -This loader takes in IDs of Google Docs and parses their text into `DocumentNode`s. You can extract a Google Doc's ID directly from its URL. For example, the ID of `https://docs.google.com/DocumentNode/d/1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec/edit` is `1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec`. - -As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. - -## Usage - -To use this loader, you simply need to pass in an array of Google Doc IDs. - -```python -from nextpy.ai import download_loader - -GoogleDocsReader = download_loader('GoogleDocsReader') - -gdoc_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleDocsReader() -documents = loader.load_data(document_ids=gdoc_ids) -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -GoogleDocsReader = download_loader('GoogleDocsReader') - -gdoc_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleDocsReader() -documents = loader.load_data(document_ids=gdoc_ids) -index = GPTVectorDBIndex.from_documents(documents) -index.query('Where did the author go to school?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -GoogleDocsReader = download_loader('GoogleDocsReader') - -gdoc_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleDocsReader() -documents = loader.load_data(document_ids=gdoc_ids) -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Google Doc Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about the Google Documents.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="Where did the author go to school?") -``` diff --git a/nextpy/ai/rag/document_loaders/google_docs/__init__.py b/nextpy/ai/rag/document_loaders/google_docs/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/google_docs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/google_docs/base.py b/nextpy/ai/rag/document_loaders/google_docs/base.py deleted file mode 100644 index fe2ad1b5..00000000 --- a/nextpy/ai/rag/document_loaders/google_docs/base.py +++ /dev/null @@ -1,153 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Google docs reader.""" - -import os -from typing import Any, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -SCOPES = ["https://www.googleapis.com/auth/documents.readonly"] - - -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class GoogleDocsReader(BaseReader): - """Google Docs reader. - - Reads a page from Google Docs - - """ - - def load_data(self, document_ids: List[str]) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - document_ids (List[str]): a list of DocumentNode ids. - """ - if document_ids is None: - raise ValueError('Must specify a "document_ids" in `load_kwargs`.') - - results = [] - for document_id in document_ids: - doc = self._load_doc(document_id) - results.append( - DocumentNode(text=doc, extra_info={"document_id": document_id}) - ) - return results - - def _load_doc(self, document_id: str) -> str: - """Load a DocumentNode from Google Docs. - - Args: - document_id: the DocumentNode id. - - Returns: - The DocumentNode text. - """ - import googleapiclient.discovery as discovery - - credentials = self._get_credentials() - docs_service = discovery.build("docs", "v1", credentials=credentials) - doc = docs_service.documents().get(documentId=document_id).execute() - doc_content = doc.get("body").get("content") - return self._read_structural_elements(doc_content) - - def _get_credentials(self) -> Any: - """Get valid user credentials from storage. - - The file token.json stores the user's access and refresh tokens, and is - created automatically when the authorization flow completes for the first - time. - - Returns: - Credentials, the obtained credential. - """ - from google.auth.transport.requests import Request - from google.oauth2 import service_account - from google.oauth2.credentials import Credentials - from google_auth_oauthlib.flow import InstalledAppFlow - - creds = None - if os.path.exists("token.json"): - creds = Credentials.from_authorized_user_file("token.json", SCOPES) - elif os.path.exists("service_account.json"): - creds = service_account.Credentials.from_service_account_file( - "service_account.json", scopes=SCOPES - ) - return creds - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - "credentials.json", SCOPES - ) - creds = flow.run_local_server(port=8080) - # Save the credentials for the next run - with open("token.json", "w") as token: - token.write(creds.to_json()) - - return creds - - def _read_paragraph_element(self, element: Any) -> Any: - """Return the text in the given ParagraphElement. - - Args: - element: a ParagraphElement from a Google Doc. - """ - text_run = element.get("textRun") - if not text_run: - return "" - return text_run.get("content") - - def _read_structural_elements(self, elements: List[Any]) -> Any: - """Recurse through a list of Structural Elements. - - Read a DocumentNode's text where text may be in nested elements. - - Args: - elements: a list of Structural Elements. - """ - text = "" - for value in elements: - if "paragraph" in value: - elements = value.get("paragraph").get("elements") - for elem in elements: - text += self._read_paragraph_element(elem) - elif "table" in value: - # The text in table cells are in nested Structural Elements - # and tables may be nested. - table = value.get("table") - for row in table.get("tableRows"): - cells = row.get("tableCells") - for cell in cells: - text += self._read_structural_elements(cell.get("content")) - elif "tableOfContents" in value: - # The text in the TOC is also in a Structural Element. - toc = value.get("tableOfContents") - text += self._read_structural_elements(toc.get("content")) - return text - - -if __name__ == "__main__": - reader = GoogleDocsReader() - print( - reader.load_data(document_ids=["11ctUj_tEf5S8vs_dk8_BNi-Zk8wW5YFhXkKqtmU_4B8"]) - ) diff --git a/nextpy/ai/rag/document_loaders/google_docs/requirements.txt b/nextpy/ai/rag/document_loaders/google_docs/requirements.txt deleted file mode 100644 index ee8b5257..00000000 --- a/nextpy/ai/rag/document_loaders/google_docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/google_drive/README.md b/nextpy/ai/rag/document_loaders/google_drive/README.md deleted file mode 100644 index dff404cf..00000000 --- a/nextpy/ai/rag/document_loaders/google_drive/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Google Drive Loader - -This loader reads files from Google Drive using folder or file ids. To use this loader, you need to pass in a list of file id's or folder id. - -### folder_id - -You can extract a folder_id directly from its drive URL. - -For example, the folder_id of `https://drive.google.com/drive/folders/1w7XryYu6mL9VLmfyqUkA4_fRnDbsCqV-` is `1w7XryYu6mL9VLmfyqUkA4_fRnDbsCqV-`. - -### file_id - -You can extract a file_id directly from its sharable drive URL. - -For example, the file_id of `https://drive.google.com/file/d/1LEqD_zQiOizKrBKZYKJtER_h6i49wE-y/view?usp=sharing` is `1LEqD_zQiOizKrBKZYKJtER_h6i49wE-y`. - -### mime_types - -You can also filter the files by the mimeType e.g.: `mime_types=["application/vnd.google-apps.DocumentNode"]` - -## Usage - -We need `credentials.json` and `client_secrets.json` files to use this reader. - -1. You need to get your `credentials.json` file by following the steps mentioned [here](https://developers.google.com/drive/api/v3/quickstart/python) -2. Create duplicate file of `credentials.json` with name `client_secrets.json` which will be used by pydrive for downloading files. - -Finally, make sure you enable "Google Drive API" in the console of your Google App. - -```python -from nextpy.ai import download_loader - -GoogleDriveReader = download_loader("GoogleDriveReader") - -loader = GoogleDriveReader() - -#### Using folder id -documents = loader.load_data(folder_id="folderid") - -#### Using file ids -documents = loader.load_data(file_ids=["fileid1", "fileid2"]) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/google_drive/__init__.py b/nextpy/ai/rag/document_loaders/google_drive/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/google_drive/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/google_drive/base.py b/nextpy/ai/rag/document_loaders/google_drive/base.py deleted file mode 100644 index 1d4d2a8e..00000000 --- a/nextpy/ai/rag/document_loaders/google_drive/base.py +++ /dev/null @@ -1,368 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Google Drive files reader.""" - -import logging -import os -import tempfile -from pathlib import Path -from typing import Any, List, Optional - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - -# Scope for reading and downloading google drive files -SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] - - -class GoogleDriveReader(BaseReader): - """Google drive reader.""" - - def __init__( - self, - credentials_path: str = "credentials.json", - token_path: str = "token.json", - pydrive_creds_path: str = "creds.txt", - ) -> None: - """Initialize with parameters.""" - self.credentials_path = credentials_path - self.token_path = token_path - self.pydrive_creds_path = pydrive_creds_path - - self._creds = None - self._drive = None - - # Download Google Docs/Slides/Sheets as actual files - # See https://developers.google.com/drive/v3/web/mime-types - self._mimetypes = { - "application/vnd.google-apps.DocumentNode": { - "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.DocumentNode", - "extension": ".docx", - }, - "application/vnd.google-apps.spreadsheet": { - "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "extension": ".xlsx", - }, - "application/vnd.google-apps.presentation": { - "mimetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "extension": ".pptx", - }, - } - - def _get_credentials(self) -> Any: - """Authenticate with Google and save credentials. - Download the credentials.json file with these instructions: https://developers.google.com/drive/api/v3/quickstart/python. - Copy credentials.json file and rename it to client_secrets.json file which will be used by pydrive for downloading files. - So, we need two files: - 1. credentials.json - 2. client_secrets.json - Both 1, 2 are esentially same but needed with two different names according to google-api-python-client, google-auth-httplib2, google-auth-oauthlib and pydrive libraries. - - Returns: - credentials, pydrive object. - """ - from google.auth.transport.requests import Request - from google.oauth2.credentials import Credentials - from google_auth_oauthlib.flow import InstalledAppFlow - from pydrive.auth import GoogleAuth - from pydrive.drive import GoogleDrive - - # First, we need the Google API credentials for the app - creds = None - if os.path.exists(self.token_path): - creds = Credentials.from_authorized_user_file(self.token_path, SCOPES) - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - self.credentials_path, SCOPES - ) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open(self.token_path, "w") as token: - token.write(creds.to_json()) - - # Next, we need user authentication to download files (via pydrive) - # Uses client_secrets.json file for authorization. - gauth = GoogleAuth() - # Try to load saved client credentials - gauth.LoadCredentialsFile(self.pydrive_creds_path) - if gauth.credentials is None: - # Authenticate if they're not there - gauth.LocalWebserverAuth() - elif gauth.access_token_expired: - # Refresh them if expired - gauth.Refresh() - else: - # Initialize the saved creds - gauth.Authorize() - # Save the current credentials to a file so user doesn't have to auth every time - gauth.SaveCredentialsFile(self.pydrive_creds_path) - - drive = GoogleDrive(gauth) - - return creds, drive - - def _get_fileids_meta( - self, - folder_id: Optional[str] = None, - file_id: Optional[str] = None, - mime_types: Optional[list] = None, - ) -> List[List[str]]: - """Get file ids present in folder/ file id - Args: - folder_id: folder id of the folder in google drive. - file_id: file id of the file in google drive - mime_types: the mimeTypes you want to allow e.g.: "application/vnd.google-apps.DocumentNode" - Returns: - metadata: List of metadata of filde ids. - """ - from googleapiclient.discovery import build - - try: - service = build("drive", "v3", credentials=self._creds) - fileids_meta = [] - if folder_id: - folder_mime_type = "application/vnd.google-apps.folder" - query = "'" + folder_id + "' in parents" - - # Add mimeType filter to query - if mime_types: - if folder_mime_type not in mime_types: - mime_types.append(folder_mime_type) # keep the recursiveness - mime_query = " or ".join( - [f"mimeType='{mime_type}'" for mime_type in mime_types] - ) - query += f" and ({mime_query})" - - results = ( - service.files() - .list( - q=query, - includeItemsFromAllDrives=True, - supportsAllDrives=True, - fields="*", - ) - .execute() - ) - items = results.get("files", []) - for item in items: - if item["mimeType"] == folder_mime_type: - fileids_meta.extend( - self._get_fileids_meta( - folder_id=item["id"], mime_types=mime_types - ) - ) - else: - # Check if file doesn't belong to a Shared Drive. "owners" doesn't exist in a Shared Drive - is_shared_drive = "driveId" in item - author = ( - item["owners"][0]["displayName"] - if not is_shared_drive - else "Shared Drive" - ) - - fileids_meta.append( - ( - item["id"], - author, - item["name"], - item["createdTime"], - item["modifiedTime"], - ) - ) - - else: - # Get the file details - file = ( - service.files() - .get(fileId=file_id, supportsAllDrives=True, fields="*") - .execute() - ) - # Get metadata of the file - # Check if file doesn't belong to a Shared Drive. "owners" doesn't exist in a Shared Drive - is_shared_drive = "driveId" in file - author = ( - file["owners"][0]["displayName"] - if not is_shared_drive - else "Shared Drive" - ) - - fileids_meta.append( - ( - file["id"], - author, - file["name"], - file["createdTime"], - file["modifiedTime"], - ) - ) - return fileids_meta - - except Exception as e: - logger.error( - "An error occurred while getting fileids metadata: {}".format(e) - ) - - def _download_file(self, fileid: str, filename: str) -> str: - """Download the file with fileid and filename - Args: - fileid: file id of the file in google drive - filename: filename with which it will be downloaded - Returns: - The downloaded filename, which which may have a new extension. - """ - from io import BytesIO - - from googleapiclient.discovery import build - from googleapiclient.http import MediaIoBaseDownload - - try: - # Get file details - service = build("drive", "v3", credentials=self._creds) - file = service.files().get(fileId=fileid, supportsAllDrives=True).execute() - - if file["mimeType"] in self._mimetypes: - download_mimetype = self._mimetypes[file["mimeType"]]["mimetype"] - download_extension = self._mimetypes[file["mimeType"]]["extension"] - new_file_name = filename + download_extension - - # Download and convert file - request = service.files().export_media( - fileId=fileid, mimeType=download_mimetype - ) - else: - new_file_name = filename - - # Download file without conversion - request = service.files().get_media(fileId=fileid) - - # Download file data - file_data = BytesIO() - downloader = MediaIoBaseDownload(file_data, request) - done = False - - while not done: - status, done = downloader.next_chunk() - - # Save the downloaded file - with open(new_file_name, "wb") as f: - f.write(file_data.getvalue()) - - return new_file_name - except Exception as e: - logger.error("An error occurred while downloading file: {}".format(e)) - - def _load_data_fileids_meta( - self, fileids_meta: List[List[str]] - ) -> List[DocumentNode]: - """Load data from fileids metadata - Args: - fileids_meta: metadata of fileids in google drive. - - Returns: - Lis[DocumentNode]: List of DocumentNode of data present in fileids. - """ - try: - with tempfile.TemporaryDirectory() as temp_dir: - - def get_metadata(filename): - return metadata[filename] - - temp_dir = Path(temp_dir) - metadata = {} - - for fileid_meta in fileids_meta: - filename = next(tempfile._get_candidate_names()) - filepath = os.path.join(temp_dir, filename) - fileid = fileid_meta[0] - final_filepath = self._download_file(fileid, filepath) - - metadata[final_filepath] = { - "file id": fileid_meta[0], - "author": fileid_meta[1], - "file name": fileid_meta[2], - "created at": fileid_meta[3], - "modified at": fileid_meta[4], - } - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - SimpleDirectoryReader = import_loader("SimpleDirectoryReader") - except ImportError: - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - loader = SimpleDirectoryReader(temp_dir, file_metadata=get_metadata) - documents = loader.load_data() - - return documents - except Exception as e: - logger.error( - "An error occurred while loading data from fileids meta: {}".format(e) - ) - - def _load_from_file_ids( - self, file_ids: List[str], mime_types: list - ) -> List[DocumentNode]: - """Load data from file ids - Args: - file_ids: file ids of the files in google drive. - - Returns: - DocumentNode: List of Documents of text. - """ - try: - fileids_meta = [] - for file_id in file_ids: - fileids_meta.extend( - self._get_fileids_meta(file_id=file_id, mime_types=mime_types) - ) - documents = self._load_data_fileids_meta(fileids_meta) - - return documents - except Exception as e: - logger.error("An error occurred while loading with fileid: {}".format(e)) - - def _load_from_folder(self, folder_id: str, mime_types: list) -> List[DocumentNode]: - """Load data from folder_id - Args: - folder_id: folder id of the folder in google drive. - mime_types: the mimeTypes you want to allow e.g.: "application/vnd.google-apps.DocumentNode" - Returns: - DocumentNode: List of Documents of text. - """ - try: - fileids_meta = self._get_fileids_meta( - folder_id=folder_id, mime_types=mime_types - ) - documents = self._load_data_fileids_meta(fileids_meta) - return documents - except Exception as e: - logger.error("An error occurred while loading from folder: {}".format(e)) - - def load_data( - self, - folder_id: str = None, - file_ids: List[str] = None, - mime_types: List[str] = None, - ) -> List[DocumentNode]: - """Load data from the folder id and file ids. - - Args: - folder_id: folder id of the folder in google drive. - file_ids: file ids of the files in google drive. - mime_types: the mimeTypes you want to allow e.g.: "application/vnd.google-apps.DocumentNode" - Returns: - List[DocumentNode]: A list of documents. - """ - self._creds, self._drive = self._get_credentials() - - if folder_id: - return self._load_from_folder(folder_id, mime_types) - else: - return self._load_from_file_ids(file_ids, mime_types) diff --git a/nextpy/ai/rag/document_loaders/google_drive/requirements.txt b/nextpy/ai/rag/document_loaders/google_drive/requirements.txt deleted file mode 100644 index ba868485..00000000 --- a/nextpy/ai/rag/document_loaders/google_drive/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib -PyDrive \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/google_keep/README.md b/nextpy/ai/rag/document_loaders/google_keep/README.md deleted file mode 100644 index 71efa756..00000000 --- a/nextpy/ai/rag/document_loaders/google_keep/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Google Keep Loader - -This loader takes in IDs of Google Keep and parses their text into `DocumentNode`s. You can extract a Google Keep's ID directly from its URL. For example, the ID of `https://keep.google.com/u/6/#NOTE/1OySsaIrx_pvQaJJk3VPQfYQvSuxTQuPndEEGl7qvrhFaN8VnO4K8Bti0SL2YklU` is `1OySsaIrx_pvQaJJk3VPQfYQvSuxTQuPndEEGl7qvrhFaN8VnO4K8Bti0SL2YklU`. - -This loader uses the (unofficial) gkeepapi library. Google Keep does provide an official API, however in order to use it, (1) your account has to be an Enterprise (Google Workspace) account (2) you will need to generate a service account to autheticate with Google Keep API (3) you will need to enable Domain-wide Delegation to enable the service account with Google Read API scopes. See [here](https://issuetracker.google.com/issues/210500028) for details. Thus I believe gkeepapi is actually more practical and useful for the majority of the users. - -To use gkeepapi, you will need to login with username and a password. I highly recommend using a (one-off) App Password over using your own password. You can find how to generate App Password at [here](https://support.google.com/accounts/answer/185833?hl=en). The username and password should be saved at a `keep_credentials.json` file, with `username` and `password` being keys. It's recommended you delete the App Password once you no longer need it. - -## Usage - -To use this loader, you simply need to pass in an array of Google Keep IDs. - -```python -from llama_hub.google_keep.base import GoogleKeepReader - -gkeep_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleKeepReader() -documents = loader.load_data(document_ids=gkeep_ids) -``` - - - -### LlamaIndex - -```python -from nextpy.ai import VectorDBIndex -from llama_hub.google_keep.base import GoogleKeepReader - -gkeep_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleKeepReader() -notes = loader.load_data(document_ids=gkeep_ids) -index = VectorDBIndex.from_documents(notes) -query_engine = index.as_query_engine() -query_engine.query('What are my current TODOs?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from nextpy.ai import VectorDBIndex -from llama_hub.google_keep.base import GoogleKeepReader -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - - -gkeep_ids = ['1wf-y2pd9C878Oh-FmLH7Q_BQkljdm6TQal-c1pUfrec'] -loader = GoogleKeepReader() -notes = loader.load_data(document_ids=gkeep_ids) -index = VectorDBIndex.from_documents(notes) -query_engine = index.as_query_engine() - -tools = [ - Tool( - name="Google Keep Index", - func=lambda q: query_engine.query(q), - description=f"Useful when you want answer questions about the Google Keep Notes.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What are my current TODOs?") -``` diff --git a/nextpy/ai/rag/document_loaders/google_keep/__init__.py b/nextpy/ai/rag/document_loaders/google_keep/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/google_keep/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/google_keep/base.py b/nextpy/ai/rag/document_loaders/google_keep/base.py deleted file mode 100644 index cab1acf8..00000000 --- a/nextpy/ai/rag/document_loaders/google_keep/base.py +++ /dev/null @@ -1,80 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""(Unofficial) Google Keep reader using gkeepapi.""" - -import json -import os -from typing import Any, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class GoogleKeepReader(BaseReader): - """Google Keep reader. - - Reads notes from Google Keep - - """ - - def load_data(self, document_ids: List[str]) -> List[DocumentNode]: - """Load data from the document_ids. - - Args: - document_ids (List[str]): a list of note ids. - """ - keep = self._get_keep() - - if document_ids is None: - raise ValueError('Must specify a "document_ids" in `load_kwargs`.') - - results = [] - for note_id in document_ids: - note = keep.get(note_id) - if note is None: - raise ValueError(f"Note with id {note_id} not found.") - text = f"Title: {note.title}\nContent: {note.text}" - results.append(DocumentNode(text=text, extra_info={"note_id": note_id})) - return results - - def load_all_notes(self) -> List[DocumentNode]: - """Load all notes from Google Keep.""" - keep = self._get_keep() - - notes = keep.all() - results = [] - for note in notes: - text = f"Title: {note.title}\nContent: {note.text}" - results.append(DocumentNode(text=text, extra_info={"note_id": note.id})) - return results - - def _get_keep(self) -> Any: - import gkeepapi - - """Get a Google Keep object with login.""" - # Read username and password from keep_credentials.json - if os.path.exists("keep_credentials.json"): - with open("keep_credentials.json", "r") as f: - credentials = json.load(f) - else: - raise RuntimeError("Failed to load keep_credentials.json.") - - keep = gkeepapi.Keep() - - success = keep.login(credentials["username"], credentials["password"]) - if not success: - raise RuntimeError("Failed to login to Google Keep.") - - return keep - - -if __name__ == "__main__": - reader = GoogleKeepReader() - print( - reader.load_data( - document_ids=[ - "1eKU7kGn8eJCErZ52OC7vCzHDSQaspFYGHHCiTX_IvhFOc7ZQZVJhTIDFMdTJOPiejOk" - ] - ) - ) diff --git a/nextpy/ai/rag/document_loaders/google_keep/requirements.txt b/nextpy/ai/rag/document_loaders/google_keep/requirements.txt deleted file mode 100644 index f5436632..00000000 --- a/nextpy/ai/rag/document_loaders/google_keep/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gkeepapi diff --git a/nextpy/ai/rag/document_loaders/google_sheets/README.md b/nextpy/ai/rag/document_loaders/google_sheets/README.md deleted file mode 100644 index a62068bd..00000000 --- a/nextpy/ai/rag/document_loaders/google_sheets/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Google Sheets Loader - -This loader reads your upcoming Google Sheets and parses the relevant info into `Documents`. - -As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. - -## Usage - -Here's an example usage of the GoogleSheetsReader. - -```python -from nextpy.ai import download_loader - -GoogleSheetsReader = download_loader('GoogleSheetsReader') - -loader = GoogleSheetsReader() -documents = loader.load_data() -``` - -## Example - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -GoogleSheetsReader = download_loader('GoogleSheetsReader') - -loader = GoogleSheetsReader() -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) -index.query('When am I meeting Gordon?') -``` diff --git a/nextpy/ai/rag/document_loaders/google_sheets/__init__.py b/nextpy/ai/rag/document_loaders/google_sheets/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/google_sheets/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/google_sheets/base.py b/nextpy/ai/rag/document_loaders/google_sheets/base.py deleted file mode 100644 index 9ab9b559..00000000 --- a/nextpy/ai/rag/document_loaders/google_sheets/base.py +++ /dev/null @@ -1,148 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Google sheets reader.""" - -import logging -import os -from typing import Any, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"] - -logger = logging.getLogger(__name__) - -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class GoogleSheetsReader(BaseReader): - """Google Sheets reader. - - Reads a sheet as TSV from Google Sheets - - """ - - def __init__(self) -> None: - """Initialize with parameters.""" - try: - import google # noqa: F401 - import google_auth_oauthlib # noqa: F401 - import googleapiclient # noqa: F401 - except ImportError: - raise ImportError( - "`google_auth_oauthlib`, `googleapiclient` and `google` " - "must be installed to use the GoogleSheetsReader.\n" - "Please run `pip install --upgrade google-api-python-client " - "google-auth-httplib2 google-auth-oauthlib`." - ) - - def load_data(self, spreadsheet_ids: List[str]) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - spreadsheet_ids (List[str]): a list of DocumentNode ids. - """ - if spreadsheet_ids is None: - raise ValueError('Must specify a "spreadsheet_ids" in `load_kwargs`.') - - results = [] - for spreadsheet_id in spreadsheet_ids: - sheet = self._load_sheet(spreadsheet_id) - results.append( - DocumentNode(text=sheet, extra_info={"spreadsheet_id": spreadsheet_id}) - ) - return results - - def _load_sheet(self, spreadsheet_id: str) -> str: - """Load a sheet from Google Sheets. - - Args: - spreadsheet_id: the sheet id. - - Returns: - The sheet data. - """ - import googleapiclient.discovery as discovery - - credentials = self._get_credentials() - sheets_service = discovery.build("sheets", "v4", credentials=credentials) - spreadsheet_data = ( - sheets_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() - ) - sheets = spreadsheet_data.get("sheets") - sheet_text = "" - - for sheet in sheets: - properties = sheet.get("properties") - title = properties.get("title") - sheet_text += title + "\n" - grid_props = properties.get("gridProperties") - rows = grid_props.get("rowCount") - cols = grid_props.get("columnCount") - range_pattern = f"R1C1:R{rows}C{cols}" - response = ( - sheets_service.spreadsheets() - .values() - .get(spreadsheetId=spreadsheet_id, range=range_pattern) - .execute() - ) - sheet_text += ( - "\n".join(map(lambda row: "\t".join(row), response.get("values", []))) - + "\n" - ) - return sheet_text - - def _get_credentials(self) -> Any: - """Get valid user credentials from storage. - - The file token.json stores the user's access and refresh tokens, and is - created automatically when the authorization flow completes for the first - time. - - Returns: - Credentials, the obtained credential. - """ - from google.auth.transport.requests import Request - from google.oauth2.credentials import Credentials - from google_auth_oauthlib.flow import InstalledAppFlow - - creds = None - if os.path.exists("token.json"): - creds = Credentials.from_authorized_user_file("token.json", SCOPES) - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - "credentials.json", SCOPES - ) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open("token.json", "w") as token: - token.write(creds.to_json()) - - return creds - - -if __name__ == "__main__": - reader = GoogleSheetsReader() - logger.info( - reader.load_data( - spreadsheet_ids=["1VkuitKIyNmkoCJJDmEUmkS_VupSkDcztpRhbUzAU5L8"] - ) - ) diff --git a/nextpy/ai/rag/document_loaders/google_sheets/requirements.txt b/nextpy/ai/rag/document_loaders/google_sheets/requirements.txt deleted file mode 100644 index ee8b5257..00000000 --- a/nextpy/ai/rag/document_loaders/google_sheets/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/gpt_repo/README.md b/nextpy/ai/rag/document_loaders/gpt_repo/README.md deleted file mode 100644 index 38d1a836..00000000 --- a/nextpy/ai/rag/document_loaders/gpt_repo/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# GPT Repository Loader - -This loader is an adaptation of https://github.com/mpoon/gpt-repository-loader -to LlamaHub. Full credit goes to mpoon for coming up with this! - -## Usage - -To use this loader, you need to pass in a path to a local Git repository - -```python -from nextpy.ai import download_loader - -GPTRepoReader = download_loader("GPTRepoReader") - -loader = GPTRepoReader() -documents = loader.load_data(repo_path="/path/to/git/repo", preamble_str="") -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/gpt_repo/__init__.py b/nextpy/ai/rag/document_loaders/gpt_repo/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/gpt_repo/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/gpt_repo/base.py b/nextpy/ai/rag/document_loaders/gpt_repo/base.py deleted file mode 100644 index f2a4a669..00000000 --- a/nextpy/ai/rag/document_loaders/gpt_repo/base.py +++ /dev/null @@ -1,163 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Reader that uses a Github Repo. - -Repo taken from: https://github.com/mpoon/gpt-repository-loader - -License attached: - -MIT License - -Copyright (c) 2023 mpoon - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -""" - -#!/usr/bin/env python3 - -import fnmatch -import os -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -def get_ignore_list(ignore_file_path) -> List[str]: - ignore_list = [] - with open(ignore_file_path, "r") as ignore_file: - for line in ignore_file: - ignore_list.append(line.strip()) - return ignore_list - - -def should_ignore(file_path, ignore_list) -> bool: - return any(fnmatch.fnmatch(file_path, pattern) for pattern in ignore_list) - - -def process_repository( - repo_path, - ignore_list, - concatenate: bool = False, - extensions: Optional[List[str]] = None, -) -> List[str]: - """Process repository.""" - result_texts = [] - result_text = "" - for root, _, files in os.walk(repo_path): - for file in files: - file_path = os.path.join(root, file) - relative_file_path = os.path.relpath(file_path, repo_path) - - _, file_ext = os.path.splitext(file_path) - is_correct_extension = extensions is None or file_ext in extensions - - if ( - not should_ignore(relative_file_path, ignore_list) - and is_correct_extension - ): - with open(file_path, "r", errors="ignore") as file: - contents = file.read() - result_text += "-" * 4 + "\n" - result_text += f"{relative_file_path}\n" - result_text += f"{contents}\n" - if not concatenate: - result_texts.append(result_text) - result_text = "" - - if concatenate: - result_texts.append(result_text) - - return result_texts - - -class GPTRepoReader(BaseReader): - """GPTRepoReader. - - Reads a github repo in a prompt-friendly format. - - """ - - def __init__(self, concatenate: bool = False) -> None: - """Initialize.""" - self.concatenate = concatenate - - def load_data( - self, - repo_path: str, - preamble_str: Optional[str] = None, - extensions: Optional[List[str]] = None, - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - pages (List[str]): List of pages to read. - - """ - metadata = { - "concatenate": self.concatenate, - "repo_path": repo_path, - "preamble_str": preamble_str, - "extensions": extensions, - } - - ignore_file_path = os.path.join(repo_path, ".gptignore") - - if os.path.exists(ignore_file_path): - ignore_list = get_ignore_list(ignore_file_path) - else: - ignore_list = [] - - output_text = "" - if preamble_str: - output_text += f"{preamble_str}\n" - elif self.concatenate: - output_text += ( - "The following text is a Git repository with code. " - "The structure of the text are sections that begin with ----, " - "followed by a single line containing the file path and file " - "name, followed by a variable amount of lines containing the " - "file contents. The text representing the Git repository ends " - "when the symbols --END-- are encounted. Any further text beyond " - "--END-- are meant to be interpreted as instructions using the " - "aforementioned Git repository as context.\n" - ) - else: - # self.concatenate is False - output_text += ( - "The following text is a file in a Git repository. " - "The structure of the text are sections that begin with ----, " - "followed by a single line containing the file path and file " - "name, followed by a variable amount of lines containing the " - "file contents. The text representing the file ends " - "when the symbols --END-- are encounted. Any further text beyond " - "--END-- are meant to be interpreted as instructions using the " - "aforementioned file as context.\n" - ) - text_list = process_repository( - repo_path, ignore_list, concatenate=self.concatenate, extensions=extensions - ) - docs = [] - for text in text_list: - doc_text = output_text + text + "\n--END--\n" - docs.append(DocumentNode(text=doc_text, extra_info=metadata)) - - return docs diff --git a/nextpy/ai/rag/document_loaders/graphdb_cypher/README.md b/nextpy/ai/rag/document_loaders/graphdb_cypher/README.md deleted file mode 100644 index c33ec1f8..00000000 --- a/nextpy/ai/rag/document_loaders/graphdb_cypher/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Graph Database Cypher Loader - -This loader populates documents from results of Cypher queries from a Graph database endpoint. -The user specifies a GraphDB endpoint URL with optional credentials to initialize the reader. -By declaring the Cypher query and optional parameters the loader can fetch the nested result docs. -The results will be turned into a yaml representation to be turned into a string for the DocumentNode. - -The approach should work for Neo4j, AWS Neptune and Memgraph. - -## Usage - -Here's an example usage of the `GraphDBCypherReader`. - -You can test out queries directly with the Neo4j labs demo server: demo.neo4jlabs.com or with a free instance https://neo4j.com/aura - -```python -from nextpy.ai import download_loader -import os - -GraphDBCypherReader = download_loader('GraphDBCypherReader') - -uri = "neo4j+s://demo.neo4jlabs.com" -username = "stackoverflow" -password = "stackoverflow" -database = "stackoverflow" - -query = """ - MATCH (q:Question)-[:TAGGED]->(:Tag {name:$tag}) - RETURN q.title as title - ORDER BY q.createdAt DESC LIMIT 10 -""" -reader = GraphDBCypherReader(uri, username, password, database) -documents = reader.load_data(query, parameters = {"tag":"lua"}) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) -and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. -See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - -It uses the [Neo4j Graph Database](https://neo4j.com/developer) for the Cypher queries. diff --git a/nextpy/ai/rag/document_loaders/graphdb_cypher/__init__.py b/nextpy/ai/rag/document_loaders/graphdb_cypher/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/graphdb_cypher/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/graphdb_cypher/base.py b/nextpy/ai/rag/document_loaders/graphdb_cypher/base.py deleted file mode 100644 index 7279d5fd..00000000 --- a/nextpy/ai/rag/document_loaders/graphdb_cypher/base.py +++ /dev/null @@ -1,70 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Graph Database Cypher Reader.""" - -from typing import Dict, List, Optional - -import yaml - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class GraphDBCypherReader(BaseReader): - """Graph database Cypher reader. - - Combines all Cypher query results into the DocumentNode type used by LlamaIndex. - - Args: - uri (str): Graph Database URI - username (str): Username - password (str): Password - - """ - - def __init__(self, uri: str, username: str, password: str, database: str) -> None: - """Initialize with parameters.""" - try: - from neo4j import GraphDatabase, basic_auth - - except ImportError: - raise ImportError( - "`neo4j` package not found, please run `pip install neo4j`" - ) - if uri: - if uri is None: - raise ValueError("`uri` must be provided.") - self.client = GraphDatabase.driver( - uri=uri, auth=basic_auth(username, password) - ) - self.database = database - - def load_data( - self, query: str, parameters: Optional[Dict] = None - ) -> List[DocumentNode]: - """Run the Cypher with optional parameters and turn results into documents. - - Args: - query (str): Graph Cypher query string. - parameters (Optional[Dict]): optional query parameters. - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = {"query": query, "parameters": parameters} - - if parameters is None: - parameters = {} - - records, summary, keys = self.client.execute_query( - query, parameters, database_=self.database - ) - - documents = [ - DocumentNode(text=yaml.dump(entry.data()), extra_info=metadata) - for entry in records - ] - - return documents diff --git a/nextpy/ai/rag/document_loaders/graphdb_cypher/requirements.txt b/nextpy/ai/rag/document_loaders/graphdb_cypher/requirements.txt deleted file mode 100644 index 68fec45c..00000000 --- a/nextpy/ai/rag/document_loaders/graphdb_cypher/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -neo4j diff --git a/nextpy/ai/rag/document_loaders/graphql/README.md b/nextpy/ai/rag/document_loaders/graphql/README.md deleted file mode 100644 index fc6ef3e8..00000000 --- a/nextpy/ai/rag/document_loaders/graphql/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# GraphQL Loader - -This loader loads documents via GraphQL queries from a GraphQL endpoint. -The user specifies a GraphQL endpoint URL with optional credentials to initialize the reader. -By declaring the GraphQL query and optional variables (parameters) the loader can fetch the nested result docs. - -## Usage - -Here's an example usage of the GraphQLReader. -You can test out queries directly [on the site](https://countries.trevorblades.com/) - -```python -from nextpy.ai import download_loader -import os - -GraphQLReader = download_loader('GraphQLReader') - -uri = "https://countries.trevorblades.com/" -headers = {} -query = """ - query getContinents { - continents { - code - name - } - } -""" -reader = GraphQLReader(uri, headers) -documents = reader.query(query, variables = {}) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) -and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. -See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - -It uses the [gql GraphQL library](https://pypi.org/project/gql/) for the GraphQL queries. \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/graphql/__init__.py b/nextpy/ai/rag/document_loaders/graphql/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/graphql/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/graphql/base.py b/nextpy/ai/rag/document_loaders/graphql/base.py deleted file mode 100644 index b5eed52b..00000000 --- a/nextpy/ai/rag/document_loaders/graphql/base.py +++ /dev/null @@ -1,88 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""GraphQL Reader.""" - -from typing import Dict, List, Optional - -import yaml - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class GraphQLReader(BaseReader): - """GraphQL reader. - - Combines all GraphQL results into the DocumentNode used by LlamaIndex. - - Args: - uri (str): GraphQL uri. - headers (Optional[Dict]): Optional http headers. - - """ - - def __init__( - self, - uri: Optional[str] = None, - headers: Optional[Dict] = None, - ) -> None: - """Initialize with parameters.""" - self.uri = uri - - try: - from gql import Client - from gql.transport.requests import RequestsHTTPTransport - - except ImportError: - raise ImportError("`gql` package not found, please run `pip install gql`") - if uri: - if uri is None: - raise ValueError("`uri` must be provided.") - if headers is None: - headers = {} - transport = RequestsHTTPTransport(url=uri, headers=headers) - self.client = Client(transport=transport, fetch_schema_from_transport=True) - - def load_data( - self, query: str, variables: Optional[Dict] = None - ) -> List[DocumentNode]: - """Run query with optional variables and turn results into documents. - - Args: - query (str): GraphQL query string. - variables (Optional[Dict]): optional query parameters. - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = {"uri": self.uri, "query": query, "variables": variables} - - try: - from gql import gql - - except ImportError: - raise ImportError("`gql` package not found, please run `pip install gql`") - if variables is None: - variables = {} - - documents = [] - - result = self.client.execute(gql(query), variable_values=variables) - - for key in result: - entry = result[key] - if type(entry) == list: - documents.extend( - [ - DocumentNode(text=yaml.dump(v), extra_info=metadata) - for v in entry - ] - ) - else: - documents.append( - DocumentNode(text=yaml.dump(entry), extra_info=metadata) - ) - - return documents diff --git a/nextpy/ai/rag/document_loaders/graphql/requirements.txt b/nextpy/ai/rag/document_loaders/graphql/requirements.txt deleted file mode 100644 index 21fdd175..00000000 --- a/nextpy/ai/rag/document_loaders/graphql/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -gql -requests_toolbelt \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/hatena_blog/README.md b/nextpy/ai/rag/document_loaders/hatena_blog/README.md deleted file mode 100644 index c48fbd22..00000000 --- a/nextpy/ai/rag/document_loaders/hatena_blog/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Hatena Blog Loader - -This loader fetches article from your own [Hatena Blog](https://hatenablog.com/) blog posts using the AtomPub API. - -You can get AtomPub info from the admin page after logging into Hatena Blog. - -## Usage - -Here's an example usage of the HatenaBlogReader. - -```python -from nextpy.ai import download_loader -import os - -HatenaBlogReader = download_loader('HatenaBlogReader') - -root_endpoint = os.getenv('ATOM_PUB_ROOT_ENDPOINT') -api_key = os.getenv('ATOM_PUB_API_KEY') -username = os.getenv('HATENA_BLOG_USERNAME') - -reader = HatenaBlogReader(root_endpoint=root_endpoint, api_key=api_key, username=username) -documents = reader.load_data() - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/hatena_blog/__init__.py b/nextpy/ai/rag/document_loaders/hatena_blog/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/hatena_blog/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/hatena_blog/base.py b/nextpy/ai/rag/document_loaders/hatena_blog/base.py deleted file mode 100644 index 55493f6f..00000000 --- a/nextpy/ai/rag/document_loaders/hatena_blog/base.py +++ /dev/null @@ -1,97 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Hatena Blog reader.""" - -from typing import Dict, List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -ATOM_PUB_ENTRY_URL = "{root_endpoint}/entry" - - -class Article: - def __init__(self) -> None: - self.title = "" - self.content = "" - self.published = "" - - -class HatenaBlogReader(BaseReader): - """Hatena Blog reader. - - Args: - root_endpoint (str): AtomPub root endpoint. - api_key (str): AtomPub API Key - username (str): Hatena ID - """ - - def __init__(self, root_endpoint: str, api_key: str, username: str) -> None: - """Initialize Hatena Blog reader.""" - self.root_endpoint = root_endpoint - self.api_key = api_key - self.username = username - - def load_data(self) -> List[DocumentNode]: - results = [] - articles = self.get_all_articles() - for a in articles: - results.append( - DocumentNode( - text=a.content, - extra_info={ - "title": a.title, - "published": a.published, - "root_endpoint": self.root_endpoint, - }, - ) - ) - - return results - - def get_all_articles(self) -> List[Article]: - articles: List[Article] = [] - page_url = ATOM_PUB_ENTRY_URL.format(root_endpoint=self.root_endpoint) - - while True: - res = self.get_articles(page_url) - articles += res.get("articles") - page_url = res.get("next_page") - if page_url is None: - break - - return articles - - def get_articles(self, url: str) -> Dict: - import requests - from bs4 import BeautifulSoup - from requests.auth import HTTPBasicAuth - - articles: List[Article] = [] - next_page = None - - res = requests.get(url, auth=HTTPBasicAuth(self.username, self.api_key)) - soup = BeautifulSoup(res.text, "xml") - for entry in soup.find_all("entry"): - if entry.find("app:control").find("app:draft").string == "yes": - continue - article = Article() - article.title = entry.find("title").string - article.published = entry.find("published").string - content = entry.find("content") - if content.get("type") == "text/html": - article.content = ( - BeautifulSoup(entry.find("content").string, "html.parser") - .get_text() - .strip() - ) - else: - article.content = entry.find("content").string.strip() - articles.append(article) - - next = soup.find("link", attrs={"rel": "next"}) - if next: - next_page = next.get("href") - - return {"articles": articles, "next_page": next_page} diff --git a/nextpy/ai/rag/document_loaders/hatena_blog/requirements.txt b/nextpy/ai/rag/document_loaders/hatena_blog/requirements.txt deleted file mode 100644 index da1564b3..00000000 --- a/nextpy/ai/rag/document_loaders/hatena_blog/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -beautifulsoup4 -lxml \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/hubspot/README.md b/nextpy/ai/rag/document_loaders/hubspot/README.md deleted file mode 100644 index 5c5f9db7..00000000 --- a/nextpy/ai/rag/document_loaders/hubspot/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Hubspot Loader - -This loader loads documents from Hubspot. The user specifies an access token to initialize the HubspotReader. - -At the moment, this loader only supports access token authentication. To obtain an access token, you will need to create a private app by following instructions [here](https://developers.hubspot.com/docs/api/private-apps). - -## Usage - -Here's an example usage of the HubspotReader. - -```python -from nextpy.ai import download_loader -import os -HubspotReader = download_loader('HubspotReader') - -reader = HubspotReader("") -documents = reader.load_data() - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/hubspot/__init__.py b/nextpy/ai/rag/document_loaders/hubspot/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/hubspot/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/hubspot/base.py b/nextpy/ai/rag/document_loaders/hubspot/base.py deleted file mode 100644 index b18f7eb7..00000000 --- a/nextpy/ai/rag/document_loaders/hubspot/base.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Hubspot reader.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class HubspotReader(BaseReader): - """Hubspot reader. Reads data from a Hubspot account. - - Args: - access_token(str): Hubspot API key. - """ - - def __init__(self, access_token: str) -> None: - """Initialize Hubspot reader.""" - self.access_token = access_token - - def load_data(self) -> List[DocumentNode]: - """Load deals, contacts and companies data from Hubspot. - - Returns: - List[DocumentNode]: List of documents, where each DocumentNode represensts a list of Hubspot objects - """ - from hubspot import HubSpot - - api_client = HubSpot(access_token=self.access_token) - all_deals = api_client.crm.deals.get_all() - all_contacts = api_client.crm.contacts.get_all() - all_companies = api_client.crm.companies.get_all() - results = [ - DocumentNode( - text=f"{all_deals}".replace("\n", ""), extra_info={"type": "deals"} - ), - DocumentNode( - text=f"{all_contacts}".replace("\n", ""), - extra_info={"type": "contacts"}, - ), - DocumentNode( - text=f"{all_companies}".replace("\n", ""), - extra_info={"type": "companies"}, - ), - ] - return results diff --git a/nextpy/ai/rag/document_loaders/hubspot/requirements.txt b/nextpy/ai/rag/document_loaders/hubspot/requirements.txt deleted file mode 100644 index ef8e3ebc..00000000 --- a/nextpy/ai/rag/document_loaders/hubspot/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -hubspot-api-client \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/huggingface/fs/README.md b/nextpy/ai/rag/document_loaders/huggingface/fs/README.md deleted file mode 100644 index 2083024b..00000000 --- a/nextpy/ai/rag/document_loaders/huggingface/fs/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Hugging Face FS Loader - -This loader uses Hugging Face Hub's Filesystem API (> 0.14) to -load datasets. - -Besides the existing `load_data` function, you may also choose to use -`load_dicts` and `load_df`. - -## Usage - -To use this loader, you need to pass in a path to a Hugging Face dataset. - -```python -from pathlib import Path -from nextpy.ai import download_loader - -HuggingFaceFSReader = download_loader("HuggingFaceFSReader") - -# load documents -loader = HuggingFaceFSReader() -documents = loader.load_data('datasets/dair-ai/emotion/data/data.jsonl.gz') - -# load dicts -dicts = loader.load_dicts('datasets/dair-ai/emotion/data/data.jsonl.gz') - -# load df -df = loader.load_df('datasets/dair-ai/emotion/data/data.jsonl.gz') - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - - diff --git a/nextpy/ai/rag/document_loaders/huggingface/fs/__init__.py b/nextpy/ai/rag/document_loaders/huggingface/fs/__init__.py deleted file mode 100644 index 1c233aca..00000000 --- a/nextpy/ai/rag/document_loaders/huggingface/fs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init params.""" diff --git a/nextpy/ai/rag/document_loaders/huggingface/fs/base.py b/nextpy/ai/rag/document_loaders/huggingface/fs/base.py deleted file mode 100644 index cce66c52..00000000 --- a/nextpy/ai/rag/document_loaders/huggingface/fs/base.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Hugging Face file reader. - -A parser for HF files. - -""" -import json -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Dict, List - -import pandas as pd - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class HuggingFaceFSReader(BaseReader): - r"""Hugging Face File System reader. - - Uses the new Filesystem API from the Hugging Face Hub client library. - - Args: - - - """ - - def __init__(self) -> None: - from huggingface_hub import HfFileSystem - - self.fs = HfFileSystem() - - def load_dicts(self, path: str) -> List[Dict]: - """Parse file.""" - test_data = self.fs.read_bytes(path) - - path = Path(path) - if ".gz" in path.suffixes: - import gzip - - with TemporaryDirectory() as tmp: - tmp = Path(tmp) - with open(tmp / "tmp.jsonl.gz", "wb") as fp: - fp.write(test_data) - - f = gzip.open(tmp / "tmp.jsonl.gz", "rb") - raw = f.read() - data = raw.decode() - else: - data = test_data.decode() - - text_lines = data.split("\n") - json_dicts = [] - for t in text_lines: - try: - json_dict = json.loads(t) - except json.decoder.JSONDecodeError: - continue - json_dicts.append(json_dict) - return json_dicts - - def load_df(self, path: str) -> pd.DataFrame: - """Load pandas dataframe.""" - return pd.DataFrame(self.load_dicts(path)) - - def load_data(self, path: str) -> List[DocumentNode]: - """Load data.""" - metadata = {"path": path} - json_dicts = self.load_dicts(path) - docs = [] - for d in json_dicts: - docs.append(DocumentNode(text=str(d), extra_info=metadata)) - return docs diff --git a/nextpy/ai/rag/document_loaders/huggingface/fs/requirements.txt b/nextpy/ai/rag/document_loaders/huggingface/fs/requirements.txt deleted file mode 100644 index 29e43968..00000000 --- a/nextpy/ai/rag/document_loaders/huggingface/fs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -huggingface-hub \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/intercom/README.md b/nextpy/ai/rag/document_loaders/intercom/README.md deleted file mode 100644 index 87432f83..00000000 --- a/nextpy/ai/rag/document_loaders/intercom/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Intercom Loader - -This loader fetches the text from Intercom help articles using the Intercom API. It also uses the BeautifulSoup library to parse the HTML and extract the text from the articles. - -## Usage - -To use this loader, you need to pass in an Intercom account access token. - -```python -from nextpy.ai import download_loader - -IntercomReader = download_loader("IntercomReader") - -loader = IntercomReader(intercom_access_token="my_access_token") -documents = loader.load_data() -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/intercom/__init__.py b/nextpy/ai/rag/document_loaders/intercom/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/intercom/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/intercom/base.py b/nextpy/ai/rag/document_loaders/intercom/base.py deleted file mode 100644 index fbbf9615..00000000 --- a/nextpy/ai/rag/document_loaders/intercom/base.py +++ /dev/null @@ -1,93 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Intercom reader.""" -import json -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class IntercomReader(BaseReader): - """Intercom reader. Reads data from a Intercom workspace. - - Args: - personal_access_token (str): Intercom token. - """ - - def __init__(self, intercom_access_token: str) -> None: - """Initialize Intercom reader.""" - self.intercom_access_token = intercom_access_token - - def load_data(self) -> List[DocumentNode]: - """Load data from the workspace. - - Args: - workspace_id (str): Workspace ID. - - Returns: - List[DocumentNode]: List of documents. - """ - from bs4 import BeautifulSoup - - results = [] - - articles = self.get_all_articles() - - for article in articles: - - body = article["body"] - soup = BeautifulSoup(body, "html.parser") - body = soup.get_text() - - extra_info = { - "id": article["id"], - "title": article["title"], - "url": article["url"], - "updated_at": article["updated_at"], - } - - results.append( - DocumentNode( - text=body, - extra_info=extra_info or {}, - ) - ) - - return results - - def get_all_articles(self): - articles = [] - next_page = None - - while True: - response = self.get_articles_page(next_page) - articles.extend(response["articles"]) - next_page = response["next_page"] - - if next_page is None: - break - - return articles - - def get_articles_page(self, next_page: str = None): - import requests - - url = "https://api.intercom.io/articles" if next_page is None else next_page - - headers = { - "accept": "application/json", - "Intercom-Version": "2.8", - "authorization": f"Bearer {self.intercom_access_token}", - } - - response = requests.get(url, headers=headers) - - response_json = json.loads(response.text) - - next_page = response_json.get("pages", {}).get("next", None) - - articles = response_json.get("data", []) - - return {"articles": articles, "next_page": next_page} diff --git a/nextpy/ai/rag/document_loaders/intercom/requirements.txt b/nextpy/ai/rag/document_loaders/intercom/requirements.txt deleted file mode 100644 index 2f1f891a..00000000 --- a/nextpy/ai/rag/document_loaders/intercom/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -beautifulsoup4==4.11.1 -requests==2.28.1 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/jira/README.md b/nextpy/ai/rag/document_loaders/jira/README.md deleted file mode 100644 index 6f6459a3..00000000 --- a/nextpy/ai/rag/document_loaders/jira/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# JIRA Reader - -The Jira loader returns a set of issues based on the query provided to the dataloader. The user intializes the reader with an email, API token and the URL of the server they wish to fetch issues from. - -## Usage - -Here's an example of how to use it - -```python - -from llama_hub.jira.base import JiraReader - -reader = JiraReader(email=email, api_token=api_token, server_url="https://your-jira-server.com") -documents = reader.load_data(query='project = ') - -``` - -Alternately, you can also use download_loader from nextpy.ai - -```python - -from nextpy.ai import download_loader -JiraReader = download_loader('JiraReader') - -reader = JiraReader(email=email, api_token=api_token, server_url="https://your-jira-server.com") -documents = reader.load_data(query='project = ') - -``` diff --git a/nextpy/ai/rag/document_loaders/jira/__init__.py b/nextpy/ai/rag/document_loaders/jira/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/jira/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/jira/base.py b/nextpy/ai/rag/document_loaders/jira/base.py deleted file mode 100644 index 8aef73db..00000000 --- a/nextpy/ai/rag/document_loaders/jira/base.py +++ /dev/null @@ -1,98 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -def safe_value_dict(dict_obj): - for key, value in dict_obj.items(): - if isinstance(value, (str, int, float)): - dict_obj[key] = value - elif isinstance(value, list): - # Convert lists to strings - dict_obj[key] = ", ".join(map(str, value)) - elif value is None: - # Replace None with a default string - dict_obj[key] = "" - else: - # Convert other types to strings - dict_obj[key] = str(value) - return dict_obj - - -class JiraReader(BaseReader): - """Jira reader. Reads data from Jira issues from passed query. - - Args: - email (str): Jira email. - api_token (str): Jira API token. - server_url (str): Jira server url. - """ - - def __init__(self, email: str, api_token: str, server_url: str) -> None: - - from jira import JIRA - - self.jira = JIRA(basic_auth=(email, api_token), server=f"https://{server_url}") - - def load_data(self, query: str) -> List[DocumentNode]: - relevant_issues = self.jira.search_issues(query) - - issues = [] - - for issue in relevant_issues: - # Iterates through only issues and not epics - if "parent" in (issue.raw["fields"]): - assignee = "" - reporter = "" - epic_key = "" - epic_summary = "" - epic_descripton = "" - - if issue.fields.assignee: - assignee = issue.fields.assignee.displayName - - if issue.fields.reporter: - reporter = issue.fields.reporter.displayName - - if issue.raw["fields"]["parent"]["key"]: - epic_key = issue.raw["fields"]["parent"]["key"] - - if issue.raw["fields"]["parent"]["fields"]["summary"]: - epic_summary = issue.raw["fields"]["parent"]["fields"]["summary"] - - if issue.raw["fields"]["parent"]["fields"]["status"]["description"]: - epic_descripton = issue.raw["fields"]["parent"]["fields"]["status"][ - "description" - ] - - issues.append( - DocumentNode( - text=f"{issue.fields.summary} \n {issue.fields.description}", - extra_info=safe_value_dict( - { - "id": issue.id, - "title": issue.fields.summary, - "url": issue.permalink(), - "query": query, - "created_at": issue.fields.created, - "updated_at": issue.fields.updated, - "labels": issue.fields.labels, - "status": issue.fields.status.name, - "assignee": assignee, - "reporter": reporter, - "project": issue.fields.project.name, - "issue_type": issue.fields.issuetype.name, - "priority": issue.fields.priority.name, - "epic_key": epic_key, - "epic_summary": epic_summary, - "epic_description": epic_descripton, - } - ), - ) - ) - - return issues diff --git a/nextpy/ai/rag/document_loaders/jira/requirements.txt b/nextpy/ai/rag/document_loaders/jira/requirements.txt deleted file mode 100644 index 9cf40eaa..00000000 --- a/nextpy/ai/rag/document_loaders/jira/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -jira diff --git a/nextpy/ai/rag/document_loaders/joplin/README.md b/nextpy/ai/rag/document_loaders/joplin/README.md deleted file mode 100644 index b4bf8dea..00000000 --- a/nextpy/ai/rag/document_loaders/joplin/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Joplin (Markdown) Loader - ->[Joplin](https://joplinapp.org/) is an open source note-taking app. Capture your thoughts and securely access them from any device. - -This readme covers how to load documents from a `Joplin` database. - -`Joplin` has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. This reader uses the API to retrieve all notes in the database and their metadata. This requires an access token that can be obtained from the app by following these steps: - -1. Open the `Joplin` app. The app must stay open while the documents are being loaded. -2. Go to settings / options and select "Web Clipper". -3. Make sure that the Web Clipper service is enabled. -4. Under "Advanced Options", copy the authorization token. - -You may either initialize the reader directly with the access token, or store it in the environment variable JOPLIN_ACCESS_TOKEN. - -An alternative to this approach is to export the `Joplin`'s note database to Markdown files (optionally, with Front Matter metadata) and use a Markdown reader, such as ObsidianReader, to load them. - -## Usage - -Here's an example usage of the JoplinReader. - -```python -from nextpy.ai import download_loader -import os - -JoplinReader = download_loader('JoplinReader') -documents = JoplinReader(access_token='').load_data() # Returns list of documents -``` diff --git a/nextpy/ai/rag/document_loaders/joplin/__init__.py b/nextpy/ai/rag/document_loaders/joplin/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/joplin/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/joplin/base.py b/nextpy/ai/rag/document_loaders/joplin/base.py deleted file mode 100644 index ef235ee3..00000000 --- a/nextpy/ai/rag/document_loaders/joplin/base.py +++ /dev/null @@ -1,129 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Joplin reader class. - -When Joplin is installed and running it will parse all markdown -files into a List of Documents. - -""" -import json -import os -import urllib -from datetime import datetime -from typing import Iterator, List, Optional - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}" - - -class JoplinReader(BaseReader): - """Reader that fetches notes from Joplin. - - In order to use this reader, you need to have Joplin running with the - Web Clipper enabled (look for "Web Clipper" in the app settings). - - To get the access token, you need to go to the Web Clipper options and - under "Advanced Options" you will find the access token. You may provide - it as an argument or set the JOPLIN_ACCESS_TOKEN environment variable. - - You can find more information about the Web Clipper service here: - https://joplinapp.org/clipper/ - """ - - def __init__( - self, - access_token: Optional[str] = None, - parse_markdown: bool = True, - port: int = 41184, - host: str = "localhost", - ) -> None: - """Initialize a new instance of JoplinReader. - - Args: - access_token (Optional[str]): The access token for Joplin's Web Clipper service. - If not provided, the JOPLIN_ACCESS_TOKEN environment variable is used. Default is None. - parse_markdown (bool): Whether to parse the markdown content of the notes using MarkdownReader. Default is True. - port (int): The port on which Joplin's Web Clipper service is running. Default is 41184. - host (str): The host on which Joplin's Web Clipper service is running. Default is "localhost". - """ - self.parse_markdown = parse_markdown - if parse_markdown: - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - mr = import_loader("MarkdownReader") - except: - mr = download_loader("MarkdownReader") - self.parser = mr() - - access_token = access_token or self._get_token_from_env() - base_url = f"http://{host}:{port}" - self._get_note_url = ( - f"{base_url}/notes?token={access_token}" - f"&fields=id,parent_id,title,body,created_time,updated_time&page={{page}}" - ) - self._get_folder_url = ( - f"{base_url}/folders/{{id}}?token={access_token}&fields=title" - ) - self._get_tag_url = ( - f"{base_url}/notes/{{id}}/tags?token={access_token}&fields=title" - ) - - def _get_token_from_env(self) -> str: - if "JOPLIN_ACCESS_TOKEN" in os.environ: - return os.environ["JOPLIN_ACCESS_TOKEN"] - else: - raise ValueError( - "You need to provide an access token to use the Joplin reader. You may provide it as an argument or set the JOPLIN_ACCESS_TOKEN environment variable." - ) - - def _get_notes(self) -> Iterator[DocumentNode]: - has_more = True - page = 1 - while has_more: - req_note = urllib.request.Request(self._get_note_url.format(page=page)) - with urllib.request.urlopen(req_note) as response: - json_data = json.loads(response.read().decode()) - for note in json_data["items"]: - metadata = { - "source": LINK_NOTE_TEMPLATE.format(id=note["id"]), - "folder": self._get_folder(note["parent_id"]), - "tags": self._get_tags(note["id"]), - "title": note["title"], - "created_time": self._convert_date(note["created_time"]), - "updated_time": self._convert_date(note["updated_time"]), - } - if self.parse_markdown: - yield from self.parser.load_data( - None, content=note["body"], extra_info=metadata - ) - else: - yield DocumentNode(text=note["body"], extra_info=metadata) - - has_more = json_data["has_more"] - page += 1 - - def _get_folder(self, folder_id: str) -> str: - req_folder = urllib.request.Request(self._get_folder_url.format(id=folder_id)) - with urllib.request.urlopen(req_folder) as response: - json_data = json.loads(response.read().decode()) - return json_data["title"] - - def _get_tags(self, note_id: str) -> List[str]: - req_tag = urllib.request.Request(self._get_tag_url.format(id=note_id)) - with urllib.request.urlopen(req_tag) as response: - json_data = json.loads(response.read().decode()) - return ",".join([tag["title"] for tag in json_data["items"]]) - - def _convert_date(self, date: int) -> str: - return datetime.fromtimestamp(date / 1000).strftime("%Y-%m-%d %H:%M:%S") - - def lazy_load(self) -> Iterator[DocumentNode]: - yield from self._get_notes() - - def load_data(self) -> List[DocumentNode]: - return list(self.lazy_load()) diff --git a/nextpy/ai/rag/document_loaders/jsondata/README.md b/nextpy/ai/rag/document_loaders/jsondata/README.md deleted file mode 100644 index 34b8cf00..00000000 --- a/nextpy/ai/rag/document_loaders/jsondata/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Json Data Loader - -This loader extracts the text in a formatted manner from Json data in a Python dictionary. The `data` is passed to `load_data`. Ideal use case is for consuming REST API JSON data. - -## Usage - -To use this loader, you need to pass in Json data in a Python dictionary. - -```python -import requests -from nextpy.ai import GPTVectorDBIndex, download_loader -headers = { - "Authorization": "your_api_token" -} -data = requests.get("your-api-url", headers=headers).json() - -JsonDataReader = download_loader("JsonDataReader") -loader = JsonDataReader() -documents = loader.load_data(data) -index = GPTVectorDBIndex.from_documents(documents) -index.query("Question about your data") -``` - diff --git a/nextpy/ai/rag/document_loaders/jsondata/__init__.py b/nextpy/ai/rag/document_loaders/jsondata/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/jsondata/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/jsondata/base.py b/nextpy/ai/rag/document_loaders/jsondata/base.py deleted file mode 100644 index acfcd04d..00000000 --- a/nextpy/ai/rag/document_loaders/jsondata/base.py +++ /dev/null @@ -1,55 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Json Data Reader.""" - -import json -import re -from typing import Dict, Generator, List, Union - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -def _depth_first_yield(json_data: Dict, path: List[str]) -> Generator[str, None, None]: - """Do depth first yield of all of the leaf nodes of a JSON. - - Combines keys in the JSON tree using spaces. - - """ - if isinstance(json_data, dict): - for key, value in json_data.items(): - new_path = path[:] - new_path.append(key) - yield from _depth_first_yield(value, new_path) - elif isinstance(json_data, list): - for _, value in enumerate(json_data): - yield from _depth_first_yield(value, path) - else: - path.append(str(json_data)) - yield " ".join(path) - - -class JSONDataReader(BaseReader): - """Json Data reader. - - Reads in Json Data. - - Args: - data(Union[str, Dict]): Json data to read. Can be either a JSON - string or dictionary. - - """ - - def __init__(self) -> None: - """Initialize with arguments.""" - super().__init__() - - def load_data(self, input_data: Union[str, Dict]) -> List[DocumentNode]: - """Load data from the input file.""" - metadata = {"input_data": input_data} - data = json.loads(input_data) if isinstance(input_data, str) else input_data - json_output = json.dumps(data, indent=0) - lines = json_output.split("\n") - useful_lines = [line for line in lines if not re.match(r"^[{}\[\],]*$", line)] - return [DocumentNode(text="\n".join(useful_lines), extra_info=metadata)] diff --git a/nextpy/ai/rag/document_loaders/jsondata/requirements.txt b/nextpy/ai/rag/document_loaders/jsondata/requirements.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/nextpy/ai/rag/document_loaders/kaltura/esearch/README.md b/nextpy/ai/rag/document_loaders/kaltura/esearch/README.md deleted file mode 100644 index 4f90b81f..00000000 --- a/nextpy/ai/rag/document_loaders/kaltura/esearch/README.md +++ /dev/null @@ -1,119 +0,0 @@ -# Kaltura eSearch Loader - -This loader reads Kaltura Entries from [Kaltura](https://corp.kaltura.com) based on a Kaltura eSearch API call. -Search queries can be passed as a pre-defined object of KalturaESearchEntryParams, or through a simple free text query. -The result is a list of documents containing the Kaltura Entries and Captions json. - -## Parameters - -### `KalturaESearchEntryParams` - -This is a Kaltura class used for performing search operations in Kaltura. You can use this class to define various search criteria, such as search phrases, operators, and objects to be searched. - -For example, you can search for entries with specific tags, created within a specific time frame, or containing specific metadata. - -### Kaltura Configuration - -To use the Kaltura eSearch Loader, you need to provide the following configuration credentials: - -| Parameter | Description | Default Value | -|----------------------|-------------------------------------------------------------------------------|--------------------------------------------------| -| partnerId | Your Kaltura partner ID. | Mandatory (no default) | -| apiSecret | Your Kaltura API secret key (aka Admin Secret). | Mandatory (no default) | -| userId | Your Kaltura user ID. | Mandatory (no default) | -| ksType | The Kaltura session type. | KalturaSessionType.ADMIN | -| ksExpiry | The Kaltura session expiry time. | 86400 seconds | -| ksPrivileges | The Kaltura session privileges. | "disableentitlement" | -| kalturaApiEndpoint | The Kaltura API endpoint URL. | "[https://cdnapi-ev.kaltura.com/](https://cdnapi-ev.kaltura.com/)" | -| requestTimeout | The request timeout duration in seconds. | 500 seconds | -| shouldLogApiCalls | If passed True, all the Kaltura API calls will also be printed to log (only use during debug). | False | - -### load_data - -This method run the search in Kaltura and load Kaltura entries in a list of dictionaries. - -#### Method inputs - -* search_params: search parameters of type KalturaESearchEntryParams with pre-set search queries. If not provided, the other parameters will be used to construct the search query. -* search_operator_and: if True, the constructed search query will have AND operator between query filters, if False, the operator will be OR. -* free_text: if provided, will be used as the free text query of the search in Kaltura. -* category_ids: if provided, will only search for entries that are found inside these category ids. -* withCaptions: determines whether or not to also download captions/transcript contents from Kaltura. -* maxEntries: sets the maximum number of entries to pull from Kaltura, between 0 to 500 (max pageSize in Kaltura). - -#### Method output - -Each dictionary in the response represents a Kaltura media entry, where the keys are strings (field names) and the values can be of any type: - -| Column Name | Data Type | Description | -|---------------------|-----------|-----------------------------------| -| entry_id | str | Unique identifier of the entry | -| entry_name | str | Name of the entry | -| entry_description | str | Description of the entry | -| entry_captions | JSON | Captions of the entry | -| entry_media_type | int | Type of the media (KalturaMediaType) | -| entry_media_date | int | Date of the media Unix timestamp | -| entry_ms_duration | int | Duration of the entry in ms | -| entry_last_played_at| int | Last played date of the entry Unix timestamp | -| entry_application | str | The app that created this entry (KalturaEntryApplication) | -| entry_tags | str | Tags of the entry (comma separated) | -| entry_reference_id | str | Reference ID of the entry | - -## Usage - -First, instantiate the KalturaReader (aka Kaltura Loader) with your Kaltura configuration credentials: - -```python -from nextpy.ai import download_loader - -KalturaESearchReader = download_loader("KalturaESearchReader") - -loader = KalturaESearchReader( - partnerId="INSERT_YOUR_PARTNER_ID", - apiSecret="INSERT_YOUR_ADMIN_SECRET", - userId="INSERT_YOUR_USER_ID" -) -``` - -### Using an instance of KalturaESearchEntryParams - -Then, create an instance of `KalturaESearchEntryParams` and set your desired search parameters: - -```python -from KalturaClient.Plugins.ElasticSearch import KalturaESearchEntryParams, KalturaESearchEntryOperator, KalturaESearchOperatorType, KalturaESearchUnifiedItem - -# instantiate the params object -search_params = KalturaESearchEntryParams() - -# define search parameters (for example, search for entries with a certain tag) -search_params.searchOperator = KalturaESearchEntryOperator() -search_params.searchOperator.operator = KalturaESearchOperatorType.AND_OP -search_params.searchOperator.searchItems = [KalturaESearchUnifiedItem()] -search_params.searchOperator.searchItems[0].searchTerm = "my_tag" -``` - -Once you have your `KalturaESearchEntryParams` ready, you can pass it to the Kaltura Loader: - -```python -# Using search params -entry_docs = loader.load_data(search_params) -``` - -### Using Free Text Search - -```python -# Simple pass the search params into the load_data method without setting search_params -entry_docs = loader.load_data(search_operator_and=True, - free_text="education", - category_ids=None, - with_captions=True, - max_entries=5) -``` - -For a more elaborate example, see: [llamaindex_kaltura_esearch_reader_example.py](https://gist.github.com/zoharbabin/07febcfe52b64116c9e3ba1a392b59a0) - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. - -## About Kaltura - -Kaltura Video Cloud is a Digital Experience Platform enabling streamlined creation, management, and distribution of media content (video, audio, image, doc, live stream, real-time video). It powers many applications across industries with collaboration, interactivity, virtual events, and deep video analytics capabilities. diff --git a/nextpy/ai/rag/document_loaders/kaltura/esearch/__init__.py b/nextpy/ai/rag/document_loaders/kaltura/esearch/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/kaltura/esearch/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/kaltura/esearch/base.py b/nextpy/ai/rag/document_loaders/kaltura/esearch/base.py deleted file mode 100644 index 2c52e998..00000000 --- a/nextpy/ai/rag/document_loaders/kaltura/esearch/base.py +++ /dev/null @@ -1,262 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Kaltura eSearch API Reader.""" -import json -import logging -from typing import Any, Dict, List, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -class KalturaESearchReader(BaseReader): - """Kaltura eSearch API Reader.""" - - def __init__( - self, - partner_id: int = 0, - api_secret: str = "INSERT_YOUR_ADMIN_SECRET", - user_id: str = "INSERT_YOUR_USER_ID", - ks_type: int = 2, - ks_expiry: int = 86400, - ks_privileges: str = "disableentitlement", - kaltura_api_endpoint: str = "https://cdnapi-ev.kaltura.com/", - request_timeout: int = 500, - should_log_api_calls: bool = False, - ) -> None: - """Initialize a new instance of KalturaESearchReader. - - Args: - partner_id (int): The Kaltura Account ID. Default is 0. - api_secret (str): The Kaltura API Admin Secret. Default is "INSERT_YOUR_ADMIN_SECRET". - user_id (str): User ID for executing and logging all API actions under. Default is "INSERT_YOUR_USER_ID". - ks_type (int): Type of Kaltura Session. Default is 2. - ks_expiry (int): Validity of the Kaltura session in seconds. Default is 86400. - ks_privileges (str): Kaltura session privileges. Default is "disableentitlement". - kaltura_api_endpoint (str): The Kaltura API endpoint. Default is "https://cdnapi-ev.kaltura.com/". - request_timeout (int): API request timeout in seconds. Default is 500. - should_log_api_calls (bool): Boolean value determining whether to log Kaltura requests. Default is False. - """ - self.partner_id = partner_id - self.api_secret = api_secret - self.user_id = user_id - self.ks_type = ks_type - self.ks_expiry = ks_expiry - self.ks_privileges = ks_privileges - self.kaltura_api_endpoint = kaltura_api_endpoint - self.request_timeout = request_timeout - self.should_log_api_calls = should_log_api_calls - # Kaltura libraries will be loaded when they are needed - self._kaltura_loaded = False - - def _load_kaltura(self): - """Load Kaltura libraries and initialize the Kaltura client.""" - from KalturaClient import KalturaClient - from KalturaClient.Base import IKalturaLogger, KalturaConfiguration - from KalturaClient.Plugins.Core import KalturaSessionType - - class KalturaLogger(IKalturaLogger): - def log(self, msg): - logging.info(msg) - - try: - self.config = KalturaConfiguration() - self.config.requestTimeout = self.request_timeout - self.config.serviceUrl = self.kaltura_api_endpoint - if self.should_log_api_calls: - self.config.setLogger(KalturaLogger()) - self.client = KalturaClient(self.config) - if self.ks_type is None: - self.ks_type = KalturaSessionType.ADMIN - self.ks = self.client.generateSessionV2( - self.api_secret, - self.user_id, - self.ks_type, - self.partner_id, - self.ks_expiry, - self.ks_privileges, - ) - self.client.setKs(self.ks) - self._kaltura_loaded = True - except Exception: - logger.error("Kaltura Auth failed, check your credentials") - - def _load_from_search_params( - self, search_params, with_captions: bool = True, max_entries: int = 10 - ) -> List[Dict[str, Any]]: - """Load search parameters and returns a list of entries. - - Args: - search_params: Search parameters for Kaltura eSearch. - with_captions (bool): If True, the entries will include captions. - max_entries (int): Maximum number of entries to return. - - Returns: - list: A list of entries as dictionaries, - if captions required entry_info will include all metadata and text will include transcript, - otherwise info is just entry_id and text is all metadata. - """ - from KalturaClient.Plugins.Core import KalturaPager - - try: - entries = [] - pager = KalturaPager() - pager.pageIndex = 1 - pager.pageSize = max_entries - response = self.client.elasticSearch.eSearch.searchEntry( - search_params, pager - ) - - for search_result in response.objects: - entry = search_result.object - items_data = search_result.itemsData - - entry_info = { - "entry_id": str(entry.id), - "entry_name": str(entry.name), - "entry_description": str(entry.description or ""), - "entry_media_type": int(entry.mediaType.value or 0), - "entry_media_date": int(entry.createdAt or 0), - "entry_ms_duration": int(entry.msDuration or 0), - "entry_last_played_at": int(entry.lastPlayedAt or 0), - "entry_application": str(entry.application or ""), - "entry_tags": str(entry.tags or ""), - "entry_reference_id": str(entry.referenceId or ""), - } - - if with_captions: - caption_search_result = items_data[0].items[0] - if hasattr(caption_search_result, "captionAssetId"): - # TODO: change this to fetch captions per language, or as for a specific language code - caption_asset_id = caption_search_result.captionAssetId - entry_dict = { - "video_transcript": self._get_json_transcript( - caption_asset_id - ) - } - else: - entry_dict = entry_info.copy() - entry_info = {"entry_id": str(entry.id)} - else: - entry_dict = entry_info.copy() - entry_info = {"entry_id": str(entry.id)} - - entry_doc = DocumentNode( - text=json.dumps(entry_dict), extra_info=entry_info - ) - entries.append(entry_doc) - - return entries - - except Exception as e: - if e.code == "INVALID_KS": - raise ValueError(f"Kaltura Auth failed, check your credentials: {e}") - logger.error(f"An error occurred while loading with search params: {e}") - return [] - - def _get_json_transcript(self, caption_asset_id): - """Fetch json transcript/captions from a given caption_asset_id. - - Args: - caption_asset_id: The ID of the caption asset that includes the captions to fetch json transcript for - - Returns: - A JSON transcript of the captions, or an empty dictionary if none found or an error occurred. - """ - # TODO: change this to fetch captions per language, or as for a specific language code - try: - cap_json_url = self.client.caption.captionAsset.serveAsJson( - caption_asset_id - ) - cap_json = requests.get(cap_json_url).json() - return cap_json - except Exception as e: - logger.error(f"An error occurred while getting captions: {e}") - return {} - - def load_data( - self, - search_params: Any = None, - search_operator_and: bool = True, - free_text: Optional[str] = None, - category_ids: Optional[str] = None, - with_captions: bool = True, - max_entries: int = 5, - ) -> List[Dict[str, Any]]: - """Load data from the Kaltura based on search parameters. - The function returns a list of dictionaries. - Each dictionary represents a media entry, where the keys are strings (field names) and the values can be of any type. - - Args: - search_params: search parameters of type KalturaESearchEntryParams with pre-set search queries. If not provided, the other parameters will be used to construct the search query. - search_operator_and: if True, the constructed search query will have AND operator between query filters, if False, the operator will be OR. - free_text: if provided, will be used as the free text query of the search in Kaltura. - category_ids: if provided, will only search for entries that are found inside these category ids. - withCaptions: determines whether or not to also download captions/transcript contents from Kaltura. - maxEntries: sets the maximum number of entries to pull from Kaltura, between 0 to 500 (max pageSize in Kaltura). - - Returns: - List[Dict[str, Any]]: A list of dictionaries representing Kaltura Media Entries with the following fields: - entry_id:str, entry_name:str, entry_description:str, entry_captions:JSON, - entry_media_type:int, entry_media_date:int, entry_ms_duration:int, entry_last_played_at:int, - entry_application:str, entry_tags:str, entry_reference_id:str. - If with_captions is False, it sets entry_info to only include the entry_id and entry_dict to include all other entry information. - If with_captions is True, it sets entry_info to include all entry information and entry_dict to only include the entry transcript fetched via self._get_captions(items_data). - """ - from KalturaClient.Plugins.ElasticSearch import ( - KalturaCategoryEntryStatus, - KalturaESearchCaptionFieldName, - KalturaESearchCaptionItem, - KalturaESearchCategoryEntryFieldName, - KalturaESearchCategoryEntryItem, - KalturaESearchEntryOperator, - KalturaESearchEntryParams, - KalturaESearchItemType, - KalturaESearchOperatorType, - KalturaESearchUnifiedItem, - ) - - # Load and initialize the Kaltura client - if not self._kaltura_loaded: - self._load_kaltura() - - # Validate input parameters: - if search_params is None: - search_params = KalturaESearchEntryParams() - # Create an AND/OR relationship between the following search queries - - search_params.searchOperator = KalturaESearchEntryOperator() - if search_operator_and: - search_params.searchOperator.operator = ( - KalturaESearchOperatorType.AND_OP - ) - else: - search_params.searchOperator.operator = KalturaESearchOperatorType.OR_OP - search_params.searchOperator.searchItems = [] - # Find only entries that have captions - - if with_captions: - caption_item = KalturaESearchCaptionItem() - caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT - caption_item.itemType = KalturaESearchItemType.EXISTS - search_params.searchOperator.searchItems.append(caption_item) - # Find only entries that are inside these category IDs - - if category_ids is not None: - category_item = KalturaESearchCategoryEntryItem() - category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE - category_item.fieldName = KalturaESearchCategoryEntryFieldName.FULL_IDS - category_item.addHighlight = False - category_item.itemType = KalturaESearchItemType.EXACT_MATCH - category_item.searchTerm = category_ids - search_params.searchOperator.searchItems.append(category_item) - # Find only entries that has this freeText found in them - - if free_text is not None: - unified_item = KalturaESearchUnifiedItem() - unified_item.searchTerm = free_text - unified_item.itemType = KalturaESearchItemType.PARTIAL - search_params.searchOperator.searchItems.append(unified_item) - - return self._load_from_search_params(search_params, with_captions, max_entries) diff --git a/nextpy/ai/rag/document_loaders/kaltura/esearch/requirements.txt b/nextpy/ai/rag/document_loaders/kaltura/esearch/requirements.txt deleted file mode 100644 index 40818090..00000000 --- a/nextpy/ai/rag/document_loaders/kaltura/esearch/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -KalturaApiClient~=19.3.0 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/kibela/README.md b/nextpy/ai/rag/document_loaders/kibela/README.md deleted file mode 100644 index e36e3f21..00000000 --- a/nextpy/ai/rag/document_loaders/kibela/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Kibela Reader - -This reader fetches article from your [Kibela](https://kibe.la/) notes using the GraphQL API. - -# Usage - -Here's an example of how to use it. You can get your access token from [here](https://my.kibe.la/settings/access_tokens). - -```python -import os -from llama_hub.kibela.base import KibelaReader - -team = os.environ["KIBELA_TEAM"] -token = os.environ["KIBELA_TOKEN"] - -reader = KibelaReader(team=team, token=token) -documents = reader.load_data() -``` - -Alternately, you can also use download_loader from nextpy.ai - -```python -import os -from nextpy.ai import download_loader -KibelaReader = download_loader('KibelaReader') - -team = os.environ["KIBELA_TEAM"] -token = os.environ["KIBELA_TOKEN"] - -reader = KibelaReader(team=team, token=token) -documents = reader.load_data() -``` diff --git a/nextpy/ai/rag/document_loaders/kibela/__init__.py b/nextpy/ai/rag/document_loaders/kibela/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/kibela/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/kibela/base.py b/nextpy/ai/rag/document_loaders/kibela/base.py deleted file mode 100644 index 87039299..00000000 --- a/nextpy/ai/rag/document_loaders/kibela/base.py +++ /dev/null @@ -1,112 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""LLama Kibela Reader.""" -from typing import Dict, Generic, List, Optional, TypeVar - -from pydantic import BaseModel, parse_obj_as -from pydantic.generics import GenericModel - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -NodeType = TypeVar("NodeType") - - -class Edge(GenericModel, Generic[NodeType]): - node: Optional[NodeType] - cursor: Optional[str] - - -class PageInfo(BaseModel): - startCursor: Optional[str] - endCursor: Optional[str] - hasNextPage: Optional[bool] - - -class Connection(GenericModel, Generic[NodeType]): - nodes: Optional[List[NodeType]] - edges: Optional[List[Edge[NodeType]]] - pageInfo: Optional[PageInfo] - totalCount: Optional[int] - - -class Note(BaseModel): - content: Optional[str] - id: Optional[str] - title: Optional[str] - url: Optional[str] - - -class KibelaReader(BaseReader): - """Kibela reader. - - Reads pages from Kibela. - - Args: - team (str): Kibela team. - token (str): Kibela API token. - """ - - def __init__(self, team: str, token: str) -> None: - """Initialize with parameters.""" - from gql import Client - from gql.transport.aiohttp import AIOHTTPTransport - - self.team = team - self.url = f"https://{team}.kibe.la/api/v1" - self.headers = {"Authorization": f"Bearer {token}"} - transport = AIOHTTPTransport(url=self.url, headers=self.headers) - self.client = Client(transport=transport, fetch_schema_from_transport=True) - - def request(self, query: str, params: dict) -> Dict: - from gql import gql - - q = gql(query) - return self.client.execute(q, variable_values=params) - - def load_data(self) -> List[DocumentNode]: - """Load data from Kibela. - - Returns: - List[DocumentNode]: List of documents. - - """ - query = """ - query getNotes($after: String) { - notes(first: 100, after: $after) { - totalCount - pageInfo { - endCursor - startCursor - hasNextPage - } - edges { - cursor - node { - id - url - title - content - } - } - } - } - """ - metadata = {"team": self.team, "url": self.url} - - params = {"after": ""} - has_next = True - documents = [] - # Due to the request limit of 10 requests per second on the Kibela API, we do not process in parallel. - # See https://github.com/kibela/kibela-api-v1-DocumentNode#1%E7%A7%92%E3%81%82%E3%81%9F%E3%82%8A%E3%81%AE%E3%83%AA%E3%82%AF%E3%82%A8%E3%82%B9%E3%83%88%E6%95%B0 - while has_next: - res = self.request(query, params) - note_conn = parse_obj_as(Connection[Note], res["notes"]) - for note in note_conn.edges: - doc = f"---\nurl: {note.node.url}\ntitle: {note.node.title}\n---\ncontent:\n{note.node.content}\n" - documents.append(DocumentNode(text=doc, extra_info=metadata)) - has_next = note_conn.pageInfo.hasNextPage - params = {"after": note_conn.pageInfo.endCursor} - - return documents diff --git a/nextpy/ai/rag/document_loaders/kibela/requirements.txt b/nextpy/ai/rag/document_loaders/kibela/requirements.txt deleted file mode 100644 index 11388188..00000000 --- a/nextpy/ai/rag/document_loaders/kibela/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gql diff --git a/nextpy/ai/rag/document_loaders/library.json b/nextpy/ai/rag/document_loaders/library.json deleted file mode 100644 index ebfb881e..00000000 --- a/nextpy/ai/rag/document_loaders/library.json +++ /dev/null @@ -1,594 +0,0 @@ -{ - "AsanaReader": { - "id": "asana", - "author": "daveey" - }, - "AzStorageBlobReader": { - "id": "azstorage_blob", - "author": "rivms", - "keywords": [ - "azure storage", - "blob", - "container", - "azure" - ] - }, - "GoogleDocsReader": { - "id": "google_docs", - "author": "jerryjliu" - }, - "GoogleDriveReader": { - "id": "google_drive", - "author": "ravi03071991" - }, - "GoogleCalendarReader": { - "id": "google_calendar", - "author": "ong" - }, - "SimpleDirectoryReader": { - "id": "file", - "author": "jerryjliu" - }, - "PDFReader": { - "id": "file/pdf", - "author": "ravi03071991" - }, - "CJKPDFReader": { - "id": "file/cjk_pdf", - "author": "JiroShimaya", - "keywords": ["Japanese", "Chinese", "Korean"] - }, - "DocxReader": { - "id": "file/docx", - "author": "thejessezhang" - }, - "PptxReader": { - "id": "file/pptx", - "author": "thejessezhang" - }, - "ImageReader": { - "id": "file/image", - "author": "ravi03071991", - "keywords": ["invoice", "receipt"] - }, - "HubspotReader": { - "id": "hubspot", - "author": "ykhli", - "keywords": ["hubspot"] - }, - "EpubReader": { - "id": "file/epub", - "author": "Haowjy" - }, - "JSONReader": { - "id": "file/json", - "author": "yisding" - }, - "MarkdownReader": { - "id": "file/markdown", - "author": "hursh-desai" - }, - "AudioTranscriber": { - "id": "file/audio", - "author": "ravi03071991" - }, - "SimpleCSVReader": { - "id": "file/simple_csv", - "author": "vguillet" - }, - "PagedCSVReader": { - "id": "file/paged_csv", - "author": "thejessezhang" - }, - "PandasCSVReader": { - "id": "file/pandas_csv", - "author": "ephe-meral" - }, - "SimpleWebPageReader": { - "id": "web/simple_web", - "author": "thejessezhang" - }, - "AsyncWebPageReader": { - "id": "web/async_web", - "author": "Hironsan" - }, - "ReadabilityWebPageReader": { - "id": "web/readability_web", - "author": "pandazki", - "extra_files": ["Readability.js"] - }, - "BeautifulSoupWebReader": { - "id": "web/beautiful_soup_web", - "author": "thejessezhang", - "keywords": ["substack", "readthedocs", "documentation"] - }, - "RssReader": { - "id": "web/rss", - "author": "bborn", - "keywords": ["feed", "rss", "atom"] - }, - "SitemapReader": { - "id": "web/sitemap", - "author": "selamanse", - "keywords": ["sitemap", "website", "seo"] - }, - "DatabaseReader": { - "id": "database", - "author": "kevinqz", - "keywords": ["sql", "postgres", "snowflake", "aws rds"] - }, - "GraphQLReader": { - "id": "graphql", - "author": "jexp", - "keywords": [ - "graphql", - "gql", - "apollo" - ] - }, - "GraphDBCypherReader": { - "id": "graphdb_cypher", - "author": "jexp", - "keywords": [ - "graph", - "neo4j", - "cypher" - ] - }, - "DiscordReader": { - "id": "discord", - "author": "jerryjliu" - }, - "FaissReader": { - "id": "faiss", - "author": "jerryjliu" - }, - "SimpleMongoReader": { - "id": "mongo", - "author": "jerryjliu" - }, - "SimpleCouchDBReader": { - "id": "couchdb", - "author": "technosophy" - }, - "NotionPageReader": { - "id": "notion", - "author": "jerryjliu" - }, - "JoplinReader": { - "id": "joplin", - "author": "alondmnt" - }, - "ObsidianReader": { - "id": "obsidian", - "author": "hursh-desai" - }, - "PineconeReader": { - "id": "pinecone", - "author": "jerryjliu" - }, - "QdrantReader": { - "id": "qdrant", - "author": "kacperlukawski" - }, - "ChromaReader": { - "id": "chroma", - "author": "atroyn" - }, - "ElasticsearchReader": { - "id": "elasticsearch", - "author": "jaylmiller" - }, - "SlackReader": { - "id": "slack", - "author": "jerryjliu" - }, - "StringIterableReader": { - "id": "string_iterable", - "author": "teoh" - }, - "TwitterTweetReader": { - "id": "twitter", - "author": "ravi03071991" - }, - "SnscrapeTwitterReader": { - "id": "snscrape_twitter", - "author": "smyja" - }, - "WeaviateReader": { - "id": "weaviate", - "author": "jerryjliu" - }, - "WikipediaReader": { - "id": "wikipedia", - "author": "jerryjliu" - }, - "YoutubeTranscriptReader": { - "id": "youtube_transcript", - "author": "ravi03071991" - }, - "MakeWrapper": { - "id": "make_com" - }, - "ArxivReader": { - "id": "papers/arxiv", - "author": "thejessezhang" - }, - "PubmedReader": { - "id": "papers/pubmed", - "author": "thejessezhang" - }, - "MboxReader": { - "id": "file/mbox", - "author": "minosvasilias" - }, - "UnstructuredReader": { - "id": "file/unstructured", - "author": "thejessezhang", - "keywords": ["sec", "html", "eml", "10k", "10q", "unstructured.io"] - }, - "RAGWebReader": { - "id": "web/rag", - "author": "jasonwcfan", - "keywords": ["documentation"] - }, - "S3Reader": { - "id": "s3", - "author": "thejessezhang", - "keywords": ["aws s3", "bucket", "amazon web services"] - }, - "RemoteReader": { - "id": "remote", - "author": "thejessezhang", - "keywords": ["hosted", "url", "gutenberg"] - }, - "RemoteDepthReader": { - "id": "remote_depth", - "author": "simonMoisselin", - "keywords": ["hosted", "url", "multiple"] - }, - "DadJokesReader": { - "id": "dad_jokes", - "author": "sidu", - "keywords": ["jokes", "dad jokes"] - }, - "WordLiftLoader": { - "id": "wordlift", - "author": "msftwarelab", - "keywords": ["wordlift", "knowledge graph", "graphql", "structured data", "seo"] - }, - "WhatsappChatLoader": { - "id": "whatsapp", - "author": "batmanscode", - "keywords": ["whatsapp", "chat"] - }, - "BilibiliTranscriptReader": { - "id": "bilibili", - "author": "alexzhangji" - }, - "RedditReader": { - "id": "reddit", - "author": "vanessahlyan", - "keywords": ["reddit", "subreddit", "search", "comments"] - }, - "MemosReader": { - "id": "memos", - "author": "bubu", - "keywords": ["memos", "note"] - }, - "SpotifyReader": { - "id": "spotify", - "author": "ong", - "keywords": ["spotify", "music"] - }, - "GithubRepositoryReader": { - "id": "github_repo", - "author": "ahmetkca", - "keywords": [ - "github", - "repository", - "git", - "code", - "source code", - "placeholder" - ], - "extra_files": ["github_client.py", "utils.py", "__init__.py"] - }, - "RDFReader": { - "id": "file/rdf", - "author": "mommi84", - "keywords": ["rdf", "n-triples", "graph", "knowledge graph"] - }, - "ReadwiseReader": { - "id": "readwise", - "author": "alexbowe", - "keywords": ["readwise", "highlights", "reading", "pkm"] - }, - "PandasExcelReader": { - "id": "file/pandas_excel", - "author": "maccarini" - }, - "ZendeskReader": { - "id": "zendesk", - "author": "bbornsztein", - "keywords": ["zendesk", "knowledge base", "help center"] - }, - "IntercomReader": { - "id": "intercom", - "author": "bbornsztein", - "keywords": ["intercom", "knowledge base", "help center"] - }, - "WordpressReader": { - "id": "wordpress", - "author": "bbornsztein", - "keywords": ["wordpress", "blog"] - }, - "GmailReader": { - "id": "gmail", - "author": "bbornsztein", - "keywords": ["gmail", "email"] - }, - "SteamshipFileReader": { - "id": "steamship", - "author": "douglas-reid", - "keywords": ["steamship"] - }, - "GPTRepoReader": { - "id": "gpt_repo", - "author": "mpoon" - }, - "AirtableReader": { - "id": "airtable", - "author": "smyja" - }, - "HatenaBlogReader": { - "id": "hatena_blog", - "author": "Shoya SHIRAKI", - "keywords": ["hatena", "blog"] - }, - "OpendalReader": { - "id": "opendal_reader", - "author": "OpenDAL Contributors", - "keywords": ["storage"] - }, - "OpendalS3Reader": { - "id": "opendal_reader/s3", - "author": "OpenDAL Contributors", - "keywords": ["storage", "s3"] - }, - "OpendalAzblobReader": { - "id": "opendal_reader/azblob", - "author": "OpenDAL Contributors", - "keywords": ["storage", "azblob"] - }, - "OpendalGcsReader": { - "id": "opendal_reader/gcs", - "author": "OpenDAL Contributors", - "keywords": ["storage", "gcs"] - }, - "ConfluenceReader": { - "id": "confluence", - "author": "zywilliamli" - }, - "ChatGPTRetrievalPluginReader": { - "id": "chatgpt_plugin", - "author": "jerryjliu" - }, - "JiraReader": { - "id": "jira", - "author": "bearguy", - "keywords": ["jira"] - }, - "UnstructuredURLLoader": { - "id": "web/unstructured_web", - "author": "kravetsmic", - "keywords": ["unstructured.io", "url"] - }, - "GoogleSheetsReader": { - "id": "google_sheets", - "author": "piroz" - }, - "FeedlyRssReader": { - "id": "feedly_rss", - "author": "kychanbp", - "keywords": ["feedly", "rss"] - }, - "FlatPdfReader": { - "id": "file/flat_pdf", - "author": "emmanuel-oliveira", - "keywords": ["pdf", "flat", "flattened"] - }, - "PDFMinerReader": { - "id": "file/pdf_miner", - "author": "thunderbug1", - "keywords": ["pdf"] - }, - "MilvusReader": { - "id": "milvus", - "author": "filip-halt" - }, - "StackoverflowReader": { - "id": "stackoverflow", - "author": "allen-munsch", - "keywords": ["posts", "questions", "answers"] - }, - "ZulipReader": { - "id": "zulip", - "author": "plurigrid" - }, - "OutlookLocalCalendarReader": { - "id": "outlook_localcalendar", - "author": "tevslin", - "keywords": ["calendar", "outlook"] - }, - "ApifyActor": { - "id": "apify/actor", - "author": "drobnikj", - "keywords": ["apify", "scraper", "scraping", "crawler"] - }, - "ApifyDataset": { - "id": "apify/dataset", - "author": "drobnikj", - "keywords": ["apify", "scraper", "scraping", "crawler"] - }, - "TrelloReader": { - "id": "trello", - "author": "bluzir", - "keywords": ["trello"] - }, - "DeepLakeReader": { - "id": "deeplake", - "author": "adolkhan", - "keywords": ["deeplake"] - }, - "ImageCaptionReader": { - "id": "file/image_blip", - "author": "FarisHijazi", - "keywords": ["image"] - }, - "ImageVisionLLMReader": { - "id": "file/image_blip2", - "author": "FarisHijazi", - "keywords": ["image"] - }, - "ImageTabularChartReader": { - "id": "file/image_deplot", - "author": "jon-chuang", - "keywords": ["image", "chart", "tabular", "figure"] - }, - "IPYNBReader": { - "id": "file/ipynb", - "author": "FarisHijazi", - "keywords": ["jupyter", "notebook", "ipynb"] - }, - "HuggingFaceFSReader": { - "id": "huggingface/fs", - "author": "jerryjliu", - "keywords": ["hugging", "face", "huggingface", "filesystem", "fs"] - }, - "DeepDoctectionReader": { - "id": "file/deepdoctection", - "author": "jerryjliu", - "keywords": ["doctection", "doc"] - }, - "PandasAIReader": { - "id": "pandas_ai", - "author": "jerryjliu", - "keywords": ["pandas", "ai"] - }, - "MetalReader": { - "id": "metal", - "author": "getmetal", - "keywords": ["metal", "retriever", "storage"] - }, - "BoardDocsReader": { - "id": "boarddocs", - "author": "dweekly", - "keywords": [ - "board", - "boarddocs" - ] - }, - "PyMuPDFReader": { - "id": "file/pymu_pdf", - "author": "iamarunbrahma", - "keywords": ["pymupdf", "pdf"] - }, - "MondayReader": { - "id": "mondaydotcom", - "author": "nadavgr", - "keywords": ["monday", "mondaydotcom"] - }, - "MangoppsGuidesReader": { - "id": "mangoapps_guides", - "author": "mangoapps", - "keywords": [ - "mangoapps" - ] - }, - "DocugamiReader": { - "id": "docugami", - "author": "tjaffri", - "keywords": [ - "docugami", - "docx", - "doc", - "pdf", - "xml" - ] - }, - "WeatherReader": { - "id": "weather", - "author": "iamadhee", - "keywords": ["weather","openweather"] - }, - "OpenMap": { - "id": "maps", - "author": "carrotpy", - "keywords": ["open maps","maps","open street maps","overpass api","geo"] - }, - "KalturaESearchReader": { - "id": "kaltura/esearch", - "author": "kaltura", - "keywords": [ - "kaltura", - "video", - "media", - "image", - "audio", - "search", - "library", - "portal", - "events" - ] - }, - "FirestoreReader": { - "id": "firestore", - "author": "rayzhudev", - "keywords": ["firestore", "datastore"] - }, - "KibelaReader": { - "id": "kibela", - "author": "higebu" - }, - "GitHubRepositoryIssuesReader": { - "id": "github_repo_issues", - "author": "moncho", - "keywords": [ - "github", - "repository", - "issues" - ], - "extra_files": ["github_client.py", "__init__.py"] - }, - "FirebaseRealtimeDatabaseReader": { - "id": "firebase_realtimedb", - "author": "ajay", - "keywords": [ - "firebase", - "realtimedb", - "database" - ] - }, - "FeishuDocsReader": { - "id": "feishu_docs", - "author": "ma-chengcheng" - }, - "GoogleKeepReader": { - "id": "google_keep", - "author": "pycui", - "keywords": [ - "google keep", - "google notes" - ] - }, - "SingleStoreReader": { - "id": "singlestore", - "author": "singlestore", - "keywords": [ - "singlestore", - "memsql" - ] - } - -} diff --git a/nextpy/ai/rag/document_loaders/make_com/README.md b/nextpy/ai/rag/document_loaders/make_com/README.md deleted file mode 100644 index 09bfad54..00000000 --- a/nextpy/ai/rag/document_loaders/make_com/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Make Loader - -The Make Loader offers a webhook wrapper that can take in a query response as an input. -**NOTE**: The Make Loader does not offer the ability to load in Documents. Currently, -it is designed so that you can plug in LlamaIndex Response objects into downstream Make workflows. - -## Usage - -Here's an example usage of the `MakeWrapper`. - -```python -from nextpy.ai import download_loader -import os - -MakeWrapper = download_loader('MakeWrapper') - -# load index from disk -index = GPTVectorDBIndex.load_from_disk('../vector_indices/index_simple.json') - -# query index -query_str = "What did the author do growing up?" -response = index.query(query_str) - -# Send response to Make.com webhook -wrapper = MakeWrapper() -wrapper.pass_response_to_webhook( - ", - response, - query_str -) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/make_com/__init__.py b/nextpy/ai/rag/document_loaders/make_com/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/make_com/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/make_com/base.py b/nextpy/ai/rag/document_loaders/make_com/base.py deleted file mode 100644 index 5a5d8f6f..00000000 --- a/nextpy/ai/rag/document_loaders/make_com/base.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Make.com API wrapper. - -Currently cannot load documents. - -""" - -from typing import Any, List, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.response.schema import Response -from nextpy.ai.schema import DocumentNode, NodeWithScore, TextNode - - -class MakeWrapper(BaseReader): - """Make reader.""" - - def load_data(self, *args: Any, **load_kwargs: Any) -> List[DocumentNode]: - """Load data from the input directory. - - NOTE: This is not implemented. - - """ - raise NotImplementedError("Cannot load documents from Make.com API.") - - def pass_response_to_webhook( - self, webhook_url: str, response: Response, query: Optional[str] = None - ) -> None: - """Pass response object to webhook. - - Args: - webhook_url (str): Webhook URL. - response (Response): Response object. - query (Optional[str]): Query. Defaults to None. - - """ - response_text = response.response - source_nodes = [n.to_dict() for n in response.source_nodes] - json_dict = { - "response": response_text, - "source_nodes": source_nodes, - "query": query, - } - r = requests.post(webhook_url, json=json_dict) - r.raise_for_status() - - -if __name__ == "__main__": - wrapper = MakeWrapper() - test_response = Response( - response="test response", - source_nodes=[NodeWithScore(node=TextNode(text="test source", id_="test id"))], - ) - wrapper.pass_response_to_webhook( - "https://hook.us1.make.com/asdfadsfasdfasdfd", - test_response, - "Test query", - ) diff --git a/nextpy/ai/rag/document_loaders/mangoapps_guides/README.md b/nextpy/ai/rag/document_loaders/mangoapps_guides/README.md deleted file mode 100644 index e26c2f95..00000000 --- a/nextpy/ai/rag/document_loaders/mangoapps_guides/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# MangoppsGuides Loader - -This loader fetches the text from Mangopps Guides. - -## Usage - -To use this loader, you need to pass base url of the MangoppsGuides installation (e.g. `https://guides.mangoapps.com/`) and the limit , i.e. max number of links it should crawl - -```python -from nextpy.ai import download_loader - -MangoppsGuidesReader = download_loader("MangoppsGuidesReader") - -loader = MangoppsGuidesReader() -documents = loader.load_data( domain_url="https://guides.mangoapps.com", limit=1 ) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/mangoapps_guides/__init__.py b/nextpy/ai/rag/document_loaders/mangoapps_guides/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/mangoapps_guides/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/mangoapps_guides/base.py b/nextpy/ai/rag/document_loaders/mangoapps_guides/base.py deleted file mode 100644 index e3f42442..00000000 --- a/nextpy/ai/rag/document_loaders/mangoapps_guides/base.py +++ /dev/null @@ -1,150 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""MangoppsGuides reader.""" -import re -from typing import List -from urllib.parse import urlparse - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MangoppsGuidesReader(BaseReader): - """MangoppsGuides reader. Reads data from a MangoppsGuides workspace. - - Args: - domain_url (str): MangoppsGuides domain url - limir (int): depth to crawl - """ - - def __init__(self) -> None: - """Initialize MangoppsGuides reader.""" - - def load_data(self, domain_url: str, limit: int) -> List[DocumentNode]: - """Load data from the workspace. - - Returns: - List[DocumentNode]: List of documents. - """ - import requests - from bs4 import BeautifulSoup - - self.domain_url = domain_url - self.limit = limit - self.start_url = f"{self.domain_url}/home/" - - fetched_urls = self.crawl_urls()[: self.limit] - - results = [] - - guides_pages = {} - for url in fetched_urls: - try: - response = requests.get(url) - soup = BeautifulSoup(response.content, "html.parser") - - page_title = soup.find("title").text - - # Remove the div with aria-label="Table of contents" - table_of_contents_div = soup.find( - "div", {"aria-label": "Table of contents"} - ) - if table_of_contents_div: - table_of_contents_div.decompose() - - # Remove header and footer - header = soup.find("header") - if header: - header.decompose() - footer = soup.find("footer") - if footer: - footer.decompose() - - # Exclude links and their text content from the main content - for link in soup.find_all("a"): - link.decompose() - - # Remove empty elements from the main content - for element in soup.find_all(): - if element.get_text(strip=True) == "": - element.decompose() - - # Find the main element containing the desired content - main_element = soup.find( - "main" - ) # Replace "main" with the appropriate element tag or CSS class - - # Extract the text content from the main element - if main_element: - text_content = main_element.get_text("\n") - # Remove multiple consecutive newlines and keep only one newline - text_content = re.sub(r"\n+", "\n", text_content) - else: - text_content = "" - - page_text = text_content - - guides_page = {} - guides_page["title"] = page_title - guides_page["text"] = page_text - guides_pages[url] = guides_page - except Exception as e: - print(f"Failed for {url} => {e}") - - for k, v in guides_pages.items(): - metadata = {"url": k, "title": v["title"]} - results.append( - DocumentNode( - text=v["text"], - extra_info=metadata, - ) - ) - - return results - - def crawl_urls(self) -> List[str]: - """Crawls all the urls from given domain.""" - self.visited = [] - - fetched_urls = self.fetch_url(self.start_url) - fetched_urls = list(set(fetched_urls)) - - return fetched_urls - - def fetch_url(self, url): - """Fetch the urls from given domain.""" - import requests - from bs4 import BeautifulSoup - - response = requests.get(url) - soup = BeautifulSoup(response.content, "html.parser") - - self.visited.append(url) - - newurls = [] - for link in soup.find_all("a"): - href: str = link.get("href") - if href and urlparse(href).netloc == self.domain_url: - newurls.append(href) - elif href and href.startswith("/"): - newurls.append(f"{self.domain_url}{href}") - - for newurl in newurls: - if ( - newurl not in self.visited - and not newurl.startswith("#") - and f"https://{urlparse(newurl).netloc}" == self.domain_url - and len(self.visited) <= self.limit - ): - newurls = newurls + self.fetch_url(newurl) - - newurls = list(set(newurls)) - return newurls - - -if __name__ == "__main__": - reader = MangoppsGuidesReader() - print("Initialized MangoppsGuidesReader") - output = reader.load_data(domain_url="https://guides.mangoapps.com", limit=5) - print(output) diff --git a/nextpy/ai/rag/document_loaders/mangoapps_guides/requirements.txt b/nextpy/ai/rag/document_loaders/mangoapps_guides/requirements.txt deleted file mode 100644 index 6ddd8a01..00000000 --- a/nextpy/ai/rag/document_loaders/mangoapps_guides/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -beautifulsoup4>=4.11.1 -requests>=2.28.1 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/maps/README.md b/nextpy/ai/rag/document_loaders/maps/README.md deleted file mode 100644 index 2c106c18..00000000 --- a/nextpy/ai/rag/document_loaders/maps/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# ***Osmmap Loader*** - -The Osmmap Loader will fetch map data from the [Overpass](https://wiki.openstreetmap.org/wiki/Main_Page) api for a certain place or area. Version **Overpass API 0.7.60** is used by this loader. - -The api will provide you with all the **nodes, relations, and ways** for the particular region when you request data for a region or location. -## **Functions of the loader** - -* To start, it first filters out those nodes that are already tagged, leaving just those nodes that are within 2 kilometres of the target location. The following keys are removed during filtering:["nodes," "geometry," "members"] from each node. The response we received is based on the tags and values we provided, so be sure to do that. The actions are covered below. - -## **Steps to find the suitable tag and values** - -1. Visit [Taginfo](taginfo.openstreetmap.org/tags). In essence, this website has all conceivable tags and values. -2. Perform a search for the feature you're looking for, for instance, "hospital" will return three results: "hospital" as an amenity, "hospital" as a structure, and "hospital" as a healthcare facility. -3. We may infer from the outcome that tag=amenity and value=hospital. -4. Leave the values parameter to their default value if you do not need to filter. - - - -## **Usage** - -The use case is here. - -Let's meet **Jayasree**, who is extracting map features from her neighbourhood using the OSM map loader. -She requires all the nodes, routes, and relations within a five-kilometer radius of her locale (Guduvanchery). - -* She must use the following arguments in order to accomplish the aforementioned. Localarea = "Guduvanchery" (the location she wants to seek), local_area_buffer = 5000 (5 km). - -### And the code snippet looks like - -```python -from nextpy.ai import download_loader - -MapReader = download_loader("OpenMap") - -loader = MapReader() -documents = loader.load_data(localarea='Guduvanchery',search_tag='',tag_only=True,local_area_buffer=5000,tag_values=['']) - -``` - -### Now she wants only the list hospitals around the location - -* so she search for hospital tag in the [Taginfo](https://taginfo.openstreetmap.org/tags) and she got - -```python -from nextpy.ai import download_loader - -MapReader = download_loader("OpenMap") - -loader = MapReader() -documents = loader.load_data(localarea='Guduvanchery',search_tag='amenity',tag_only=True,local_area_buffer=5000,tag_values=['hospital','clinic']) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/maps/__init__.py b/nextpy/ai/rag/document_loaders/maps/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/maps/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/maps/base.py b/nextpy/ai/rag/document_loaders/maps/base.py deleted file mode 100644 index ca3edf7b..00000000 --- a/nextpy/ai/rag/document_loaders/maps/base.py +++ /dev/null @@ -1,131 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that reads OSMmap data from overpass API.""" - -import random -import string -import warnings -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -warnings.filterwarnings("ignore") - - -class OpenMap(BaseReader): - """OpenMap Reader. - - Get the map Features from the overpass api(osm) for the given location/area - - - Args: - localarea(str) - Area or location you are seaching for - tag_values(str) - filter for the give area - search_tag(str) - Tag that you are looking for - - if you not sure about the search_tag and tag_values visit https://taginfo.openstreetmap.org/tags - - remove_keys(list) - list of keys that need to be removed from the response - by default following keys will be removed ['nodes','geometry','members'] - - """ - - def __init__(self) -> None: - """Initialize with parameters.""" - super().__init__() - - @staticmethod - def _get_user() -> str: - # choose from all lowercase letter - letters = string.ascii_lowercase - result_str = "".join(random.choice(letters) for i in range(10)) - return result_str - - @staticmethod - def _get_latlon(locarea: str, user_agent: str) -> tuple: - try: - from geopy.geocoders import Nominatim - except: - raise ImportError("install geopy using `pip3 install geopy`") - - geolocator = Nominatim(user_agent=user_agent) - location = geolocator.geocode(locarea) - return (location.latitude, location.longitude) if location else (None, None) - - def load_data( - self, - localarea: str, - search_tag: Optional[str] = "amenity", - remove_keys: Optional[List] = ["nodes", "geometry", "members"], - tag_only: Optional[bool] = True, - tag_values: Optional[List] = [""], - local_area_buffer: Optional[int] = 2000, - ) -> List[DocumentNode]: - """This loader will bring you the all the node values from the open street maps for the given location. - - Args: - localarea(str) - Area or location you are seaching for - search_tag(str) - Tag that you are looking for - if you not sure about the search_tag and tag_values visit https://taginfo.openstreetmap.org/tags - - remove_keys(list) - list of keys that need to be removed from the response - by default it those keys will be removed ['nodes','geometry','members'] - - tag_only(bool) - if True it return the nodes which has tags if False returns all the nodes - tag_values(str) - filter for the give area - local_area_buffer(int) - range that you wish to cover (Default 2000(2km)) - """ - try: - from osmxtract import location, overpass - from osmxtract.errors import OverpassBadRequest - except: - raise ImportError("install osmxtract using `pip3 install osmxtract`") - - null_list = ["", "null", "none", None] - metadata = {} - local_area = localarea - - if local_area.lower().strip() in null_list: - raise Exception("The Area should not be null") - - user = self._get_user() - lat, lon = self._get_latlon(local_area, user) - try: - bounds = location.from_buffer(lat, lon, buffer_size=int(local_area_buffer)) - except TypeError: - raise TypeError("Please give valid location name or check for spelling") - - # overpass query generation and execution - tag_values = [str(i).lower().strip() for i in tag_values] - query = overpass.ql_query( - bounds, tag=search_tag.lower(), values=tag_values, timeout=500 - ) - - metadata["overpass_query"] = query - try: - response = overpass.request(query) - - except OverpassBadRequest: - raise TypeError( - f"Error while executing the Query {query} please check the Args" - ) - - res = response["elements"] - - _meta = response.copy() - del _meta["elements"] - metadata["overpass_meta"] = str(_meta) - metadata["lat"] = lat - metadata["lon"] = lon - metadata["localarea"] = localarea - # filtering for only the tag values - filtered = [i for i in res if "tags" in i] if tag_only else res - - for key in remove_keys: - [i.pop(key, None) for i in filtered] - if filtered: - return DocumentNode(text=str(filtered), extra_info=metadata) - else: - return DocumentNode(text=str(res), extra_info=metadata) diff --git a/nextpy/ai/rag/document_loaders/maps/requirements.txt b/nextpy/ai/rag/document_loaders/maps/requirements.txt deleted file mode 100644 index 721f6444..00000000 --- a/nextpy/ai/rag/document_loaders/maps/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -osmxtract -geopy \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/memos/README.md b/nextpy/ai/rag/document_loaders/memos/README.md deleted file mode 100644 index e9031cb4..00000000 --- a/nextpy/ai/rag/document_loaders/memos/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Memos Loader - -This loader fetchs text from self-hosted [memos](https://github.com/usememos/memos). - -## Usage - -To use this loader, you need to specify the host where memos is deployed. If you need to filter, pass the [corresponding parameter](https://github.com/usememos/memos/blob/4fe8476169ecd2fc4b164a25611aae6861e36812/api/memo.go#L76) in `load_data`. - -```python -from nextpy.ai import download_loader - -MemosReader = download_loader("MemosReader") -loader = MemosReader("https://demo.usememos.com/") -documents = loader.load_data({"creatorId": 101}) -``` - - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/memos/__init__.py b/nextpy/ai/rag/document_loaders/memos/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/memos/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/memos/base.py b/nextpy/ai/rag/document_loaders/memos/base.py deleted file mode 100644 index 211e6b96..00000000 --- a/nextpy/ai/rag/document_loaders/memos/base.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple Reader for Memos.""" - -from typing import Dict, List -from urllib.parse import urljoin - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MemosReader(BaseReader): - """Memos reader. - - Reads content from an Memos. - - """ - - def __init__(self, host: str = "https://demo.usememos.com/") -> None: - """Init params.""" - self._memoUrl = urljoin(host, "api/memo") - - def load_data(self, params: Dict = {}) -> List[DocumentNode]: - """Load data from RSS feeds. - - Args: - params (Dict): Filtering parameters. - - Returns: - List[DocumentNode]: List of documents. - - """ - import requests - - documents = [] - realUrl = self._memoUrl - - if not params: - realUrl = urljoin(self._memoUrl, "all", False) - - try: - req = requests.get(realUrl, params) - res = req.json() - except: - raise ValueError("Your Memo URL is not valid") - - if "data" not in res: - raise ValueError("Invalid Memo response") - - memos = res["data"] - for memo in memos: - content = memo["content"] - metadata = { - "memoUrl": self._memoUrl, - "creator": memo["creator"], - "resource_list": memo["resourceList"], - id: memo["id"], - } - documents.append(DocumentNode(text=content, extra_info=metadata)) - - return documents diff --git a/nextpy/ai/rag/document_loaders/metal/README.md b/nextpy/ai/rag/document_loaders/metal/README.md deleted file mode 100644 index 10277777..00000000 --- a/nextpy/ai/rag/document_loaders/metal/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Metal Loader -[Metal](https://getmetal.io) - - -The Metal Loader returns a set of texts corresponding to embeddings retrieved from a Metal Index. - -The user initializes the loader with a Metal index. They then pass in a text query. - -## Usage - -Here's an example usage of the MetalReader. - -```python -from nextpy.ai import download_loader -import os - - -MetalReader = download_loader('MetalReader') - -query_embedding = [n1, n2, n3, ...] # embedding of the search query - -reader = MetalReader( - api_key=api_key, - client_id=client_id, - index_id=index_id -) - -documents = reader.load_data( - top_k=3, - query_embedding=query_embedding, -) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/metal/__init__.py b/nextpy/ai/rag/document_loaders/metal/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/metal/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/metal/base.py b/nextpy/ai/rag/document_loaders/metal/base.py deleted file mode 100644 index 1c5bd76d..00000000 --- a/nextpy/ai/rag/document_loaders/metal/base.py +++ /dev/null @@ -1,80 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Metal Reader.""" -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MetalReader(BaseReader): - """Metal reader. - - Args: - api_key (str): Metal API key. - client_id (str): Metal client ID. - index_id (str): Metal index ID. - """ - - def __init__(self, api_key: str, client_id: str, index_id: str): - import_err_msg = ( - "`metal_sdk` package not found, please run `pip install metal_sdk`" - ) - try: - import metal_sdk # noqa: F401 - except ImportError: - raise ImportError(import_err_msg) - from metal_sdk.metal import Metal - - """Initialize with parameters.""" - self._api_key = api_key - self._client_id = client_id - self._index_id = index_id - self.metal_client = Metal(api_key, client_id, index_id) - - def load_data( - self, - limit: int, - query_embedding: Optional[List[float]] = None, - filters: Optional[Dict[str, Any]] = None, - separate_documents: bool = True, - **query_kwargs: Any - ) -> List[DocumentNode]: - """Load data from Metal. - - Args: - query_embedding (Optional[List[float]]): Query embedding for search. - limit (int): Number of results to return. - filters (Optional[Dict[str, Any]]): Filters to apply to the search. - separate_documents (Optional[bool]): Whether to return separate - documents per retrieved entry. Defaults to True. - **query_kwargs: Keyword arguments to pass to the search. - - Returns: - List[DocumentNode]: A list of documents. - """ - metadata = { - "limit": limit, - "query_embedding": query_embedding, - "filters": filters, - "separate_documents": separate_documents, - } - - payload = { - "embedding": query_embedding, - "filters": filters, - } - response = self.metal_client.search(payload, limit=limit, **query_kwargs) - - documents = [] - for item in response["data"]: - text = item["text"] or (item["metadata"] and item["metadata"]["text"]) - documents.append(DocumentNode(text=text, extra_info=metadata)) - - if not separate_documents: - text_list = [doc.get_text() for doc in documents] - text = "\n\n".join(text_list) - documents = [DocumentNode(text=text, extra_info=metadata)] - - return documents diff --git a/nextpy/ai/rag/document_loaders/metal/requirements.txt b/nextpy/ai/rag/document_loaders/metal/requirements.txt deleted file mode 100644 index 66b852b0..00000000 --- a/nextpy/ai/rag/document_loaders/metal/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -metal_sdk diff --git a/nextpy/ai/rag/document_loaders/milvus/README.md b/nextpy/ai/rag/document_loaders/milvus/README.md deleted file mode 100644 index fa643530..00000000 --- a/nextpy/ai/rag/document_loaders/milvus/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Milvus Loader - -The Milvus Loader returns a set of texts corresponding to embeddings retrieved from a Milvus collection. -The user initializes the loader with parameters like host/port. - -During query-time, the user passes in the collection name, query vector, and a few other parameters. - -## Usage - -Here's an example usage of the MilvusReader. - -```python -from nextpy.ai import download_loader -import os - -MilvusReader = download_loader("MilvusReader") - -reader = MilvusReader( - host="localhost", port=19530, user="", password="", use_secure=False -) -# the query_vector is an embedding representation of your query_vector -# Example query vector: -# query_vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] - -query_vector=[n1, n2, n3, ...] - -documents = reader.load_data( - query_vector=query_vector, - collection_name="demo", - limit=5 -) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/milvus/__init__.py b/nextpy/ai/rag/document_loaders/milvus/__init__.py deleted file mode 100644 index 1c233aca..00000000 --- a/nextpy/ai/rag/document_loaders/milvus/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init params.""" diff --git a/nextpy/ai/rag/document_loaders/milvus/base.py b/nextpy/ai/rag/document_loaders/milvus/base.py deleted file mode 100644 index c7851cfd..00000000 --- a/nextpy/ai/rag/document_loaders/milvus/base.py +++ /dev/null @@ -1,155 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Milvus reader.""" - -from typing import Any, Dict, List, Optional -from uuid import uuid4 - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MilvusReader(BaseReader): - """Milvus reader.""" - - def __init__( - self, - host: str = "localhost", - port: int = 19530, - user: str = "", - password: str = "", - use_secure: bool = False, - ): - """Initialize with parameters.""" - import_err_msg = ( - "`pymilvus` package not found, please run `pip install pymilvus`" - ) - try: - import pymilvus # noqa: F401 - except ImportError: - raise ImportError(import_err_msg) - - from pymilvus import MilvusException - - self.host = host - self.port = port - self.user = user - self.password = password - self.use_secure = use_secure - self.collection = None - - self.default_search_params = { - "IVF_FLAT": {"metric_type": "IP", "params": {"nprobe": 10}}, - "IVF_SQ8": {"metric_type": "IP", "params": {"nprobe": 10}}, - "IVF_PQ": {"metric_type": "IP", "params": {"nprobe": 10}}, - "HNSW": {"metric_type": "IP", "params": {"ef": 10}}, - "RHNSW_FLAT": {"metric_type": "IP", "params": {"ef": 10}}, - "RHNSW_SQ": {"metric_type": "IP", "params": {"ef": 10}}, - "RHNSW_PQ": {"metric_type": "IP", "params": {"ef": 10}}, - "IVF_HNSW": {"metric_type": "IP", "params": {"nprobe": 10, "ef": 10}}, - "ANNOY": {"metric_type": "IP", "params": {"search_k": 10}}, - "AUTOINDEX": {"metric_type": "IP", "params": {}}, - } - try: - self._create_connection_alias() - except MilvusException as e: - raise e - - def load_data( - self, - query_vector: List[float], - collection_name: str, - expr: Any = None, - search_params: Optional[dict] = None, - limit: int = 10, - ) -> List[DocumentNode]: - """Load data from Milvus. - - Args: - collection_name (str): Name of the Milvus collection. - query_vector (List[float]): Query vector. - limit (int): Number of results to return. - - Returns: - List[DocumentNode]: A list of documents. - """ - metadata = { - "host": self.host, - "query_vector": query_vector, - "collection_name": collection_name, - "expr": expr, - "search_params": search_params, - "limit": limit, - } - - from pymilvus import Collection, MilvusException - - try: - self.collection = Collection(collection_name, using=self.alias) - except MilvusException as e: - raise e - - assert self.collection is not None - try: - self.collection.load() - except MilvusException as e: - raise e - if search_params is None: - search_params = self._create_search_params() - - res = self.collection.search( - [query_vector], - "embedding", - param=search_params, - expr=expr, - output_fields=["doc_id", "text"], - limit=limit, - ) - - documents = [] - # TODO: In future append embedding when more efficient - for hit in res[0]: - doc = DocumentNode( - doc_id=hit.entity.get("doc_id"), - text=hit.entity.get("text"), - extra_info=metadata, - ) - - documents.append(doc) - - return documents - - def _create_connection_alias(self) -> None: - from pymilvus import connections - - self.alias = None - # Attempt to reuse an open connection - for x in connections.list_connections(): - addr = connections.get_connection_addr(x[0]) - if ( - x[1] - and ("address" in addr) - and (addr["address"] == "{}:{}".format(self.host, self.port)) - ): - self.alias = x[0] - break - - # Connect to the Milvus instance using the passed in Environment variables - if self.alias is None: - self.alias = uuid4().hex - connections.connect( - alias=self.alias, - host=self.host, - port=self.port, - user=self.user, # type: ignore - password=self.password, # type: ignore - secure=self.use_secure, - ) - - def _create_search_params(self) -> Dict[str, Any]: - assert self.collection is not None - index = self.collection.indexes[0]._index_params - search_params = self.default_search_params[index["index_type"]] - search_params["metric_type"] = index["metric_type"] - return search_params diff --git a/nextpy/ai/rag/document_loaders/milvus/requirements.txt b/nextpy/ai/rag/document_loaders/milvus/requirements.txt deleted file mode 100644 index de2c40e3..00000000 --- a/nextpy/ai/rag/document_loaders/milvus/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pymilvus \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/mondaydotcom/README.md b/nextpy/ai/rag/document_loaders/mondaydotcom/README.md deleted file mode 100644 index ed94a5fd..00000000 --- a/nextpy/ai/rag/document_loaders/mondaydotcom/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Monday Loader - -This loader loads data from monday.com. The user specifies an API token to initialize the MondayReader. They then specify a monday.com board id to load in the corresponding DocumentNode objects. - -## Usage - -Here's an example usage of the MondayReader. - -```python -from nextpy.ai import download_loader - -MondayReader = download_loader('MondayReader') - -reader = MondayReader("") -documents = reader.load_data("") - -``` - -Check out monday.com API docs - [here](https://developer.monday.com/apps/docs/mondayapi) - - diff --git a/nextpy/ai/rag/document_loaders/mondaydotcom/__init__.py b/nextpy/ai/rag/document_loaders/mondaydotcom/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/mondaydotcom/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/mondaydotcom/base.py b/nextpy/ai/rag/document_loaders/mondaydotcom/base.py deleted file mode 100644 index 5a110504..00000000 --- a/nextpy/ai/rag/document_loaders/mondaydotcom/base.py +++ /dev/null @@ -1,96 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""monday.com reader.""" -from typing import Dict, List - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class MondayReader(BaseReader): - """monday.com reader. Reads board's data by a GraphQL query. - - Args: - api_key (str): monday.com API key. - """ - - def __init__(self, api_key: str) -> None: - """Initialize monday.com reader.""" - self.api_key = api_key - self.api_url = "https://api.monday.com/v2" - - def _parse_item_values(self, cv) -> Dict[str, str]: - data = {} - data["title"] = cv["title"] - data["value"] = cv["text"] - - return data - - def _parse_data(self, item) -> Dict[str, str]: - data = {} - data["id"] = item["id"] - data["name"] = item["name"] - data["values"] = list(map(self._parse_item_values, list(item["column_values"]))) - - return data - - def _perform_request(self, board_id) -> Dict[str, str]: - headers = {"Authorization": self.api_key} - query = """ - query{ - boards(ids: [%d]){ - name, - items{ - id, - name, - column_values{ - title, - text - } - } - } - } """ % ( - board_id - ) - data = {"query": query} - - response = requests.post(url=self.api_url, json=data, headers=headers) - return response.json() - - def load_data(self, board_id: int) -> List[DocumentNode]: - """Load board data by board_id. - - Args: - board_id (int): monday.com board id. - - Returns: - List[DocumentNode]: List of items as documents. - [{id, name, values: [{title, value}]}] - """ - json_response = self._perform_request(board_id) - board_data = json_response["data"]["boards"][0] - - board_data["name"] - items_array = list(board_data["items"]) - parsed_items = list(map(self._parse_data, list(items_array))) - result = [] - for item in parsed_items: - text = f"name: {item['name']}" - for item_value in item["values"]: - if item_value["value"]: - text += f", {item_value['title']}: {item_value['value']}" - result.append( - DocumentNode( - text=text, extra_info={"board_id": board_id, "item_id": item["id"]} - ) - ) - - return result - - -if __name__ == "__main__": - reader = MondayReader("api_key") - print(reader.load_data(12345)) diff --git a/nextpy/ai/rag/document_loaders/mondaydotcom/requirements.txt b/nextpy/ai/rag/document_loaders/mondaydotcom/requirements.txt deleted file mode 100644 index 663bd1f6..00000000 --- a/nextpy/ai/rag/document_loaders/mondaydotcom/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -requests \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/mongo/README.md b/nextpy/ai/rag/document_loaders/mongo/README.md deleted file mode 100644 index b4539658..00000000 --- a/nextpy/ai/rag/document_loaders/mongo/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Mongo Loader - -This loader loads documents from MongoDB. The user specifies a Mongo instance to -initialize the reader. They then specify the collection name and query params to -fetch the relevant docs. - -## Usage - -Here's an example usage of the SimpleMongoReader. - -```python -from nextpy.ai import download_loader -import os - -SimpleMongoReader = download_loader('SimpleMongoReader') - -host = "" -port = "" -db_name = "" -collection_name = "" -# query_dict is passed into db.collection.find() -query_dict = {} -reader = SimpleMongoReader(host, port) -documents = reader.load_data(db_name, collection_name, query_dict=query_dict) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/mongo/__init__.py b/nextpy/ai/rag/document_loaders/mongo/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/mongo/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/mongo/base.py b/nextpy/ai/rag/document_loaders/mongo/base.py deleted file mode 100644 index 12bd6207..00000000 --- a/nextpy/ai/rag/document_loaders/mongo/base.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Mongo client.""" - -from typing import Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SimpleMongoReader(BaseReader): - """Simple mongo reader. - - Concatenates each Mongo doc into DocumentNode used by LlamaIndex. - - Args: - host (str): Mongo host. - port (int): Mongo port. - max_docs (int): Maximum number of documents to load. - - """ - - def __init__( - self, - host: Optional[str] = None, - port: Optional[int] = None, - uri: Optional[str] = None, - max_docs: int = 1000, - ) -> None: - """Initialize with parameters.""" - self.host = host - self.port = port - self.uri = uri - try: - import pymongo # noqa: F401 - from pymongo import MongoClient # noqa: F401 - except ImportError: - raise ImportError( - "`pymongo` package not found, please run `pip install pymongo`" - ) - if uri: - if uri is None: - raise ValueError("Either `host` and `port` or `uri` must be provided.") - self.client: MongoClient = MongoClient(uri) - else: - if host is None or port is None: - raise ValueError("Either `host` and `port` or `uri` must be provided.") - self.client = MongoClient(host, port) - self.max_docs = max_docs - - def load_data( - self, db_name: str, collection_name: str, query_dict: Optional[Dict] = None - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - db_name (str): name of the database. - collection_name (str): name of the collection. - query_dict (Optional[Dict]): query to filter documents. - Defaults to None - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = { - "host": self.host, - "port": self.port, - "uri": self.uri, - "db_name": db_name, - "collection_name": collection_name, - "query_dict": query_dict, - } - documents = [] - db = self.client[db_name] - if query_dict is None: - cursor = db[collection_name].find() - else: - cursor = db[collection_name].find(query_dict) - - for item in cursor: - if "text" not in item: - raise ValueError("`text` field not found in Mongo DocumentNode.") - documents.append(DocumentNode(text=item["text"], extra_info=metadata)) - return documents diff --git a/nextpy/ai/rag/document_loaders/mongo/requirements.txt b/nextpy/ai/rag/document_loaders/mongo/requirements.txt deleted file mode 100644 index 8c7d698b..00000000 --- a/nextpy/ai/rag/document_loaders/mongo/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pymongo \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/notion/README.md b/nextpy/ai/rag/document_loaders/notion/README.md deleted file mode 100644 index 714ea0b1..00000000 --- a/nextpy/ai/rag/document_loaders/notion/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Notion Loader - -This loader loads documents from Notion. The user specifies an API token to initialize -the NotionPageReader. They then specify a set of `page_ids` or `database_id` to load in -the corresponding DocumentNode objects. - -## Usage - -Here's an example usage of the NotionPageReader. - -```python -from nextpy.ai import download_loader -import os - -NotionPageReader = download_loader('NotionPageReader') - -integration_token = os.getenv("NOTION_INTEGRATION_TOKEN") -page_ids = [""] -reader = NotionPageReader(integration_token=integration_token) -documents = reader.load_data(page_ids=page_ids) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/notion/__init__.py b/nextpy/ai/rag/document_loaders/notion/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/notion/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/notion/base.py b/nextpy/ai/rag/document_loaders/notion/base.py deleted file mode 100644 index 89d05867..00000000 --- a/nextpy/ai/rag/document_loaders/notion/base.py +++ /dev/null @@ -1,193 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Notion reader.""" -import os -from typing import Any, Dict, List, Optional - -import requests # type: ignore - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -INTEGRATION_TOKEN_NAME = "NOTION_INTEGRATION_TOKEN" -BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children" -DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query" -SEARCH_URL = "https://api.notion.com/v1/search" - - -# TODO: Notion DB reader coming soon! -class NotionPageReader(BaseReader): - """Notion Page reader. - - Reads a set of Notion pages. - - Args: - integration_token (str): Notion integration token. - - """ - - def __init__(self, integration_token: Optional[str] = None) -> None: - """Initialize with parameters.""" - if integration_token is None: - integration_token = os.getenv(INTEGRATION_TOKEN_NAME) - if integration_token is None: - raise ValueError( - "Must specify `integration_token` or set environment " - "variable `NOTION_INTEGRATION_TOKEN`." - ) - self.token = integration_token - self.headers = { - "Authorization": "Bearer " + self.token, - "Content-Type": "application/json", - "Notion-Version": "2022-06-28", - } - - def _read_block(self, block_id: str, num_tabs: int = 0) -> str: - """Read a block.""" - done = False - result_lines_arr = [] - cur_block_id = block_id - while not done: - block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id) - query_dict: Dict[str, Any] = {} - - res = requests.request( - "GET", block_url, headers=self.headers, json=query_dict - ) - data = res.json() - - for result in data["results"]: - result_type = result["type"] - result_obj = result[result_type] - - cur_result_text_arr = [] - if "rich_text" in result_obj: - for rich_text in result_obj["rich_text"]: - # skip if doesn't have text object - if "text" in rich_text: - text = rich_text["text"]["content"] - prefix = "\t" * num_tabs - cur_result_text_arr.append(prefix + text) - - result_block_id = result["id"] - has_children = result["has_children"] - if has_children: - children_text = self._read_block( - result_block_id, num_tabs=num_tabs + 1 - ) - cur_result_text_arr.append(children_text) - - cur_result_text = "\n".join(cur_result_text_arr) - result_lines_arr.append(cur_result_text) - - if data["next_cursor"] is None: - done = True - break - else: - cur_block_id = data["next_cursor"] - - result_lines = "\n".join(result_lines_arr) - return result_lines - - def read_page(self, page_id: str) -> str: - """Read a page.""" - return self._read_block(page_id) - - def query_database( - self, database_id: str, query_dict: Dict[str, Any] = {"page_size": 100} - ) -> List[str]: - """Get all the pages from a Notion database.""" - pages = [] - - res = requests.post( - DATABASE_URL_TMPL.format(database_id=database_id), - headers=self.headers, - json=query_dict, - ) - res.raise_for_status() - data = res.json() - - pages.extend(data.get("results")) - - while data.get("has_more"): - query_dict["start_cursor"] = data.get("next_cursor") - res = requests.post( - DATABASE_URL_TMPL.format(database_id=database_id), - headers=self.headers, - json=query_dict, - ) - res.raise_for_status() - data = res.json() - pages.extend(data.get("results")) - - page_ids = [page["id"] for page in pages] - return page_ids - - def search(self, query: str) -> List[str]: - """Search Notion page given a text query.""" - done = False - next_cursor: Optional[str] = None - page_ids = [] - while not done: - query_dict = { - "query": query, - } - if next_cursor is not None: - query_dict["start_cursor"] = next_cursor - res = requests.post(SEARCH_URL, headers=self.headers, json=query_dict) - data = res.json() - for result in data["results"]: - page_id = result["id"] - page_ids.append(page_id) - - if data["next_cursor"] is None: - done = True - break - else: - next_cursor = data["next_cursor"] - return page_ids - - def load_data( - self, page_ids: List[str] = [], database_id: Optional[str] = None - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - page_ids (List[str]): List of page ids to load. - database_id (str): Database_id from which to load page ids. - - Returns: - List[DocumentNode]: List of documents. - - """ - if not page_ids and not database_id: - raise ValueError("Must specify either `page_ids` or `database_id`.") - docs = [] - if database_id is not None: - # get all the pages in the database - page_ids = self.query_database(database_id) - for page_id in page_ids: - page_text = self.read_page(page_id) - docs.append( - DocumentNode( - text=page_text, - extra_info={"page_id": page_id, "database_id": database_id}, - ) - ) - else: - for page_id in page_ids: - page_text = self.read_page(page_id) - docs.append( - DocumentNode( - text=page_text, - extra_info={"page_id": page_id, "database_id": database_id}, - ) - ) - - return docs - - -if __name__ == "__main__": - reader = NotionPageReader() - print(reader.search("What I")) diff --git a/nextpy/ai/rag/document_loaders/obsidian/README.md b/nextpy/ai/rag/document_loaders/obsidian/README.md deleted file mode 100644 index 0575a6d6..00000000 --- a/nextpy/ai/rag/document_loaders/obsidian/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Obsidian (Markdown) Loader - -This loader loads documents from a markdown directory (for instance, an Obsidian vault). - -## Usage - -Here's an example usage of the ObsidianReader. - -```python -from nextpy.ai import download_loader -import os - -ObsidianReader = download_loader('ObsidianReader') -documents = ObsidianReader('/path/to/dir').load_data() # Returns list of documents -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/obsidian/__init__.py b/nextpy/ai/rag/document_loaders/obsidian/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/obsidian/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/obsidian/base.py b/nextpy/ai/rag/document_loaders/obsidian/base.py deleted file mode 100644 index 3619a61a..00000000 --- a/nextpy/ai/rag/document_loaders/obsidian/base.py +++ /dev/null @@ -1,55 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Obsidian reader class. - -Pass in the path to an Obsidian vault and it will parse all markdown -files into a List of Documents, -with each DocumentNode containing text from under an Obsidian header. - -""" -import os -from pathlib import Path -from typing import Any, List - -from langchain.docstore.DocumentNode import DocumentNode as LCDocument - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.readers.file.markdown_reader import MarkdownReader -from nextpy.ai.schema import DocumentNode - - -class ObsidianReader(BaseReader): - """Utilities for loading data from an Obsidian Vault. - - Args: - input_dir (str): Path to the vault. - - """ - - def __init__(self, input_dir: str): - """Init params.""" - self.input_dir = Path(input_dir) - - def load_data(self, *args: Any, **load_kwargs: Any) -> List[DocumentNode]: - """Load data from the input directory.""" - docs: List[DocumentNode] = [] - for (dirpath, dirnames, filenames) in os.walk(self.input_dir): - dirnames[:] = [d for d in dirnames if not d.startswith(".")] - for filename in filenames: - if filename.endswith(".md"): - filepath = os.path.join(dirpath, filename) - content = MarkdownReader().load_data(Path(filepath)) - - metadata = {"input_dir": self.input_dir} - - for doc in content: - doc.extra_info = metadata - - docs.extend(content) - return docs - - def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: - """Load data in LangChain DocumentNode format.""" - docs = self.load_data(**load_kwargs) - return [d.to_langchain_format() for d in docs] diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/README.md b/nextpy/ai/rag/document_loaders/opendal_reader/README.md deleted file mode 100644 index 90358d02..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# OpenDAL Loader - -This loader parses any file via [OpenDAL](https://github.com/apache/incubator-opendal). - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -## Usage - -`OpendalReader` can read data from any supported storage services including `s3`, `azblob`, `gcs` and so on. - -```python -from nextpy.ai import download_loader - -OpendalReader = download_loader("OpendalReader") - -loader = OpendalReader( - scheme="s3", - bucket='bucket', - path='path/to/data/', -) -documents = loader.load_data() -``` - -We also provide `Opendal[S3|Gcs|Azblob]Reader` for convenience. - ---- - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/__init__.py b/nextpy/ai/rag/document_loaders/opendal_reader/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/README.md b/nextpy/ai/rag/document_loaders/opendal_reader/azblob/README.md deleted file mode 100644 index 2dda71c7..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Azblob Loader - -This loader parses any file stored on Azblob. - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -> Azblob loader is based on `OpendalReader`. - -## Usage - -```python -from nextpy.ai import download_loader - -OpendalAzblobReader = download_loader("OpendalAzblobReader") - -loader = OpendalAzblobReader( - container='container', - path='path/to/data/', - endpoint='[endpoint]', - account_name='[account_name]', - account_key='[account_key]', -) -documents = loader.load_data() -``` - ---- - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/__init__.py b/nextpy/ai/rag/document_loaders/opendal_reader/azblob/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/base.py b/nextpy/ai/rag/document_loaders/opendal_reader/azblob/base.py deleted file mode 100644 index 08449ad4..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/base.py +++ /dev/null @@ -1,74 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Azblob file and directory reader. - -A loader that fetches a file or iterates through a directory on Azblob or. - -""" - -from typing import Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class OpendalAzblobReader(BaseReader): - """General reader for any Azblob file or directory.""" - - def __init__( - self, - container: str, - path: str = "/", - endpoint: str = "", - account_name: str = "", - account_key: str = "", - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - ) -> None: - """Initialize Azblob container, along with credentials if needed. - - If key is not set, the entire bucket (filtered by prefix) is parsed. - - Args: - container (str): the name of your azblob bucket - path (str): the path of the data. If none is provided, - this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file. - endpoint Optional[str]: the endpoint of the azblob service. - account_name (Optional[str]): provide azblob access key directly. - account_key (Optional[str]): provide azblob access key directly. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - - """ - super().__init__() - - self.path = path - self.file_extractor = file_extractor - - # opendal service related config. - self.options = { - "container": container, - "endpoint": endpoint, - "account_name": account_name, - "account_key": account_key, - } - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from OpenDAL.""" - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - OpendalReader = import_loader("OpendalReader") - except ImportError: - OpendalReader = download_loader("OpendalReader") - - loader = OpendalReader( - scheme="azblob", - path=self.path, - file_extractor=self.file_extractor, - **self.options, - ) - - return loader.load_data() diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/requirements.txt b/nextpy/ai/rag/document_loaders/opendal_reader/azblob/requirements.txt deleted file mode 100644 index e55fe80f..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/azblob/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -opendal==0.30.3 diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/base.py b/nextpy/ai/rag/document_loaders/opendal_reader/base.py deleted file mode 100644 index 6969b1ff..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/base.py +++ /dev/null @@ -1,90 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Opendal file and directory reader. - -A loader that fetches a file or iterates through a directory on AWS S3 or other compatible service. - -""" -import asyncio -import tempfile -from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class OpendalReader(BaseReader): - """General reader for any opendal operator.""" - - def __init__( - self, - scheme: str, - path: str = "/", - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - **kwargs, - ) -> None: - """Initialize opendal operator, along with credentials if needed. - - - Args: - scheme (str): the scheme of the service - path (str): the path of the data. If none is provided, - this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - """ - import opendal - - super().__init__() - - self.path = path - self.file_extractor = file_extractor - - self.op = opendal.AsyncOperator(scheme, **kwargs) - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from OpenDAL.""" - with tempfile.TemporaryDirectory() as temp_dir: - if not self.path.endswith("/"): - asyncio.run(download_file_from_opendal(self.op, temp_dir, self.path)) - else: - asyncio.run(download_dir_from_opendal(self.op, temp_dir, self.path)) - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - SimpleDirectoryReader = import_loader("SimpleDirectoryReader") - except ImportError: - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - loader = SimpleDirectoryReader(temp_dir, file_extractor=self.file_extractor) - - return loader.load_data() - - -async def download_file_from_opendal(op: Any, temp_dir: str, path: str) -> str: - """Download file from OpenDAL.""" - import opendal - - op = cast(opendal.AsyncOperator, op) - - suffix = Path(path).suffix - filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" - - async with op.open_reader(path) as r: - with open(filepath, "wb") as w: - w.write(await r.read()) - - return filepath - - -async def download_dir_from_opendal(op: Any, temp_dir: str, dir: str) -> str: - """Download directory from opendal.""" - import opendal - - op = cast(opendal.AsyncOperator, op) - async for obj in await op.scan(dir): - await download_file_from_opendal(op, temp_dir, obj.path) diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/README.md b/nextpy/ai/rag/document_loaders/opendal_reader/gcs/README.md deleted file mode 100644 index 9e175171..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Gcs Loader - -This loader parses any file stored on Gcs. - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -> Gcs loader is based on `OpendalReader`. - -## Usage - -```python -from nextpy.ai import download_loader - -OpendalGcsReader = download_loader("OpendalGcsReader") - -loader = OpendalGcsReader( - bucket='bucket', - path='path/to/data/', - endpoint='[endpoint]', - credentials='[credentials]', -) -documents = loader.load_data() -``` - -Note: if `credentials` is not provided, this loader to try to load from env. - ---- - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/__init__.py b/nextpy/ai/rag/document_loaders/opendal_reader/gcs/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/base.py b/nextpy/ai/rag/document_loaders/opendal_reader/gcs/base.py deleted file mode 100644 index 74e08a24..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/base.py +++ /dev/null @@ -1,70 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Gcs file and directory reader. - -A loader that fetches a file or iterates through a directory on Gcs. - -""" - -from typing import Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class OpendalGcsReader(BaseReader): - """General reader for any Gcs file or directory.""" - - def __init__( - self, - bucket: str, - path: str = "/", - endpoint: str = "", - credentials: str = "", - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - ) -> None: - """Initialize Gcs container, along with credentials if needed. - - If key is not set, the entire bucket (filtered by prefix) is parsed. - - Args: - bucket (str): the name of your gcs bucket - path (str): the path of the data. If none is provided, - this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file. - endpoint Optional[str]: the endpoint of the azblob service. - credentials (Optional[str]): provide credential string for GCS OAuth2 directly. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - - """ - super().__init__() - - self.path = path - self.file_extractor = file_extractor - - # opendal service related config. - self.options = { - "bucket": bucket, - "endpoint": endpoint, - "credentials": credentials, - } - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from OpenDAL.""" - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - OpendalReader = import_loader("OpendalReader") - except ImportError: - OpendalReader = download_loader("OpendalReader") - loader = OpendalReader( - scheme="gcs", - path=self.path, - file_extractor=self.file_extractor, - **self.options, - ) - - return loader.load_data() diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/requirements.txt b/nextpy/ai/rag/document_loaders/opendal_reader/gcs/requirements.txt deleted file mode 100644 index e55fe80f..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/gcs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -opendal==0.30.3 diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/requirements.txt b/nextpy/ai/rag/document_loaders/opendal_reader/requirements.txt deleted file mode 100644 index e55fe80f..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -opendal==0.30.3 diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/s3/README.md b/nextpy/ai/rag/document_loaders/opendal_reader/s3/README.md deleted file mode 100644 index 427afcc9..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/s3/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# S3 Loader - -This loader parses any file stored on S3. When initializing `S3Reader`, you may pass in your [AWS Access Key](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). If none are found, the loader assumes they are stored in `~/.aws/credentials`. - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -> S3 loader is based on `OpendalReader`. - -## Usage - -```python -from nextpy.ai import download_loader - -OpendalS3Reader = download_loader("OpendalS3Reader") - -loader = OpendalS3Reader( - bucket='bucket', - path='path/to/data/', - access_key_id='[ACCESS_KEY_ID]', - secret_access_key='[ACCESS_KEY_SECRET]', -) -documents = loader.load_data() -``` - -Note: if `access_key_id` or `secret_access_key` is not provided, this loader to try to load from env. - -Possible arguments includes: - -- `endpoint`: Specify the endpoint of s3 service. -- `region`: Specify the region of s3 service. - ---- - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/s3/__init__.py b/nextpy/ai/rag/document_loaders/opendal_reader/s3/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/s3/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/s3/base.py b/nextpy/ai/rag/document_loaders/opendal_reader/s3/base.py deleted file mode 100644 index 3adbfa52..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/s3/base.py +++ /dev/null @@ -1,75 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""S3 file and directory reader. - -A loader that fetches a file or iterates through a directory on AWS S3 or other compatible service. - -""" - -from typing import Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class OpendalS3Reader(BaseReader): - """General reader for any S3 file or directory.""" - - def __init__( - self, - bucket: str, - path: str = "/", - endpoint: str = "", - region: str = "", - access_key_id: str = "", - secret_access_key: str = "", - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - ) -> None: - """Initialize S3 bucket and key, along with credentials if needed. - - If key is not set, the entire bucket (filtered by prefix) is parsed. - - Args: - bucket (str): the name of your S3 bucket - path (str): the path of the data. If none is provided, - this loader will iterate through the entire bucket. If path is endswith `/`, this loader will iterate through the entire dir. Otherwise, this loeader will load the file. - endpoint Optional[str]: the endpoint of the S3 service. - region: Optional[str]: the region of the S3 service. - access_key_id (Optional[str]): provide AWS access key directly. - secret_access_key (Optional[str]): provide AWS access key directly. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - """ - super().__init__() - - self.path = path - self.file_extractor = file_extractor - - # opendal service related config. - self.options = { - "access_key": access_key_id, - "secret_key": secret_access_key, - "endpoint": endpoint, - "region": region, - "bucket": bucket, - } - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from OpenDAL.""" - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - OpendalReader = import_loader("OpendalReader") - except ImportError: - OpendalReader = download_loader("OpendalReader") - loader = OpendalReader( - scheme="s3", - path=self.path, - file_extractor=self.file_extractor, - **self.options, - ) - - return loader.load_data() diff --git a/nextpy/ai/rag/document_loaders/opendal_reader/s3/requirements.txt b/nextpy/ai/rag/document_loaders/opendal_reader/s3/requirements.txt deleted file mode 100644 index e55fe80f..00000000 --- a/nextpy/ai/rag/document_loaders/opendal_reader/s3/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -opendal==0.30.3 diff --git a/nextpy/ai/rag/document_loaders/outlook_localcalendar/README.md b/nextpy/ai/rag/document_loaders/outlook_localcalendar/README.md deleted file mode 100644 index 3c91af6e..00000000 --- a/nextpy/ai/rag/document_loaders/outlook_localcalendar/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Outlook Local Calendar Loader - -This loader reads your past and upcoming Calendar events from your local Outlook .ost or .pst and parses the relevant info into `Documents`. - -It runs on Windows only and has only been tested with Windows 11. It has been designed to have a supoerset of the functionality of the Google Calendar reader. - -## Usage - -Here's an example usage of the OutlookCalendar Reader. It will retrieve up to 100 future events, unless an optional `number_of_results` argument is passed. It will also retrieve only future events, unless an optional `start_date` argument is passed. Optionally events can be restricted to those which occur on or before a specific date by specifying the optional `end-date` parameter. By default, `end-date` is 2199-01-01. - -It always returns Start, End, Subject, Location, and Organizer attributes and optionally returns additional attributes specified in the `more_attributes` parameter, which, if specified, must be a list of strings eg. ['Body','someotherattribute',...]. Attributes which don't exist in a calendar entry are ignored without warning. - -```python -from nextpy.ai import download_loader - -OutlookCalendarReader = download_loader('OutlookLocalCalendarReader') - -loader = OutlookCalendarReader() -documents = loader.load_data() -``` - -## Example - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -OutlookCalendarReader = download_loader('OutlookLocalCalendarReader') - -loader = OutlookCalendarReader(start_date='2022-01-01',number_of_documents=1000) - -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) -index.query('When did I last see George Guava? When do I see him again?') -``` -Note: it is actually better to give s structured prompt with this data and be sure to it is clear what today's date is and whether you want any data besides the indexed data used in answering the prompt. diff --git a/nextpy/ai/rag/document_loaders/outlook_localcalendar/__init__,py b/nextpy/ai/rag/document_loaders/outlook_localcalendar/__init__,py deleted file mode 100644 index 3a5547f4..00000000 --- a/nextpy/ai/rag/document_loaders/outlook_localcalendar/__init__,py +++ /dev/null @@ -1 +0,0 @@ -"""Init file.""" \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/outlook_localcalendar/base.py b/nextpy/ai/rag/document_loaders/outlook_localcalendar/base.py deleted file mode 100644 index d6bcd5f3..00000000 --- a/nextpy/ai/rag/document_loaders/outlook_localcalendar/base.py +++ /dev/null @@ -1,116 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Outlook local calendar reader for Windows.""" - -""" -Created on Sun Apr 16 12:03:19 2023 - -@author: tevslin -""" - - -import datetime -import importlib -import platform -from typing import List, Optional, Union - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -# Copyright 2023 Evslin Consulting -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class OutlookLocalCalendarReader(BaseReader): - """Outlook local calendar reader for Windows. - Reads events from local copy of Outlook calendar. - """ - - def load_data( - self, - number_of_results: Optional[int] = 100, - start_date: Optional[Union[str, datetime.date]] = None, - end_date: Optional[Union[str, datetime.date]] = None, - more_attributes: Optional[List[str]] = None, - ) -> List[DocumentNode]: - """Load data from user's local calendar. - - Args: - number_of_results (Optional[int]): the number of events to return. Defaults to 100. - start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today. - end_date (Optional[Union[str, datetime.date]]): the last date (inclusive) to return events from. Defaults to 2199-01-01. - more_attributes (Optional[ List[str]]): additional attributes to be retrieved from calendar entries. Non-existnat attributes are ignored. - - Returns a list of documents sutitable for indexing by llam_index. Always returns Start, End, Subject, Location, and Organizer - attributes and optionally returns additional attributes specified in the more_attributes parameter. - """ - metadata = { - "number_of_results": number_of_results, - "start_date": start_date, - "end_date": end_date, - "more_attributes": more_attributes, - } - - if platform.system().lower() != "windows": - return [] - attributes = [ - "Start", - "End", - "Subject", - "Location", - "Organizer", - ] # base attrubutes to return - if more_attributes is not None: # if the user has specified more attributes - attributes += more_attributes - if start_date is None: - start_date = datetime.date.today() - elif isinstance(start_date, str): - start_date = datetime.date.fromisoformat(start_date) - - # Initialize the Outlook application - winstuff = importlib.import_module("win32com.client") - outlook = winstuff.Dispatch("Outlook.Application").GetNamespace("MAPI") - - # Get the Calendar folder - calendar_folder = outlook.GetDefaultFolder(9) - - # Retrieve calendar items - events = calendar_folder.Items - - if not events: - return [] - events.Sort("[Start]") # Sort items by start time - numberReturned = 0 - results = [] - for event in events: - converted_date = datetime.date( - event.Start.year, event.Start.month, event.Start.day - ) - if converted_date > start_date: # if past start date - numberReturned += 1 - eventstring = "" - for attribute in attributes: - if hasattr(event, attribute): - eventstring += f"{attribute}: {getattr(event,attribute)}, " - results.append(DocumentNode(text=eventstring, extra_info=metadata)) - if numberReturned >= number_of_results: - break - - return results - - -if __name__ == "__main__": - reader = OutlookLocalCalendarReader() - print(reader.load_data()) diff --git a/nextpy/ai/rag/document_loaders/outlook_localcalendar/requirements.txt b/nextpy/ai/rag/document_loaders/outlook_localcalendar/requirements.txt deleted file mode 100644 index 10c9d322..00000000 --- a/nextpy/ai/rag/document_loaders/outlook_localcalendar/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pywin32 diff --git a/nextpy/ai/rag/document_loaders/pandas_ai/README.md b/nextpy/ai/rag/document_loaders/pandas_ai/README.md deleted file mode 100644 index ea720314..00000000 --- a/nextpy/ai/rag/document_loaders/pandas_ai/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# Pandas AI Loader - -This loader is a light wrapper around the `PandasAI` Python package. - -See here: https://github.com/gventuri/pandas-ai. - -You can directly get the result of `pandasai.run` command, or -you can choose to load in `DocumentNode` objects via `load_data`. - -## Usage - -```python -from nextpy.ai import download_loader -from pandasai.llm.openai import OpenAI -import pandas as pd - -# Sample DataFrame -df = pd.DataFrame({ - "country": ["United States", "United Kingdom", "France", "Germany", "Italy", "Spain", "Canada", "Australia", "Japan", "China"], - "gdp": [21400000, 2940000, 2830000, 3870000, 2160000, 1350000, 1780000, 1320000, 516000, 14000000], - "happiness_index": [7.3, 7.2, 6.5, 7.0, 6.0, 6.3, 7.3, 7.3, 5.9, 5.0] -}) - -llm = OpenAI() - -PandasAIReader = download_loader("PandasAIReader") - -# use run_pandas_ai directly -# set is_conversational_answer=False to get parsed output -loader = PandasAIReader(llm=llm) -response = reader.run_pandas_ai( - df, - "Which are the 5 happiest countries?", - is_conversational_answer=False -) -print(response) - -# load data with is_conversational_answer=False -# will use our PandasCSVReader under the hood -docs = reader.load_data( - df, - "Which are the 5 happiest countries?", - is_conversational_answer=False -) - -# load data with is_conversational_answer=True -# will use our PandasCSVReader under the hood -docs = reader.load_data( - df, - "Which are the 5 happiest countries?", - is_conversational_answer=True -) - - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/pandas_ai/__init__.py b/nextpy/ai/rag/document_loaders/pandas_ai/__init__.py deleted file mode 100644 index 1c233aca..00000000 --- a/nextpy/ai/rag/document_loaders/pandas_ai/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init params.""" diff --git a/nextpy/ai/rag/document_loaders/pandas_ai/base.py b/nextpy/ai/rag/document_loaders/pandas_ai/base.py deleted file mode 100644 index f916b090..00000000 --- a/nextpy/ai/rag/document_loaders/pandas_ai/base.py +++ /dev/null @@ -1,127 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Pandas AI loader.""" - -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, List, Optional - -import numpy as np -import pandas as pd - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.readers.download import download_loader -from nextpy.ai.schema import DocumentNode - - -class PandasAIReader(BaseReader): - """Pandas AI reader. - - Light wrapper around https://github.com/gventuri/pandas-ai. - - Args: - llm (Optional[pandas.llm]): LLM to use. Defaults to None. - concat_rows (bool): whether to concatenate all rows into one DocumentNode. - If set to False, a DocumentNode will be created for each row. - True by default. - - col_joiner (str): Separator to use for joining cols per row. - Set to ", " by default. - - row_joiner (str): Separator to use for joining each row. - Only used when `concat_rows=True`. - Set to "\n" by default. - - pandas_config (dict): Options for the `pandas.read_csv` function call. - Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html - for more information. - Set to empty dict by default, this means pandas will try to figure - out the separators, table head, etc. on its own. - - """ - - def __init__( - self, - llm: Optional[Any] = None, - concat_rows: bool = True, - col_joiner: str = ", ", - row_joiner: str = "\n", - pandas_config: dict = {}, - ) -> None: - """Init params.""" - try: - from pandasai import PandasAI - from pandasai.llm.openai import OpenAI - except ImportError: - raise ImportError("Please install pandasai to use this reader.") - - self._llm = llm or OpenAI() - self._pandas_ai = PandasAI(llm) - - self._concat_rows = concat_rows - self._col_joiner = col_joiner - self._row_joiner = row_joiner - self._pandas_config = pandas_config - - def run_pandas_ai( - self, - initial_df: pd.DataFrame, - query: str, - is_conversational_answer: bool = False, - ) -> Any: - """Load dataframe.""" - return self._pandas_ai.run( - initial_df, prompt=query, is_conversational_answer=is_conversational_answer - ) - - def load_data( - self, - initial_df: pd.DataFrame, - query: str, - is_conversational_answer: bool = False, - ) -> List[DocumentNode]: - """Parse file.""" - metadata = { - "llm": self._llm, - "initial_df": initial_df, - "query": query, - "is_conversational_answer": is_conversational_answer, - } - - result = self.run_pandas_ai( - initial_df, query, is_conversational_answer=is_conversational_answer - ) - if is_conversational_answer: - return [DocumentNode(text=result, extra_info=metadata)] - else: - if isinstance(result, (np.generic)): - result = pd.Series(result) - elif isinstance(result, (pd.Series, pd.DataFrame)): - pass - else: - raise ValueError("Unexpected type for result: {}".format(type(result))) - # if not conversational answer, use Pandas CSV Reader - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - PandasCSVReader = import_loader("PandasCSVReader") - except ImportError: - PandasCSVReader = download_loader("PandasCSVReader") - - reader = PandasCSVReader( - concat_rows=self._concat_rows, - col_joiner=self._col_joiner, - row_joiner=self._row_joiner, - pandas_config=self._pandas_config, - ) - - with TemporaryDirectory() as tmpdir: - outpath = Path(tmpdir) / "out.csv" - with outpath.open("w") as f: - # TODO: add option to specify index=False - result.to_csv(f, index=False) - - docs = reader.load_data(outpath, metadata) - return docs diff --git a/nextpy/ai/rag/document_loaders/pandas_ai/requirements.txt b/nextpy/ai/rag/document_loaders/pandas_ai/requirements.txt deleted file mode 100644 index b758d929..00000000 --- a/nextpy/ai/rag/document_loaders/pandas_ai/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandasai \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/papers/arxiv/README.md b/nextpy/ai/rag/document_loaders/papers/arxiv/README.md deleted file mode 100644 index 311f2c94..00000000 --- a/nextpy/ai/rag/document_loaders/papers/arxiv/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Arxiv Papers Loader - -This loader fetchs the text from the most relevant scientific papers on Arxiv specified by a search query (e.g. "Artificial Intelligence"). For each paper, the abstract is extracted and put in a separate DocumentNode. The search query may be any string, Arxiv paper id, or a general Arxiv query string (see the full list of capabilities [here](https://info.arxiv.org/help/api/user-manual.html#query_details)). - -## Usage - -To use this loader, you need to pass in the search query. You may also optionally specify a local directory to temporarily store the paper PDFs (they are deleted automatically) and the maximum number of papers you want to parse for your search query (default is 10). - -```python -from nextpy.ai import download_loader - -ArxivReader = download_loader("ArxivReader") - -loader = ArxivReader() -documents = loader.load_data(search_query='au:Karpathy') -``` - -Alternatively, if you would like to load papers and abstracts separately: - -```python -from nextpy.ai import download_loader - -ArxivReader = download_loader("ArxivReader") - -loader = ArxivReader() -documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy') -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/papers/arxiv/__init__.py b/nextpy/ai/rag/document_loaders/papers/arxiv/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/papers/arxiv/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/papers/arxiv/base.py b/nextpy/ai/rag/document_loaders/papers/arxiv/base.py deleted file mode 100644 index 86da8c7b..00000000 --- a/nextpy/ai/rag/document_loaders/papers/arxiv/base.py +++ /dev/null @@ -1,177 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read Arxiv Papers.""" -import hashlib -import logging -import os -from typing import List, Optional, Tuple - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class ArxivReader(BaseReader): - """Arxiv Reader. - - Gets a search query, return a list of Documents of the top corresponding scientific papers on Arxiv. - """ - - def __init__( - self, - ): - """Initialize with parameters.""" - super().__init__() - - def _hacky_hash(self, some_string): - _hash = hashlib.md5(some_string.encode("utf-8")).hexdigest() - return _hash - - def load_data( - self, - search_query: str, - papers_dir: Optional[str] = ".papers", - max_results: Optional[int] = 10, - ) -> List[DocumentNode]: - """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them. - - Args: - search_query (str): A topic to search for (e.g. "Artificial Intelligence"). - papers_dir (Optional[str]): Locally directory to store the papers - max_results (Optional[int]): Maximum number of papers to fetch. - - Returns: - List[DocumentNode]: A list of DocumentNode objects. - """ - import arxiv - - arxiv_search = arxiv.Search( - query=search_query, - id_list=[], - max_results=max_results, - sort_by=arxiv.SortCriterion.Relevance, - ) - search_results = list(arxiv_search.results()) - logging.debug(f"> Successfully fetched {len(search_results)} paperes") - - if not os.path.exists(papers_dir): - os.makedirs(papers_dir) - - paper_lookup = {} - for paper in search_results: - # Hash filename to avoid bad charaters in file path - filename = f"{self._hacky_hash(paper.title)}.pdf" - paper_lookup[os.path.join(papers_dir, filename)] = { - "Title of this paper": paper.title, - "Authors": (", ").join([a.name for a in paper.authors]), - "Date published": paper.published.strftime("%m/%d/%Y"), - "URL": paper.entry_id, - # "summary": paper.summary - } - paper.download_pdf(dirpath=papers_dir, filename=filename) - logging.debug(f"> Downloading {filename}...") - - def get_paper_metadata(filename): - metadata = paper_lookup[filename] - metadata["search_query"] = search_query - metadata["papers_dir"] = papers_dir - metadata["max_results"] = max_results - return metadata - - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - arxiv_documents = SimpleDirectoryReader( - papers_dir, file_metadata=get_paper_metadata - ).load_data() - # Include extra documents containing the abstracts - abstract_documents = [] - for paper in search_results: - d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}" - abstract_documents.append(DocumentNode(text=d)) - - # Delete downloaded papers - try: - for f in os.listdir(papers_dir): - os.remove(os.path.join(papers_dir, f)) - logging.debug(f"> Deleted file: {f}") - os.rmdir(papers_dir) - logging.debug(f"> Deleted directory: {papers_dir}") - except OSError: - print("Unable to delete files or directory") - - return arxiv_documents + abstract_documents - - def load_papers_and_abstracts( - self, - search_query: str, - papers_dir: Optional[str] = ".papers", - max_results: Optional[int] = 10, - ) -> Tuple[List[DocumentNode], List[DocumentNode]]: - """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them. - - Args: - search_query (str): A topic to search for (e.g. "Artificial Intelligence"). - papers_dir (Optional[str]): Locally directory to store the papers - max_results (Optional[int]): Maximum number of papers to fetch. - - Returns: - List[DocumentNode]: A list of DocumentNode objects representing the papers themselves - List[DocumentNode]: A list of DocumentNode objects representing abstracts only - """ - import arxiv - - arxiv_search = arxiv.Search( - query=search_query, - id_list=[], - max_results=max_results, - sort_by=arxiv.SortCriterion.Relevance, - ) - search_results = list(arxiv_search.results()) - logging.debug(f"> Successfully fetched {len(search_results)} paperes") - - if not os.path.exists(papers_dir): - os.makedirs(papers_dir) - - paper_lookup = {} - for paper in search_results: - # Hash filename to avoid bad charaters in file path - filename = f"{self._hacky_hash(paper.title)}.pdf" - paper_lookup[os.path.join(papers_dir, filename)] = { - "Title of this paper": paper.title, - "Authors": (", ").join([a.name for a in paper.authors]), - "Date published": paper.published.strftime("%m/%d/%Y"), - "URL": paper.entry_id, - # "summary": paper.summary - } - paper.download_pdf(dirpath=papers_dir, filename=filename) - logging.debug(f"> Downloading {filename}...") - - def get_paper_metadata(filename): - return paper_lookup[filename] - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - SimpleDirectoryReader = import_loader("SimpleDirectoryReader") - except ImportError: - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - arxiv_documents = SimpleDirectoryReader( - papers_dir, file_metadata=get_paper_metadata - ).load_data() - # Include extra documents containing the abstracts - abstract_documents = [] - for paper in search_results: - d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}" - abstract_documents.append(DocumentNode(text=d)) - - # Delete downloaded papers - try: - for f in os.listdir(papers_dir): - os.remove(os.path.join(papers_dir, f)) - logging.debug(f"> Deleted file: {f}") - os.rmdir(papers_dir) - logging.debug(f"> Deleted directory: {papers_dir}") - except OSError: - print("Unable to delete files or directory") - - return arxiv_documents, abstract_documents diff --git a/nextpy/ai/rag/document_loaders/papers/arxiv/requirements.txt b/nextpy/ai/rag/document_loaders/papers/arxiv/requirements.txt deleted file mode 100644 index 164782d5..00000000 --- a/nextpy/ai/rag/document_loaders/papers/arxiv/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -arxiv \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/papers/pubmed/README.md b/nextpy/ai/rag/document_loaders/papers/pubmed/README.md deleted file mode 100644 index 92621ee9..00000000 --- a/nextpy/ai/rag/document_loaders/papers/pubmed/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Pubmed Papers Loader - -This loader fetchs the text from the most relevant scientific papers on Pubmed specified by a search query (e.g. "Alzheimers"). For each paper, the abstract is included in the `DocumentNode`. The search query may be any string. - -## Usage - -To use this loader, you need to pass in the search query. You may also optionally specify the maximum number of papers you want to parse for your search query (default is 10). - -```python -from nextpy.ai import download_loader - -PubmedReader = download_loader("PubmedReader") - -loader = PubmedReader() -documents = loader.load_data(search_query='amyloidosis') -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/papers/pubmed/__init__.py b/nextpy/ai/rag/document_loaders/papers/pubmed/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/papers/pubmed/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/papers/pubmed/base.py b/nextpy/ai/rag/document_loaders/papers/pubmed/base.py deleted file mode 100644 index 5827517d..00000000 --- a/nextpy/ai/rag/document_loaders/papers/pubmed/base.py +++ /dev/null @@ -1,174 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Read Pubmed Papers.""" -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PubmedReader(BaseReader): - """Pubmed Reader. - - Gets a search query, return a list of Documents of the top corresponding scientific papers on Pubmed. - """ - - def load_data_bioc( - self, - search_query: str, - max_results: Optional[int] = 10, - ) -> List[DocumentNode]: - """Search for a topic on Pubmed, fetch the text of the most relevant full-length papers. - Uses the BoiC API, which has been down a lot. - - Args: - search_query (str): A topic to search for (e.g. "Alzheimers"). - max_results (Optional[int]): Maximum number of papers to fetch. - - Returns: - List[DocumentNode]: A list of DocumentNode objects. - """ - import xml.etree.ElementTree as xml - from datetime import datetime - - import requests - - pubmed_search = [] - parameters = {"tool": "tool", "email": "email", "db": "pmc"} - parameters["term"] = search_query - parameters["retmax"] = max_results - resp = requests.get( - "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", - params=parameters, - ) - root = xml.fromstring(resp.content) - - for elem in root.iter(): - if elem.tag == "Id": - _id = elem.text - try: - resp = requests.get( - f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{_id}/ascii" - ) - info = resp.json() - title = "Pubmed Paper" - try: - title = [ - p["text"] - for p in info["documents"][0]["passages"] - if p["infons"]["section_type"] == "TITLE" - ][0] - except KeyError: - pass - pubmed_search.append( - { - "title": title, - "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/", - "date": info["date"], - "documents": info["documents"], - } - ) - except Exception: - print(f"Unable to parse PMC{_id} or it does not exist") - pass - - # Then get documents from Pubmed text, which includes abstracts - pubmed_documents = [] - for paper in pubmed_search: - for d in paper["documents"]: - text = "\n".join([p["text"] for p in d["passages"]]) - pubmed_documents.append( - DocumentNode( - text=text, - extra_info={ - "Title of this paper": paper["title"], - "URL": paper["url"], - "Date published": datetime.strptime( - paper["date"], "%Y%m%d" - ).strftime("%m/%d/%Y"), - }, - ) - ) - - return pubmed_documents - - def load_data( - self, - search_query: str, - max_results: Optional[int] = 10, - ) -> List[DocumentNode]: - """Search for a topic on Pubmed, fetch the text of the most relevant full-length papers. - Args: - search_query (str): A topic to search for (e.g. "Alzheimers"). - max_results (Optional[int]): Maximum number of papers to fetch. - - Returns: - List[DocumentNode]: A list of DocumentNode objects. - """ - import time - import xml.etree.ElementTree as xml - - import requests - - pubmed_search = [] - parameters = {"tool": "tool", "email": "email", "db": "pmc"} - parameters["term"] = search_query - parameters["retmax"] = max_results - resp = requests.get( - "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", - params=parameters, - ) - root = xml.fromstring(resp.content) - - for elem in root.iter(): - if elem.tag == "Id": - _id = elem.text - url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id={_id}&db=pmc" - print(url) - try: - resp = requests.get(url) - info = xml.fromstring(resp.content) - - raw_text = "" - title = "" - journal = "" - for element in info.iter(): - if element.tag == "article-title": - title = element.text - elif element.tag == "journal-title": - journal = element.text - - if element.text: - raw_text += element.text.strip() + " " - - pubmed_search.append( - { - "title": title, - "journal": journal, - "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/", - "text": raw_text, - } - ) - time.sleep(1) # API rate limits - except Exception as e: - print(f"Unable to parse PMC{_id} or it does not exist:", e) - pass - - # Then get documents from Pubmed text, which includes abstracts - pubmed_documents = [] - for paper in pubmed_search: - pubmed_documents.append( - DocumentNode( - text=paper["text"], - extra_info={ - "Title of this paper": paper["title"], - "Journal it was published in:": paper["journal"], - "URL": paper["url"], - "search_query": search_query, - "max_results": max_results, - }, - ) - ) - - return pubmed_documents diff --git a/nextpy/ai/rag/document_loaders/pinecone/README.md b/nextpy/ai/rag/document_loaders/pinecone/README.md deleted file mode 100644 index 7b07e293..00000000 --- a/nextpy/ai/rag/document_loaders/pinecone/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Pinecone Loader - -The Pinecone Loader returns a set of texts corresponding to embeddings retrieved from a Pinecone Index. -The user initializes the loader with a Pinecone index. They then pass in a query vector. - -## Usage - -Here's an example usage of the PineconeReader. - -```python -from nextpy.ai import download_loader -import os - -PineconeReader = download_loader('PineconeReader') - -# the id_to_text_map specifies a mapping from the ID specified in Pinecone to your text. -id_to_text_map = { - "id1": "text blob 1", - "id2": "text blob 2", -} - -# the query_vector is an embedding representation of your query_vector -# Example query vector: -# query_vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] - -query_vector=[n1, n2, n3, ...] - -reader = PineconeReader(api_key=api_key, environment="us-west1-gcp") -documents = reader.load_data( - index_name='quickstart', - id_to_text_map=id_to_text_map, - top_k=3, - vector=query_vector, - separate_documents=True -) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/pinecone/__init__.py b/nextpy/ai/rag/document_loaders/pinecone/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/pinecone/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/pinecone/base.py b/nextpy/ai/rag/document_loaders/pinecone/base.py deleted file mode 100644 index 216e33b6..00000000 --- a/nextpy/ai/rag/document_loaders/pinecone/base.py +++ /dev/null @@ -1,90 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Pinecone reader.""" - -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class PineconeReader(BaseReader): - """Pinecone reader. - - Args: - api_key (str): Pinecone API key. - environment (str): Pinecone environment. - """ - - def __init__(self, api_key: str, environment: str): - """Initialize with parameters.""" - import pinecone # noqa: F401 - - self._api_key = api_key - self._environment = environment - pinecone.init(api_key=api_key, environment=environment) - - def load_data( - self, - index_name: str, - id_to_text_map: Dict[str, str], - vector: Optional[List[float]], - top_k: int, - separate_documents: bool = True, - include_values: bool = True, - **query_kwargs: Any - ) -> List[DocumentNode]: - """Load data from Pinecone. - - Args: - index_name (str): Name of the index. - id_to_text_map (Dict[str, str]): A map from ID's to text. - separate_documents (Optional[bool]): Whether to return separate - documents per retrieved entry. Defaults to True. - vector (List[float]): Query vector. - top_k (int): Number of results to return. - include_values (bool): Whether to include the embedding in the response. - Defaults to True. - **query_kwargs: Keyword arguments to pass to the query. - Arguments are the exact same as those found in - Pinecone's reference documentation for the - query method. - - Returns: - List[DocumentNode]: A list of documents. - """ - metadata = { - "index_name": index_name, - "id_to_text_map": id_to_text_map, - "vector": vector, - "top k": top_k, - "separate_documents": separate_documents, - "include_values": include_values, - } - - import pinecone - - index = pinecone.Index(index_name) - if "include_values" not in query_kwargs: - query_kwargs["include_values"] = True - response = index.query(top_k=top_k, vector=vector, **query_kwargs) - - documents = [] - for match in response.matches: - if match.id not in id_to_text_map: - raise ValueError("ID not found in id_to_text_map.") - text = id_to_text_map[match.id] - embedding = match.values - if len(embedding) == 0: - embedding = None - documents.append( - DocumentNode(text=text, embedding=embedding, extra_info=metadata) - ) - - if not separate_documents: - text_list = [doc.get_text() for doc in documents] - text = "\n\n".join(text_list) - documents = [DocumentNode(text=text, extra_info=metadata)] - - return documents diff --git a/nextpy/ai/rag/document_loaders/pinecone/requirements.txt b/nextpy/ai/rag/document_loaders/pinecone/requirements.txt deleted file mode 100644 index 8bf0a1e2..00000000 --- a/nextpy/ai/rag/document_loaders/pinecone/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pinecone-client \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/qdrant/README.md b/nextpy/ai/rag/document_loaders/qdrant/README.md deleted file mode 100644 index 7fb414f8..00000000 --- a/nextpy/ai/rag/document_loaders/qdrant/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Qdrant Loader - -The Qdrant Loader returns a set of texts corresponding to embeddings retrieved from a Qdrant Index. -The user initializes the loader with a Qdrant index. They then pass in a query vector. - -## Usage - -Here's an example usage of the QdrantReader. - -```python -from nextpy.ai import download_loader -import os - -QdrantReader = download_loader("QdrantReader") - -reader = QdrantReader(host="localhost") -# the query_vector is an embedding representation of your query_vector -# Example query vector: -# query_vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] - -query_vector=[n1, n2, n3, ...] - -# NOTE: Required args are collection_name, query_vector. -# See the Python client: https://github.com/qdrant/qdrant_client -# for more details. -documents = reader.load_data( - collection_name="demo", - query_vector=query_vector, - limit=5 -) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/qdrant/__init__.py b/nextpy/ai/rag/document_loaders/qdrant/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/qdrant/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/qdrant/base.py b/nextpy/ai/rag/document_loaders/qdrant/base.py deleted file mode 100644 index 08be16f9..00000000 --- a/nextpy/ai/rag/document_loaders/qdrant/base.py +++ /dev/null @@ -1,205 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Qdrant reader.""" - -from typing import Dict, List, Optional, cast - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class QdrantReader(BaseReader): - """Qdrant reader. - - Retrieve documents from existing Qdrant collections. - - Args: - location: - If `:memory:` - use in-memory Qdrant instance. - If `str` - use it as a `url` parameter. - If `None` - use default values for `host` and `port`. - url: - either host or str of - "Optional[scheme], host, Optional[port], Optional[prefix]". - Default: `None` - port: Port of the REST API interface. Default: 6333 - grpc_port: Port of the gRPC interface. Default: 6334 - prefer_grpc: If `true` - use gPRC interface whenever possible in custom methods. - https: If `true` - use HTTPS(SSL) protocol. Default: `false` - api_key: API key for authentication in Qdrant Cloud. Default: `None` - prefix: - If not `None` - add `prefix` to the REST URL path. - Example: `service/v1` will result in - `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API. - Default: `None` - timeout: - Timeout for REST and gRPC API requests. - Default: 5.0 seconds for REST and unlimited for gRPC - host: Host name of Qdrant service. If url and host are None, set to 'localhost'. - Default: `None` - """ - - def __init__( - self, - location: Optional[str] = None, - url: Optional[str] = None, - port: Optional[int] = 6333, - grpc_port: int = 6334, - prefer_grpc: bool = False, - https: Optional[bool] = None, - api_key: Optional[str] = None, - prefix: Optional[str] = None, - timeout: Optional[float] = None, - host: Optional[str] = None, - path: Optional[str] = None, - ): - """Initialize with parameters.""" - import_err_msg = ( - "`qdrant-client` package not found, please run `pip install qdrant-client`" - ) - - self.url = url - - try: - import qdrant_client # noqa: F401 - except ImportError: - raise ImportError(import_err_msg) - - self._client = qdrant_client.QdrantClient( - location=location, - url=url, - port=port, - grpc_port=grpc_port, - prefer_grpc=prefer_grpc, - https=https, - api_key=api_key, - prefix=prefix, - timeout=timeout, - host=host, - path=path, - ) - - def load_data( - self, - collection_name: str, - query_vector: List[float], - should_search_mapping: Optional[Dict[str, str]] = None, - must_search_mapping: Optional[Dict[str, str]] = None, - must_not_search_mapping: Optional[Dict[str, str]] = None, - rang_search_mapping: Optional[Dict[str, Dict[str, float]]] = None, - limit: int = 10, - ) -> List[DocumentNode]: - """Load data from Qdrant. - - Args: - collection_name (str): Name of the Qdrant collection. - query_vector (List[float]): Query vector. - should_search_mapping (Optional[Dict[str, str]]): Mapping from field name - to query string. - must_search_mapping (Optional[Dict[str, str]]): Mapping from field name - to query string. - must_not_search_mapping (Optional[Dict[str, str]]): Mapping from field - name to query string. - rang_search_mapping (Optional[Dict[str, Dict[str, float]]]): Mapping from - field name to range query. - limit (int): Number of results to return. - Example: - reader = QdrantReader() - reader.load_data( - collection_name="test_collection", - query_vector=[0.1, 0.2, 0.3], - should_search_mapping={"text_field": "text"}, - must_search_mapping={"text_field": "text"}, - must_not_search_mapping={"text_field": "text"}, - # gte, lte, gt, lt supported - rang_search_mapping={"text_field": {"gte": 0.1, "lte": 0.2}}, - limit=10 - ) - - Returns: - List[DocumentNode]: A list of documents. - """ - metadata = { - "url": self.url, - "collection_name": collection_name, - "query_vector": query_vector, - "should_search_mapping": should_search_mapping, - "must_search_mapping": must_search_mapping, - "must_not_search_mapping": must_not_search_mapping, - "rang_search_mapping": rang_search_mapping, - "limit": limit, - } - - from qdrant_client.http.models import ( - FieldCondition, - Filter, - MatchText, - MatchValue, - Range, - ) - from qdrant_client.http.models.models import Payload - - should_search_mapping = should_search_mapping or {} - must_search_mapping = must_search_mapping or {} - must_not_search_mapping = must_not_search_mapping or {} - rang_search_mapping = rang_search_mapping or {} - - should_search_conditions = [ - FieldCondition(key=key, match=MatchText(text=value)) - for key, value in should_search_mapping.items() - if should_search_mapping - ] - must_search_conditions = [ - FieldCondition(key=key, match=MatchValue(value=value)) - for key, value in must_search_mapping.items() - if must_search_mapping - ] - must_not_search_conditions = [ - FieldCondition(key=key, match=MatchValue(value=value)) - for key, value in must_not_search_mapping.items() - if must_not_search_mapping - ] - rang_search_conditions = [ - FieldCondition( - key=key, - range=Range( - gte=value.get("gte"), - lte=value.get("lte"), - gt=value.get("gt"), - lt=value.get("lt"), - ), - ) - for key, value in rang_search_mapping.items() - if rang_search_mapping - ] - should_search_conditions.extend(rang_search_conditions) - response = self._client.search( - collection_name=collection_name, - query_vector=query_vector, - query_filter=Filter( - must=must_search_conditions, - must_not=must_not_search_conditions, - should=should_search_conditions, - ), - with_vectors=True, - with_payload=True, - limit=limit, - ) - - documents = [] - for point in response: - payload = cast(Payload, point.payload) - try: - vector = cast(List[float], point.vector) - except ValueError as e: - raise ValueError("Could not cast vector to List[float].") from e - doc = DocumentNode( - doc_id=payload.get("doc_id"), - text=payload.get("text"), - extra_info={**payload.get("extra_info", {}), **metadata}, - embedding=vector, - ) - documents.append(doc) - - return documents diff --git a/nextpy/ai/rag/document_loaders/qdrant/requirements.txt b/nextpy/ai/rag/document_loaders/qdrant/requirements.txt deleted file mode 100644 index 2f03c119..00000000 --- a/nextpy/ai/rag/document_loaders/qdrant/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -qdrant_client \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/readwise/README.md b/nextpy/ai/rag/document_loaders/readwise/README.md deleted file mode 100644 index ac5a2892..00000000 --- a/nextpy/ai/rag/document_loaders/readwise/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Readwise Reader - -Use Readwise's export API to fetch your highlights from web articles, epubs, pdfs, Kindle, YouTube, and load the resulting text into LLMs. - -## Setup - -1. Get your Readwise API key from [readwise.io/access_token](https://readwise.io/access_token). - -## Usage - -Here is an example usage of the Readwise Reader: - -```python -import os -from nextpy.ai import GPTVectorDBIndex, download_loader - -ReadwiseReader = download_loader("ReadwiseReader") -token = os.getenv("READWISE_API_KEY") -loader = ReadwiseReader(api_key=token) -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) - -index.query("What was the paper 'Attention is all you need' about?") -``` - -You can also query for highlights that have been created after a certain time: - -```python -import os -import datetime -from nextpy.ai import GPTVectorDBIndex, download_loader - -ReadwiseReader = download_loader("ReadwiseReader") -token = os.getenv("READWISE_API_KEY") -loader = ReadwiseReader(api_key=token) -seven_days_ago = datetime.datetime.now() - datetime.timedelta(days=7) -documents = loader.load_data(updated_after=seven_days_ago) -index = GPTVectorDBIndex.from_documents(documents) - -index.query("What has Elon Musk done this time?") -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/readwise/__init__.py b/nextpy/ai/rag/document_loaders/readwise/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/readwise/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/readwise/base.py b/nextpy/ai/rag/document_loaders/readwise/base.py deleted file mode 100644 index 06a09942..00000000 --- a/nextpy/ai/rag/document_loaders/readwise/base.py +++ /dev/null @@ -1,66 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple Reader that loads highlights from Readwise.io.""" -import datetime -import json -from typing import List, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -def _get_readwise_data(api_key: str, updated_after: Optional[datetime.datetime] = None): - """Uses Readwise's export API to export all highlights, optionally after a specified date. - - See https://readwise.io/api_deets for details. - - Args: - updated_after (datetime.datetime): The datetime to load highlights after. Useful for updating indexes over time. - """ - result = [] - next_page = None - while True: - response = requests.get( - url="https://readwise.io/api/v2/export/", - params={ - "pageCursor": next_page, - "updatedAfter": updated_after.isoformat() if updated_after else None, - }, - headers={"Authorization": f"Token {api_key}"}, - ) - response.raise_for_status() - result.extend(response.json()["results"]) - next_page = response.json().get("nextPageCursor") - if not next_page: - break - return result - - -class ReadwiseReader(BaseReader): - """Reader for Readwise highlights.""" - - def __init__(self, api_key: str): - self._api_key = api_key - - def load_data( - self, - updated_after: Optional[datetime.datetime] = None, - ) -> List[DocumentNode]: - """Load your Readwise.io highlights. - - Args: - updated_after (datetime.datetime): The datetime to load highlights after. Useful for updating indexes over time. - """ - metadata = {"updated_after": updated_after} - - readwise_response = _get_readwise_data( - api_key=self._api_key, updated_after=updated_after - ) - result = [ - DocumentNode(text=json.dumps(d), extra_info=metadata) - for d in readwise_response - ] - return result diff --git a/nextpy/ai/rag/document_loaders/reddit/README.md b/nextpy/ai/rag/document_loaders/reddit/README.md deleted file mode 100644 index 7e5e80db..00000000 --- a/nextpy/ai/rag/document_loaders/reddit/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# Reddit Reader - -For any subreddit(s) you're interested in, search for relevant posts using keyword(s) and load the resulting text in the post and and top-level comments into LLMs/ LangChains. - -## Get your Reddit credentials ready - -1. Visit Reddit App Preferences (https://www.reddit.com/prefs/apps) or [https://old.reddit.com/prefs/apps/](https://old.reddit.com/prefs/apps/) -2. Scroll to the bottom and click "create another app..." -3. Fill out the name, description, and redirect url for your app, then click "create app" -4. Now you should be able to see the personal use script, secret, and name of your app. Store those as environment variables REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, and REDDIT_USER_AGENT respecitvely. -5. Additionally store the environment variables REDDIT_USERNAME and REDDIT_PASSWORD, which correspond to the credentials for your Reddit account. - -## Usage - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -RedditReader = download_loader('RedditReader') - -subreddits = ['MachineLearning'] -search_keys = ['PyTorch', 'deploy'] -post_limit = 10 - -loader = RedditReader() -documents = loader.load_data(subreddits=subreddits, search_keys=search_keys, post_limit=post_limit) -index = GPTVectorDBIndex.from_documents(documents) - -index.query("What are the pain points of PyTorch users?") -``` - -### LangChain - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -RedditReader = download_loader('RedditReader') - -subreddits = ['MachineLearning'] -search_keys = ['PyTorch', 'deploy'] -post_limit = 10 - -loader = RedditReader() -documents = loader.load_data(subreddits=subreddits, search_keys=search_keys, post_limit=post_limit) -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Reddit Index", - func=lambda q: index.query(q), - description=f"Useful when you want to read relevant posts and top-level comments in subreddits.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What are the pain points of PyTorch users?") -print(output) - -``` - -This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/reddit/__init__.py b/nextpy/ai/rag/document_loaders/reddit/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/reddit/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/reddit/base.py b/nextpy/ai/rag/document_loaders/reddit/base.py deleted file mode 100644 index f5738e16..00000000 --- a/nextpy/ai/rag/document_loaders/reddit/base.py +++ /dev/null @@ -1,70 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple Reader that loads text relevant to a certain search keyword from subreddits.""" -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class RedditReader(BaseReader): - """Subreddit post and top-level comments reader for Reddit.""" - - def load_data( - self, - subreddits: List[str], - search_keys: List[str], - post_limit: Optional[int] = [10], - ) -> List[DocumentNode]: - """Load text from relevant posts and top-level comments in subreddit(s), given keyword(s) for search. - - Args: - subreddits (List[str]): List of subreddits you'd like to read from - search_keys (List[str]): List of keywords you'd like to use to search from subreddit(s) - post_limit (Optional[int]): Maximum number of posts per subreddit you'd like to read from, defaults to 10 - - """ - import os - - import praw - from praw.models import MoreComments - - reddit = praw.Reddit( - client_id=os.getenv("REDDIT_CLIENT_ID"), - client_secret=os.getenv("REDDIT_CLIENT_SECRET"), - user_agent=os.getenv("REDDIT_USER_AGENT"), - username=os.getenv("REDDIT_USERNAME"), - password=os.getenv("REDDIT_PASSWORD"), - ) - - posts = [] - - for sr in subreddits: - ml_subreddit = reddit.subreddit(sr) - - for kw in search_keys: - relevant_posts = ml_subreddit.search(kw, limit=post_limit) - - for post in relevant_posts: - metadata = { - "subreddits": sr, - "search_keys": kw, - "post_limit": post_limit, - } - posts.append(DocumentNode(text=post.selftext, extra_info=metadata)) - for top_level_comment in post.comments: - if isinstance(top_level_comment, MoreComments): - continue - metadata = { - "subreddits": sr, - "search_keys": kw, - "post_limit": post_limit, - } - posts.append( - DocumentNode( - text=top_level_comment.body, extra_info=metadata - ) - ) - - return posts diff --git a/nextpy/ai/rag/document_loaders/reddit/requirements.txt b/nextpy/ai/rag/document_loaders/reddit/requirements.txt deleted file mode 100644 index c1400b24..00000000 --- a/nextpy/ai/rag/document_loaders/reddit/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -praw~=7.6 -prawcore~=2.3 -requests~=2.28 -update-checker~=0.18 -websocket-client~=1.5 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/remote/README.md b/nextpy/ai/rag/document_loaders/remote/README.md deleted file mode 100644 index 6ba610b6..00000000 --- a/nextpy/ai/rag/document_loaders/remote/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Remote Page/File Loader - -This loader makes it easy to extract the text from any remote page or file using just its url. If there's a file at the url, this loader will download it temporarily and parse it using `SimpleDirectoryReader`. It is an all-in-one tool for (almost) any url. - -As a result, any page or type of file is supported. For instance, if a `.txt` url such as a [Project Gutenberg book](https://www.gutenberg.org/cache/epub/69994/pg69994.txt) is passed in, the text will be parsed as is. On the other hand, if a hosted .mp3 url is passed in, it will be downloaded and parsed using `AudioTranscriber`. - -## Usage - -To use this loader, you need to pass in a `Path` to a local file. Optionally, you may specify a `file_extractor` for the `SimpleDirectoryReader` to use, other than the default one. - -```python -from nextpy.ai import download_loader - -RemoteReader = download_loader("RemoteReader") - -loader = RemoteReader() -documents = loader.load_data(url="https://en.wikipedia.org/wiki/File:Example.jpg") -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/remote/__init__.py b/nextpy/ai/rag/document_loaders/remote/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/remote/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/remote/base.py b/nextpy/ai/rag/document_loaders/remote/base.py deleted file mode 100644 index c84a1d88..00000000 --- a/nextpy/ai/rag/document_loaders/remote/base.py +++ /dev/null @@ -1,88 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Remote file reader. - -A loader that fetches an arbitrary remote page or file by URL and parses its contents. - -""" -import re -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class RemoteReader(BaseReader): - """General reader for any remote page or file.""" - - def __init__( - self, - *args: Any, - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - **kwargs: Any, - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - - self.file_extractor = file_extractor - - @staticmethod - def _is_youtube_video(url: str) -> bool: - # TODO create more global method for detecting all types - """Returns True if the given URL is a video on YouTube, False otherwise.""" - # Regular expression pattern to match YouTube video URLs - youtube_pattern = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([^\s&]+)" - - # Match the pattern against the URL - match = re.match(youtube_pattern, url) - - # If there's a match, it's a YouTube video URL - if match: - return True - - # Otherwise, it's not a YouTube video URL - return False - - def load_data(self, url: str) -> List[DocumentNode]: - """Parse whatever is at the URL.""" - import tempfile - from urllib.parse import urlparse - from urllib.request import Request, urlopen - - extra_info = {"Source": url} - - req = Request(url, headers={"User-Agent": "Magic Browser"}) - result = urlopen(req) - url_type = result.info().get_content_type() - documents = [] - if url_type == "text/html" or url_type == "text/plain": - text = "\n\n".join([str(el.decode("utf-8-sig")) for el in result]) - documents = [DocumentNode(text=text, extra_info=extra_info)] - elif self._is_youtube_video(url): - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - YoutubeTranscriptReader = import_loader("YoutubeTranscriptReader") - except ImportError: - YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader") - youtube_reader = YoutubeTranscriptReader() - # TODO should we have another langauge, like english / french? - documents = youtube_reader.load_data([url]) - else: - suffix = Path(urlparse(url).path).suffix - with tempfile.TemporaryDirectory() as temp_dir: - filepath = f"{temp_dir}/temp{suffix}" - with open(filepath, "wb") as output: - output.write(result.read()) - - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - loader = SimpleDirectoryReader( - temp_dir, - file_metadata=(lambda _: extra_info), - file_extractor=self.file_extractor, - ) - documents = loader.load_data() - return documents diff --git a/nextpy/ai/rag/document_loaders/remote_depth/README.md b/nextpy/ai/rag/document_loaders/remote_depth/README.md deleted file mode 100644 index e31a0196..00000000 --- a/nextpy/ai/rag/document_loaders/remote_depth/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Remote Page/File Loader - -This loader makes it easy to extract the text from the links available in a webpage URL, and extract the links presents in the page. It's based on `RemoteReader` (reading single page), that is based on `SimpleDirectoryReader` (parsing the DocumentNode if file is a pdf, etc). It is an all-in-one tool for (almost) any group of urls. - -You can try with this MIT lecture link, it will be able to extract the syllabus, the PDFs, etc: -`https://ocw.mit.edu/courses/5-05-principles-of-inorganic-chemistry-iii-spring-2005/pages/syllabus/` - -## Usage - -You need to specify the parameter `depth` to specify how many levels of links you want to extract. For example, if you want to extract the links in the page, and the links in the links in the page, you need to specify `depth=2`. - -```python -from nextpy.ai import download_loader - -RemoteDepthReader = download_loader("RemoteDepthReader") - -loader = RemoteDepthReader() -documents = loader.load_data(url="https://ocw.mit.edu/courses/5-05-principles-of-inorganic-chemistry-iii-spring-2005/pages/syllabus/") -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/remote_depth/__init__.py b/nextpy/ai/rag/document_loaders/remote_depth/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/remote_depth/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/remote_depth/base.py b/nextpy/ai/rag/document_loaders/remote_depth/base.py deleted file mode 100644 index a6cd528c..00000000 --- a/nextpy/ai/rag/document_loaders/remote_depth/base.py +++ /dev/null @@ -1,108 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Remote file reader. - -A loader that fetches any remote page or file by URL and retrieves child pages with certain constraints. The class also parses the contents of each page and provides access to the parsed data. -""" -from typing import Any, Dict, List, Optional, Union - -import requests - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class RemoteDepthReader(BaseReader): - def __init__( - self, - *args: Any, - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - depth: int = 1, - domain_lock: bool = False, - **kwargs: Any, - ) -> None: - """Init params.""" - super().__init__(*args, **kwargs) - self.file_extractor = file_extractor - self.depth = depth - self.domain_lock = domain_lock - - def load_data(self, url: str) -> List[DocumentNode]: - from tqdm.auto import tqdm - - """Parse whatever is at the URL.""" "" - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - RemoteReader = import_loader("RemoteReader") - except ImportError: - RemoteReader = download_loader("RemoteReader") - remote_reader = RemoteReader(file_extractor=self.file_extractor) - documents = [] - links = self.get_links(url) - urls = {-1: [url]} # -1 is the starting point - links_visited = [] - for i in range(self.depth + 1): - urls[i] = [] - new_links = [] - print(f"Reading links at depth {i}...") - for link in tqdm(links): - """Checking if the link belongs the provided domain.""" - if (self.domain_lock and link.find(url) > -1) or (not self.domain_lock): - print("Loading link: " + link) - if link in links_visited: - continue - if link: - urls[i].append(link) - new_links.extend(self.get_links(link)) - links_visited.append(link) - else: - print("Link ignored: " + link) - new_links = list(set(new_links)) - links = new_links - print(f"Found {len(urls)} links at depth {self.depth}.") - for depth_i in urls: - for url in urls[depth_i]: - try: - documents.extend(remote_reader.load_data(url)) - except Exception as e: - print(f"Error reading {url} at depth {depth_i}: {e}") - continue - - return documents - - @staticmethod - def is_url(href) -> bool: - """Check if a link is a URL.""" - return href.startswith("http") - - def get_links(self, url) -> List[str]: - from urllib.parse import urljoin, urlparse, urlunparse - - from bs4 import BeautifulSoup - - """Get all links from a page.""" - page = requests.get(url) - soup = BeautifulSoup(page.content, "html.parser") - - links = soup.find_all("a") - result = [] - for link in links: - href = link if isinstance(link, str) else link.get("href") - if href is not None and not self.is_url(href): - href = urljoin(url, href) - - url_parsed = urlparse(href) - url_without_query_string = urlunparse( - (url_parsed.scheme, url_parsed.netloc, url_parsed.path, "", "", "") - ) - - if ( - url_without_query_string not in result - and url_without_query_string - and url_without_query_string.startswith("http") - ): - result.append(url_without_query_string) - return result diff --git a/nextpy/ai/rag/document_loaders/remote_depth/requirements.txt b/nextpy/ai/rag/document_loaders/remote_depth/requirements.txt deleted file mode 100644 index ecfeee74..00000000 --- a/nextpy/ai/rag/document_loaders/remote_depth/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -tqdm~=4.64 -beautifulsoup4~=4.11 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/s3/README.md b/nextpy/ai/rag/document_loaders/s3/README.md deleted file mode 100644 index 91287be3..00000000 --- a/nextpy/ai/rag/document_loaders/s3/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# S3 File or Directory Loader - -This loader parses any file stored on S3, or the entire Bucket (with an optional prefix filter) if no particular file is specified. When initializing `S3Reader`, you may pass in your [AWS Access Key](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). If none are found, the loader assumes they are stored in `~/.aws/credentials`. - -All files are temporarily downloaded locally and subsequently parsed with `SimpleDirectoryReader`. Hence, you may also specify a custom `file_extractor`, relying on any of the loaders in this library (or your own)! - -## Usage - -To use this loader, you need to pass in the name of your S3 Bucket. After that, if you want to just parse a single file, pass in its key. Note that if the file is nested in a subdirectory, the key should contain that, so like `subdirectory/input.txt`. - -Otherwise, you may specify a prefix if you only want to parse certain files in the Bucket, or a subdirectory. AWS Access Key credentials may either be passed in during initialization or stored locally (see above). - -```python -from nextpy.ai import download_loader - -S3Reader = download_loader("S3Reader") - -loader = S3Reader(bucket='scrabble-dictionary', key='dictionary.txt', aws_access_id='[ACCESS_KEY_ID]', aws_access_secret='[ACCESS_KEY_SECRET]') -documents = loader.load_data() -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/s3/__init__.py b/nextpy/ai/rag/document_loaders/s3/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/s3/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/s3/base.py b/nextpy/ai/rag/document_loaders/s3/base.py deleted file mode 100644 index 3d82f714..00000000 --- a/nextpy/ai/rag/document_loaders/s3/base.py +++ /dev/null @@ -1,135 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""S3 file and directory reader. - -A loader that fetches a file or iterates through a directory on AWS S3. - -""" -import tempfile -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class S3Reader(BaseReader): - """General reader for any S3 file or directory.""" - - def __init__( - self, - *args: Any, - bucket: str, - key: Optional[str] = None, - prefix: Optional[str] = "", - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, - required_exts: Optional[List[str]] = None, - filename_as_id: bool = False, - num_files_limit: Optional[int] = None, - file_metadata: Optional[Callable[[str], Dict]] = None, - aws_access_id: Optional[str] = None, - aws_access_secret: Optional[str] = None, - aws_session_token: Optional[str] = None, - s3_endpoint_url: Optional[str] = "https://s3.amazonaws.com", - **kwargs: Any, - ) -> None: - """Initialize S3 bucket and key, along with credentials if needed. - - If key is not set, the entire bucket (filtered by prefix) is parsed. - - Args: - bucket (str): the name of your S3 bucket - key (Optional[str]): the name of the specific file. If none is provided, - this loader will iterate through the entire bucket. - prefix (Optional[str]): the prefix to filter by in the case that the loader - iterates through the entire bucket. Defaults to empty string. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - required_exts (Optional[List[str]]): List of required extensions. - Default is None. - num_files_limit (Optional[int]): Maximum number of files to read. - Default is None. - file_metadata (Optional[Callable[str, Dict]]): A function that takes - in a filename and returns a Dict of metadata for the DocumentNode. - Default is None. - aws_access_id (Optional[str]): provide AWS access key directly. - aws_access_secret (Optional[str]): provide AWS access key directly. - s3_endpoint_url (Optional[str]): provide S3 endpoint URL directly. - """ - super().__init__(*args, **kwargs) - - self.bucket = bucket - self.key = key - self.prefix = prefix - - self.file_extractor = file_extractor - self.required_exts = required_exts - self.filename_as_id = filename_as_id - self.num_files_limit = num_files_limit - self.file_metadata = file_metadata - - self.aws_access_id = aws_access_id - self.aws_access_secret = aws_access_secret - self.aws_session_token = aws_session_token - self.s3_endpoint_url = s3_endpoint_url - - def load_data(self) -> List[DocumentNode]: - """Load file(s) from S3.""" - import boto3 - - s3 = boto3.resource("s3") - s3_client = boto3.client("s3") - if self.aws_access_id: - session = boto3.Session( - aws_access_key_id=self.aws_access_id, - aws_secret_access_key=self.aws_access_secret, - aws_session_token=self.aws_session_token, - ) - s3 = session.resource("s3") - s3_client = session.client("s3", endpoint_url=self.s3_endpoint_url) - - with tempfile.TemporaryDirectory() as temp_dir: - if self.key: - suffix = Path(self.key).suffix - filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" - s3_client.download_file(self.bucket, self.key, filepath) - else: - bucket = s3.Bucket(self.bucket) - for i, obj in enumerate(bucket.objects.filter(Prefix=self.prefix)): - if self.num_files_limit is not None and i > self.num_files_limit: - break - - suffix = Path(obj.key).suffix - - is_dir = obj.key.endswith("/") # skip folders - is_bad_ext = ( - self.required_exts is not None - and suffix not in self.required_exts # skip other extentions - ) - - if is_dir or is_bad_ext: - continue - - filepath = ( - f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" - ) - s3_client.download_file(self.bucket, obj.key, filepath) - - try: - from nextpy.ai import SimpleDirectoryReader - except ImportError: - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") - - loader = SimpleDirectoryReader( - temp_dir, - file_extractor=self.file_extractor, - required_exts=self.required_exts, - filename_as_id=self.filename_as_id, - num_files_limit=self.num_files_limit, - file_metadata=self.file_metadata, - ) - - return loader.load_data() diff --git a/nextpy/ai/rag/document_loaders/s3/requirements.txt b/nextpy/ai/rag/document_loaders/s3/requirements.txt deleted file mode 100644 index 1db657b6..00000000 --- a/nextpy/ai/rag/document_loaders/s3/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -boto3 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/singlestore/README.md b/nextpy/ai/rag/document_loaders/singlestore/README.md deleted file mode 100644 index 82f39249..00000000 --- a/nextpy/ai/rag/document_loaders/singlestore/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# SingleStore Loader - -The SingleStore Loader retrieves a set of documents from a specified table in a SingleStore database. The user initializes the loader with database information and then provides a search embedding for retrieving similar documents. - -## Usage - -Here's an example usage of the SingleStoreReader: - -```python -from llama_hub.singlestore.base import SingleStoreReader - -# Initialize the reader with your SingleStore database credentials and other relevant details -reader = SingleStoreReader( - scheme="mysql", - host="localhost", - port="3306", - user="username", - password="password", - dbname="database_name", - table_name="table_name", - content_field="text", - vector_field="embedding" -) - -# The search_embedding is an embedding representation of your query_vector. -# Example search_embedding: -# search_embedding=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] -search_embedding=[n1, n2, n3, ...] - -# load_data fetches documents from your SingleStore database that are similar to the search_embedding. -# The top_k argument specifies the number of similar documents to fetch. -documents = reader.load_data(search_embedding=search_embedding, top_k=5) \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/singlestore/__init__.py b/nextpy/ai/rag/document_loaders/singlestore/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/singlestore/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/singlestore/base.py b/nextpy/ai/rag/document_loaders/singlestore/base.py deleted file mode 100644 index 75ea4b84..00000000 --- a/nextpy/ai/rag/document_loaders/singlestore/base.py +++ /dev/null @@ -1,91 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""SingleStore reader.""" - -from typing import List - -from nextpy.ai import download_loader -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SingleStoreReader(BaseReader): - """SingleStore reader. - - Args: - scheme (str): Database Scheme. - host (str): Database Host. - port (str): Database Port. - user (str): Database User. - password (str): Database Password. - dbname (str): Database Name. - table_name (str): Table Name. - content_field (str): Content Field. - vector_field (str): Vector Field. - """ - - def __init__( - self, - scheme: str, - host: str, - port: str, - user: str, - password: str, - dbname: str, - table_name: str, - content_field: str = "text", - vector_field: str = "embedding", - ): - """Initialize with parameters.""" - self.scheme = scheme - self.host = host - self.port = port - self.user = user - self.password = password - self.dbname = dbname - self.table_name = table_name - self.content_field = content_field - self.vector_field = vector_field - - try: - import pymysql - - pymysql.install_as_MySQLdb() - except ImportError: - pass - - try: - from nextpy.ai.rag.document_loaders.utils import import_loader - - self.DatabaseReader = import_loader("DatabaseReader") - except: - self.DatabaseReader = download_loader("DatabaseReader") - - self.reader = self.DatabaseReader( - scheme=self.scheme, - host=self.host, - port=self.port, - user=self.user, - password=self.password, - dbname=self.dbname, - ) - - def load_data(self, search_embedding: str, top_k: int = 5) -> List[DocumentNode]: - """Load data from SingleStore. - - Args: - search_embedding (str): The embedding to search. - top_k (int): Number of results to return. - - Returns: - List[DocumentNode]: A list of documents. - """ - query = f""" - SELECT {self.content_field}, DOT_PRODUCT_F64({self.vector_field}, JSON_ARRAY_PACK_F64(\'{search_embedding}\')) AS score - FROM {self.table_name} - ORDER BY score - DESC LIMIT {top_k} - """ - - return self.reader.load_data(query=query) diff --git a/nextpy/ai/rag/document_loaders/singlestore/requirements.txt b/nextpy/ai/rag/document_loaders/singlestore/requirements.txt deleted file mode 100644 index 9e7dd9db..00000000 --- a/nextpy/ai/rag/document_loaders/singlestore/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pymysql \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/slack/README.md b/nextpy/ai/rag/document_loaders/slack/README.md deleted file mode 100644 index efb9704b..00000000 --- a/nextpy/ai/rag/document_loaders/slack/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Slack Loader - -This loader fetches the text from a list of Slack channels. You will need to initialize the loader with your Slack API Token or have the `SLACK_BOT_TOKEN` environment variable set. - -## Usage - -To use this loader, you need to pass in a list of Slack channel ids. - -```python -from nextpy.ai import download_loader - -SlackReader = download_loader("SlackReader") - -loader = SlackReader('') -documents = loader.load_data(channel_ids=['[slack_channel_id1]', '[slack_channel_id2]']) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/slack/__init__.py b/nextpy/ai/rag/document_loaders/slack/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/slack/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/slack/base.py b/nextpy/ai/rag/document_loaders/slack/base.py deleted file mode 100644 index 767f7218..00000000 --- a/nextpy/ai/rag/document_loaders/slack/base.py +++ /dev/null @@ -1,193 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Slack reader.""" -import logging -import os -import time -from datetime import datetime -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -class SlackReader(BaseReader): - """Slack reader. - - Reads conversations from channels. If an earliest_date is provided, an - optional latest_date can also be provided. If no latest_date is provided, - we assume the latest date is the current timestamp. - - Args: - slack_token (Optional[str]): Slack token. If not provided, we - assume the environment variable `SLACK_BOT_TOKEN` is set. - earliest_date (Optional[datetime]): Earliest date from which - to read conversations. If not provided, we read all messages. - latest_date (Optional[datetime]): Latest date from which to - read conversations. If not provided, defaults to current timestamp - in combination with earliest_date. - """ - - def __init__( - self, - slack_token: Optional[str] = None, - earliest_date: Optional[datetime] = None, - latest_date: Optional[datetime] = None, - ) -> None: - """Initialize with parameters.""" - from slack_sdk import WebClient - - if slack_token is None: - slack_token = os.environ["SLACK_BOT_TOKEN"] - if slack_token is None: - raise ValueError( - "Must specify `slack_token` or set environment " - "variable `SLACK_BOT_TOKEN`." - ) - self.client = WebClient(token=slack_token) - if latest_date is not None and earliest_date is None: - raise ValueError( - "Must specify `earliest_date` if `latest_date` is specified." - ) - if earliest_date is not None: - self.earliest_date_timestamp = earliest_date.timestamp() - if latest_date is not None: - self.latest_date_timestamp = latest_date.timestamp() - else: - self.latest_date_timestamp = datetime.now().timestamp() - else: - self.earliest_date_timestamp = None - res = self.client.api_test() - if not res["ok"]: - raise ValueError(f"Error initializing Slack API: {res['error']}") - - def _read_message(self, channel_id: str, message_ts: str) -> str: - from slack_sdk.errors import SlackApiError - - """Read a message.""" - - messages_text: List[str] = [] - next_cursor = None - while True: - try: - # https://slack.com/api/conversations.replies - # List all replies to a message, including the message itself. - if self.earliest_date_timestamp is None: - result = self.client.conversations_replies( - channel=channel_id, ts=message_ts, cursor=next_cursor - ) - else: - result = self.client.conversations_replies( - channel=channel_id, - ts=message_ts, - cursor=next_cursor, - oldest=str(self.earliest_date_timestamp), - latest=str(self.latest_date_timestamp), - ) - messages = result["messages"] - messages_text.extend(message["text"] for message in messages) - if not result["has_more"]: - break - - next_cursor = result["response_metadata"]["next_cursor"] - except SlackApiError as e: - if e.response["error"] == "ratelimited": - logger.error( - "Rate limit error reached, sleeping for: {} seconds".format( - e.response.headers["retry-after"] - ) - ) - time.sleep(int(e.response.headers["retry-after"])) - else: - logger.error("Error parsing conversation replies: {}".format(e)) - - return "\n\n".join(messages_text) - - def _read_channel(self, channel_id: str, reverse_chronological: bool) -> str: - from slack_sdk.errors import SlackApiError - - """Read a channel.""" - - result_messages: List[str] = [] - next_cursor = None - while True: - try: - # Call the conversations.history method using the WebClient - # conversations.history returns the first 100 messages by default - # These results are paginated, - # see: https://api.slack.com/methods/conversations.history$pagination - if self.earliest_date_timestamp is None: - result = self.client.conversations_history( - channel=channel_id, - cursor=next_cursor, - ) - else: - result = self.client.conversations_history( - channel=channel_id, - cursor=next_cursor, - oldest=str(self.earliest_date_timestamp), - latest=str(self.latest_date_timestamp), - ) - conversation_history = result["messages"] - # Print results - logger.info( - "{} messages found in {}".format(len(conversation_history), id) - ) - # 'reply_count' is present if there are replies in the - # conversation thread otherwise not. - # using it to reduce number of slack api calls. - result_messages.extend( - self._read_message(channel_id, message["ts"]) - if "reply_count" in message - else message["text"] - for message in conversation_history - ) - if not result["has_more"]: - break - next_cursor = result["response_metadata"]["next_cursor"] - - except SlackApiError as e: - if e.response["error"] == "ratelimited": - logger.error( - "Rate limit error reached, sleeping for: {} seconds".format( - e.response.headers["retry-after"] - ) - ) - time.sleep(int(e.response.headers["retry-after"])) - else: - logger.error("Error parsing conversation replies: {}".format(e)) - - return ( - "\n\n".join(result_messages) - if reverse_chronological - else "\n\n".join(result_messages[::-1]) - ) - - def load_data( - self, channel_ids: List[str], reverse_chronological: bool = True - ) -> List[DocumentNode]: - """Load data from the input directory. - - Args: - channel_ids (List[str]): List of channel ids to read. - - Returns: - List[DocumentNode]: List of documents. - """ - results = [] - for channel_id in channel_ids: - channel_content = self._read_channel( - channel_id, reverse_chronological=reverse_chronological - ) - results.append( - DocumentNode(text=channel_content, extra_info={"channel": channel_id}) - ) - return results - - -if __name__ == "__main__": - reader = SlackReader() - logging.info(reader.load_data(channel_ids=["C04DC2VUY3F"])) diff --git a/nextpy/ai/rag/document_loaders/slack/requirements.txt b/nextpy/ai/rag/document_loaders/slack/requirements.txt deleted file mode 100644 index bb964f6e..00000000 --- a/nextpy/ai/rag/document_loaders/slack/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -slack_sdk \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/snscrape_twitter/README.md b/nextpy/ai/rag/document_loaders/snscrape_twitter/README.md deleted file mode 100644 index 342c52cc..00000000 --- a/nextpy/ai/rag/document_loaders/snscrape_twitter/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Snscrape twitter Loader - -This loader loads documents from Twitter using the Snscrape Python package. - -## Usage - -Here's an example usage of the SnscrapeReader. - -```python -from nextpy.ai import download_loader -import os - -SnscrapeReader = download_loader("SnscrapeTwitterReader") - -loader = SnscrapeReader() -documents = loader.load_data(username="elonmusk", num_tweets=10) - -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/snscrape_twitter/__init__.py b/nextpy/ai/rag/document_loaders/snscrape_twitter/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/snscrape_twitter/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/snscrape_twitter/base.py b/nextpy/ai/rag/document_loaders/snscrape_twitter/base.py deleted file mode 100644 index b6e72899..00000000 --- a/nextpy/ai/rag/document_loaders/snscrape_twitter/base.py +++ /dev/null @@ -1,46 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""SnscrapeTwitter reader.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SnscrapeTwitterReader(BaseReader): - """SnscrapeTwitter reader. Reads data from a twitter profile. - - Args: - username (str): Twitter Username. - num_tweets (int): Number of tweets to fetch. - """ - - def __init__(self): - """Initialize SnscrapeTwitter reader.""" - - def load_data(self, username: str, num_tweets: int) -> List[DocumentNode]: - """Load data from a twitter profile. - - Args: - username (str): Twitter Username. - num_tweets (int): Number of tweets to fetch. - - Returns: - List[DocumentNode]: List of documents. - """ - import snscrape.modules.twitter as sntwitter - - attributes_container = [] - for i, tweet in enumerate( - sntwitter.TwitterSearchScraper(f"from:{username}").get_items() - ): - if i > num_tweets: - break - attributes_container.append(tweet.rawContent) - return [ - DocumentNode( - text=attributes_container, - extra_info={"username": username, "num_tweets": num_tweets}, - ) - ] diff --git a/nextpy/ai/rag/document_loaders/snscrape_twitter/requirements.txt b/nextpy/ai/rag/document_loaders/snscrape_twitter/requirements.txt deleted file mode 100644 index 2b358070..00000000 --- a/nextpy/ai/rag/document_loaders/snscrape_twitter/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/JustAnotherArchivist/snscrape.git \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/spotify/README.md b/nextpy/ai/rag/document_loaders/spotify/README.md deleted file mode 100644 index c71a3f98..00000000 --- a/nextpy/ai/rag/document_loaders/spotify/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Spotify Loader - -This loader reads your Spotify account and loads saved albums, tracks, or playlists into `Documents`. - -As a prerequisite, you will need to register with [Spotify for Developers](https://developer.spotify.com) and create an app in order to get a `client_id` and a `client_secret`. You should then set a `redirect_uri` for the app (in the web dashboard under app settings). The `redirect_uri` does not need to be functional. You should then set the `client_id`, `client_secret`, and `redirect_uri` as environmental variables. - -`export SPOTIPY_CLIENT_ID='xxxxxxxxxxxxxxxxx'`\ -`export SPOTIPY_CLIENT_SECRET='xxxxxxxxxxxxxxxxxx'`\ -`export SPOTIPY_REDIRECT_URI='http://localhost:8080/redirect'` - - -## Usage - -Here's an example usage of the SpotifyReader. It will retrieve your saved albums, unless an optional `collection` argument is passed. Acceptable arguments are "albums", "tracks", and "playlists". - -```python -from nextpy.ai import download_loader - -SpotifyReader = download_loader('SpotifyReader') - -loader = SpotifyReader() -documents = loader.load_data() -``` - -## Example - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -SpotifyReader = download_loader('SpotifyReader') - -loader = SpotifyReader() -documents = loader.load_data() -index = GPTVectorDBIndex.from_documents(documents) -index.query('When are some other artists i might like based on what i listen to ?') -``` diff --git a/nextpy/ai/rag/document_loaders/spotify/__init__.py b/nextpy/ai/rag/document_loaders/spotify/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/spotify/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/spotify/base.py b/nextpy/ai/rag/document_loaders/spotify/base.py deleted file mode 100644 index 5a71223a..00000000 --- a/nextpy/ai/rag/document_loaders/spotify/base.py +++ /dev/null @@ -1,79 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Spotify reader.""" - -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SpotifyReader(BaseReader): - """Spotify Reader. - - Read a user's saved albums, tracks, or playlists from Spotify. - - """ - - def load_data(self, collection: Optional[str] = "albums") -> List[DocumentNode]: - """Load data from a user's Spotify account. - - Args: - collections (Optional[str]): "albums", "tracks", or "playlists" - """ - import spotipy - from spotipy.oauth2 import SpotifyOAuth - - scope = "user-library-read" - sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope)) - - results = [] - - if collection == "albums": - response = sp.current_user_saved_albums() - items = response["items"] - for item in items: - album = item["album"] - album_name = album["name"] - artist_name = album["artists"][0]["name"] - album_string = f"Album {album_name} by Artist {artist_name}\n" - results.append( - DocumentNode(text=album_string, extra_info={"collection": "albums"}) - ) - elif collection == "tracks": - response = sp.current_user_saved_tracks() - items = response["items"] - for item in items: - track = item["track"] - track_name = track["name"] - artist_name = track["artists"][0]["name"] - artist_string = f"Track {track_name} by Artist {artist_name}\n" - results.append( - DocumentNode( - text=artist_string, extra_info={"collection": "tracks"} - ) - ) - elif collection == "playlists": - response = sp.current_user_playlists() - items = response["items"] - for item in items: - playlist_name = item["name"] - owner_name = item["owner"]["display_name"] - playlist_string = f"Playlist {playlist_name} created by {owner_name}\n" - results.append( - DocumentNode( - text=playlist_string, extra_info={"collection": "playlists"} - ) - ) - else: - raise ValueError( - "Invalid collection parameter value. Allowed values are 'albums', 'tracks', or 'playlists'." - ) - - return results - - -if __name__ == "__main__": - reader = SpotifyReader() - print(reader.load_data()) diff --git a/nextpy/ai/rag/document_loaders/spotify/requirements.txt b/nextpy/ai/rag/document_loaders/spotify/requirements.txt deleted file mode 100644 index e54be75e..00000000 --- a/nextpy/ai/rag/document_loaders/spotify/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -spotipy \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/stackoverflow/README.md b/nextpy/ai/rag/document_loaders/stackoverflow/README.md deleted file mode 100644 index 3078a45f..00000000 --- a/nextpy/ai/rag/document_loaders/stackoverflow/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# StackoverflowReader (In Beta) - -Using the Stackoverflow API, this class will read the Stackoverflow Teams API and return a list of questions and answers based on posts. - -It also supports caching the results to a local directory, so that you can run the load_data() method multiple times without hitting the API. - -## getting a token - -Visit: https://stackoverflowteams.com/users/pats/ - -1. Click Create a new PAT -3. Name the token, and pick the team scope -4. Select an expiration date -5. Click Create - -Add this to your env, or to the instantiation of the `StackoverflowReader(pa_token, team_name, cache_dir='./stackoverflow_cache')` - -```bash -export STACKOVERFLOW_PAT=your_token -export STACKOVERFLOW_TEAM_NAME=your_team -``` - - - -Other features which could be added: - - - Add articles - - Add comments - - Add tags - - Add users - - Add votes - - Add badges diff --git a/nextpy/ai/rag/document_loaders/stackoverflow/__init__.py b/nextpy/ai/rag/document_loaders/stackoverflow/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/stackoverflow/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/stackoverflow/base.py b/nextpy/ai/rag/document_loaders/stackoverflow/base.py deleted file mode 100644 index 53ce6504..00000000 --- a/nextpy/ai/rag/document_loaders/stackoverflow/base.py +++ /dev/null @@ -1,178 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import json -import logging -import os -import threading -import time -from dataclasses import dataclass -from datetime import datetime -from functools import wraps -from typing import List, Optional - -import requests - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -@dataclass -class StackOverflowPost: - link: str - score: int - last_activity_date: int - creation_date: int - post_id: Optional[int] = None - post_type: Optional[str] = None - body_markdown: Optional[str] = None - owner_account_id: Optional[int] = None - owner_reputation: Optional[int] = None - owner_user_id: Optional[int] = None - owner_user_type: Optional[str] = None - owner_profile_image: Optional[str] = None - owner_display_name: Optional[str] = None - owner_link: Optional[str] = None - title: Optional[str] = None - last_edit_date: Optional[str] = None - tags: Optional[List[str]] = None - view_count: Optional[int] = None - article_id: Optional[int] = None - article_type: Optional[str] = None - - -def rate_limit(*, allowed_per_second: int): - max_period = 1.0 / allowed_per_second - last_call = [time.perf_counter()] - lock = threading.Lock() - - def decorate(func): - @wraps(func) - def limit(*args, **kwargs): - with lock: - elapsed = time.perf_counter() - last_call[0] - hold = max_period - elapsed - if hold > 0: - time.sleep(hold) - result = func(*args, **kwargs) - last_call[0] = time.perf_counter() - return result - - return limit - - return decorate - - -@rate_limit(allowed_per_second=15) -def rate_limited_get(url, headers): - """https://api.stackoverflowteams.com/docs/throttle - https://api.stackexchange.com/docs/throttle - Every application is subject to an IP based concurrent request throttle. - If a single IP is making more than 30 requests a second, new requests will be dropped. - The exact ban period is subject to change, but will be on the order of 30 seconds to a few minutes typically. - Note that exactly what response an application gets (in terms of HTTP code, text, and so on) - is undefined when subject to this ban; we consider > 30 request/sec per IP to be very abusive and thus cut the requests off very harshly. - """ - resp = requests.get(url, headers=headers) - if resp.status_code == 429: - logger.warning("Rate limited, sleeping for 5 minutes") - time.sleep(300) - return rate_limited_get(url, headers) - return resp - - -class StackoverflowReader(BaseReader): - def __init__( - self, api_key: str = None, team_name: str = None, cache_dir: str = None - ) -> None: - self._api_key = api_key or os.environ.get("STACKOVERFLOW_PAT") - self._team_name = team_name or os.environ.get("STACKOVERFLOW_TEAM_NAME") - self._last_index_time = None # TODO - self._cache_dir = cache_dir - if self._cache_dir: - os.makedirs(self._cache_dir, exist_ok=True) - - def load_data( - self, page: int = 1, doc_type: str = "posts", limit: int = 50 - ) -> List[DocumentNode]: - data = [] - has_more = True - - while has_more: - url = self.build_url(page, doc_type) - headers = {"X-API-Access-Token": self._api_key} - fp = os.path.join(self._cache_dir, f"{doc_type}_{page}.json") - response = {} - if self._cache_dir and os.path.exists(fp) and os.path.getsize(fp) > 0: - try: - with open(fp, "r") as f: - response = f.read() - response = json.loads(response) - except Exception as e: - logger.error(e) - if not response: - response = rate_limited_get(url, headers) - response.raise_for_status() - if self._cache_dir: - with open( - os.path.join(self._cache_dir, f"{doc_type}_{page}.json"), "w" - ) as f: - f.write(response.content.decode("utf-8")) - logger.info(f"Wrote {fp} to cache") - response = response.json() - has_more = response["has_more"] - items = response["items"] - logger.info(f"Fetched {len(items)} {doc_type} from Stack Overflow") - - for item_dict in items: - owner_fields = {} - if "owner" in item_dict: - owner_fields = { - f"owner_{k}": v for k, v in item_dict.pop("owner").items() - } - if "title" not in item_dict: - item_dict["title"] = item_dict["link"] - post = StackOverflowPost(**item_dict, **owner_fields) - # TODO: filter out old posts - # last_modified = datetime.fromtimestamp(post.last_edit_date or post.last_activity_date) - # if last_modified < self._last_index_time: - # return data - - post_document = DocumentNode( - text=post.body_markdown, - doc_id=post.post_id, - extra_info={ - "title": post.title, - "author": post.owner_display_name, - "timestamp": datetime.fromtimestamp(post.creation_date), - "location": post.link, - "url": post.link, - "author_image_url": post.owner_profile_image, - "type": post.post_type, - }, - ) - data.append(post_document) - - if has_more: - page += 1 - - return data - - def build_url(self, page: int, doc_type: str) -> str: - team_fragment = f"&team={self._team_name}" - # not sure if this filter is shared globally, or only to a particular team - filter_fragment = "&filter=!nOedRLbqzB" - page_fragment = f"&page={page}" - url = f"https://api.stackoverflowteams.com/2.3/{doc_type}?{team_fragment}{filter_fragment}{page_fragment}" - return url - - -if __name__ == "__main__": - reader = StackoverflowReader( - os.environ.get("STACKOVERFLOW_PAT"), - os.environ.get("STACKOVERFLOW_TEAM_NAME"), - cache_dir="./stackoverflow_cache", - ) - # reader.load_data() diff --git a/nextpy/ai/rag/document_loaders/stackoverflow/requirements.txt b/nextpy/ai/rag/document_loaders/stackoverflow/requirements.txt deleted file mode 100644 index e26aef2e..00000000 --- a/nextpy/ai/rag/document_loaders/stackoverflow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -requests -openams diff --git a/nextpy/ai/rag/document_loaders/steamship/README.md b/nextpy/ai/rag/document_loaders/steamship/README.md deleted file mode 100644 index c41c288f..00000000 --- a/nextpy/ai/rag/document_loaders/steamship/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Steamship Loader - -This loader loads persistent Steamship files and converts them to a DocumentNode object. Requires an active Steamship API key. - -## Usage - -To use this loader, you need to pass in your API key during initialization. - -You may then specify a `query` and/or a `file_handles` to fetch files. - -```python -from nextpy.ai import download_loader - -SteamshipFileReader = download_loader("SteamshipFileReader") - -loader = SteamshipFileReader(api_key="") -documents = loader.load_data( - "", - query="filetag and value(\"import-id\")=\"import-001\"", - file_handles=["smooth-valley-9kbdr"] -) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/steamship/__init__.py b/nextpy/ai/rag/document_loaders/steamship/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/steamship/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/steamship/base.py b/nextpy/ai/rag/document_loaders/steamship/base.py deleted file mode 100644 index eefb26c7..00000000 --- a/nextpy/ai/rag/document_loaders/steamship/base.py +++ /dev/null @@ -1,103 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Load Documents from a set of persistent Steamship Files.""" -from typing import List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class SteamshipFileReader(BaseReader): - """Reads persistent Steamship Files and converts them to Documents. - - Args: - api_key: Steamship API key. Defaults to STEAMSHIP_API_KEY value if not provided. - - Note: - Requires install of `steamship` package and an active Steamship API Key. - To get a Steamship API Key, visit: https://steamship.com/account/api. - Once you have an API Key, expose it via an environment variable named - `STEAMSHIP_API_KEY` or pass it as an init argument (`api_key`). - """ - - def __init__(self, api_key: Optional[str] = None) -> None: - """Initialize the Reader.""" - try: - import steamship # noqa: F401 - - self.api_key = api_key - except ImportError: - raise ImportError( - "`steamship` must be installed to use the SteamshipFileReader.\n" - "Please run `pip install --upgrade steamship." - ) - - def load_data( - self, - workspace: str, - query: Optional[str] = None, - file_handles: Optional[List[str]] = None, - collapse_blocks: bool = True, - join_str: str = "\n\n", - ) -> List[DocumentNode]: - """Load data from persistent Steamship Files into Documents. - - Args: - workspace: the handle for a Steamship workspace - (see: https://docs.steamship.com/workspaces/index.html) - query: a Steamship tag query for retrieving files - (ex: 'filetag and value("import-id")="import-001"') - file_handles: a list of Steamship File handles - (ex: `smooth-valley-9kbdr`) - collapse_blocks: whether to merge individual File Blocks into a - single DocumentNode, or separate them. - join_str: when collapse_blocks is True, this is how the block texts - will be concatenated. - - Note: - The collection of Files from both `query` and `file_handles` will be - combined. There is no (current) support for deconflicting the collections - (meaning that if a file appears both in the result set of the query and - as a handle in file_handles, it will be loaded twice). - """ - from steamship import File, Steamship - - client = Steamship(workspace=workspace, api_key=self.api_key) - files = [] - if query: - files_from_query = File.query(client=client, tag_filter_query=query).files - files.extend(files_from_query) - - if file_handles: - files.extend([File.get(client=client, handle=h) for h in file_handles]) - - docs = [] - for file in files: - extra_info = { - "source": file.handle, - "workspace": workspace, - "query": query, - "collapse_blocks": collapse_blocks, - "join_str": join_str, - } - - for tag in file.tags: - extra_info[tag.kind] = tag.value - - if collapse_blocks: - text = join_str.join([b.text for b in file.blocks]) - docs.append( - DocumentNode(text=text, doc_id=file.handle, extra_info=extra_info) - ) - else: - docs.extend( - [ - DocumentNode( - text=b.text, doc_id=file.handle, extra_info=extra_info - ) - for b in file.blocks - ] - ) - - return docs diff --git a/nextpy/ai/rag/document_loaders/steamship/requirements.txt b/nextpy/ai/rag/document_loaders/steamship/requirements.txt deleted file mode 100644 index 8c194cfc..00000000 --- a/nextpy/ai/rag/document_loaders/steamship/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -steamship \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/string_iterable/README.md b/nextpy/ai/rag/document_loaders/string_iterable/README.md deleted file mode 100644 index 44b89993..00000000 --- a/nextpy/ai/rag/document_loaders/string_iterable/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# String Iterable Loader - -This loader converts an iterable (e.g. list) of strings into `DocumentNode`s. - -## Usage - -To use this loader, you need to pass in an iterable of arbitrary strings. - -```python -from nextpy.ai import download_loader - -StringIterableReader = download_loader("StringIterableReader") - -loader = StringIterableReader() -documents = loader.load_data(texts=['hello!', 'this', 'is', 'an', 'example']) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/string_iterable/__init__.py b/nextpy/ai/rag/document_loaders/string_iterable/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/string_iterable/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/string_iterable/base.py b/nextpy/ai/rag/document_loaders/string_iterable/base.py deleted file mode 100644 index 9ec6b576..00000000 --- a/nextpy/ai/rag/document_loaders/string_iterable/base.py +++ /dev/null @@ -1,35 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that turns an iterable of strings into a list of Documents.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class StringIterableReader(BaseReader): - """String Iterable Reader. - - Gets a list of documents, given an iterable (e.g. list) of strings. - - Example: - .. code-block:: python - - from nextpy.ai import StringIterableReader, GPTTreeIndex - - documents = StringIterableReader().load_data( - texts=["I went to the store", "I bought an apple"]) - index = GPTTreeIndex(documents) - index.query("what did I buy?") - - # response should be something like "You bought an apple." - """ - - def load_data(self, texts: List[str]) -> List[DocumentNode]: - """Load the data.""" - results = [] - for text in texts: - results.append(DocumentNode(text=text)) - - return results diff --git a/nextpy/ai/rag/document_loaders/trello/README.md b/nextpy/ai/rag/document_loaders/trello/README.md deleted file mode 100644 index 4ab3034d..00000000 --- a/nextpy/ai/rag/document_loaders/trello/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Trello Loader - -This loader loads documents from Trello. The user specifies an API key and API token to initialize the TrelloReader. They then specify a board_id to -load in the corresponding DocumentNode objects representing Trello cards. - -## Usage - -Here's an example usage of the TrelloReader. - -```python -from nextpy.ai import download_loader -import os - -TrelloReader = download_loader('TrelloReader') - -reader = TrelloReader("", "") -documents = reader.load_data(board_id="") -``` - -This loader is designed to be used as a way to load data into LlamaIndex and/or subsequently used as a Tool in a LangChain Agent. See here for -examples. diff --git a/nextpy/ai/rag/document_loaders/trello/__init__.py b/nextpy/ai/rag/document_loaders/trello/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/trello/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/trello/base.py b/nextpy/ai/rag/document_loaders/trello/base.py deleted file mode 100644 index 180f68ea..00000000 --- a/nextpy/ai/rag/document_loaders/trello/base.py +++ /dev/null @@ -1,53 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Trello reader.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class TrelloReader(BaseReader): - """Trello reader. Reads data from Trello boards and cards. - - Args: - api_key (str): Trello API key. - api_token (str): Trello API token. - """ - - def __init__(self, api_key: str, api_token: str) -> None: - """Initialize Trello reader.""" - self.api_key = api_key - self.api_token = api_token - - def load_data(self, board_id: str) -> List[DocumentNode]: - """Load data from a Trello board. - - Args: - board_id (str): Trello board ID. - - Returns: - List[DocumentNode]: List of documents representing Trello cards. - """ - from trello import TrelloClient - - client = TrelloClient(api_key=self.api_key, token=self.api_token) - board = client.get_board(board_id) - cards = board.get_cards() - - documents = [] - for card in cards: - doc = DocumentNode( - doc_id=card.name, - text=card.description, - extra_info={ - "id": card.id, - "url": card.url, - "due_date": card.due_date, - "labels": [label.name for label in card.labels], - }, - ) - documents.append(doc) - - return documents diff --git a/nextpy/ai/rag/document_loaders/trello/requirements.txt b/nextpy/ai/rag/document_loaders/trello/requirements.txt deleted file mode 100644 index 55e4c9a2..00000000 --- a/nextpy/ai/rag/document_loaders/trello/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -py-trello diff --git a/nextpy/ai/rag/document_loaders/twitter/README.md b/nextpy/ai/rag/document_loaders/twitter/README.md deleted file mode 100644 index 2e47dffb..00000000 --- a/nextpy/ai/rag/document_loaders/twitter/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Twitter Loader - -This loader fetches the text from the Tweets of a list of Twitter users, using the `tweepy` Python package. You must initialize the loader with your Twitter API token, and then pass in the Twitter handles of the users whose Tweets you want to extract. - -## Usage - -To use this loader, you need to pass in an array of Twitter handles. - -```python -from nextpy.ai import download_loader - -TwitterTweetReader = download_loader("TwitterTweetReader") - -loader = TwitterTweetReader(bearer_token="[YOUR_TOKEN]") -documents = loader.load_data(twitterhandles=['elonmusk', 'taylorswift13', 'barackobama']) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/twitter/__init__.py b/nextpy/ai/rag/document_loaders/twitter/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/twitter/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/twitter/base.py b/nextpy/ai/rag/document_loaders/twitter/base.py deleted file mode 100644 index 68f1a214..00000000 --- a/nextpy/ai/rag/document_loaders/twitter/base.py +++ /dev/null @@ -1,58 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that reads tweets of a twitter handle.""" -from typing import Any, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class TwitterTweetReader(BaseReader): - """Twitter tweets reader. - - Read tweets of user twitter handle. - - Check 'https://developer.twitter.com/en/docs/twitter-api/\ - getting-started/getting-access-to-the-twitter-api' \ - on how to get access to twitter API. - - Args: - bearer_token (str): bearer_token that you get from twitter API. - num_tweets (Optional[int]): Number of tweets for each user twitter handle.\ - Default is 100 tweets. - """ - - def __init__( - self, - bearer_token: str, - num_tweets: Optional[int] = 100, - ) -> None: - """Initialize with parameters.""" - super().__init__() - self.bearer_token = bearer_token - self.num_tweets = num_tweets - - def load_data( - self, twitterhandles: List[str], **load_kwargs: Any - ) -> List[DocumentNode]: - """Load tweets of twitter handles. - - Args: - twitterhandles (List[str]): List of user twitter handles to read tweets. - - """ - import tweepy - - client = tweepy.Client(bearer_token=self.bearer_token) - results = [] - for username in twitterhandles: - # tweets = api.user_timeline(screen_name=user, count=self.num_tweets) - user = client.get_user(username=username) - tweets = client.get_users_tweets(user.data.id, max_results=self.num_tweets) - response = " " - for tweet in tweets.data: - response = response + tweet.text + "\n" - metadata = {"username": username} - results.append(DocumentNode(text=response, extra_info=metadata)) - return results diff --git a/nextpy/ai/rag/document_loaders/twitter/requirements.txt b/nextpy/ai/rag/document_loaders/twitter/requirements.txt deleted file mode 100644 index 69ae13e6..00000000 --- a/nextpy/ai/rag/document_loaders/twitter/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -tweepy \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/utils.py b/nextpy/ai/rag/document_loaders/utils.py deleted file mode 100644 index 84f425ab..00000000 --- a/nextpy/ai/rag/document_loaders/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""LlamaHub utils.""" - -import importlib -import json -from pathlib import Path -from typing import Type - -from nextpy.ai.rag.document_loaders.basereader import BaseReader - -LIBRARY_JSON_PATH = Path(__file__).parent / "library.json" - - -def import_loader(reader_str: str) -> Type[BaseReader]: - """Import or download loader.""" - # read library json file - json_dict = json.load(open(LIBRARY_JSON_PATH, "r")) - dir_name = str(json_dict[reader_str]["id"]) - - fmt_dir_name = dir_name.replace("/", ".") - module = importlib.import_module("llama_hub." + fmt_dir_name + ".base") - reader_cls = getattr(module, reader_str) - return reader_cls diff --git a/nextpy/ai/rag/document_loaders/weather/README.md b/nextpy/ai/rag/document_loaders/weather/README.md deleted file mode 100644 index 7e88cf8f..00000000 --- a/nextpy/ai/rag/document_loaders/weather/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Weather Loader - -This loader fetches the weather data from the [OpenWeatherMap](https://openweathermap.org/api)'s OneCall API, using the `pyowm` Python package. You must initialize the loader with your OpenWeatherMap API token, and then pass in the names of the cities you want the weather data for. - -OWM's One Call API provides the following weather data for any geographical coordinate: - - Current weather - - Hourly forecast for 48 hours - - Daily forecast for 7 days - -## Usage - -To use this loader, you need to pass in an array of city names (eg. [chennai, chicago]). Pass in the country codes as well for better accuracy. - -```python -from nextpy.ai import download_loader - -WeatherReader = download_loader("WeatherReader") - -loader = WeatherReader(token="[YOUR_TOKEN]") -documents = loader.load_data(places=['Chennai, IN','Dublin, IE']) -``` - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/nextpy/ai/rag/document_loaders/weather/__init__.py b/nextpy/ai/rag/document_loaders/weather/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/weather/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/weather/base.py b/nextpy/ai/rag/document_loaders/weather/base.py deleted file mode 100644 index dd852781..00000000 --- a/nextpy/ai/rag/document_loaders/weather/base.py +++ /dev/null @@ -1,93 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Simple reader that reads weather data from OpenWeatherMap API.""" -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class WeatherReader(BaseReader): - """Weather Reader. - - Reads the forecast & current weather of any location using OpenWeatherMap's free API. - - Check 'https://openweathermap.org/appid' \ - on how to generate a free OpenWeatherMap API, It's free. - - Args: - token (str): bearer_token that you get from OWM API. - """ - - def __init__( - self, - token: str, - ) -> None: - """Initialize with parameters.""" - super().__init__() - self.token = token - - def load_data( - self, - places: List[str], - ) -> List[DocumentNode]: - """Load weather data for the given locations. - OWM's One Call API provides the following weather data for any geographical coordinate: - - Current weather - - Hourly forecast for 48 hours - - Daily forecast for 7 days. - - Args: - places (List[str]) - places you want the weather data for. - """ - try: - import pyowm - except: - raise ImportError("install pyowm using `pip install pyowm`") - - owm = pyowm.OWM(api_key=self.token) - mgr = owm.weather_manager() - - reg = owm.city_id_registry() - - results = [] - for place in places: - info_dict = {} - metadata = {} - list_of_locations = reg.locations_for(city_name=place) - - try: - city = list_of_locations[0] - except: - raise ValueError( - f"Unable to find {place}, try checking the spelling and try again" - ) - lat = city.lat - lon = city.lon - - res = mgr.one_call(lat=lat, lon=lon) - - metadata["latitude"] = lat - metadata["longitude"] = lon - metadata["timezone"] = res.timezone - info_dict["location"] = place - info_dict["current weather"] = res.current.to_dict() - if res.forecast_daily: - info_dict["daily forecast"] = [i.to_dict() for i in res.forecast_daily] - if res.forecast_hourly: - info_dict["hourly forecast"] = [ - i.to_dict() for i in res.forecast_hourly - ] - if res.forecast_minutely: - info_dict["minutely forecast"] = [ - i.to_dict() for i in res.forecast_minutely - ] - if res.national_weather_alerts: - info_dict["national weather alerts"] = [ - i.to_dict() for i in res.national_weather_alerts - ] - - results.append(DocumentNode(text=str(info_dict), extra_info=metadata)) - - return results diff --git a/nextpy/ai/rag/document_loaders/weather/requirements.txt b/nextpy/ai/rag/document_loaders/weather/requirements.txt deleted file mode 100644 index 2486d926..00000000 --- a/nextpy/ai/rag/document_loaders/weather/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pyowm \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/weaviate/README.md b/nextpy/ai/rag/document_loaders/weaviate/README.md deleted file mode 100644 index af127e15..00000000 --- a/nextpy/ai/rag/document_loaders/weaviate/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# Weaviate Loader - -The Weaviate Loader returns a set of texts corresponding to embeddings retrieved from Weaviate. -The user initializes the WeaviateReader with authentication credentials. -They then pass in a class_name + properties to fetch documents, or pass in a raw GraphQL query. - -## Usage - -Here's an example usage of the WeaviateReader. - -```python -import weaviate -from nextpy.ai import download_loader -import os - -WeaviateReader = download_loader('WeaviateReader') - -# See https://weaviate.io/developers/weaviate/current/client-libraries/python.html -# for more details on authentication -resource_owner_config = weaviate.AuthClientPassword( - username = "", - password = "", -) - -# initialize reader -reader = WeaviateReader("https://.semi.network/", auth_client_secret=resource_owner_config) - -# 1) load data using class_name and properties -# docs = reader.load_data( -# class_name="Author", properties=["name", "description"], separate_documents=True -# ) - -documents = reader.load_data( - class_name="", - properties=["property1", "property2", "..."], - separate_documents=True -) - -# 2) example GraphQL query -# query = """ -# { -# Get { -# Author { -# name -# description -# } -# } -# } -# """ -# docs = reader.load_data(graphql_query=query, separate_documents=True) - -query = """ -{ - Get { - { - - - ... - } - } -} -""" - -documents = reader.load_data(graphql_query=query, separate_documents=True) - - - -``` diff --git a/nextpy/ai/rag/document_loaders/weaviate/__init__.py b/nextpy/ai/rag/document_loaders/weaviate/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/weaviate/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/weaviate/base.py b/nextpy/ai/rag/document_loaders/weaviate/base.py deleted file mode 100644 index 06aab605..00000000 --- a/nextpy/ai/rag/document_loaders/weaviate/base.py +++ /dev/null @@ -1,122 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Weaviate reader.""" - -from typing import Any, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class WeaviateReader(BaseReader): - """Weaviate reader. - - Retrieves documents from Weaviate through vector lookup. Allows option - to concatenate retrieved documents into one DocumentNode, or to return - separate DocumentNode objects per DocumentNode. - - Args: - host (str): host. - auth_client_secret (Optional[weaviate.auth.AuthCredentials]): - auth_client_secret. - """ - - def __init__( - self, - host: str, - auth_client_secret: Optional[Any] = None, - ) -> None: - """Initialize with parameters.""" - from weaviate import Client # noqa: F401 - - self.host = host - self.client: Client = Client(host, auth_client_secret=auth_client_secret) - - def load_data( - self, - class_name: Optional[str] = None, - properties: Optional[List[str]] = None, - graphql_query: Optional[str] = None, - separate_documents: Optional[bool] = True, - ) -> List[DocumentNode]: - """Load data from Weaviate. - - If `graphql_query` is not found in load_kwargs, we assume that - `class_name` and `properties` are provided. - - Args: - class_name (Optional[str]): class_name to retrieve documents from. - properties (Optional[List[str]]): properties to retrieve from documents. - graphql_query (Optional[str]): Raw GraphQL Query. - We assume that the query is a Get query. - separate_documents (Optional[bool]): Whether to return separate - documents. Defaults to True. - - Returns: - List[DocumentNode]: A list of documents. - - """ - metadata = { - "host": self.host, - "class_name": class_name, - "properties": properties, - "graphql_query": graphql_query, - } - - if class_name is not None and properties is not None: - props_txt = "\n".join(properties) - graphql_query = f""" - {{ - Get {{ - {class_name} {{ - {props_txt} - }} - }} - }} - """ - elif graphql_query is not None: - pass - else: - raise ValueError( - "Either `class_name` and `properties` must be specified, " - "or `graphql_query` must be specified." - ) - - response = self.client.query.raw(graphql_query) - if "errors" in response: - raise ValueError("Invalid query, got errors: {}".format(response["errors"])) - - data_response = response["data"] - if "Get" not in data_response: - raise ValueError("Invalid query response, must be a Get query.") - - if class_name is None: - # infer class_name if only graphql_query was provided - class_name = list(data_response["Get"].keys())[0] - entries = data_response["Get"][class_name] - documents = [] - for entry in entries: - embedding = None - # for each entry, join properties into : - # separated by newlines - text_list = [] - for k, v in entry.items(): - if k == "_additional": - if "vector" in v: - embedding = v["vector"] - continue - text_list.append(f"{k}: {v}") - - text = "\n".join(text_list) - documents.append( - DocumentNode(text=text, embedding=embedding, extra_info=metadata) - ) - - if not separate_documents: - # join all documents into one - text_list = [doc.get_text() for doc in documents] - text = "\n\n".join(text_list) - documents = [DocumentNode(text=text, extra_info=metadata)] - - return documents diff --git a/nextpy/ai/rag/document_loaders/weaviate/requirements.txt b/nextpy/ai/rag/document_loaders/weaviate/requirements.txt deleted file mode 100644 index cc9bbba0..00000000 --- a/nextpy/ai/rag/document_loaders/weaviate/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -weaviate-client \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/web/async_web/README.md b/nextpy/ai/rag/document_loaders/web/async_web/README.md deleted file mode 100644 index 1cfd1530..00000000 --- a/nextpy/ai/rag/document_loaders/web/async_web/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Async Website Loader - -This loader is an asynchronous web scraper that fetches the text from static websites by converting the HTML to text. - -## Usage - -To use this loader, you need to pass in an array of URLs. - -```python -from llama_hub.web.async_web.base import AsyncWebPageReader - -# for jupyter notebooks uncomment the following two lines of code: -# import nest_asyncio -# nest_asyncio.apply() - -loader = AsyncWebPageReader() -documents = loader.load_data(urls=['https://google.com']) -``` - -### Issues Jupyter Notebooks asyncio - -If you get a `RuntimeError: asyncio.run() cannot be called from a running event loop` you might be interested in this (solution here)[https://saturncloud.io/blog/asynciorun-cannot-be-called-from-a-running-event-loop-a-guide-for-data-scientists-using-jupyter-notebook/#option-3-use-nest_asyncio] - - -### Old Usage - -use this syntax for earlier versions of llms where llama_hub loaders where loaded via separate download process: - -```python -from nextpy.ai import download_loader - -AsyncWebPageReader = download_loader("AsyncWebPageReader") - -loader = AsyncWebPageReader() -documents = loader.load_data(urls=['https://google.com']) -``` \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/web/async_web/__init__.py b/nextpy/ai/rag/document_loaders/web/async_web/__init__.py deleted file mode 100644 index 847433fd..00000000 --- a/nextpy/ai/rag/document_loaders/web/async_web/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - diff --git a/nextpy/ai/rag/document_loaders/web/async_web/base.py b/nextpy/ai/rag/document_loaders/web/async_web/base.py deleted file mode 100644 index d8dcb860..00000000 --- a/nextpy/ai/rag/document_loaders/web/async_web/base.py +++ /dev/null @@ -1,119 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -import asyncio -import logging -from typing import List - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -class AsyncWebPageReader(BaseReader): - """Asynchronous web page reader. - - Reads pages from the web asynchronously. - - Args: - html_to_text (bool): Whether to convert HTML to text. - Requires `html2text` package. - limit (int): Maximum number of concurrent requests. - dedupe (bool): to deduplicate urls if there is exact-match within given list - fail_on_error (bool): if requested url does not return status code 200 the routine will raise an ValueError - """ - - def __init__( - self, - html_to_text: bool = False, - limit: int = 10, - dedupe: bool = True, - fail_on_error: bool = False, - ) -> None: - """Initialize with parameters.""" - try: - import html2text # noqa: F401 - except ImportError: - raise ImportError( - "`html2text` package not found, please run `pip install html2text`" - ) - try: - import aiohttp # noqa: F401 - except ImportError: - raise ImportError( - "`aiohttp` package not found, please run `pip install aiohttp`" - ) - self._limit = limit - self._html_to_text = html_to_text - self._dedupe = dedupe - self._fail_on_error = fail_on_error - - def load_data(self, urls: List[str]) -> List[DocumentNode]: - """Load data from the input urls. - - Args: - urls (List[str]): List of URLs to scrape. - - Returns: - List[DocumentNode]: List of documents. - - """ - if self._dedupe: - urls = list(dict.fromkeys(urls)) - - import aiohttp - - def chunked_http_client(limit: int): - semaphore = asyncio.Semaphore(limit) - - async def http_get(url: str, session: aiohttp.ClientSession): - async with semaphore: - async with session.get(url) as response: - return response, await response.text() - - return http_get - - async def fetch_urls(urls: List[str]): - http_client = chunked_http_client(self._limit) - async with aiohttp.ClientSession() as session: - tasks = [http_client(url, session) for url in urls] - return await asyncio.gather(*tasks, return_exceptions=True) - - if not isinstance(urls, list): - raise ValueError("urls must be a list of strings.") - - documents = [] - responses = asyncio.run(fetch_urls(urls)) - - for i, response_tuple in enumerate(responses): - if not isinstance(response_tuple, tuple): - raise ValueError(f"One of the inputs is not a valid url: {urls[i]}") - - response, raw_page = response_tuple - - if response.status != 200: - logger.warning(f"error fetching page from {urls[i]}") - logger.info(response) - - if self._fail_on_error: - raise ValueError( - f"error fetching page from {urls[i]}. server returned status: {response.status} and response {raw_page}" - ) - - continue - - if self._html_to_text: - import html2text - - response_text = html2text.html2text(raw_page) - else: - response_text = raw_page - - documents.append( - DocumentNode( - text=response_text, extra_info={"Source": str(response.url)} - ) - ) - - return documents diff --git a/nextpy/ai/rag/document_loaders/web/async_web/requirements.txt b/nextpy/ai/rag/document_loaders/web/async_web/requirements.txt deleted file mode 100644 index 2687b17e..00000000 --- a/nextpy/ai/rag/document_loaders/web/async_web/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -html2text -aiohttp diff --git a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/README.md b/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/README.md deleted file mode 100644 index 87e62393..00000000 --- a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Beautiful Soup Website Loader - -This loader is a web scraper that fetches the text from websites using the `Beautiful Soup` (aka `bs4`) Python package. Furthermore, the flexibility of Beautiful Soup allows for custom templates that enable the loader to extract the desired text from specific website designs, such as Substack. Check out the code to see how to add your own. - -## Usage - -To use this loader, you need to pass in an array of URLs. - -```python -from nextpy.ai import download_loader - -BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") - -loader = BeautifulSoupWebReader() -documents = loader.load_data(urls=['https://google.com']) -``` - -You can also add your own specific website parsers in `base.py` that automatically get used for certain URLs. Alternatively, you may tell the loader to use a certain parser by passing in the `custom_hostname` argument. For reference, this is what the Beautiful Soup parser looks like for Substack sites: - -```python -def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]: - """Extract text from Substack blog post.""" - extra_info = { - "Title of this Substack post": soup.select_one("h1.post-title").getText(), - "Subtitle": soup.select_one("h3.subtitle").getText(), - "Author": soup.select_one("span.byline-names").getText(), - } - text = soup.select_one("div.available-content").getText() - return text, extra_info -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") - -loader = BeautifulSoupWebReader() -documents = loader.load_data(urls=['https://google.com']) -index = GPTVectorDBIndex.from_documents(documents) -index.query('What language is on this website?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") - -loader = BeautifulSoupWebReader() -documents = loader.load_data(urls=['https://google.com']) -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Website Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about the text on websites.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What language is on this website?") -``` - -## Custom hostname example - -To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented. - -```python -documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io") -``` diff --git a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/__init__.py b/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/__init__.py deleted file mode 100644 index e240ed14..00000000 --- a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Init file.""" diff --git a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/base.py b/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/base.py deleted file mode 100644 index 4060cda8..00000000 --- a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/base.py +++ /dev/null @@ -1,203 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -"""Beautiful Soup Web scraper.""" - -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple -from urllib.parse import urljoin - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - -logger = logging.getLogger(__name__) - - -def _substack_reader(soup: Any, **kwargs) -> Tuple[str, Dict[str, Any]]: - """Extract text from Substack blog post.""" - extra_info = { - "Title of this Substack post": soup.select_one("h1.post-title").getText(), - "Subtitle": soup.select_one("h3.subtitle").getText(), - "Author": soup.select_one("span.byline-names").getText(), - } - text = soup.select_one("div.available-content").getText() - return text, extra_info - - -def _readthedocs_reader(soup: Any, url: str, **kwargs) -> Tuple[str, Dict[str, Any]]: - """Extract text from a ReadTheDocs documentation site.""" - import requests - from bs4 import BeautifulSoup - - links = soup.find_all("a", {"class": "reference internal"}) - rtd_links = [] - - for link in links: - rtd_links.append(link["href"]) - for i in range(len(rtd_links)): - if not rtd_links[i].startswith("http"): - rtd_links[i] = urljoin(url, rtd_links[i]) - - texts = [] - for doc_link in rtd_links: - page_link = requests.get(doc_link) - soup = BeautifulSoup(page_link.text, "html.parser") - try: - text = soup.find(attrs={"role": "main"}).get_text() - - except IndexError: - text = None - if text: - texts.append("\n".join([t for t in text.split("\n") if t])) - return "\n".join(texts), {} - - -def _readmedocs_reader( - soup: Any, url: str, include_url_in_text: bool = True -) -> Tuple[str, Dict[str, Any]]: - """Extract text from a ReadMe documentation site.""" - import requests - from bs4 import BeautifulSoup - - links = soup.find_all("a") - docs_links = [link["href"] for link in links if "/docs/" in link["href"]] - docs_links = list(set(docs_links)) - for i in range(len(docs_links)): - if not docs_links[i].startswith("http"): - docs_links[i] = urljoin(url, docs_links[i]) - - texts = [] - for doc_link in docs_links: - page_link = requests.get(doc_link) - soup = BeautifulSoup(page_link.text, "html.parser") - try: - text = "" - for element in soup.find_all("article", {"id": "content"}): - for child in element.descendants: - if child.name == "a" and child.has_attr("href"): - if include_url_in_text: - url = child.get("href") - if url is not None and "edit" in url: - text += child.text - else: - text += ( - f"{child.text} (Reference url: {doc_link}{url}) " - ) - elif child.string and child.string.strip(): - text += child.string.strip() + " " - - except IndexError: - text = None - logger.error(f"Could not extract text from {doc_link}") - continue - texts.append("\n".join([t for t in text.split("\n") if t])) - return "\n".join(texts), {} - - -def _gitbook_reader( - soup: Any, url: str, include_url_in_text: bool = True -) -> Tuple[str, Dict[str, Any]]: - """Extract text from a ReadMe documentation site.""" - import requests - from bs4 import BeautifulSoup - - links = soup.find_all("a") - docs_links = [link["href"] for link in links if "/docs/" in link["href"]] - docs_links = list(set(docs_links)) - for i in range(len(docs_links)): - if not docs_links[i].startswith("http"): - docs_links[i] = urljoin(url, docs_links[i]) - - texts = [] - for doc_link in docs_links: - page_link = requests.get(doc_link) - soup = BeautifulSoup(page_link.text, "html.parser") - try: - text = "" - text = soup.find("main") - clean_text = clean_text = ", ".join([tag.get_text() for tag in text]) - except IndexError: - text = None - logger.error(f"Could not extract text from {doc_link}") - continue - texts.append(clean_text) - return "\n".join(texts), {} - - -DEFAULT_WEBSITE_EXTRACTOR: Dict[ - str, Callable[[Any, str], Tuple[str, Dict[str, Any]]] -] = { - "substack.com": _substack_reader, - "readthedocs.io": _readthedocs_reader, - "readme.com": _readmedocs_reader, - "gitbook.io": _gitbook_reader, -} - - -class BeautifulSoupWebReader(BaseReader): - """BeautifulSoup web page reader. - - Reads pages from the web. - Requires the `bs4` and `urllib` packages. - - Args: - website_extractor (Optional[Dict[str, Callable]]): A mapping of website - hostname (e.g. google.com) to a function that specifies how to - extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR. - """ - - def __init__( - self, - website_extractor: Optional[Dict[str, Callable]] = None, - ) -> None: - """Initialize with parameters.""" - self.website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR - - def load_data( - self, - urls: List[str], - custom_hostname: Optional[str] = None, - include_url_in_text: Optional[bool] = True, - ) -> List[DocumentNode]: - """Load data from the urls. - - Args: - urls (List[str]): List of URLs to scrape. - custom_hostname (Optional[str]): Force a certain hostname in the case - a website is displayed under custom URLs (e.g. Substack blogs) - include_url_in_text (Optional[bool]): Include the reference url in the text of the DocumentNode - - Returns: - List[DocumentNode]: List of documents. - - """ - from urllib.parse import urlparse - - import requests - from bs4 import BeautifulSoup - - documents = [] - for url in urls: - try: - page = requests.get(url) - except Exception: - raise ValueError(f"One of the inputs is not a valid url: {url}") - - hostname = custom_hostname or urlparse(url).hostname or "" - - soup = BeautifulSoup(page.content, "html.parser") - - data = "" - extra_info = {"URL": url} - if hostname in self.website_extractor: - data, metadata = self.website_extractor[hostname]( - soup=soup, url=url, include_url_in_text=include_url_in_text - ) - extra_info.update(metadata) - - else: - data = soup.getText() - - documents.append(DocumentNode(text=data, extra_info=extra_info)) - - return documents diff --git a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/requirements.txt b/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/requirements.txt deleted file mode 100644 index 013a6eb4..00000000 --- a/nextpy/ai/rag/document_loaders/web/beautiful_soup_web/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -beautifulsoup4 -requests -urllib3 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/web/knowledge_base/README.md b/nextpy/ai/rag/document_loaders/web/knowledge_base/README.md deleted file mode 100644 index 5453ae38..00000000 --- a/nextpy/ai/rag/document_loaders/web/knowledge_base/README.md +++ /dev/null @@ -1,94 +0,0 @@ -# Knowledge Base Website Loader - -This loader is a web crawler and scraper that fetches text content from websites hosting public knowledge bases. Examples are the [Intercom help center](https://www.intercom.com/help/en/) or the [Robinhood help center](https://robinhood.com/us/en/support/). Typically these sites have a directory structure with several sections and many articles in each section. This loader crawls and finds all links that match the article path provided, and scrapes the content of each article. This can be used to create bots that answer customer questions based on public documentation. - -It uses [Playwright](https://playwright.dev/python/) to drive a browser. This reduces the chance of getting blocked by Cloudflare or other CDNs, but makes it a bit more challenging to run on cloud services. - -## Usage - -First run -``` -playwright install -``` -This installs the browsers that Playwright requires. - -To use this loader, you need to pass in the root URL and the string to search for in the URL to tell if the crawler has reached an article. You also need to pass in several CSS selectors so the cralwer knows which links to follow and which elements to extract content from. use - -```python -from nextpy.ai import download_loader - -RAGWebReader = download_loader("RAGWebReader") - -loader = RAGWebReader() -documents = loader.load_data( - root_url='https://www.intercom.com/help', - link_selectors=['.article-list a', '.article-list a'] - article_path='/articles' - body_selector='.article-body' - title_selector='.article-title' - subtitle_selector='.article-subtitle' - ) -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader - -RAGWebReader = download_loader("RAGWebReader") - -loader = RAGWebReader() -documents = loader.load_data( - root_url='https://support.intercom.com', - link_selectors=['.article-list a', '.article-list a'] - article_path='/articles' - body_selector='.article-body' - title_selector='.article-title' - subtitle_selector='.article-subtitle' - ) -index = GPTVectorDBIndex.from_documents(documents) -index.query('What languages does Intercom support?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -RAGWebReader = download_loader("RAGWebReader") - -loader = RAGWebReader() -documents = loader.load_data( - root_url='https://support.intercom.com', - link_selectors=['.article-list a', '.article-list a'] - article_path='/articles' - body_selector='.article-body' - title_selector='.article-title' - subtitle_selector='.article-subtitle' - ) -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Website Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about a product that has a public knowledge base.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What languages does Intercom support?") -``` diff --git a/nextpy/ai/rag/document_loaders/web/knowledge_base/__init__.py b/nextpy/ai/rag/document_loaders/web/knowledge_base/__init__.py deleted file mode 100644 index 964de997..00000000 --- a/nextpy/ai/rag/document_loaders/web/knowledge_base/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - - diff --git a/nextpy/ai/rag/document_loaders/web/knowledge_base/base.py b/nextpy/ai/rag/document_loaders/web/knowledge_base/base.py deleted file mode 100644 index 73d17ad4..00000000 --- a/nextpy/ai/rag/document_loaders/web/knowledge_base/base.py +++ /dev/null @@ -1,173 +0,0 @@ -# This file has been modified by the Nextpy Team in 2023 using AI tools and automation scripts. -# We have rigorously tested these modifications to ensure reliability and performance. Based on successful test results, we are confident in the quality and stability of these changes. - -from typing import Any, Dict, List, Optional - -from nextpy.ai.rag.document_loaders.basereader import BaseReader -from nextpy.ai.schema import DocumentNode - - -class RAGWebReader(BaseReader): - """Knowledge base reader. - - Crawls and reads articles from a knowledge base/help center with Playwright. - Tested on Zendesk and Intercom CMS, may work on others. - Can be run in headless mode but it may be blocked by Cloudflare. Run it headed to be safe. - Times out occasionally, just increase the default time out if it does. - Requires the `playwright` package. - - Args: - root_url (str): the base url of the knowledge base, with no trailing slash - e.g. 'https://support.intercom.com' - link_selectors (List[str]): list of css selectors to find links to articles while crawling - e.g. ['.article-list a', '.article-list a'] - article_path (str): the url path of articles on this domain so the crawler knows when to stop - e.g. '/articles' - title_selector (Optional[str]): css selector to find the title of the article - e.g. '.article-title' - subtitle_selector (Optional[str]): css selector to find the subtitle/description of the article - e.g. '.article-subtitle' - body_selector (Optional[str]): css selector to find the body of the article - e.g. '.article-body' - """ - - def __init__( - self, - root_url: str, - link_selectors: List[str], - article_path: str, - title_selector: Optional[str] = None, - subtitle_selector: Optional[str] = None, - body_selector: Optional[str] = None, - ) -> None: - """Initialize with parameters.""" - self.root_url = root_url - self.link_selectors = link_selectors - self.article_path = article_path - self.title_selector = title_selector - self.subtitle_selector = subtitle_selector - self.body_selector = body_selector - - def load_data(self) -> List[DocumentNode]: - """Load data from the knowledge base.""" - from playwright.sync_api import sync_playwright - - with sync_playwright() as p: - browser = p.chromium.launch(headless=False) - - # Crawl - article_urls = self.get_article_urls( - browser, - self.root_url, - self.root_url, - ) - - # Scrape - documents = [] - for url in article_urls: - article = self.scrape_article( - browser, - url, - ) - metadata = { - "title": article["title"], - "subtitle": article["subtitle"], - "url": article["url"], - "root_url": self.root_url, - "article_path": self.article_path, - } - - documents.append( - DocumentNode(text=article["body"], extra_info=metadata) - ) - - browser.close() - - return documents - - def scrape_article( - self, - browser: Any, - url: str, - ) -> Dict[str, str]: - """Scrape a single article url. - - Args: - browser (Any): a Playwright Chromium browser. - url (str): URL of the article to scrape. - - Returns: - Dict[str, str]: a mapping of article attributes to their values. - - """ - page = browser.new_page(ignore_https_errors=True) - page.set_default_timeout(60000) - page.goto(url, wait_until="domcontentloaded") - - title = ( - ( - page.query_selector(self.title_selector).evaluate( - "node => node.innerText" - ) - ) - if self.title_selector - else "" - ) - subtitle = ( - ( - page.query_selector(self.subtitle_selector).evaluate( - "node => node.innerText" - ) - ) - if self.subtitle_selector - else "" - ) - body = ( - (page.query_selector(self.body_selector).evaluate("node => node.innerText")) - if self.body_selector - else "" - ) - - page.close() - print("scraped:", url) - return {"title": title, "subtitle": subtitle, "body": body, "url": url} - - def get_article_urls( - self, browser: Any, root_url: str, current_url: str - ) -> List[str]: - """Recursively crawl through the knowledge base to find a list of articles. - - Args: - browser (Any): a Playwright Chromium browser. - root_url (str): root URL of the knowledge base. - current_url (str): current URL that is being crawled. - - Returns: - List[str]: a list of URLs of found articles. - - """ - page = browser.new_page(ignore_https_errors=True) - page.set_default_timeout(60000) - page.goto(current_url, wait_until="domcontentloaded") - - # If this is a leaf node aka article page, return itself - if self.article_path in current_url: - print("Found an article: ", current_url) - page.close() - return [current_url] - - # Otherwise crawl this page and find all the articles linked from it - article_urls = [] - links = [] - - for link_selector in self.link_selectors: - ahrefs = page.query_selector_all(link_selector) - links.extend(ahrefs) - - for link in links: - url = root_url + page.evaluate("(node) => node.getAttribute('href')", link) - article_urls.extend(self.get_article_urls(browser, root_url, url)) - - page.close() - - return article_urls diff --git a/nextpy/ai/rag/document_loaders/web/knowledge_base/requirements.txt b/nextpy/ai/rag/document_loaders/web/knowledge_base/requirements.txt deleted file mode 100644 index df3e475a..00000000 --- a/nextpy/ai/rag/document_loaders/web/knowledge_base/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -playwright~=1.30 \ No newline at end of file diff --git a/nextpy/ai/rag/document_loaders/web/readability_web/README.md b/nextpy/ai/rag/document_loaders/web/readability_web/README.md deleted file mode 100644 index 7f9d8dd9..00000000 --- a/nextpy/ai/rag/document_loaders/web/readability_web/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# Readability Webpage Loader - -Extracting relevant information from a fully rendered web page. -During the processing, it is always assumed that web pages used as data sources contain textual content. - -It is particularly effective for websites that use client-side rendering. - -1. Load the page and wait for it rendered. (playwright) -2. Inject Readability.js to extract the main content. - -## Usage - -To use this loader, you need to pass in a single of URL. - -```python -from nextpy.ai import download_loader - -ReadabilityWebPageReader = download_loader("ReadabilityWebPageReader") - -# or set proxy server for playwright: loader = ReadabilityWebPageReader(proxy="http://your-proxy-server:port") -# For some specific web pages, you may need to set "wait_until" to "networkidle". loader = ReadabilityWebPageReader(wait_until="networkidle") -loader = ReadabilityWebPageReader() - -documents = loader.load_data(url='https://support.squarespace.com/hc/en-us/articles/206795137-Pages-and-content-basics') -``` - -## Examples - -This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from nextpy.ai import download_loader - -ReadabilityWebPageReader = download_loader("ReadabilityWebPageReader") - -loader = ReadabilityWebPageReader() -documents = loader.load_data(url='https://support.squarespace.com/hc/en-us/articles/206795137-Pages-and-content-basics') - -index = GPTVectorDBIndex.from_documents(documents) -print(index.query('What is pages?')) - -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from nextpy.ai import GPTVectorDBIndex, download_loader -from langchain.agents import initialize_agent, Tool -from langchain.endpoints import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -ReadabilityWebPageReader = download_loader("ReadabilityWebPageReader") - -loader = ReadabilityWebPageReader() -documents = loader.load_data(url='https://support.squarespace.com/hc/en-us/articles/206795137-Pages-and-content-basics') - -index = GPTVectorDBIndex.from_documents(documents) - -tools = [ - Tool( - name="Website Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about the text on websites.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What is pages?") -``` diff --git a/nextpy/ai/rag/document_loaders/web/readability_web/Readability.js b/nextpy/ai/rag/document_loaders/web/readability_web/Readability.js deleted file mode 100644 index 1540edd6..00000000 --- a/nextpy/ai/rag/document_loaders/web/readability_web/Readability.js +++ /dev/null @@ -1,2301 +0,0 @@ -/* - * Copyright (c) 2010 Arc90 Inc - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * This code is heavily based on Arc90's readability.js (1.7.1) script - * available at: http://code.google.com/p/arc90labs-readability - */ - -/** - * Public constructor. - * @param {HTMLDocument} doc The DocumentNode to parse. - * @param {Object} options The options object. - */ -function Readability(doc, options) { - // In some older versions, people passed a URI as the first argument. Cope: - if (options && options.documentElement) { - doc = options; - options = arguments[2]; - } else if (!doc || !doc.documentElement) { - throw new Error("First argument to Readability constructor should be a DocumentNode object."); - } - options = options || {}; - - this._doc = doc; - this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; - this._articleTitle = null; - this._articleByline = null; - this._articleDir = null; - this._articleSiteName = null; - this._attempts = []; - - // Configurable options - this._debug = !!options.debug; - this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; - this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; - this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; - this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); - this._keepClasses = !!options.keepClasses; - this._serializer = options.serializer || function(el) { - return el.innerHTML; - }; - this._disableJSONLD = !!options.disableJSONLD; - this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; - - // Start with all flags set - this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; - - - // Control whether log messages are sent to the console - if (this._debug) { - let logNode = function(node) { - if (node.nodeType == node.TEXT_NODE) { - return `${node.nodeName} ("${node.textContent}")`; - } - let attrPairs = Array.from(node.attributes || [], function(attr) { - return `${attr.name}="${attr.value}"`; - }).join(" "); - return `<${node.localName} ${attrPairs}>`; - }; - this.log = function () { - if (typeof console !== "undefined") { - let args = Array.from(arguments, arg => { - if (arg && arg.nodeType == this.ELEMENT_NODE) { - return logNode(arg); - } - return arg; - }); - args.unshift("Reader: (Readability)"); - console.log.apply(console, args); - } else if (typeof dump !== "undefined") { - /* global dump */ - var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logNode(x) : x; - }).join(" "); - dump("Reader: (Readability) " + msg + "\n"); - } - }; - } else { - this.log = function () {}; - } -} - -Readability.prototype = { - FLAG_STRIP_UNLIKELYS: 0x1, - FLAG_WEIGHT_CLASSES: 0x2, - FLAG_CLEAN_CONDITIONALLY: 0x4, - - // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType - ELEMENT_NODE: 1, - TEXT_NODE: 3, - - // Max number of nodes supported by this parser. Default: 0 (no limit) - DEFAULT_MAX_ELEMS_TO_PARSE: 0, - - // The number of top candidates to consider when analysing how - // tight the competition is among candidates. - DEFAULT_N_TOP_CANDIDATES: 5, - - // Element tags to score by default. - DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), - - // The default number of chars an article must have in order to return a result - DEFAULT_CHAR_THRESHOLD: 500, - - // All of the regular expressions in use within readability. - // Defined up here so we don't instantiate them repeatedly in loops. - REGEXPS: { - // NOTE: These two regular expressions are duplicated in - // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, - okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, - - positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, - extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, - byline: /byline|author|dateline|writtenby|p-author/i, - replaceFonts: /<(\/?)font[^>]*>/gi, - normalize: /\s{2,}/g, - videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, - shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, - nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, - prevLink: /(prev|earl|old|new|<|«)/i, - tokenize: /\W+/g, - whitespace: /^\s*$/, - hasContent: /\S$/, - hashUrl: /^#.+/, - srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, - b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, - // See: https://schema.org/Article - jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ - }, - - UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], - - DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), - - ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], - - PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], - - DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], - - // The commented out elements qualify as phrasing content but tend to be - // removed by readability when put into paragraphs, so we ignore them here. - PHRASING_ELEMS: [ - // "CANVAS", "IFRAME", "SVG", "VIDEO", - "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", - "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", - "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", - "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", - "SUP", "TEXTAREA", "TIME", "VAR", "WBR" - ], - - // These are the classes that readability sets itself. - CLASSES_TO_PRESERVE: [ "page" ], - - // These are the list of HTML entities that need to be escaped. - HTML_ESCAPE_MAP: { - "lt": "<", - "gt": ">", - "amp": "&", - "quot": '"', - "apos": "'", - }, - - /** - * Run any post-process modifications to article content as necessary. - * - * @param Element - * @return void - **/ - _postProcessContent: function(articleContent) { - // Readability cannot open relative uris so we convert them to absolute uris. - this._fixRelativeUris(articleContent); - - this._simplifyNestedElements(articleContent); - - if (!this._keepClasses) { - // Remove classes. - this._cleanClasses(articleContent); - } - }, - - /** - * Iterates over a NodeList, calls `filterFn` for each node and removes node - * if function returned `true`. - * - * If function is not passed, removes all the nodes in node list. - * - * @param NodeList nodeList The nodes to operate on - * @param Function filterFn the function to use as a filter - * @return void - */ - _removeNodes: function(nodeList, filterFn) { - // Avoid ever operating on live node lists. - if (this._docJSDOMParser && nodeList._isLiveNodeList) { - throw new Error("Do not pass live node lists to _removeNodes"); - } - for (var i = nodeList.length - 1; i >= 0; i--) { - var node = nodeList[i]; - var parentNode = node.parentNode; - if (parentNode) { - if (!filterFn || filterFn.call(this, node, i, nodeList)) { - parentNode.removeChild(node); - } - } - } - }, - - /** - * Iterates over a NodeList, and calls _setNodeTag for each node. - * - * @param NodeList nodeList The nodes to operate on - * @param String newTagName the new tag name to use - * @return void - */ - _replaceNodeTags: function(nodeList, newTagName) { - // Avoid ever operating on live node lists. - if (this._docJSDOMParser && nodeList._isLiveNodeList) { - throw new Error("Do not pass live node lists to _replaceNodeTags"); - } - for (const node of nodeList) { - this._setNodeTag(node, newTagName); - } - }, - - /** - * Iterate over a NodeList, which doesn't natively fully implement the Array - * interface. - * - * For convenience, the current object context is applied to the provided - * iterate function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The iterate function. - * @return void - */ - _forEachNode: function(nodeList, fn) { - Array.prototype.forEach.call(nodeList, fn, this); - }, - - /** - * Iterate over a NodeList, and return the first node that passes - * the supplied test function - * - * For convenience, the current object context is applied to the provided - * test function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The test function. - * @return void - */ - _findNode: function(nodeList, fn) { - return Array.prototype.find.call(nodeList, fn, this); - }, - - /** - * Iterate over a NodeList, return true if any of the provided iterate - * function calls returns true, false otherwise. - * - * For convenience, the current object context is applied to the - * provided iterate function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The iterate function. - * @return Boolean - */ - _someNode: function(nodeList, fn) { - return Array.prototype.some.call(nodeList, fn, this); - }, - - /** - * Iterate over a NodeList, return true if all of the provided iterate - * function calls return true, false otherwise. - * - * For convenience, the current object context is applied to the - * provided iterate function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The iterate function. - * @return Boolean - */ - _everyNode: function(nodeList, fn) { - return Array.prototype.every.call(nodeList, fn, this); - }, - - /** - * Concat all nodelists passed as arguments. - * - * @return ...NodeList - * @return Array - */ - _concatNodeLists: function() { - var slice = Array.prototype.slice; - var args = slice.call(arguments); - var nodeLists = args.map(function(list) { - return slice.call(list); - }); - return Array.prototype.concat.apply([], nodeLists); - }, - - _getAllNodesWithTag: function(node, tagNames) { - if (node.querySelectorAll) { - return node.querySelectorAll(tagNames.join(",")); - } - return [].concat.apply([], tagNames.map(function(tag) { - var collection = node.getElementsByTagName(tag); - return Array.isArray(collection) ? collection : Array.from(collection); - })); - }, - - /** - * Removes the class="" attribute from every element in the given - * subtree, except those that match CLASSES_TO_PRESERVE and - * the classesToPreserve array from the options object. - * - * @param Element - * @return void - */ - _cleanClasses: function(node) { - var classesToPreserve = this._classesToPreserve; - var className = (node.getAttribute("class") || "") - .split(/\s+/) - .filter(function(cls) { - return classesToPreserve.indexOf(cls) != -1; - }) - .join(" "); - - if (className) { - node.setAttribute("class", className); - } else { - node.removeAttribute("class"); - } - - for (node = node.firstElementChild; node; node = node.nextElementSibling) { - this._cleanClasses(node); - } - }, - - /** - * Converts each and uri in the given element to an absolute URI, - * ignoring #ref URIs. - * - * @param Element - * @return void - */ - _fixRelativeUris: function(articleContent) { - var baseURI = this._doc.baseURI; - var documentURI = this._doc.documentURI; - function toAbsoluteURI(uri) { - // Leave hash links alone if the base URI matches the DocumentNode URI: - if (baseURI == documentURI && uri.charAt(0) == "#") { - return uri; - } - - // Otherwise, resolve against base URI: - try { - return new URL(uri, baseURI).href; - } catch (ex) { - // Something went wrong, just return the original: - } - return uri; - } - - var links = this._getAllNodesWithTag(articleContent, ["a"]); - this._forEachNode(links, function(link) { - var href = link.getAttribute("href"); - if (href) { - // Remove links with javascript: URIs, since - // they won't work after scripts have been removed from the page. - if (href.indexOf("javascript:") === 0) { - // if the link only contains simple text content, it can be converted to a text node - if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { - var text = this._doc.createTextNode(link.textContent); - link.parentNode.replaceChild(text, link); - } else { - // if the link has multiple children, they should all be preserved - var container = this._doc.createElement("span"); - while (link.firstChild) { - container.appendChild(link.firstChild); - } - link.parentNode.replaceChild(container, link); - } - } else { - link.setAttribute("href", toAbsoluteURI(href)); - } - } - }); - - var medias = this._getAllNodesWithTag(articleContent, [ - "img", "picture", "figure", "video", "audio", "source" - ]); - - this._forEachNode(medias, function(media) { - var src = media.getAttribute("src"); - var poster = media.getAttribute("poster"); - var srcset = media.getAttribute("srcset"); - - if (src) { - media.setAttribute("src", toAbsoluteURI(src)); - } - - if (poster) { - media.setAttribute("poster", toAbsoluteURI(poster)); - } - - if (srcset) { - var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { - return toAbsoluteURI(p1) + (p2 || "") + p3; - }); - - media.setAttribute("srcset", newSrcset); - } - }); - }, - - _simplifyNestedElements: function(articleContent) { - var node = articleContent; - - while (node) { - if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { - if (this._isElementWithoutContent(node)) { - node = this._removeAndGetNext(node); - continue; - } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { - var child = node.children[0]; - for (var i = 0; i < node.attributes.length; i++) { - child.setAttribute(node.attributes[i].name, node.attributes[i].value); - } - node.parentNode.replaceChild(child, node); - node = child; - continue; - } - } - - node = this._getNextNode(node); - } - }, - - /** - * Get the article title as an H1. - * - * @return string - **/ - _getArticleTitle: function() { - var doc = this._doc; - var curTitle = ""; - var origTitle = ""; - - try { - curTitle = origTitle = doc.title.trim(); - - // If they had an element with id "title" in their HTML - if (typeof curTitle !== "string") - curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); - } catch (e) {/* ignore exceptions setting the title. */} - - var titleHadHierarchicalSeparators = false; - function wordCount(str) { - return str.split(/\s+/).length; - } - - // If there's a separator in the title, first remove the final part - if ((/ [\|\-\\\/>»] /).test(curTitle)) { - titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); - curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); - - // If the resulting title is too short (3 words or fewer), remove - // the first part instead: - if (wordCount(curTitle) < 3) - curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); - } else if (curTitle.indexOf(": ") !== -1) { - // Check if we have an heading containing this exact string, so we - // could assume it's the full title. - var headings = this._concatNodeLists( - doc.getElementsByTagName("h1"), - doc.getElementsByTagName("h2") - ); - var trimmedTitle = curTitle.trim(); - var match = this._someNode(headings, function(heading) { - return heading.textContent.trim() === trimmedTitle; - }); - - // If we don't, let's extract the title out of the original title string. - if (!match) { - curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); - - // If the title is now too short, try the first colon instead: - if (wordCount(curTitle) < 3) { - curTitle = origTitle.substring(origTitle.indexOf(":") + 1); - // But if we have too many words before the colon there's something weird - // with the titles and the H tags so let's just use the original title instead - } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { - curTitle = origTitle; - } - } - } else if (curTitle.length > 150 || curTitle.length < 15) { - var hOnes = doc.getElementsByTagName("h1"); - - if (hOnes.length === 1) - curTitle = this._getInnerText(hOnes[0]); - } - - curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); - // If we now have 4 words or fewer as our title, and either no - // 'hierarchical' separators (\, /, > or ») were found in the original - // title or we decreased the number of words by more than 1 word, use - // the original title. - var curTitleWordCount = wordCount(curTitle); - if (curTitleWordCount <= 4 && - (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { - curTitle = origTitle; - } - - return curTitle; - }, - - /** - * Prepare the HTML DocumentNode for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ - _prepDocument: function() { - var doc = this._doc; - - // Remove all style tags in head - this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); - - if (doc.body) { - this._replaceBrs(doc.body); - } - - this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); - }, - - /** - * Finds the next node, starting from the given node, and ignoring - * whitespace in between. If the given node is an element, the same node is - * returned. - */ - _nextNode: function (node) { - var next = node; - while (next - && (next.nodeType != this.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { - next = next.nextSibling; - } - return next; - }, - - /** - * Replaces 2 or more successive
elements with a single

. - * Whitespace between
elements are ignored. For example: - *

foo
bar


abc
- * will become: - *
foo
bar

abc

- */ - _replaceBrs: function (elem) { - this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { - var next = br.nextSibling; - - // Whether 2 or more
elements have been found and replaced with a - //

block. - var replaced = false; - - // If we find a
chain, remove the
s until we hit another node - // or non-whitespace. This leaves behind the first
in the chain - // (which will be replaced with a

later). - while ((next = this._nextNode(next)) && (next.tagName == "BR")) { - replaced = true; - var brSibling = next.nextSibling; - next.parentNode.removeChild(next); - next = brSibling; - } - - // If we removed a
chain, replace the remaining
with a

. Add - // all sibling nodes as children of the

until we hit another
- // chain. - if (replaced) { - var p = this._doc.createElement("p"); - br.parentNode.replaceChild(p, br); - - next = p.nextSibling; - while (next) { - // If we've hit another

, we're done adding children to this

. - if (next.tagName == "BR") { - var nextElem = this._nextNode(next.nextSibling); - if (nextElem && nextElem.tagName == "BR") - break; - } - - if (!this._isPhrasingContent(next)) - break; - - // Otherwise, make this node a child of the new

. - var sibling = next.nextSibling; - p.appendChild(next); - next = sibling; - } - - while (p.lastChild && this._isWhitespace(p.lastChild)) { - p.removeChild(p.lastChild); - } - - if (p.parentNode.tagName === "P") - this._setNodeTag(p.parentNode, "DIV"); - } - }); - }, - - _setNodeTag: function (node, tag) { - this.log("_setNodeTag", node, tag); - if (this._docJSDOMParser) { - node.localName = tag.toLowerCase(); - node.tagName = tag.toUpperCase(); - return node; - } - - var replacement = node.ownerDocument.createElement(tag); - while (node.firstChild) { - replacement.appendChild(node.firstChild); - } - node.parentNode.replaceChild(replacement, node); - if (node.readability) - replacement.readability = node.readability; - - for (var i = 0; i < node.attributes.length; i++) { - try { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); - } catch (ex) { - /* it's possible for setAttribute() to throw if the attribute name - * isn't a valid XML Name. Such attributes can however be parsed from - * source in HTML docs, see https://github.com/whatwg/html/issues/4275, - * so we can hit them here and then throw. We don't care about such - * attributes so we ignore them. - */ - } - } - return replacement; - }, - - /** - * Prepare the article node for display. Clean out any inline styles, - * iframes, forms, strip extraneous

tags, etc. - * - * @param Element - * @return void - **/ - _prepArticle: function(articleContent) { - this._cleanStyles(articleContent); - - // Check for data tables before we continue, to avoid removing items in - // those tables, which will often be isolated even though they're - // visually linked to other content-ful elements (text, images, etc.). - this._markDataTables(articleContent); - - this._fixLazyImages(articleContent); - - // Clean out junk from the article content - this._cleanConditionally(articleContent, "form"); - this._cleanConditionally(articleContent, "fieldset"); - this._clean(articleContent, "object"); - this._clean(articleContent, "embed"); - this._clean(articleContent, "footer"); - this._clean(articleContent, "link"); - this._clean(articleContent, "aside"); - - // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, - // which means we don't remove the top candidates even they have "share". - - var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; - - this._forEachNode(articleContent.children, function (topCandidate) { - this._cleanMatchedNodes(topCandidate, function (node, matchString) { - return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; - }); - }); - - this._clean(articleContent, "iframe"); - this._clean(articleContent, "input"); - this._clean(articleContent, "textarea"); - this._clean(articleContent, "select"); - this._clean(articleContent, "button"); - this._cleanHeaders(articleContent); - - // Do these last as the previous stuff may have removed junk - // that will affect these - this._cleanConditionally(articleContent, "table"); - this._cleanConditionally(articleContent, "ul"); - this._cleanConditionally(articleContent, "div"); - - // replace H1 with H2 as H1 should be only title that is displayed separately - this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); - - // Remove extra paragraphs - this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { - var imgCount = paragraph.getElementsByTagName("img").length; - var embedCount = paragraph.getElementsByTagName("embed").length; - var objectCount = paragraph.getElementsByTagName("object").length; - // At this point, nasty iframes have been removed, only remain embedded video ones. - var iframeCount = paragraph.getElementsByTagName("iframe").length; - var totalCount = imgCount + embedCount + objectCount + iframeCount; - - return totalCount === 0 && !this._getInnerText(paragraph, false); - }); - - this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { - var next = this._nextNode(br.nextSibling); - if (next && next.tagName == "P") - br.parentNode.removeChild(br); - }); - - // Remove single-cell tables - this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { - var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; - if (this._hasSingleTagInsideElement(tbody, "TR")) { - var row = tbody.firstElementChild; - if (this._hasSingleTagInsideElement(row, "TD")) { - var cell = row.firstElementChild; - cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); - table.parentNode.replaceChild(cell, table); - } - } - }); - }, - - /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. - * - * @param Element - * @return void - **/ - _initializeNode: function(node) { - node.readability = {"contentScore": 0}; - - switch (node.tagName) { - case "DIV": - node.readability.contentScore += 5; - break; - - case "PRE": - case "TD": - case "BLOCKQUOTE": - node.readability.contentScore += 3; - break; - - case "ADDRESS": - case "OL": - case "UL": - case "DL": - case "DD": - case "DT": - case "LI": - case "FORM": - node.readability.contentScore -= 3; - break; - - case "H1": - case "H2": - case "H3": - case "H4": - case "H5": - case "H6": - case "TH": - node.readability.contentScore -= 5; - break; - } - - node.readability.contentScore += this._getClassWeight(node); - }, - - _removeAndGetNext: function(node) { - var nextNode = this._getNextNode(node, true); - node.parentNode.removeChild(node); - return nextNode; - }, - - /** - * Traverse the DOM from node to node, starting at the node passed in. - * Pass true for the second parameter to indicate this node itself - * (and its kids) are going away, and we want the next node over. - * - * Calling this in a loop will traverse the DOM depth-first. - */ - _getNextNode: function(node, ignoreSelfAndKids) { - // First check for kids if those aren't being ignored - if (!ignoreSelfAndKids && node.firstElementChild) { - return node.firstElementChild; - } - // Then for siblings... - if (node.nextElementSibling) { - return node.nextElementSibling; - } - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - node = node.parentNode; - } while (node && !node.nextElementSibling); - return node && node.nextElementSibling; - }, - - // compares second text to first one - // 1 = same text, 0 = completely different text - // works the way that it splits both texts into words and then finds words that are unique in second text - // the result is given by the lower length of unique parts - _textSimilarity: function(textA, textB) { - var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); - var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); - if (!tokensA.length || !tokensB.length) { - return 0; - } - var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); - var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; - return 1 - distanceB; - }, - - _checkByline: function(node, matchString) { - if (this._articleByline) { - return false; - } - - if (node.getAttribute !== undefined) { - var rel = node.getAttribute("rel"); - var itemprop = node.getAttribute("itemprop"); - } - - if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { - this._articleByline = node.textContent.trim(); - return true; - } - - return false; - }, - - _getNodeAncestors: function(node, maxDepth) { - maxDepth = maxDepth || 0; - var i = 0, ancestors = []; - while (node.parentNode) { - ancestors.push(node.parentNode); - if (maxDepth && ++i === maxDepth) - break; - node = node.parentNode; - } - return ancestors; - }, - - /*** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @param page a DocumentNode to run upon. Needs to be a full DocumentNode, complete with body. - * @return Element - **/ - _grabArticle: function (page) { - this.log("**** grabArticle ****"); - var doc = this._doc; - var isPaging = page !== null; - page = page ? page : this._doc.body; - - // We can't grab an article if we don't have a page! - if (!page) { - this.log("No body found in DocumentNode. Abort."); - return null; - } - - var pageCacheHtml = page.innerHTML; - - while (true) { - this.log("Starting grabArticle loop"); - var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); - - // First, node prepping. Trash nodes that look cruddy (like ones with the - // class name "comment", etc), and turn divs into P tags where they have been - // used inappropriately (as in, where they contain no other block level elements.) - var elementsToScore = []; - var node = this._doc.documentElement; - - let shouldRemoveTitleHeader = true; - - while (node) { - - if (node.tagName === "HTML") { - this._articleLang = node.getAttribute("lang"); - } - - var matchString = node.className + " " + node.id; - - if (!this._isProbablyVisible(node)) { - this.log("Removing hidden node - " + matchString); - node = this._removeAndGetNext(node); - continue; - } - - // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" - if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { - node = this._removeAndGetNext(node); - continue; - } - - // Check to see if this node is a byline, and remove it if it is. - if (this._checkByline(node, matchString)) { - node = this._removeAndGetNext(node); - continue; - } - - if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { - this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); - shouldRemoveTitleHeader = false; - node = this._removeAndGetNext(node); - continue; - } - - // Remove unlikely candidates - if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && - !this._hasAncestorTag(node, "table") && - !this._hasAncestorTag(node, "code") && - node.tagName !== "BODY" && - node.tagName !== "A") { - this.log("Removing unlikely candidate - " + matchString); - node = this._removeAndGetNext(node); - continue; - } - - if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { - this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); - node = this._removeAndGetNext(node); - continue; - } - } - - // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || - node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || - node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && - this._isElementWithoutContent(node)) { - node = this._removeAndGetNext(node); - continue; - } - - if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { - elementsToScore.push(node); - } - - // Turn all divs that don't have children block level elements into p's - if (node.tagName === "DIV") { - // Put phrasing content into paragraphs. - var p = null; - var childNode = node.firstChild; - while (childNode) { - var nextSibling = childNode.nextSibling; - if (this._isPhrasingContent(childNode)) { - if (p !== null) { - p.appendChild(childNode); - } else if (!this._isWhitespace(childNode)) { - p = doc.createElement("p"); - node.replaceChild(p, childNode); - p.appendChild(childNode); - } - } else if (p !== null) { - while (p.lastChild && this._isWhitespace(p.lastChild)) { - p.removeChild(p.lastChild); - } - p = null; - } - childNode = nextSibling; - } - - // Sites like http://mobile.slate.com encloses each paragraph with a DIV - // element. DIVs with only a P element inside and no text content can be - // safely converted into plain P elements to avoid confusing the scoring - // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { - var newNode = node.children[0]; - node.parentNode.replaceChild(newNode, node); - node = newNode; - elementsToScore.push(node); - } else if (!this._hasChildBlockElement(node)) { - node = this._setNodeTag(node, "P"); - elementsToScore.push(node); - } - } - node = this._getNextNode(node); - } - - /** - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. - * - * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ - var candidates = []; - this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") - return; - - // If this paragraph is less than 25 characters, don't even count it. - var innerText = this._getInnerText(elementToScore); - if (innerText.length < 25) - return; - - // Exclude nodes with no ancestor. - var ancestors = this._getNodeAncestors(elementToScore, 5); - if (ancestors.length === 0) - return; - - var contentScore = 0; - - // Add a point for the paragraph itself as a base. - contentScore += 1; - - // Add points for any commas within this paragraph. - contentScore += innerText.split(",").length; - - // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += Math.min(Math.floor(innerText.length / 100), 3); - - // Initialize and score ancestors. - this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") - return; - - if (typeof(ancestor.readability) === "undefined") { - this._initializeNode(ancestor); - candidates.push(ancestor); - } - - // Node score divider: - // - parent: 1 (no division) - // - grandparent: 2 - // - great grandparent+: ancestor level * 3 - if (level === 0) - var scoreDivider = 1; - else if (level === 1) - scoreDivider = 2; - else - scoreDivider = level * 3; - ancestor.readability.contentScore += contentScore / scoreDivider; - }); - }); - - // After we've calculated scores, loop through all of the possible - // candidate nodes we found and find the one with the highest score. - var topCandidates = []; - for (var c = 0, cl = candidates.length; c < cl; c += 1) { - var candidate = candidates[c]; - - // Scale the final candidates score based on link density. Good content - // should have a relatively small link density (5% or less) and be mostly - // unaffected by this operation. - var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); - candidate.readability.contentScore = candidateScore; - - this.log("Candidate:", candidate, "with score " + candidateScore); - - for (var t = 0; t < this._nbTopCandidates; t++) { - var aTopCandidate = topCandidates[t]; - - if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { - topCandidates.splice(t, 0, candidate); - if (topCandidates.length > this._nbTopCandidates) - topCandidates.pop(); - break; - } - } - } - - var topCandidate = topCandidates[0] || null; - var neededToCreateTopCandidate = false; - var parentOfTopCandidate; - - // If we still have no top candidate, just use the body as a last resort. - // We also have to copy the body node so it is something we can modify. - if (topCandidate === null || topCandidate.tagName === "BODY") { - // Move all of the page's children into topCandidate - topCandidate = doc.createElement("DIV"); - neededToCreateTopCandidate = true; - // Move everything (not just elements, also text nodes etc.) into the container - // so we even include text directly in the body: - while (page.firstChild) { - this.log("Moving child out:", page.firstChild); - topCandidate.appendChild(page.firstChild); - } - - page.appendChild(topCandidate); - - this._initializeNode(topCandidate); - } else if (topCandidate) { - // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array - // and whose scores are quite closed with current `topCandidate` node. - var alternativeCandidateAncestors = []; - for (var i = 1; i < topCandidates.length; i++) { - if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { - alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); - } - } - var MINIMUM_TOPCANDIDATES = 3; - if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { - parentOfTopCandidate = topCandidate.parentNode; - while (parentOfTopCandidate.tagName !== "BODY") { - var listsContainingThisAncestor = 0; - for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { - listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); - } - if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { - topCandidate = parentOfTopCandidate; - break; - } - parentOfTopCandidate = parentOfTopCandidate.parentNode; - } - } - if (!topCandidate.readability) { - this._initializeNode(topCandidate); - } - - // Because of our bonus system, parents of candidates might have scores - // themselves. They get half of the node. There won't be nodes with higher - // scores than our topCandidate, but if we see the score going *up* in the first - // few steps up the tree, that's a decent sign that there might be more content - // lurking in other places that we want to unify in. The sibling stuff - // below does some of that - but only if we've looked high enough up the DOM - // tree. - parentOfTopCandidate = topCandidate.parentNode; - var lastScore = topCandidate.readability.contentScore; - // The scores shouldn't get too low. - var scoreThreshold = lastScore / 3; - while (parentOfTopCandidate.tagName !== "BODY") { - if (!parentOfTopCandidate.readability) { - parentOfTopCandidate = parentOfTopCandidate.parentNode; - continue; - } - var parentScore = parentOfTopCandidate.readability.contentScore; - if (parentScore < scoreThreshold) - break; - if (parentScore > lastScore) { - // Alright! We found a better parent to use. - topCandidate = parentOfTopCandidate; - break; - } - lastScore = parentOfTopCandidate.readability.contentScore; - parentOfTopCandidate = parentOfTopCandidate.parentNode; - } - - // If the top candidate is the only child, use parent instead. This will help sibling - // joining logic when adjacent content is actually located in parent's sibling node. - parentOfTopCandidate = topCandidate.parentNode; - while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { - topCandidate = parentOfTopCandidate; - parentOfTopCandidate = topCandidate.parentNode; - } - if (!topCandidate.readability) { - this._initializeNode(topCandidate); - } - } - - // Now that we have the top candidate, look through its siblings for content - // that might also be related. Things like preambles, content split by ads - // that we removed, etc. - var articleContent = doc.createElement("DIV"); - if (isPaging) - articleContent.id = "readability-content"; - - var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); - // Keep potential top candidate's parent node to try to get text direction of it later. - parentOfTopCandidate = topCandidate.parentNode; - var siblings = parentOfTopCandidate.children; - - for (var s = 0, sl = siblings.length; s < sl; s++) { - var sibling = siblings[s]; - var append = false; - - this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); - this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); - - if (sibling === topCandidate) { - append = true; - } else { - var contentBonus = 0; - - // Give a bonus if sibling nodes and top candidates have the example same classname - if (sibling.className === topCandidate.className && topCandidate.className !== "") - contentBonus += topCandidate.readability.contentScore * 0.2; - - if (sibling.readability && - ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { - append = true; - } else if (sibling.nodeName === "P") { - var linkDensity = this._getLinkDensity(sibling); - var nodeContent = this._getInnerText(sibling); - var nodeLength = nodeContent.length; - - if (nodeLength > 80 && linkDensity < 0.25) { - append = true; - } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { - append = true; - } - } - } - - if (append) { - this.log("Appending node:", sibling); - - if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { - // We have a node that isn't a common block level element, like a form or td tag. - // Turn it into a div so it doesn't get filtered out later by accident. - this.log("Altering sibling:", sibling, "to div."); - - sibling = this._setNodeTag(sibling, "DIV"); - } - - articleContent.appendChild(sibling); - // Fetch children again to make it compatible - // with DOM parsers without live collection support. - siblings = parentOfTopCandidate.children; - // siblings is a reference to the children array, and - // sibling is removed from the array when we call appendChild(). - // As a result, we must revisit this index since the nodes - // have been shifted. - s -= 1; - sl -= 1; - } - } - - if (this._debug) - this.log("Article content pre-prep: " + articleContent.innerHTML); - // So we have all of the content that we need. Now we clean it up for presentation. - this._prepArticle(articleContent); - if (this._debug) - this.log("Article content post-prep: " + articleContent.innerHTML); - - if (neededToCreateTopCandidate) { - // We already created a fake div thing, and there wouldn't have been any siblings left - // for the previous loop, so there's no point trying to create a new div, and then - // move all the children over. Just assign IDs and class names here. No need to append - // because that already happened anyway. - topCandidate.id = "readability-page-1"; - topCandidate.className = "page"; - } else { - var div = doc.createElement("DIV"); - div.id = "readability-page-1"; - div.className = "page"; - while (articleContent.firstChild) { - div.appendChild(articleContent.firstChild); - } - articleContent.appendChild(div); - } - - if (this._debug) - this.log("Article content after paging: " + articleContent.innerHTML); - - var parseSuccessful = true; - - // Now that we've gone through the full algorithm, check to see if - // we got any meaningful content. If we didn't, we may need to re-run - // grabArticle with different flags set. This gives us a higher likelihood of - // finding the content, and the sieve approach gives us a higher likelihood of - // finding the -right- content. - var textLength = this._getInnerText(articleContent, true).length; - if (textLength < this._charThreshold) { - parseSuccessful = false; - page.innerHTML = pageCacheHtml; - - if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { - this._removeFlag(this.FLAG_STRIP_UNLIKELYS); - this._attempts.push({articleContent: articleContent, textLength: textLength}); - } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { - this._removeFlag(this.FLAG_WEIGHT_CLASSES); - this._attempts.push({articleContent: articleContent, textLength: textLength}); - } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { - this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); - this._attempts.push({articleContent: articleContent, textLength: textLength}); - } else { - this._attempts.push({articleContent: articleContent, textLength: textLength}); - // No luck after removing flags, just return the longest text we found during the different loops - this._attempts.sort(function (a, b) { - return b.textLength - a.textLength; - }); - - // But first check if we actually have something - if (!this._attempts[0].textLength) { - return null; - } - - articleContent = this._attempts[0].articleContent; - parseSuccessful = true; - } - } - - if (parseSuccessful) { - // Find out text direction from ancestors of final top candidate. - var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); - this._someNode(ancestors, function(ancestor) { - if (!ancestor.tagName) - return false; - var articleDir = ancestor.getAttribute("dir"); - if (articleDir) { - this._articleDir = articleDir; - return true; - } - return false; - }); - return articleContent; - } - } - }, - - /** - * Check whether the input string could be a byline. - * This verifies that the input is a string, and that the length - * is less than 100 chars. - * - * @param possibleByline {string} - a string to check whether its a byline. - * @return Boolean - whether the input string is a byline. - */ - _isValidByline: function(byline) { - if (typeof byline == "string" || byline instanceof String) { - byline = byline.trim(); - return (byline.length > 0) && (byline.length < 100); - } - return false; - }, - - /** - * Converts some of the common HTML entities in string to their corresponding characters. - * - * @param str {string} - a string to unescape. - * @return string without HTML entity. - */ - _unescapeHtmlEntities: function(str) { - if (!str) { - return str; - } - - var htmlEscapeMap = this.HTML_ESCAPE_MAP; - return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { - return htmlEscapeMap[tag]; - }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { - var num = parseInt(hex || numStr, hex ? 16 : 10); - return String.fromCharCode(num); - }); - }, - - /** - * Try to extract metadata from JSON-LD object. - * For now, only Schema.org objects of type Article or its subtypes are supported. - * @return Object with any metadata that could be extracted (possibly none) - */ - _getJSONLD: function (doc) { - var scripts = this._getAllNodesWithTag(doc, ["script"]); - - var metadata; - - this._forEachNode(scripts, function(jsonLdElement) { - if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { - try { - // Strip CDATA markers if present - var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); - var parsed = JSON.parse(content); - if ( - !parsed["@context"] || - !parsed["@context"].match(/^https?\:\/\/schema\.org$/) - ) { - return; - } - - if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { - parsed = parsed["@graph"].find(function(it) { - return (it["@type"] || "").match( - this.REGEXPS.jsonLdArticleTypes - ); - }); - } - - if ( - !parsed || - !parsed["@type"] || - !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) - ) { - return; - } - - metadata = {}; - - if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { - // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz - // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either - // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. - - var title = this._getArticleTitle(); - var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; - var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; - - if (headlineMatches && !nameMatches) { - metadata.title = parsed.headline; - } else { - metadata.title = parsed.name; - } - } else if (typeof parsed.name === "string") { - metadata.title = parsed.name.trim(); - } else if (typeof parsed.headline === "string") { - metadata.title = parsed.headline.trim(); - } - if (parsed.author) { - if (typeof parsed.author.name === "string") { - metadata.byline = parsed.author.name.trim(); - } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { - metadata.byline = parsed.author - .filter(function(author) { - return author && typeof author.name === "string"; - }) - .map(function(author) { - return author.name.trim(); - }) - .join(", "); - } - } - if (typeof parsed.description === "string") { - metadata.excerpt = parsed.description.trim(); - } - if ( - parsed.publisher && - typeof parsed.publisher.name === "string" - ) { - metadata.siteName = parsed.publisher.name.trim(); - } - return; - } catch (err) { - this.log(err.message); - } - } - }); - return metadata ? metadata : {}; - }, - - /** - * Attempts to get excerpt and byline metadata for the article. - * - * @param {Object} jsonld — object containing any metadata that - * could be extracted from JSON-LD object. - * - * @return Object with optional "excerpt" and "byline" properties - */ - _getArticleMetadata: function(jsonld) { - var metadata = {}; - var values = {}; - var metaElements = this._doc.getElementsByTagName("meta"); - - // property is a space-separated list of values - var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; - - // name is a single value - var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; - - // Find description tags. - this._forEachNode(metaElements, function(element) { - var elementName = element.getAttribute("name"); - var elementProperty = element.getAttribute("property"); - var content = element.getAttribute("content"); - if (!content) { - return; - } - var matches = null; - var name = null; - - if (elementProperty) { - matches = elementProperty.match(propertyPattern); - if (matches) { - // Convert to lowercase, and remove any whitespace - // so we can match below. - name = matches[0].toLowerCase().replace(/\s/g, ""); - // multiple authors - values[name] = content.trim(); - } - } - if (!matches && elementName && namePattern.test(elementName)) { - name = elementName; - if (content) { - // Convert to lowercase, remove any whitespace, and convert dots - // to colons so we can match below. - name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); - values[name] = content.trim(); - } - } - }); - - // get title - metadata.title = jsonld.title || - values["dc:title"] || - values["dcterm:title"] || - values["og:title"] || - values["weibo:article:title"] || - values["weibo:webpage:title"] || - values["title"] || - values["twitter:title"]; - - if (!metadata.title) { - metadata.title = this._getArticleTitle(); - } - - // get author - metadata.byline = jsonld.byline || - values["dc:creator"] || - values["dcterm:creator"] || - values["author"]; - - // get description - metadata.excerpt = jsonld.excerpt || - values["dc:description"] || - values["dcterm:description"] || - values["og:description"] || - values["weibo:article:description"] || - values["weibo:webpage:description"] || - values["description"] || - values["twitter:description"]; - - // get site name - metadata.siteName = jsonld.siteName || - values["og:site_name"]; - - // in many sites the meta value is escaped with HTML entities, - // so here we need to unescape it - metadata.title = this._unescapeHtmlEntities(metadata.title); - metadata.byline = this._unescapeHtmlEntities(metadata.byline); - metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); - metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); - - return metadata; - }, - - /** - * Check if node is image, or if node contains exactly only one image - * whether as a direct child or as its descendants. - * - * @param Element - **/ - _isSingleImage: function(node) { - if (node.tagName === "IMG") { - return true; - } - - if (node.children.length !== 1 || node.textContent.trim() !== "") { - return false; - } - - return this._isSingleImage(node.children[0]); - }, - - /** - * Find all