Skip to content

Commit

Permalink
Rename Daxa to Pebblo, Aggregate size calculation, Initial unit tests (
Browse files Browse the repository at this point in the history
…langchain-ai#12)

* 1. Calculate the Size of the page content 
* 2. Rename Daxa to Pebblo

* unit test initial

Signed-off-by: Rahul Tripathi <[email protected]>

* Update test_imports.py

* Rename Daxa to Pebblo

---------

Signed-off-by: Rahul Tripathi <[email protected]>
Co-authored-by: Rahul Tripathi <[email protected]>
  • Loading branch information
Raj725 and Rahul Tripathi authored Jan 25, 2024
1 parent b4ffd85 commit 358cf78
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
from langchain_community.document_loaders.cube_semantic import CubeSemanticLoader
from langchain_community.document_loaders.datadog_logs import DatadogLogsLoader
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.document_loaders.pebblo import DaxaSafeLoader
from langchain_community.document_loaders.pebblo import PebbloSafeLoader
from langchain_community.document_loaders.diffbot import DiffbotLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.discord import DiscordChatLoader
Expand Down Expand Up @@ -278,7 +278,7 @@
"CubeSemanticLoader",
"DataFrameLoader",
"DatadogLogsLoader",
"DaxaSafeLoader",
"PebbloSafeLoader",
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
Expand Down
38 changes: 28 additions & 10 deletions libs/community/langchain_community/document_loaders/pebblo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Daxa's safe loader."""
"""Pebblo's safe loader."""

import logging
import os
Expand All @@ -23,7 +23,7 @@
logger = logging.getLogger(__name__)


class DaxaSafeLoader(BaseLoader):
class PebbloSafeLoader(BaseLoader):
def __init__(
self,
langchain_loader: BaseLoader,
Expand All @@ -41,16 +41,17 @@ def __init__(
self.owner = owner
self.description = description
self.source_path = get_loader_full_path(self.loader)
self.source_owner = DaxaSafeLoader.get_file_owner_from_path(self.source_path)
self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
self.docs = []
loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
self.source_type = get_loader_type(loader_name)
self.source_size = self.get_source_size(self.source_path)
self.source_path_size = self.get_source_size(self.source_path)
self.source_aggr_size = 0
self.loader_details = {
"loader": loader_name,
"source_path": self.source_path,
"source_type": self.source_type,
"source_size": self.source_size,
"source_path_size": self.source_path_size,
}
# generate app
self.app = self._get_app_details()
Expand Down Expand Up @@ -97,15 +98,18 @@ def _send_loader_doc(self, loading_end=False):
docs = []
for doc in doc_content:
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
doc_source_owner = DaxaSafeLoader.get_file_owner_from_path(doc_source_path)
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
doc_source_size = self.get_source_size(doc_source_path)
page_content = doc.get("page_content")
page_content_size = self.calculate_content_size(page_content)
self.source_aggr_size += page_content_size
docs.append(
{
"doc": doc.get("page_content"),
"doc": page_content,
"source_path": doc_source_path,
"last_modified": doc.get("metadata", {}).get("last_modified"),
"file_owner": doc_source_owner,
"source_size": doc_source_size,
"source_path_size": doc_source_size,
}
)
payload = {
Expand All @@ -120,6 +124,7 @@ def _send_loader_doc(self, loading_end=False):
}
if loading_end is True:
payload["loading_end"] = "true"
payload["loader_details"]["source_aggr_size"] = self.source_aggr_size
try:
payload = Doc.model_validate(payload).model_dump(exclude_unset=True)
except AttributeError:
Expand Down Expand Up @@ -150,7 +155,20 @@ def _send_loader_doc(self, loading_end=False):
except Exception as e:
logger.warning(f"An Exception caught in _send_loader_doc.")
if loading_end is True:
DaxaSafeLoader.set_loader_sent()
PebbloSafeLoader.set_loader_sent()

@staticmethod
def calculate_content_size(page_content):
"""
Calculate the content size in bytes:
- Encode the string to bytes using a specific encoding (e.g., UTF-8)
- Get the length of the encoded bytes.
"""

# Encode the content to bytes using UTF-8
encoded_content = page_content.encode('utf-8')
size = len(encoded_content)
return size

def _send_discover(self):
headers = {"Accept": "application/json", "Content-Type": "application/json"}
Expand All @@ -175,7 +193,7 @@ def _send_discover(self):
resp.status_code == HTTPStatus.OK
or resp.status_code == HTTPStatus.BAD_GATEWAY
):
DaxaSafeLoader.set_discover_sent()
PebbloSafeLoader.set_discover_sent()
else:
logger.debug(
f"Received unexpected HTTP response code: {resp.status_code}"
Expand Down
2 changes: 1 addition & 1 deletion libs/community/langchain_community/utilities/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

PLUGIN_VERSION = "0.1.0"
IP_INFO_URL = "https://ipinfo.io/ip"
CLASSIFIER_URL = os.getenv("DAXA_CLASSIFIER_URL", "http://localhost:8000/v1")
CLASSIFIER_URL = os.getenv("PEBBLO_CLASSIFIER_URL", "http://localhost:8000/v1")

file_loader = [
"JSONLoader",
Expand Down
Empty file.
3 changes: 3 additions & 0 deletions libs/community/tests/examples/test_nominal.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
column1,column2,column3
value1,value2,value3
value4,value5,value6
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"CubeSemanticLoader",
"DataFrameLoader",
"DatadogLogsLoader",
"PebbloSafeLoader",
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
Expand Down
81 changes: 81 additions & 0 deletions libs/community/tests/unit_tests/document_loaders/test_pebblo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
from pathlib import Path

from langchain_core.documents import Document
from langchain_community.document_loaders import CSVLoader

EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")

def test_pebblo_import() -> None:
"""Test that the Pebblo safe loader can be imported."""
from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401

def test_empty_filebased_loader() -> None:
"""Test basic file based csv loader."""
# Setup
from langchain_community.document_loaders import PebbloSafeLoader
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_empty.csv")
expected_docs: list = []

# Exercise
loader = PebbloSafeLoader(
CSVLoader(file_path=file_path),
"dummy_app_name", "dummy_owner","dummy_description"
)
result = loader.load()

# Assert
assert result == expected_docs

def test_csv_loader_load_valid_data() -> None:
# Setup
from langchain_community.document_loaders import PebbloSafeLoader
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv")
expected_docs = [
Document(
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={"source": file_path, "row": 0},
),
Document(
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
metadata={"source": file_path, "row": 1},
),
]

# Exercise
loader = PebbloSafeLoader(
CSVLoader(file_path=file_path),
"dummy_app_name", "dummy_owner","dummy_description"
)
result = loader.load()

# Assert
assert result == expected_docs

def test_csv_lazy_load():
# Setup
from langchain_community.document_loaders import PebbloSafeLoader
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv")
expected_docs = [
Document(
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={"source": file_path, "row": 0},
),
Document(
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
metadata={"source": file_path, "row": 1},
),
]

# Exercise
loader = PebbloSafeLoader(
CSVLoader(file_path=file_path),
"dummy_app_name", "dummy_owner","dummy_description"
)

result = []
for doc in loader.lazy_load():
result.extend(doc)

# Assert
assert result == expected_docs

0 comments on commit 358cf78

Please sign in to comment.