langchain-ai · ccurme · Mar 20, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/backend/db/models.py b/backend/db/models.py
@@ -141,6 +141,11 @@ class Extractor(TimestampedModel):
         server_default="",
         comment="The name of the extractor.",
     )
+    owner_id = Column(
+        UUID(as_uuid=True),
+        nullable=False,
+        comment="Owner uuid.",
+    )
     schema = Column(
         JSONB,
         nullable=False,

diff --git a/backend/server/api/extract.py b/backend/server/api/extract.py
@@ -8,6 +8,7 @@
 from db.models import Extractor, get_session
 from extraction.parsing import parse_binary_input
 from server.extraction_runnable import ExtractResponse, extract_entire_document
+from server.models import DEFAULT_MODEL
 from server.retrieval import extract_from_content
 
 router = APIRouter(
@@ -24,7 +25,7 @@ async def extract_using_existing_extractor(
     text: Optional[str] = Form(None),
     mode: Literal["entire_document", "retrieval"] = Form("entire_document"),
     file: Optional[UploadFile] = File(None),
-    model_name: Optional[str] = Form("default"),
+    model_name: Optional[str] = Form(DEFAULT_MODEL),
     session: Session = Depends(get_session),
 ) -> ExtractResponse:
     """Endpoint that is used with an existing extractor.

diff --git a/backend/server/api/extractors.py b/backend/server/api/extractors.py
@@ -22,6 +22,8 @@ class CreateExtractor(BaseModel):
 
     name: str = Field(default="", description="The name of the extractor.")
 
+    owner_id: UUID = Field(..., description="The UUID of the owner of the extractor.")
+
     description: str = Field(
         default="", description="Short description of the extractor."
     )
@@ -110,6 +112,7 @@ def create(
 
     instance = Extractor(
         name=create_request.name,
+        owner_id=create_request.owner_id,
         schema=create_request.json_schema,
         description=create_request.description,
         instruction=create_request.instruction,

diff --git a/backend/server/extraction_runnable.py b/backend/server/extraction_runnable.py
@@ -15,7 +15,7 @@
 
 from db.models import Example, Extractor
 from extraction.utils import update_json_schema
-from server.models import get_chunk_size, get_model
+from server.models import DEFAULT_MODEL, get_chunk_size, get_model
 from server.validators import validate_json_schema
 
 
@@ -188,7 +188,7 @@ async def extract_entire_document(
     text_splitter = TokenTextSplitter(
         chunk_size=get_chunk_size(model_name),
         chunk_overlap=20,
-        model_name=model_name,
+        model_name=DEFAULT_MODEL,
     )
     texts = text_splitter.split_text(content)
     extraction_requests = [

diff --git a/backend/tests/unit_tests/api/test_api_defining_extractors.py b/backend/tests/unit_tests/api/test_api_defining_extractors.py
@@ -13,7 +13,9 @@ async def test_extractors_api() -> None:
         assert response.json() == []
 
         # Verify that we can create an extractor
+        owner_id = str(uuid.uuid4())
         create_request = {
+            "owner_id": owner_id,
             "description": "Test Description",
             "schema": {"type": "object"},
             "instruction": "Test Instruction",
@@ -39,6 +41,7 @@ async def test_extractors_api() -> None:
 
         # Verify that we can create an extractor
         create_request = {
+            "owner_id": owner_id,
             "description": "Test Description",
             "schema": {"type": "object"},
             "instruction": "Test Instruction",
@@ -62,14 +65,23 @@ async def test_extractors_api() -> None:
         assert get_response.status_code == 200
         assert get_response.json() == []
 
-        # Verify that we can create an extractor
+        # Verify that we can create an extractor, including other properties
+        owner_id = str(uuid.uuid4())
         create_request = {
+            "name": "my extractor",
+            "owner_id": owner_id,
             "description": "Test Description",
             "schema": {"type": "object"},
             "instruction": "Test Instruction",
         }
         response = await client.post("/extractors", json=create_request)
+        extractor_uuid = response.json()["uuid"]
         assert response.status_code == 200
+        response = await client.get(f"/extractors/{extractor_uuid}")
+        response_data = response.json()
+        assert extractor_uuid == response_data["uuid"]
+        assert "my extractor" == response_data["name"]
+        assert "owner_id" not in response_data
 
 
 async def test_sharing_extractor() -> None:
@@ -79,25 +91,27 @@ async def test_sharing_extractor() -> None:
         assert response.status_code == 200
         assert response.json() == []
         # Verify that we can create an extractor
+        owner_id = str(uuid.uuid4())
         create_request = {
             "name": "Test Name",
+            "owner_id": owner_id,
             "description": "Test Description",
             "schema": {"type": "object"},
             "instruction": "Test Instruction",
         }
         response = await client.post("/extractors", json=create_request)
         assert response.status_code == 200
 
-        uuid = response.json()["uuid"]
+        uuid_str = response.json()["uuid"]
 
         # Verify that the extractor was created
-        response = await client.post(f"/extractors/{uuid}/share")
+        response = await client.post(f"/extractors/{uuid_str}/share")
         assert response.status_code == 200
         assert "share_uuid" in response.json()
         share_uuid = response.json()["share_uuid"]
 
         # Test idempotency
-        response = await client.post(f"/extractors/{uuid}/share")
+        response = await client.post(f"/extractors/{uuid_str}/share")
         assert response.status_code == 200
         assert "share_uuid" in response.json()
         assert response.json()["share_uuid"] == share_uuid

diff --git a/backend/tests/unit_tests/api/test_api_examples.py b/backend/tests/unit_tests/api/test_api_examples.py
@@ -1,4 +1,5 @@
 """Code to test API endpoints."""
+import uuid
 
 from tests.db import get_async_client
 
@@ -14,9 +15,11 @@ async def test_examples_api() -> None:
     """Runs through a set of API calls to test the examples API."""
     async with get_async_client() as client:
         # First create an extractor
+        owner_id = str(uuid.uuid4())
         create_request = {
             "description": "Test Description",
             "name": "Test Name",
+            "owner_id": owner_id,
             "schema": {"type": "object"},
             "instruction": "Test Instruction",
         }

diff --git a/backend/tests/unit_tests/api/test_api_extract.py b/backend/tests/unit_tests/api/test_api_extract.py
@@ -1,7 +1,7 @@
 """Code to test API endpoints."""
 import tempfile
 from unittest.mock import patch
-from uuid import UUID
+from uuid import UUID, uuid4
 
 from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.embeddings import FakeEmbeddings
@@ -53,8 +53,10 @@ async def test_extract_from_file() -> None:
         assert response.status_code == 404, response.text
 
         # First create an extractor
+        owner_id = str(uuid4())
         create_request = {
             "name": "Test Name",
+            "owner_id": owner_id,
             "description": "Test Description",
             "schema": {"type": "object"},
             "instruction": "Test Instruction",

diff --git a/docs/source/notebooks/earnings_call_example.ipynb b/docs/source/notebooks/earnings_call_example.ipynb
@@ -84,6 +84,8 @@
     }
    ],
    "source": [
+    "from uuid import uuid4\n",
+    "\n",
     "from pydantic import BaseModel, Field\n",
     "\n",
     "class FinancialData(BaseModel):\n",
@@ -94,7 +96,10 @@
     "    period_duration: int = Field(..., description=\"Duration of period, in months\")\n",
     "    evidence: str = Field(..., description=\"Verbatim sentence of text where figure was found.\")\n",
     "\n",
+    "owner_id = str(uuid4())\n",
+    "\n",
     "data = {\n",
+    "    \"owner_id\": owner_id,\n",
     "    \"description\": \"Financial revenues and other figures.\",\n",
     "    \"schema\": FinancialData.schema(),\n",
     "    \"instruction\": (\n",
@@ -117,7 +122,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'uuid': 'e02e30d6-af42-4783-8b5f-f94cf356cb56'}\n"
+      "{'uuid': '6f11dec7-571b-49e5-affd-811527a36708'}\n"
      ]
     }
    ],
@@ -173,19 +178,13 @@
        "{'data': [{'name': 'Adjusted EBITDA',\n",
        "   'scale': 'million',\n",
        "   'value': 1300,\n",
-       "   'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
+       "   'evidence': 'We maintained our focus on operational efficiency and disciplined expense management, which contributed to all-time high Adjusted EBITDA of $1.3 billion (note: foreign exchange was a $30 million YoY tailwind).',\n",
        "   'period_start': '2023-10-01',\n",
        "   'period_duration': 3},\n",
        "  {'name': 'GAAP operating income',\n",
        "   'scale': 'million',\n",
        "   'value': 652,\n",
-       "   'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
-       "   'period_start': '2023-10-01',\n",
-       "   'period_duration': 3},\n",
-       "  {'name': 'Gross Bookings',\n",
-       "   'scale': 'billion',\n",
-       "   'value': 37.6,\n",
-       "   'evidence': 'Both Gross Bookings and Adjusted EBITDA surpassed the high end of our Q4 outlook. Gross Bookings growth accelerated to 21% YoY on a constant-currency basis (23% excluding Freight), as we generated Gross Bookings of $37.6 billion.',\n",
+       "   'evidence': 'In Q4, we also improved our GAAP operating profitability, with income from operations of $652 million, compared to $394 million and a loss of $142 million in Q3 2023 and Q4 2022, respectively.',\n",
        "   'period_start': '2023-10-01',\n",
        "   'period_duration': 3},\n",
        "  {'name': 'Revenue',\n",
@@ -195,9 +194,9 @@
        "   'period_start': '2023-10-01',\n",
        "   'period_duration': 3},\n",
        "  {'name': 'Adjusted EBITDA',\n",
-       "   'scale': 'million',\n",
-       "   'value': 1260,\n",
-       "   'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
+       "   'scale': '$',\n",
+       "   'value': 1260000000,\n",
+       "   'evidence': '● We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
        "   'period_start': '2023-01-01',\n",
        "   'period_duration': 12}]}"
       ]
@@ -309,22 +308,22 @@
     {
      "data": {
       "text/plain": [
-       "{'data': [{'name': 'revenue',\n",
-       "   'scale': 'B',\n",
-       "   'value': 9900,\n",
-       "   'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n",
-       "   'period_start': '2023-01-01',\n",
-       "   'period_duration': 12},\n",
-       "  {'name': 'adjusted ebitda',\n",
+       "{'data': [{'name': 'adjusted ebitda',\n",
        "   'scale': 'MM',\n",
        "   'value': 1300,\n",
-       "   'evidence': 'We maintained our focus on operational eﬃciency and disciplined expense management, which contributed to all-time high Adjusted EBITDA of $1.3 billion.',\n",
-       "   'period_start': '2023-01-01',\n",
+       "   'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
+       "   'period_start': '2023-10-01',\n",
+       "   'period_duration': 3},\n",
+       "  {'name': 'revenue',\n",
+       "   'scale': 'MM',\n",
+       "   'value': 9900,\n",
+       "   'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n",
+       "   'period_start': '2023-10-01',\n",
        "   'period_duration': 3},\n",
        "  {'name': 'gaap operating income',\n",
        "   'scale': 'MM',\n",
        "   'value': 652,\n",
-       "   'evidence': 'In Q4, we also improved our GAAP operating proﬁtability, with income from operations of $652 million.',\n",
+       "   'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
        "   'period_start': '2023-10-01',\n",
        "   'period_duration': 3},\n",
        "  {'name': 'adjusted ebitda',\n",