Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add owner_id to extractors #44

Merged
merged 14 commits into from
Mar 20, 2024
5 changes: 5 additions & 0 deletions backend/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ class Extractor(TimestampedModel):
server_default="",
comment="The name of the extractor.",
)
owner_id = Column(
UUID(as_uuid=True),
nullable=False,
comment="Owner uuid.",
)
schema = Column(
JSONB,
nullable=False,
Expand Down
3 changes: 2 additions & 1 deletion backend/server/api/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from db.models import Extractor, get_session
from extraction.parsing import parse_binary_input
from server.extraction_runnable import ExtractResponse, extract_entire_document
from server.models import DEFAULT_MODEL
from server.retrieval import extract_from_content

router = APIRouter(
Expand All @@ -24,7 +25,7 @@ async def extract_using_existing_extractor(
text: Optional[str] = Form(None),
mode: Literal["entire_document", "retrieval"] = Form("entire_document"),
file: Optional[UploadFile] = File(None),
model_name: Optional[str] = Form("default"),
model_name: Optional[str] = Form(DEFAULT_MODEL),
session: Session = Depends(get_session),
) -> ExtractResponse:
"""Endpoint that is used with an existing extractor.
Expand Down
3 changes: 3 additions & 0 deletions backend/server/api/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class CreateExtractor(BaseModel):

name: str = Field(default="", description="The name of the extractor.")

owner_id: UUID = Field(..., description="The UUID of the owner of the extractor.")

description: str = Field(
default="", description="Short description of the extractor."
)
Expand Down Expand Up @@ -110,6 +112,7 @@ def create(

instance = Extractor(
name=create_request.name,
owner_id=create_request.owner_id,
schema=create_request.json_schema,
description=create_request.description,
instruction=create_request.instruction,
Expand Down
4 changes: 2 additions & 2 deletions backend/server/extraction_runnable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from db.models import Example, Extractor
from extraction.utils import update_json_schema
from server.models import get_chunk_size, get_model
from server.models import DEFAULT_MODEL, get_chunk_size, get_model
from server.validators import validate_json_schema


Expand Down Expand Up @@ -188,7 +188,7 @@ async def extract_entire_document(
text_splitter = TokenTextSplitter(
chunk_size=get_chunk_size(model_name),
chunk_overlap=20,
model_name=model_name,
model_name=DEFAULT_MODEL,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated bug. for now we'll use gpt 3.5 tokenizer everywhere.

)
texts = text_splitter.split_text(content)
extraction_requests = [
Expand Down
22 changes: 18 additions & 4 deletions backend/tests/unit_tests/api/test_api_defining_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ async def test_extractors_api() -> None:
assert response.json() == []

# Verify that we can create an extractor
owner_id = str(uuid.uuid4())
create_request = {
"owner_id": owner_id,
"description": "Test Description",
"schema": {"type": "object"},
"instruction": "Test Instruction",
Expand All @@ -39,6 +41,7 @@ async def test_extractors_api() -> None:

# Verify that we can create an extractor
create_request = {
"owner_id": owner_id,
"description": "Test Description",
"schema": {"type": "object"},
"instruction": "Test Instruction",
Expand All @@ -62,14 +65,23 @@ async def test_extractors_api() -> None:
assert get_response.status_code == 200
assert get_response.json() == []

# Verify that we can create an extractor
# Verify that we can create an extractor, including other properties
owner_id = str(uuid.uuid4())
create_request = {
"name": "my extractor",
"owner_id": owner_id,
"description": "Test Description",
"schema": {"type": "object"},
"instruction": "Test Instruction",
}
response = await client.post("/extractors", json=create_request)
extractor_uuid = response.json()["uuid"]
assert response.status_code == 200
response = await client.get(f"/extractors/{extractor_uuid}")
response_data = response.json()
assert extractor_uuid == response_data["uuid"]
assert "my extractor" == response_data["name"]
assert "owner_id" not in response_data


async def test_sharing_extractor() -> None:
Expand All @@ -79,25 +91,27 @@ async def test_sharing_extractor() -> None:
assert response.status_code == 200
assert response.json() == []
# Verify that we can create an extractor
owner_id = str(uuid.uuid4())
create_request = {
"name": "Test Name",
"owner_id": owner_id,
"description": "Test Description",
"schema": {"type": "object"},
"instruction": "Test Instruction",
}
response = await client.post("/extractors", json=create_request)
assert response.status_code == 200

uuid = response.json()["uuid"]
uuid_str = response.json()["uuid"]

# Verify that the extractor was created
response = await client.post(f"/extractors/{uuid}/share")
response = await client.post(f"/extractors/{uuid_str}/share")
assert response.status_code == 200
assert "share_uuid" in response.json()
share_uuid = response.json()["share_uuid"]

# Test idempotency
response = await client.post(f"/extractors/{uuid}/share")
response = await client.post(f"/extractors/{uuid_str}/share")
assert response.status_code == 200
assert "share_uuid" in response.json()
assert response.json()["share_uuid"] == share_uuid
Expand Down
3 changes: 3 additions & 0 deletions backend/tests/unit_tests/api/test_api_examples.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Code to test API endpoints."""
import uuid

from tests.db import get_async_client

Expand All @@ -14,9 +15,11 @@ async def test_examples_api() -> None:
"""Runs through a set of API calls to test the examples API."""
async with get_async_client() as client:
# First create an extractor
owner_id = str(uuid.uuid4())
create_request = {
"description": "Test Description",
"name": "Test Name",
"owner_id": owner_id,
"schema": {"type": "object"},
"instruction": "Test Instruction",
}
Expand Down
4 changes: 3 additions & 1 deletion backend/tests/unit_tests/api/test_api_extract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Code to test API endpoints."""
import tempfile
from unittest.mock import patch
from uuid import UUID
from uuid import UUID, uuid4

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import FakeEmbeddings
Expand Down Expand Up @@ -53,8 +53,10 @@ async def test_extract_from_file() -> None:
assert response.status_code == 404, response.text

# First create an extractor
owner_id = str(uuid4())
create_request = {
"name": "Test Name",
"owner_id": owner_id,
"description": "Test Description",
"schema": {"type": "object"},
"instruction": "Test Instruction",
Expand Down
43 changes: 21 additions & 22 deletions docs/source/notebooks/earnings_call_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@
}
],
"source": [
"from uuid import uuid4\n",
"\n",
"from pydantic import BaseModel, Field\n",
"\n",
"class FinancialData(BaseModel):\n",
Expand All @@ -94,7 +96,10 @@
" period_duration: int = Field(..., description=\"Duration of period, in months\")\n",
" evidence: str = Field(..., description=\"Verbatim sentence of text where figure was found.\")\n",
"\n",
"owner_id = str(uuid4())\n",
"\n",
"data = {\n",
" \"owner_id\": owner_id,\n",
" \"description\": \"Financial revenues and other figures.\",\n",
" \"schema\": FinancialData.schema(),\n",
" \"instruction\": (\n",
Expand All @@ -117,7 +122,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'uuid': 'e02e30d6-af42-4783-8b5f-f94cf356cb56'}\n"
"{'uuid': '6f11dec7-571b-49e5-affd-811527a36708'}\n"
]
}
],
Expand Down Expand Up @@ -173,19 +178,13 @@
"{'data': [{'name': 'Adjusted EBITDA',\n",
" 'scale': 'million',\n",
" 'value': 1300,\n",
" 'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
" 'evidence': 'We maintained our focus on operational efficiency and disciplined expense management, which contributed to all-time high Adjusted EBITDA of $1.3 billion (note: foreign exchange was a $30 million YoY tailwind).',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'GAAP operating income',\n",
" 'scale': 'million',\n",
" 'value': 652,\n",
" 'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'Gross Bookings',\n",
" 'scale': 'billion',\n",
" 'value': 37.6,\n",
" 'evidence': 'Both Gross Bookings and Adjusted EBITDA surpassed the high end of our Q4 outlook. Gross Bookings growth accelerated to 21% YoY on a constant-currency basis (23% excluding Freight), as we generated Gross Bookings of $37.6 billion.',\n",
" 'evidence': 'In Q4, we also improved our GAAP operating profitability, with income from operations of $652 million, compared to $394 million and a loss of $142 million in Q3 2023 and Q4 2022, respectively.',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'Revenue',\n",
Expand All @@ -195,9 +194,9 @@
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'Adjusted EBITDA',\n",
" 'scale': 'million',\n",
" 'value': 1260,\n",
" 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
" 'scale': '$',\n",
" 'value': 1260000000,\n",
" 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
" 'period_start': '2023-01-01',\n",
" 'period_duration': 12}]}"
]
Expand Down Expand Up @@ -309,22 +308,22 @@
{
"data": {
"text/plain": [
"{'data': [{'name': 'revenue',\n",
" 'scale': 'B',\n",
" 'value': 9900,\n",
" 'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n",
" 'period_start': '2023-01-01',\n",
" 'period_duration': 12},\n",
" {'name': 'adjusted ebitda',\n",
"{'data': [{'name': 'adjusted ebitda',\n",
" 'scale': 'MM',\n",
" 'value': 1300,\n",
" 'evidence': 'We maintained our focus on operational efficiency and disciplined expense management, which contributed to all-time high Adjusted EBITDA of $1.3 billion.',\n",
" 'period_start': '2023-01-01',\n",
" 'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'revenue',\n",
" 'scale': 'MM',\n",
" 'value': 9900,\n",
" 'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'gaap operating income',\n",
" 'scale': 'MM',\n",
" 'value': 652,\n",
" 'evidence': 'In Q4, we also improved our GAAP operating profitability, with income from operations of $652 million.',\n",
" 'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
" 'period_start': '2023-10-01',\n",
" 'period_duration': 3},\n",
" {'name': 'adjusted ebitda',\n",
Expand Down
Loading