From 5a771c95d9a186b7e95449cd4f028a0df514b673 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Thu, 29 Feb 2024 19:38:30 -0500 Subject: [PATCH] add example notebook --- .../notebooks/earnings_call_example.ipynb | 369 ++++++++++++++++++ 1 file changed, 369 insertions(+) create mode 100644 docs/source/notebooks/earnings_call_example.ipynb diff --git a/docs/source/notebooks/earnings_call_example.ipynb b/docs/source/notebooks/earnings_call_example.ipynb new file mode 100644 index 0000000..332d45c --- /dev/null +++ b/docs/source/notebooks/earnings_call_example.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1549a9c6-cca7-4028-9f0c-80ee3aa1d4b4", + "metadata": {}, + "source": [ + "# Example: extracting structured data from earnings call transcripts\n", + "\n", + "Most public companies host earnings calls, providing their management opportunities to discuss past financial results and future plans. Natural language transcripts of these calls may contain useful information, but often this information must first be extracted from the document and arranged into a structured form so that it can be analyzed or compared across time periods and other companies.\n", + "\n", + "Here we demonstrate the use of a LLM-powered extraction service on extracting information from Uber's Q4 2023 earnings call. We show the importance of incorporating few-shot learning to accurate extraction in a real-world context.\n", + "\n", + "Uber investor relations makes the prepared remarks for the call available [online](https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf).\n", + "\n", + "First we start our local extraction service, as described in the [README](../../../README.md), and download the PDF document:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "589ea131-e6ae-4605-8c8f-3ccb0f643477", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "url = \"http://localhost:8000\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d7d935fe-4642-4c55-bba4-6dfb8191e4bd", + "metadata": {}, + "outputs": [], + "source": [ + "# Uber transcripts from earnings calls and other events at https://investor.uber.com/news-events/default.aspx\n", + "\n", + "pdf_url = \"https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c704a00e-f663-4bce-b482-984278dad8f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Get PDF bytes\n", + "\n", + "pdf_response = requests.get(pdf_url)\n", + "assert(pdf_response.status_code == 200)\n", + "pdf_bytes = pdf_response.content" + ] + }, + { + "cell_type": "markdown", + "id": "0a10cf8b-f05e-424b-9a72-fb4abfb9e091", + "metadata": {}, + "source": [ + "We next specify the schema of what we intend to extract. Here we specify a record of financial data. We allow the LLM to infer various attributes, such as the time period for the record.\n", + "\n", + "Note that we include an `evidence` attribute, which provides context for the predictions and supports downstream verification of the results.\n", + "\n", + "Once we've defined our schema, we create an extractor by posting it to our database." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3d5a5bb0-4284-4706-98e6-e622bcc3778d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pydantic import BaseModel, Field\n", + "\n", + "class FinancialData(BaseModel):\n", + " name: str = Field(..., description=\"Name of the financial figure, such as revenue.\")\n", + " value: int = Field(..., description=\"Nominal earnings in local currency.\")\n", + " scale: str = Field(..., description=\"Scale of figure, such as MM, B, or percent.\")\n", + " period_start: str = Field(..., description=\"The start of the time period in ISO format.\")\n", + " period_duration: int = Field(..., description=\"Duration of period, in months\")\n", + " evidence: str = Field(..., description=\"Verbatim sentence of text where figure was found.\")\n", + "\n", + "data = {\n", + " \"description\": \"Financial revenues and other figures.\",\n", + " \"schema\": FinancialData.schema(),\n", + " \"instruction\": (\n", + " \"Extract standard financial figures, specifically earnings and \"\n", + " \"revenue figures.\"\n", + " )\n", + "}\n", + "\n", + "response = requests.post(f\"{url}/extractors\", json=data)\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "74b7f3a4-07c1-4cf0-8c75-34d22eb5a661", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'uuid': 'e02e30d6-af42-4783-8b5f-f94cf356cb56'}\n" + ] + } + ], + "source": [ + "extractor = response.json()\n", + "print(extractor)" + ] + }, + { + "cell_type": "markdown", + "id": "fc18f21c-9f73-4f9e-b63d-7d0a198208c9", + "metadata": {}, + "source": [ + "We can now try the extractor on our PDF:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "15a1c7e9-3fcd-42ca-88fb-4802fe841a8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = requests.post(\n", + " f\"{url}/extract\",\n", + " data={\"extractor_id\": extractor[\"uuid\"]},\n", + " files={\"file\": pdf_bytes},\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "894ad738-5f25-4791-a2a6-365d39b583b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': [{'name': 'Adjusted EBITDA',\n", + " 'scale': 'million',\n", + " 'value': 1300,\n", + " 'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n", + " 'period_start': '2023-10-01',\n", + " 'period_duration': 3},\n", + " {'name': 'GAAP operating income',\n", + " 'scale': 'million',\n", + " 'value': 652,\n", + " 'evidence': 'Q4 was a standout quarter to cap off a standout year. ... translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n", + " 'period_start': '2023-10-01',\n", + " 'period_duration': 3},\n", + " {'name': 'Gross Bookings',\n", + " 'scale': 'billion',\n", + " 'value': 37.6,\n", + " 'evidence': 'Both Gross Bookings and Adjusted EBITDA surpassed the high end of our Q4 outlook. Gross Bookings growth accelerated to 21% YoY on a constant-currency basis (23% excluding Freight), as we generated Gross Bookings of $37.6 billion.',\n", + " 'period_start': '2023-10-01',\n", + " 'period_duration': 3},\n", + " {'name': 'Revenue',\n", + " 'scale': 'billion',\n", + " 'value': 9.9,\n", + " 'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n", + " 'period_start': '2023-10-01',\n", + " 'period_duration': 3},\n", + " {'name': 'Adjusted EBITDA',\n", + " 'scale': 'million',\n", + " 'value': 1260,\n", + " 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n", + " 'period_start': '2023-01-01',\n", + " 'period_duration': 12}]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.json()" + ] + }, + { + "cell_type": "markdown", + "id": "cdcf8c31-bbb4-40df-b94e-dcda32b975aa", + "metadata": {}, + "source": [ + "We've extracted several records capturing various earnings and revenue figures, and have conformed the records to the desired schema.\n", + "\n", + "We can convey additional instructions to the LLM efficiently via few-shot examples. For example, we can specify how the names of financial metrics should be normalized, or how scales (millions, billions, percentages, etc.) should be represented in different cases.\n", + "\n", + "The `examples` endpoint lets us associate few-shot examples with an extractor. We can specify examples by pairing text inputs with lists of `FinancialData` outputs:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b07d78e9-23d5-49e8-ae6c-e3d4aaca2d4d", + "metadata": {}, + "outputs": [], + "source": [ + "examples = [\n", + " {\n", + " \"text\": \"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", + " \"output\": [\n", + " FinancialData(\n", + " name=\"revenue\",\n", + " value=1,\n", + " scale=\"MM\",\n", + " period_start=\"2022-01-01\",\n", + " period_duration=12,\n", + " evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", + " ).dict(),\n", + " FinancialData(\n", + " name=\"ebit\",\n", + " value=2,\n", + " scale=\"MM\",\n", + " period_start=\"2022-01-01\",\n", + " period_duration=12,\n", + " evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", + " ).dict()\n", + " ],\n", + " },\n", + "]\n", + "\n", + "responses = []\n", + "for example in examples:\n", + " create_request = {\n", + " \"extractor_id\": extractor[\"uuid\"],\n", + " \"content\": example[\"text\"],\n", + " \"output\": example['output'],\n", + " }\n", + " response = requests.post(f\"{url}/examples\", json=create_request)\n", + " responses.append(response)" + ] + }, + { + "cell_type": "markdown", + "id": "271a90c0-d320-4e35-9317-1ea08c5dde15", + "metadata": {}, + "source": [ + "Having posted the examples, we can re-run the extraction:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "efc8041e-e3ca-4705-8d34-7f9b93b1400c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = requests.post(\n", + " f\"{url}/extract\",\n", + " data={\"extractor_id\": extractor[\"uuid\"]},\n", + " files={\"file\": pdf_bytes},\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2101f960-6abd-4fef-9945-bafb449d5435", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': [{'name': 'revenue',\n", + " 'scale': 'B',\n", + " 'value': 9900,\n", + " 'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n", + " 'period_start': '2023-01-01',\n", + " 'period_duration': 12},\n", + " {'name': 'adjusted ebitda',\n", + " 'scale': 'MM',\n", + " 'value': 1300,\n", + " 'evidence': 'We maintained our focus on operational efficiency and disciplined expense management, which contributed to all-time high Adjusted EBITDA of $1.3 billion.',\n", + " 'period_start': '2023-01-01',\n", + " 'period_duration': 3},\n", + " {'name': 'gaap operating income',\n", + " 'scale': 'MM',\n", + " 'value': 652,\n", + " 'evidence': 'In Q4, we also improved our GAAP operating profitability, with income from operations of $652 million.',\n", + " 'period_start': '2023-10-01',\n", + " 'period_duration': 3},\n", + " {'name': 'adjusted ebitda',\n", + " 'scale': 'B',\n", + " 'value': 1260,\n", + " 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n", + " 'period_start': '2023-01-01',\n", + " 'period_duration': 12}]}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.json()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}