From ea1f155240a781a190b7d3f12be63e87811a59b3 Mon Sep 17 00:00:00 2001 From: Ben Lackey Date: Wed, 4 Oct 2023 13:04:00 -0400 Subject: [PATCH] revert notebooks --- .../01-text-embedding.ipynb | 61 +++---------------- .../02-semantic-search.ipynb | 35 ++++++----- 2 files changed, 31 insertions(+), 65 deletions(-) diff --git a/Lab 7 - Semantic Search/01-text-embedding.ipynb b/Lab 7 - Semantic Search/01-text-embedding.ipynb index d8d44d4..c25ba53 100644 --- a/Lab 7 - Semantic Search/01-text-embedding.ipynb +++ b/Lab 7 - Semantic Search/01-text-embedding.ipynb @@ -164,47 +164,7 @@ "outputs": [], "source": [ "from vertexai.language_models import TextEmbeddingModel\n", - "from typing import List\n", - "\n", - "EMBEDDING_MODEL = TextEmbeddingModel\n", - "MAX_REQ_PER_MIN = 60\n", - "CHUNK_SIZE = 4000\n", - "CHUNK_OVERLAP = 15\n", - "\n", - "def rate_limit(max_per_minute):\n", - " period = 60 / max_per_minute\n", - " while True:\n", - " before = time.time()\n", - " yield\n", - " after = time.time()\n", - " elapsed = after - before\n", - " sleep_time = max(0, period - elapsed)\n", - " if sleep_time > 0:\n", - " # print(f'Sleeping {sleep_time:.1f} seconds')\n", - " time.sleep(sleep_time)\n", - " \n", - "def embed_documents(texts: List[str]) -> List[List[float]]:\n", - " \"\"\"Call Vertex LLM embedding endpoint for embedding docs\n", - " Args:\n", - " texts: The list of texts to embed.\n", - " Returns:\n", - " List of embeddings, one for each text.\n", - " \"\"\"\n", - " model = EMBEDDING_MODEL.from_pretrained(\"textembedding-gecko@001\")\n", - "\n", - " limiter = rate_limit(MAX_REQ_PER_MIN)\n", - " results = []\n", - " docs = list(texts)\n", - "\n", - " while docs:\n", - " # Working in batches of 2 because the API apparently won't let\n", - " # us send more than 2 documents per request to get embeddings.\n", - " head, docs = docs[:2], docs[2:]\n", - " # print(f'Sending embedding request for: {head!r}')\n", - " chunk = model.get_embeddings(head)\n", - " results.extend(chunk)\n", - " next(limiter)\n", - " return results" + "EMBEDDING_MODEL = TextEmbeddingModel.from_pretrained(\"textembedding-gecko@001\")" ] }, { @@ -231,8 +191,8 @@ "\n", "def create_text_embedding_entries(input_text:str, company_name: str, cusip: str):\n", " text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size = CHUNK_SIZE,\n", - " chunk_overlap = CHUNK_OVERLAP,\n", + " chunk_size = 2000,\n", + " chunk_overlap = 15,\n", " length_function = len,\n", " is_separator_regex = False,\n", " )\n", @@ -240,9 +200,10 @@ " res = []\n", " seq_id = -1\n", " for d in chunks(docs):\n", - " embeddings = embed_documents(d)\n", + " embeddings = EMBEDDING_MODEL.get_embeddings(d)\n", + " \n", " # throttle so we don't blow through the quota.\n", - " # time.sleep(1)\n", + " time.sleep(1)\n", " \n", " for i in range(len(d)):\n", " seq_id += 1\n", @@ -278,12 +239,10 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "\n", "import time\n", "\n", "# We're hitting the quota, so we're going to sleep for a bit to zero it out for sure, then throttle our calls\n", - "# time.sleep(60)\n", + "time.sleep(60)\n", "\n", "count = 0\n", "embedding_entries = []\n", @@ -623,9 +582,9 @@ "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111" }, "kernelspec": { - "display_name": "lab (Local)", + "display_name": "Python 3", "language": "python", - "name": "local-lab" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { @@ -637,7 +596,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/Lab 7 - Semantic Search/02-semantic-search.ipynb b/Lab 7 - Semantic Search/02-semantic-search.ipynb index 34dd799..f0fcea1 100644 --- a/Lab 7 - Semantic Search/02-semantic-search.ipynb +++ b/Lab 7 - Semantic Search/02-semantic-search.ipynb @@ -236,7 +236,17 @@ "id": "3479218e-c372-4296-8058-af73eb85096b", "metadata": {}, "source": [ - "As seen above, the cross-encoder finds this passages which are more relevant to the query and ranks them accordingly" + "As seen above, the cross-encoder finds this passage from Bershire Hathaway to be more relevant to the query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c38a08dc-b443-48ab-8e8e-11eb9bc53062", + "metadata": {}, + "outputs": [], + "source": [ + "ranked_results['text'][0]" ] }, { @@ -326,7 +336,7 @@ "metadata": {}, "outputs": [], "source": [ - "ranked_results['text'][5]" + "ranked_results['text'][4]" ] }, { @@ -334,7 +344,7 @@ "id": "541c6aad-05b4-4391-9f27-b668a82ae910", "metadata": {}, "source": [ - "The Hybrid search brought in additional results from companies not in vector-only search but has content related to energy, oil & gas. The re-ranker helped rank the results. " + "The Hybrid search brought in additional results like `Martin Marietta Material2` which also has content related to energy, oil & gas. The re-ranker helped rank the results. " ] }, { @@ -445,14 +455,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "ccf6be00-7ae2-468f-abb9-c28242c7f9db", + "cell_type": "markdown", + "id": "458e5b2c-8bf6-4755-8e3c-9eaf06bc4096", "metadata": {}, - "outputs": [], "source": [ - "top_mgr = res_df['managerName'][0]\n", - "top_mgr" + "And we can see that our top result is HAHN CAPITAL MANAGEMENT LLC." ] }, { @@ -478,7 +485,7 @@ "WITH m, count(DISTINCT c) AS ownedCompaniesWithDocs\n", "MATCH (m:Manager {managerName: $managerName})-[:OWNS]->(c:Company)\n", "RETURN m.managerName AS managerName, ownedCompaniesWithDocs, count(DISTINCT c) AS totalOwnedCompanies\n", - "''', params = {'managerName': top_mgr})" + "''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})" ] }, { @@ -538,7 +545,7 @@ "MATCH (m0:Manager {managerName: $managerName})-[r:SIMILAR]->(m:Manager)\n", "RETURN DISTINCT m.managerName AS managerName, r.score AS score\n", "ORDER BY score DESC LIMIT 10\n", - "''', params = {'managerName': top_mgr})" + "''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})" ] }, { @@ -568,9 +575,9 @@ "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111" }, "kernelspec": { - "display_name": "lab (Local)", + "display_name": "Python 3", "language": "python", - "name": "local-lab" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { @@ -582,7 +589,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.10.12" } }, "nbformat": 4,