Skip to content

Commit

Permalink
revert notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
benofben committed Oct 4, 2023
1 parent 8a5a442 commit ea1f155
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 65 deletions.
61 changes: 10 additions & 51 deletions Lab 7 - Semantic Search/01-text-embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -164,47 +164,7 @@
"outputs": [],
"source": [
"from vertexai.language_models import TextEmbeddingModel\n",
"from typing import List\n",
"\n",
"EMBEDDING_MODEL = TextEmbeddingModel\n",
"MAX_REQ_PER_MIN = 60\n",
"CHUNK_SIZE = 4000\n",
"CHUNK_OVERLAP = 15\n",
"\n",
"def rate_limit(max_per_minute):\n",
" period = 60 / max_per_minute\n",
" while True:\n",
" before = time.time()\n",
" yield\n",
" after = time.time()\n",
" elapsed = after - before\n",
" sleep_time = max(0, period - elapsed)\n",
" if sleep_time > 0:\n",
" # print(f'Sleeping {sleep_time:.1f} seconds')\n",
" time.sleep(sleep_time)\n",
" \n",
"def embed_documents(texts: List[str]) -> List[List[float]]:\n",
" \"\"\"Call Vertex LLM embedding endpoint for embedding docs\n",
" Args:\n",
" texts: The list of texts to embed.\n",
" Returns:\n",
" List of embeddings, one for each text.\n",
" \"\"\"\n",
" model = EMBEDDING_MODEL.from_pretrained(\"textembedding-gecko@001\")\n",
"\n",
" limiter = rate_limit(MAX_REQ_PER_MIN)\n",
" results = []\n",
" docs = list(texts)\n",
"\n",
" while docs:\n",
" # Working in batches of 2 because the API apparently won't let\n",
" # us send more than 2 documents per request to get embeddings.\n",
" head, docs = docs[:2], docs[2:]\n",
" # print(f'Sending embedding request for: {head!r}')\n",
" chunk = model.get_embeddings(head)\n",
" results.extend(chunk)\n",
" next(limiter)\n",
" return results"
"EMBEDDING_MODEL = TextEmbeddingModel.from_pretrained(\"textembedding-gecko@001\")"
]
},
{
Expand All @@ -231,18 +191,19 @@
"\n",
"def create_text_embedding_entries(input_text:str, company_name: str, cusip: str):\n",
" text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size = CHUNK_SIZE,\n",
" chunk_overlap = CHUNK_OVERLAP,\n",
" chunk_size = 2000,\n",
" chunk_overlap = 15,\n",
" length_function = len,\n",
" is_separator_regex = False,\n",
" )\n",
" docs = text_splitter.split_text(input_text)\n",
" res = []\n",
" seq_id = -1\n",
" for d in chunks(docs):\n",
" embeddings = embed_documents(d)\n",
" embeddings = EMBEDDING_MODEL.get_embeddings(d)\n",
" \n",
" # throttle so we don't blow through the quota.\n",
" # time.sleep(1)\n",
" time.sleep(1)\n",
" \n",
" for i in range(len(d)):\n",
" seq_id += 1\n",
Expand Down Expand Up @@ -278,12 +239,10 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"\n",
"import time\n",
"\n",
"# We're hitting the quota, so we're going to sleep for a bit to zero it out for sure, then throttle our calls\n",
"# time.sleep(60)\n",
"time.sleep(60)\n",
"\n",
"count = 0\n",
"embedding_entries = []\n",
Expand Down Expand Up @@ -623,9 +582,9 @@
"uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
},
"kernelspec": {
"display_name": "lab (Local)",
"display_name": "Python 3",
"language": "python",
"name": "local-lab"
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -637,7 +596,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
35 changes: 21 additions & 14 deletions Lab 7 - Semantic Search/02-semantic-search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,17 @@
"id": "3479218e-c372-4296-8058-af73eb85096b",
"metadata": {},
"source": [
"As seen above, the cross-encoder finds this passages which are more relevant to the query and ranks them accordingly"
"As seen above, the cross-encoder finds this passage from Bershire Hathaway to be more relevant to the query"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c38a08dc-b443-48ab-8e8e-11eb9bc53062",
"metadata": {},
"outputs": [],
"source": [
"ranked_results['text'][0]"
]
},
{
Expand Down Expand Up @@ -326,15 +336,15 @@
"metadata": {},
"outputs": [],
"source": [
"ranked_results['text'][5]"
"ranked_results['text'][4]"
]
},
{
"cell_type": "markdown",
"id": "541c6aad-05b4-4391-9f27-b668a82ae910",
"metadata": {},
"source": [
"The Hybrid search brought in additional results from companies not in vector-only search but has content related to energy, oil & gas. The re-ranker helped rank the results. "
"The Hybrid search brought in additional results like `Martin Marietta Material2` which also has content related to energy, oil & gas. The re-ranker helped rank the results. "
]
},
{
Expand Down Expand Up @@ -445,14 +455,11 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccf6be00-7ae2-468f-abb9-c28242c7f9db",
"cell_type": "markdown",
"id": "458e5b2c-8bf6-4755-8e3c-9eaf06bc4096",
"metadata": {},
"outputs": [],
"source": [
"top_mgr = res_df['managerName'][0]\n",
"top_mgr"
"And we can see that our top result is HAHN CAPITAL MANAGEMENT LLC."
]
},
{
Expand All @@ -478,7 +485,7 @@
"WITH m, count(DISTINCT c) AS ownedCompaniesWithDocs\n",
"MATCH (m:Manager {managerName: $managerName})-[:OWNS]->(c:Company)\n",
"RETURN m.managerName AS managerName, ownedCompaniesWithDocs, count(DISTINCT c) AS totalOwnedCompanies\n",
"''', params = {'managerName': top_mgr})"
"''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
]
},
{
Expand Down Expand Up @@ -538,7 +545,7 @@
"MATCH (m0:Manager {managerName: $managerName})-[r:SIMILAR]->(m:Manager)\n",
"RETURN DISTINCT m.managerName AS managerName, r.score AS score\n",
"ORDER BY score DESC LIMIT 10\n",
"''', params = {'managerName': top_mgr})"
"''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
]
},
{
Expand Down Expand Up @@ -568,9 +575,9 @@
"uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
},
"kernelspec": {
"display_name": "lab (Local)",
"display_name": "Python 3",
"language": "python",
"name": "local-lab"
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -582,7 +589,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ea1f155

Please sign in to comment.