Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding samples on embeddings architectures #2

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions Embeddings/Architectures/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


# faiss index files
*.index

.vscode/
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.git*
.vscode
local.settings.json
test
.venv
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import logging
import faiss
import numpy as np
import pandas as pd
import json
import os
from azure.data.tables import TableServiceClient
from azure.core.credentials import AzureNamedKeyCredential
import openai
import azure.functions as func

account_name = os.environ["azure_table_account_name"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Azure_Storage_Name" or "Storage_Account_Name"

account_key = os.environ["azure_table_key"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not table key, rather "Azure_Storage_key" or "Storage_Account_Key"

table_name = os.environ["azure_table_name"]

# creating a client to connect to Azure Tables
credential = AzureNamedKeyCredential(account_name, account_key)
table_service_client = TableServiceClient(endpoint="https://{}.table.core.windows.net/".format(account_name), credential=credential)

# settings for the OpenAI SDK
openai.api_type = "azure"
openai.api_key = os.environ['open_ai_api_key']
openai.api_base = "https://{}.openai.azure.com/".format(os.environ["open_ai_resource_name"])
openai.api_version = os.getenv('open_ai_api_version') or "2022-12-01"
embedding_model = os.environ["open_ai_deployment_name"]

def get_openai_embedding(text, model):
result = openai.Embedding.create(
engine=model,
input=text
)
return np.array(result["data"][0]["embedding"])

# function to load the data from the Azure Table
def load_data():
table_client = table_service_client.get_table_client(table_name=table_name)

entities = table_client.list_entities()
df = pd.DataFrame(entities)

# create an array of the embeddings
vectors = df["embedding"].apply(lambda x: np.array(json.loads(x)))
np_vectors = np.array(vectors.values.tolist()).astype(np.float32)

return np_vectors, df

# code outside of main() function executes when the Azure Function is first spun up
# this allows the data and and index to be cached across multiple invocations
vectors, df = load_data()

# creating a FAISS index from the embeddings stored in the Azure Table
index = faiss.IndexFlatL2(1024)
index.add(vectors)

def main(req: func.HttpRequest) -> func.HttpResponse:
global index
global df

logging.info('Python HTTP trigger function processed a request.')

req_body = req.get_json()

# read in the parameters
text = req_body.get('text')
n = req_body.get('n') or 5
force_reload = req_body.get('force_reload') or False

# reload the data if needed
if force_reload or index == None:
index = faiss.IndexFlatL2(1024)
vectors, df = load_data()
index.add(vectors)

# get the embedding from the input text
embedding = get_openai_embedding(text, embedding_model)

# find the n most similar vectors to the input vector
_, similar = index.search(embedding.reshape(1, -1).astype(np.float32), n)

# prep the output
output = {
"nearest_neighbors": similar[0].tolist(),
"text": df.iloc[similar[0].tolist()]["content"].tolist()
}

logging.info(output)

# return the results object
return func.HttpResponse(json.dumps(output), mimetype="application/json")
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"authLevel": "function",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"methods": [
"get",
"post"
]
},
{
"type": "http",
"direction": "out",
"name": "$return"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"text": "",
"n": 5,
"force_reload": False
}
96 changes: 96 additions & 0 deletions Embeddings/Architectures/Embeddings with Azure Functions/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Embeddings retrieval with Azure Functions

For scenarios with a small number of embeddings (less than 10,000), the simplest option is to cache the data in memory within your application and only refresh the data when needed. This works well at small scales because the data is small enough to easily fit in memory, can be pulled from the data source in a few seconds, and it only takes a few milliseconds to compare all of the embeddings to the user input.

In this example, we use an Azure Function to find the most similar vectors and an Azure Table to store the data, but you could follow this same pattern with other data stores and compute options.


## Architecture Overview
![Azure Function Architecture](../images/azure_function_architecture.png)

In this approach, we need two main components:
1. A primary data store - in this case, we use an Azure Table but you could also work with Azure SQL, Cosmos DB, or any other data store you prefer.
2. A compute resource - in this case, we use an Azure Function but you could also leverage a similar approach within your existing application or other places such as an App Service or Container Instance. The important thing is that you have compute that can cache data in memory so you don't need to reload the data for every call. When using an Azure Function, it's important to host it on a premium plan or in an App Service plan so that the cache is more persistent.

The Azure function receives the text as an input

1. Convert input text to an embedding
2. Find the `n` most similar embeddings to the input embedding
3. Return the text from the most similar embeddings

Optionally, you could also create your prompt and send it to the completion API withing the Azure Function. This would allow you to encapsulate all of the business logic in one place.

## Running the Azure Function

### Setting up your environment

For more details see how to [Create a function in Azure with Python using Visual Studio Code](https://learn.microsoft.com/azure/azure-functions/create-first-function-vs-code-python?pivots=python-mode-configuration)

1. Make sure you have the following items installed:
* The [Azure Functions Core Tools](https://learn.microsoft.com/azure/azure-functions/functions-run-local?tabs=v4) version 4.x.
* [Visual Studio Code](https://code.visualstudio.com/) on one of the supported platforms.
* The [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) for Visual Studio Code.
* The [Azure Functions extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-azurefunctions) for Visual Studio Code

2. Create the following resources on Azure:
* [Azure OpenAI Service]()
* [Azure Function]()
* [Azure Table]()

> Important: Make sure to host your Azure Function on a Premium Plan or in an App Service plan. This important to ensure that the function's compute is persistent so that you don't have to reload the data as frequently.

3. Add the sample data to the Azure table by running [load_azure_table.ipynb](./load_azure_table.ipynb). If you have your own dataset, you can use that instead too.

4. Update the local.settings.json file to include the following parameters
```json
{
"IsEncrypted": false,
"Values": {
"AzureWebJobsStorage": "",
"FUNCTIONS_WORKER_RUNTIME": "python",
"azure_table_account_name": "",
"azure_table_key": "",
"azure_table_name": "",
"open_ai_resource_name": "",
"open_ai_deployment_name": "",
"open_ai_api_version": "2022-12-01",
"open_ai_api_key": ""
}
}
```

### Running the Azure Function locally

Follow the steps below to run the Azure Function locally.
1. Open Visual Studio code.
2. Open a terminal and navigate to this folder.
```cmd
cd "Embeddings with Azure Functions"
```

3. Install the dependencies.
```cmd
pip install -r requirements.txt
```

4. Run the Azure Function.
```cmd
func start
```

5. Call the Azure Function. You can use [test_azure_function.ipynb](./test_azure_function.ipynb) to test the Azure Function or call the API directly.

```http
POST localhost:7071/api/QueryEmbedding

{
"text": "I want to go to the beach",
"n": 5,
"force_reload": false
}
```

### Deploying the Azure Function


To Deploy your Azure Function, see [deploy the project to Azure](https://learn.microsoft.com/azure/azure-functions/create-first-function-vs-code-python?pivots=python-mode-decorators#deploy-the-project-to-azure).
15 changes: 15 additions & 0 deletions Embeddings/Architectures/Embeddings with Azure Functions/host.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "2.0",
"logging": {
"applicationInsights": {
"samplingSettings": {
"isEnabled": true,
"excludedTypes": "Request"
}
}
},
"extensionBundle": {
"id": "Microsoft.Azure.Functions.ExtensionBundle",
"version": "[3.*, 4.0.0)"
}
}
Loading