From e5c518630c1bf27843aa20e8f6b3bf5fb7088768 Mon Sep 17 00:00:00 2001 From: blythed Date: Sat, 2 Dec 2023 22:47:53 +0100 Subject: [PATCH] Fix links and re-type auto-ibis meta-data --- README.md | 14 +- deploy/app_template/app_template.py | 28 +- deploy/testenv/preload.py | 1 + .../content/docs/data_integrations/mongodb.md | 4 +- .../fundamentals/component_abstraction.md | 2 +- .../docs/fundamentals/component_versioning.md | 5 +- .../docs/fundamentals/datalayer_overview.md | 8 +- .../procedural_vs_declarative_api.md | 2 +- .../fundamentals/vector_search_algorithm.md | 2 +- .../hr/content/docs/get_started/quickstart.md | 12 +- .../developer_vs_production_mode.md | 6 +- docs/hr/content/docs/setup/connecting.md | 2 +- .../content/docs/walkthrough/apply_models.md | 2 +- .../content/docs/walkthrough/vector_search.md | 2 +- .../chat_with_your_database.md | 287 ++++++++++++++++++ examples/chat_with_your_database.ipynb | 63 +++- superduperdb/base/build.py | 26 +- 17 files changed, 400 insertions(+), 66 deletions(-) create mode 100644 docs/hr/content/use_cases/question-answering/chat_with_your_database.md diff --git a/README.md b/README.md index aef415752..c27e2b642 100644 --- a/README.md +++ b/README.md @@ -54,14 +54,14 @@ SuperDuperDB eliminates the need for complex MLOps pipelines and specialized vec ### Key Features: -- **[Integration of AI with your existing data infrastructure](https://docs.superduperdb.com/docs/docs/apply_models):** Integrate any AI models and APIs with your databases in a single scalable deployment, without the need for additional pre-processing steps, ETL or boilerplate code. -- **[Streaming Inference](https://docs.superduperdb.com/docs/docs/daemonizing_models_with_listeners):** Have your models compute outputs automatically and immediately as new data arrives, keeping your deployment always up-to-date. -- **[Scalable Model Training](https://docs.superduperdb.com/docs/docs/training_models):** Train AI models on large, diverse datasets simply by querying your training data. Ensured optimal performance via in-build computational optimizations. -- **[Model Chaining](https://docs.superduperdb.com/docs/docs/linking_interdependent_models)**: Easily setup complex workflows by connecting models and APIs to work together in an interdependent and sequential manner. -- **[Simple, but Extendable Interface](https://docs.superduperdb.com/docs/docs/procedural_vs_declarative_api)**: Add and leverage any function, program, script or algorithm from the Python ecosystem to enhance your workflows and applications. Drill down on any layer as deep as it gets, up until the inner workings of your models while operating SuperDuperDB with simple Python commands. -- **[Difficult Data-Types](https://docs.superduperdb.com/docs/docs/encoding_special_data_types)**: Work directly with images, video, audio in your datastore, and any type which can be encoded as `bytes` in Python. +- **[Integration of AI with your existing data infrastructure](https://docs.superduperdb.com/docs/docs/walkthrough/apply_models):** Integrate any AI models and APIs with your databases in a single scalable deployment, without the need for additional pre-processing steps, ETL or boilerplate code. +- **[Streaming Inference](https://docs.superduperdb.com/docs/docs/walkthrough/daemonizing_models_with_listeners):** Have your models compute outputs automatically and immediately as new data arrives, keeping your deployment always up-to-date. +- **[Scalable Model Training](https://docs.superduperdb.com/docs/docs/walkthrough/training_models):** Train AI models on large, diverse datasets simply by querying your training data. Ensured optimal performance via in-build computational optimizations. +- **[Model Chaining](https://docs.superduperdb.com/docs/docs/walkthrough/linking_interdependent_models/)**: Easily setup complex workflows by connecting models and APIs to work together in an interdependent and sequential manner. +- **[Simple, but Extendable Interface](https://docs.superduperdb.com/docs/docs/fundamentals/procedural_vs_declarative_api)**: Add and leverage any function, program, script or algorithm from the Python ecosystem to enhance your workflows and applications. Drill down on any layer as deep as it gets, up until the inner workings of your models while operating SuperDuperDB with simple Python commands. +- **[Difficult Data-Types](https://docs.superduperdb.com/docs/docs/walkthrough/encoding_special_data_types/)**: Work directly with images, video, audio in your datastore, and any type which can be encoded as `bytes` in Python. - **[Feature Storing](https://docs.superduperdb.com/docs/docs/mongodb_query_API#inserts):** Turn your database into a centralized repository for storing and managing inputs and outputs of AI models of arbitrary data-types, making them available in a structured format and known environment. -- **[Vector Search](https://docs.superduperdb.com/docs/docs/vector_search):** No need for duplicating and migrating your data to additional specialized vector databases - turn your existing battle-tested datastore into a fully-fledged multi-modal vector-search database, including easy generation of vector embeddings and vector indexes of your data with preferred models and APIs. +- **[Vector Search](https://docs.superduperdb.com/docs/docs/walkthrough/vector_search):** No need for duplicating and migrating your data to additional specialized vector databases - turn your existing battle-tested datastore into a fully-fledged multi-modal vector-search database, including easy generation of vector embeddings and vector indexes of your data with preferred models and APIs. ### Why opt for SuperDuperDB? || With SuperDuperDB | Without | diff --git a/deploy/app_template/app_template.py b/deploy/app_template/app_template.py index f1fadcb85..df52d0777 100644 --- a/deploy/app_template/app_template.py +++ b/deploy/app_template/app_template.py @@ -14,17 +14,23 @@ @app.get("/") def show(): - return {"models": db.show('model'), 'listeners': db.show('listener'), 'vector_indexes': db.show('vector_index')} + return { + "models": db.show('model'), + 'listeners': db.show('listener'), + 'vector_indexes': db.show('vector_index'), + } @app.get("/search") def search(input: str): - results = sorted(list( - collection - .like(Document({'': input}), vector_index='', n=20) - .find({}, {'_id': 0}), - key=lambda x: -x['score'], - )) + results = sorted( + list( + collection.like( + Document({'': input}), vector_index='', n=20 + ).find({}, {'_id': 0}), + key=lambda x: -x['score'], + ) + ) return {'results': results} @@ -35,10 +41,10 @@ def predict(input: str): model_name='', input=input, context_select=( - collection - .like(Document({'': input}), vector_index='', n=num_results) - .find() + collection.like( + Document({'': input}), vector_index='', n=num_results + ).find() ), context_key='txt', ) - return {'prediction': output} \ No newline at end of file + return {'prediction': output} diff --git a/deploy/testenv/preload.py b/deploy/testenv/preload.py index 6e0767a38..13a128b4d 100644 --- a/deploy/testenv/preload.py +++ b/deploy/testenv/preload.py @@ -1,2 +1,3 @@ import sys + sys.path.append('./') diff --git a/docs/hr/content/docs/data_integrations/mongodb.md b/docs/hr/content/docs/data_integrations/mongodb.md index ab7e1787b..feeb2fd44 100644 --- a/docs/hr/content/docs/data_integrations/mongodb.md +++ b/docs/hr/content/docs/data_integrations/mongodb.md @@ -55,7 +55,7 @@ db.execute( ) ``` -Read more about vector-search [here](../fundamentals/25_vector_search.mdx). +Read more about vector-search [here](../fundamentals/vector_search_algorithm.md). ## Deletes @@ -68,4 +68,4 @@ db.execute(collection.delete_many({})) Aggregates are exactly as in `pymongo`, with the exception that a `$vectorSearch` stage may be fed with an additional field `'like': Document({...})`, which plays the same role as in selects. -Read more about this in [the vector-search section](../fundamentals/25_vector_search.mdx). +Read more about this in [the vector-search section](../walkthrough/vector_search). diff --git a/docs/hr/content/docs/fundamentals/component_abstraction.md b/docs/hr/content/docs/fundamentals/component_abstraction.md index 824144440..4d827024f 100644 --- a/docs/hr/content/docs/fundamentals/component_abstraction.md +++ b/docs/hr/content/docs/fundamentals/component_abstraction.md @@ -56,7 +56,7 @@ instances. ### `Stack` -A `Stack` is a way of connecting diverse and interoperating sets of functionality. See [here](../walkthrough/28_creating_stacks_of_functionality.md) for more details. +A `Stack` is a way of connecting diverse and interoperating sets of functionality. See [here](../walkthrough/creating_stacks_of_functionality) for more details. ## Activating components diff --git a/docs/hr/content/docs/fundamentals/component_versioning.md b/docs/hr/content/docs/fundamentals/component_versioning.md index ea1dcf111..270c1e3a3 100644 --- a/docs/hr/content/docs/fundamentals/component_versioning.md +++ b/docs/hr/content/docs/fundamentals/component_versioning.md @@ -4,7 +4,7 @@ sidebar_position: 26 # Component versioning -Whenever a `Component` is created (see [here](../fundamentals/09_component_abstraction.md) for overview of `Component` classes), +Whenever a `Component` is created (see [here](../fundamentals/component_abstraction.md) for overview of `Component` classes), information about that `Component` is saved in the `db.metadata` store. All components come with attributes `.identifier` which is a unique identifying string for that `Component` instance. @@ -47,4 +47,5 @@ When one adds the `VectorIndex` with `db.add(vector_index)`, the sub-components are also versioned, if a version has not already been assigned to those components in the same session. -Read more about `VectorIndex` and vector-searches [here](../fundamentals/25_vector_search.mdx). +Read more about `VectorIndex` and vector-searches [here](../walkthrough/vector_search.md). + diff --git a/docs/hr/content/docs/fundamentals/datalayer_overview.md b/docs/hr/content/docs/fundamentals/datalayer_overview.md index cbd9e46be..34857dfce 100644 --- a/docs/hr/content/docs/fundamentals/datalayer_overview.md +++ b/docs/hr/content/docs/fundamentals/datalayer_overview.md @@ -44,9 +44,9 @@ The databackend typically connects to your database (although `superduperdb` als and dispatches queries written in an query API which is compatible with that databackend, but which also includes additional aspects specific to `superduperdb`. -Read more [here](../walkthrough/11_supported_query_APIs.md). +Read more [here](../data_integrations/supported_query_APIs.md). -The databackend is configured by setting the URI `CFG.databackend` in the [configuration system](../walkthrough/01_configuration.md). +The databackend is configured by setting the URI `CFG.databackend` in the [configuration system](../setup/configuration.md). We support the same databackends as supported by the [`ibis` project](https://ibis-project.org/): @@ -168,7 +168,7 @@ Here are the key methods which you'll use again and again: ### `db.execute` -This method executes a query. For an overview of how this works see [here](../walkthrough/11_supported_query_APIs.md). +This method executes a query. For an overview of how this works see [here](../data_integrations/supported_query_APIs.md). ### `db.add` @@ -196,4 +196,4 @@ Validate your components (mostly models) ### `db.predict` -Infer predictions from models hosted by `superduperdb`. Read more about this and about models [here](../fundamentals/21_apply_models.mdx). +Infer predictions from models hosted by `superduperdb`. Read more about this and about models [here](../walkthrough/apply_models.md). diff --git a/docs/hr/content/docs/fundamentals/procedural_vs_declarative_api.md b/docs/hr/content/docs/fundamentals/procedural_vs_declarative_api.md index 660593f14..6c2310528 100644 --- a/docs/hr/content/docs/fundamentals/procedural_vs_declarative_api.md +++ b/docs/hr/content/docs/fundamentals/procedural_vs_declarative_api.md @@ -54,4 +54,4 @@ db.add( ) ``` -Read more about the `VectorIndex` concept [here](25_vector_search.mdx). +Read more about the `VectorIndex` concept [here](../walkthrough/vector_search.md). diff --git a/docs/hr/content/docs/fundamentals/vector_search_algorithm.md b/docs/hr/content/docs/fundamentals/vector_search_algorithm.md index 9559088d1..c9205ae14 100644 --- a/docs/hr/content/docs/fundamentals/vector_search_algorithm.md +++ b/docs/hr/content/docs/fundamentals/vector_search_algorithm.md @@ -50,4 +50,4 @@ The most similar `ids` are retrieved. The `select` part of the query is then tra a similar query which searches within the retrieved `ids`. The full set of results are returned to the client. -Read [here](../walkthrough/vector_search.mdx) about setting up and detailed usage of vector-search. +Read [here](../walkthrough/vector_search.md) about setting up and detailed usage of vector-search. diff --git a/docs/hr/content/docs/get_started/quickstart.md b/docs/hr/content/docs/get_started/quickstart.md index ce2b48f23..ba3e61c76 100644 --- a/docs/hr/content/docs/get_started/quickstart.md +++ b/docs/hr/content/docs/get_started/quickstart.md @@ -12,30 +12,30 @@ Follow these steps to quickly get started: hosted there, can be executed directly in the environment, but can also be cloned in [their original form in the open source repo](https://github.com/SuperDuperDB/superduperdb/tree/main/examples), and executed locally. These notebooks are also described on this documentation website in the [use-cases section](/docs/use-cases). -1. **Get setup** +2. **Get setup** Follow [the installation guide](./installation.md) and check the [minimum-working example](./minimum_working_example.md) to set-up your environment. For more detailed configuration, read the detailed [setup](/docs/category/setup) section. -1. **Dive into the documentation** +3. **Dive into the documentation** Refer to our comprehensive [`README.md`](https://github.com/superDuperDB/) for a high level of SuperDuperDB. The long-form documentation you are reading now provides deeper insights into features, usage, and best practices. -1. **Explore the community apps examples** +4. **Explore the community apps examples** Visit our [`community apps example`](https://github.com/superDuperDB/superduper-community-apps) repository to explore more examples of how SuperDuperDB can enhance your experience. Learn from real-world use cases and implementations. -1. **Grasp the fundamentals** +5. **Grasp the fundamentals** Read through the [`Fundamentals`](../fundamentals/glossary) section to gain a solid understanding of SuperDuperDB's architecture and refer to the [`API References`](https://docs.superduperdb.com/apidocs/source/superduperdb.html) for detailed information on API usage. -1. **Engage with the Community** +6. **Engage with the Community** If you encounter challenges, join our [Slack Channels](https://join.slack.com/t/superduperdb/shared_invite/zt-1zuojj0k0-RjAYBs1TDsvEa7yaFGa6QA) for assistance. Report bugs and share feature requests [by raising an issue]((https://github.com/SuperDuperDB/superduperdb/issues).). Our community is here to support you. You are welcome to join the conversation on our [discussions forum](https://github.com/SuperDuperDB/superduperdb/discussions) and follow our open-source roadmap [here](https://github.com/orgs/SuperDuperDB/projects/1/views/10). -1. **Contribute and Share** +7. **Contribute and Share** Contribute to the SuperDuperDB community by sharing your solutions and experiences. Help us grow by promoting SuperDuperDB to your peers and the wider world. Your involvement is valuable to us! Don't forget to give us a star ⭐! diff --git a/docs/hr/content/docs/production/developer_vs_production_mode.md b/docs/hr/content/docs/production/developer_vs_production_mode.md index 13114f7cb..0164c7505 100644 --- a/docs/hr/content/docs/production/developer_vs_production_mode.md +++ b/docs/hr/content/docs/production/developer_vs_production_mode.md @@ -33,8 +33,8 @@ There are several gradations of a more productionized deployment. In the most distributed case we have: - A `jupyter` environment running in its own process -- A [distributed **Dask** cluster](31_non_blocking_dask_jobs.md), with scheduler and workers configured to work with `superduperdb` -- A [**change-data-capture** service](32_change_data_capture.md) -- A [**vector-search** service](33_vector_comparison_service.md), which finds similar vectors, given an input vector +- A [distributed **Dask** cluster](non_blocking_dask_jobs.md), with scheduler and workers configured to work with `superduperdb` +- A [**change-data-capture** service](change_data_capture.md) +- A [**vector-search** service](vector_comparison_service.md), which finds similar vectors, given an input vector In the remainder of this section we describe the use of each of these services \ No newline at end of file diff --git a/docs/hr/content/docs/setup/connecting.md b/docs/hr/content/docs/setup/connecting.md index 124e86804..e154c0440 100644 --- a/docs/hr/content/docs/setup/connecting.md +++ b/docs/hr/content/docs/setup/connecting.md @@ -41,4 +41,4 @@ db = superduper('mongodb://localhost:27018', CFG=CFG) ``` The `db` object is an instance of `superduperdb.base.datalayer.Datalayer`. -The `Datalayer` class handles AI models and communicates with the databackend and associated components. Read more [here](07_datalayer_overview.md). +The `Datalayer` class handles AI models and communicates with the databackend and associated components. Read more [here](../fundamentals/datalayer_overview.md). diff --git a/docs/hr/content/docs/walkthrough/apply_models.md b/docs/hr/content/docs/walkthrough/apply_models.md index acad54273..2985eff97 100644 --- a/docs/hr/content/docs/walkthrough/apply_models.md +++ b/docs/hr/content/docs/walkthrough/apply_models.md @@ -77,7 +77,7 @@ my_model.predict( Under-the-hood, this call creates a `Listener` which is deployed on the query passed to the `.predict` call. -Read more about the `Listener` abstraction [here](22_daemonizing_models_with_listeners.md) +Read more about the `Listener` abstraction [here](daemonizing_models_with_listeners.md) ### Activating models for vector-search with `create_vector_index=True` diff --git a/docs/hr/content/docs/walkthrough/vector_search.md b/docs/hr/content/docs/walkthrough/vector_search.md index 3b53bccb2..26c35671d 100644 --- a/docs/hr/content/docs/walkthrough/vector_search.md +++ b/docs/hr/content/docs/walkthrough/vector_search.md @@ -60,7 +60,7 @@ SuperDuperDB supports queries via: - `pymongo` - `ibis` -Read more about this [here](../walkthrough/11_supported_query_APIs.md). +Read more about this [here](../data_integrations/supported_query_APIs.md). In order to use vector-search in a query, one combines these APIs with the `.like` operator. diff --git a/docs/hr/content/use_cases/question-answering/chat_with_your_database.md b/docs/hr/content/use_cases/question-answering/chat_with_your_database.md new file mode 100644 index 000000000..e87e6ba84 --- /dev/null +++ b/docs/hr/content/use_cases/question-answering/chat_with_your_database.md @@ -0,0 +1,287 @@ +# Chat with your Database + +## Chatting instantly with a 10-million-record SQL database using SuperDuperDB and OpenAI. + +Imagine chatting with your database using just a few lines of code. Sounds unbelievable, right? Well, believe it! We'll show you how you can effortlessly chat with a huge database containing 10 million business records—all with just a few lines of SuperDuperDB code. + +Here is the behemoth 10M dataset: [FREE COMPANY DATASET](https://app.snowflake.com/marketplace/listing/GZSTZRRVYL2/people-data-labs-free-company-dataset) + +Chatting with this type of massive dataset using the standard RAG pipeline is next to impossible due to the cost and scale. However, with SuperDuperDB, you can achieve the same functionality with just a few lines of code. + +You can control the low level code while enjoying writing the high level code! So that, you can increase the capacity of it! Whether you're using `Snowflake` or `any other SQL dataset`, we've got your back. + +Here's the simplicity of it: +1. Connect using your URI (works with any SQL Database). +2. Specify your Database/Table Name. +3. Craft a query in plain English. + +You'll not only get results but also clear explanations! + +Let SuperDuperDB and OpenAI do the heavy lifting—all within a single prompt. Stay tuned for more exciting features, including prompt chaining! + +Bring the power of AI into your database effortlessly! + +Let's bring AI into your database! 🚀 + +```python +# Only one dependency +# %pip install superduperdb + +# Import SuperDuperDB and connect your database +from superduperdb import superduper +from superduperdb.backends.ibis.query import RawSQL +``` + +### Import SuperDuperDB and connect your database + +Here we have connected with a mega database from `Snowflake` but it works with all other `SQL` database. + +```python +user = "superduperuser" +password = "superduperpassword" +account = "XXXX-XXXX" # ORGANIZATIONID-USERID +database = "FREE_COMPANY_DATASET/PUBLIC" # DATABASE/SCHEMA + +# Here we are using Snowflake FREE_COMPANY_DATASET with 10 million company data +snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}" + +# Let's superduper your database +db = superduper( + snowflake_uri, # It could be any SQL Database + metadata_store='sqlite:///your_database_name.db', # We need a persistent metadata store to store important infos like job. It could be anything including your own database. Here we are using a SQLite database. You can use your same database as well. In that case, you don't have to add metadata_store; we will use the same database as metadata store +) +``` + +### Add OpenAI API Key + +If you don't have any, call Sam Altman! + +```python +import os +from superduperdb.ext.openai import OpenAIChatCompletion +from IPython.display import Markdown + +# Add your OPEN_AI_API_KEY or keep it in your os.environ, we will pick it up from the environment +os.environ['OPENAI_API_KEY'] = 'sk-XXX_SAM_ALTMAN_IS_NOT_FIRED_XXX' +``` + +### Create a helper function chat with your database + +Here you can tweak the prompts or you can leave here as it is! + +```python +def chat_with_your_database(table_name, query, limit=5): + # Define the search parameters + search_term = f'Write me a SQL query for the table named {table_name}. The query is: {query}' + + # Define the prompt for the OpenAIChatCompletion model + prompt = ( + 'Act as a database administrator, and expert in SQL. You will be helping me write complex SQL queries. I will explain you my needs, you will generate SQL queries against my database. The database is a Snowflake database, please take it into consideration when generating SQL.' + f' I will provide you with a description of the structure of my tables. You must remember them and use them for generating SQL queries.\n' + 'Here are the tables in CSV format: {context}\n\n' + f'Generate only the SQL query. Always write "regex_pattern" in every "WHERE" query. Integrate a "LIMIT {limit}" clause into the query. Exclude any text other than the SQL query itself. Do not include markdown "```" or "```sql" at the start or end.' + 'Here\'s the CSV file:\n' + ) + + # Add the OpenAIChatCompletion instance to the database + db.add(OpenAIChatCompletion(model='gpt-3.5-turbo', prompt=prompt)) + + # Use the OpenAIChatCompletion model to predict the next query based on the provided context + output, context = db.predict( + model_name='gpt-3.5-turbo', + input=search_term, + context_select=db.execute(RawSQL(f'DESCRIBE {table_name}')).as_pandas().to_csv() + # context_select=db.execute(RawSQL(f'SELECT * FROM {table_name} LIMIT 10')).as_pandas().to_csv() # Use in case of some other SQL databases like Postgres where `DESCRIBE` is not supported. + ) + + try: + # Attempt to execute the predicted SQL query and retrieve the result as a pandas DataFrame + # print(output.content) + query_result = db.execute(RawSQL(output.content)).as_pandas() + + if query_result.empty: + query_result = "No result found. Please edit your query based on the database. Be specific. Like keep everything in lowercase. Use regex etc. Run the same thing multiple times. Always." + except: + # If an exception occurs, provide a message to guide the user on adjusting their query + query_result = "Please edit your query based on the database so that we can find you a suitable result. Please check your table schema if you encounter issues. Run again, if necessary." + + return query_result +``` + +### Create another helper function to explain the result + +This function will be used to explain the result + +```python +def explain_the_result(query_result): + # Define the search parameters + try: + search_term = f'Find business insights from it {query_result.to_csv()}' + except: + return "No result found. Run again. Please edit your query. Be specific. And always run again. LLM will catch the error and will show you the perfect result in multiple attempts." + + # Define the prompt for the OpenAIChatCompletion model + prompt = ( + f'Assume the role of a database analyst. Your objective is to provide accurate business insights based on the provided CSV content. Avoid reproducing the same CSV file or rewriting the SQL query. Conclude your response with a summary.\n' + 'Context: {context}' + 'Here\'s the CSV file for you to analyze:\n' + ) + + # Add the OpenAIChatCompletion instance to the database + db.add(OpenAIChatCompletion(model='gpt-3.5-turbo', prompt=prompt)) + + # Use the OpenAIChatCompletion model to predict insights based on the provided context + output, context = db.predict( + model_name='gpt-3.5-turbo', + input=search_term, + ) + + try: + # Attempt to format the predicted output as Markdown + query_result = Markdown(output.content) + except: + # If an exception occurs, provide a message to guide the user on adjusting their input + query_result = "Please edit your input based on the dataset so that we can find you a suitable output. Please check your data if you encounter issues." + + return query_result +``` + +### Now let's start chatting with your database. + +Run this multiple times as it will keep its context. Here you just edit the `table_name` and `query` to see the final result. + +```python +# If you see no result, Run this codeblock multiple times to make the gpt-3.5-turbo work better and change your query as well. Idea: start with a simple query. Then make it gradually complex. + +table_name = "FREECOMPANYDATASET" +query = "Find me some company in Germany in Berlin in Dortmund in automotive industry. Keep all in lowercase" + +result = chat_with_your_database(table_name, query) + +result +``` + +Result: +| COUNTRY | FOUNDED | ID | INDUSTRY | LINKEDIN_URL | LOCALITY | NAME | REGION | SIZE | WEBSITE | +|---------|---------|----------------------------|------------|------------------------------------------------|------------------------|--------------------------------|--------|-------|-------------------------------| +| germany | None | theaterscoutings-berlin | automotive | [linkedin.com/company/theaterscoutings-berlin](https://linkedin.com/company/theaterscoutings-berlin) | None | theaterscoutings berlin | berlin | 1-10 | [theaterscoutings-berlin.de](https://theaterscoutings-berlin.de) | +| germany | None | v8-garage | automotive | [linkedin.com/company/v8-garage](https://linkedin.com/company/v8-garage) | None | v8 garage | berlin | 1-10 | [v8-garage.de](https://v8-garage.de) | +| germany | None | dipsales-gmbh | automotive | [linkedin.com/company/dipsales-gmbh](https://linkedin.com/company/dipsales-gmbh) | berlin | dipsales gmbh | berlin | 1-10 | [dipsales.com](https://dipsales.com) | +| germany | None | autohaus-koschnick-gmbh | automotive | [linkedin.com/company/autohaus-koschnick-gmbh](https://linkedin.com/company/autohaus-koschnick-gmbh) | berlin | autohaus koschnick gmbh | berlin | 1-10 | [autohaus-koschnick.de](https://autohaus-koschnick.de) | +| germany | None | samoconsult-gmbh | automotive | [linkedin.com/company/samoconsult-gmbh](https://linkedin.com/company/samoconsult-gmbh) | None | samoconsult gmbh | berlin | 1-10 | [samoconsult.de](https://samoconsult.de) | + +### Let's call the `explain_the_result` function to find insights + +Call the explain_the_result function to analyze and explain the business insights + +```python +# Run multiple times if no result shown +explain_the_result(query_result=result) +``` + +```text +# Result: +1. The automotive industry is well-represented in Berlin, Germany, with multiple companies listed in this dataset. + +2. The companies listed are mostly small, with a size range of 1-10 employees. + +3. The companies' LinkedIn profiles provide an opportunity for networking and connecting with industry professionals. + +4. Theaterscoutings Berlin, v8 garage, dipsales gmbh, autohaus koschnick gmbh, and samoconsult gmbh are some of the automotive companies operating in Berlin. + +5. These companies primarily focus on automotive-related services, such as theater scouting, garage services, sales, and consultation. + +6. The websites of these companies can be visited for further information on their offerings and services. + +7. The locality of these companies is primarily Berlin, indicating a concentration of automotive businesses in this area. + +Further analysis could be done by examining the specific offerings and competitive landscape of these companies within the automotive industry in Berlin. +``` + +### Let's generate result on the fly by model chaining + +Now you can do model-chaining as well, if you only care about the explanations. Here we found from the dataset about the company + +```python +# Run multiple times if no result shown +table_name = "FREECOMPANYDATASET" +query = "Find me information about BMW company in Germany. Keep all in lowercase." + +# The result is generated from your dataset. Tweak limit params if you want specific results. +explain_the_result(chat_with_your_database(table_name, query, limit=1)) +``` + +```txt +## Result +#### Based on the given information, here are some possible business insights: + +- **Company Name:** BMW +- **Country:** Germany +- **Founded:** 1916 +- **Industry:** Automotive +- **Region:** Bavaria +- **LinkedIn URL:** linkedin.com/company/abrockman +- **Locality:** Munich +- **Size:** 10,001+ +- **Website:** N/A (Not provided in the given data) + +BMW is one of the oldest automotive companies, founded in 1916 in Germany. This long history indicates its established presence and experience in the industry. + +The company has a significant presence in Bavaria, specifically in Munich. This location may serve as its headquarters or a key operational hub. + +The size of the company is mentioned as 10,001+, which implies that the company has a large workforce or a substantial number of employees. + +The LinkedIn URL provided links to the company's profile on the platform. This indicates that BMW has an active presence on LinkedIn, where it may engage with professionals and potentially recruit talent. + +No website is listed for BMW in the given data, which could mean that the company's website was not included or omitted in the dataset. + +Please note that these insights are based solely on the information provided and further research is recommended for a comprehensive understanding of the business and its insights. +``` + +## Let's chat realtime. + +### Ask questions, get result. + +We just boiled the whole thing in one function. + +Rerun this for new questions. Don't worry, it is keeping the context! + +Let's have one simple interface. Where you just write your query and see the result. Simple. + +```python +# Run multiple times if no result shown +table_name = "FREECOMPANYDATASET" + +# Be innovative and specific here +query = "Find me information about Volkswagen company in Germany. Keep all in lowercase." + +def integrated_result(table_name, query): + queried_result = chat_with_your_database(table_name, query) + explained_result = explain_the_result(queried_result) + + display(queried_result, explained_result) + +# Showing the result here +integrated_result(table_name, query) +``` + +Result: + +| COUNTRY | FOUNDED | ID | INDUSTRY | LINKEDIN_URL | LOCALITY | NAME | REGION | SIZE | WEBSITE | +|---------|---------|-------------|------------|------------------------------------------------|------------|------------|--------------|--------|--------------------------------------| +| germany | None | volkswagen | automotive | [linkedin.com/company/volkswagen](https://linkedin.com/company/volkswagen) | wolfsburg | volkswagen | niedersachsen | 10001+ | [volkswagen-newsroom.com](https://volkswagen-newsroom.com) | + +```text +# The business is Volkswagen, a German automotive company founded in Wolfsburg, Germany. +# The company has a LinkedIn profile at linkedin.com/company/volkswagen. +# It is located in the region of Niedersachsen in Germany. +# Volkswagen is classified as a large company, with a size of 10001+ employees. +# The company's official website is volkswagen-newsroom.com. +``` + +## Voila! You just had a conversation with your database. Let's take it from here. + +This is just the beginning – feel free to customize prompts for your dataset. One secret tip: Mentioning your database schema in the chat_your_database function enhances accuracy by a few miles. Another one is giving more data to it. Anyway, it's yours. Play with it. The better you prompt, the better result you get. This prompt of us is just a simple one works for everything! Your journey with SuperDuperDB is in your hands now. Let the exploration begin! + +#### Give us a star. We will release more update in this example like visualization, fine tuning etc. + diff --git a/examples/chat_with_your_database.ipynb b/examples/chat_with_your_database.ipynb index f9739d253..c4d8e7965 100644 --- a/examples/chat_with_your_database.ipynb +++ b/examples/chat_with_your_database.ipynb @@ -8,9 +8,13 @@ "\n", "### Chatting with Snowflake 10M Dataset instantly with SuperDuperDB & OpenAI\n", "\n", - "Imagine chatting with your database using just a few lines of code. Sounds unbelievable, right? Well, believe it! We'll show you how you can effortlessly chat with a huge database containing 10 million business records—all with just a few lines of SuperDuperDB code.\n", + "Imagine chatting with your database using just a few lines of code. Sounds unbelievable, right? Well, believe it! We'll show you how you can effortlessly chat with a huge database containing 10 million business records—all with just a few lines of SuperDuperDB code. \n", "\n", - "Let's do this with SuperDuperDB's user-friendly approach. You can control the low level code while enjoying writing the high level code! Whether you're using `Snowflake` or `any other SQL dataset`, we've got your back.\n", + "Here is the behemoth 10M dataset: [FREE COMPANY DATASET](https://app.snowflake.com/marketplace/listing/GZSTZRRVYL2/people-data-labs-free-company-dataset)\n", + "\n", + "Chatting with this type of massive dataset using the standard RAG pipeline is next to impossible due to the cost and scale. However, with SuperDuperDB, you can achieve the same functionality with just a few lines of code.\n", + "\n", + "You can control the low level code while enjoying writing the high level code! So that, you can increase the capacity of it! Whether you're using `Snowflake` or `any other SQL dataset`, we've got your back.\n", "\n", "Here's the simplicity of it:\n", "1. Connect using your URI (works with any SQL Database).\n", @@ -32,7 +36,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Only one dependency\n", "# %pip install superduperdb" ] }, @@ -40,7 +43,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Import SuperDuperDB and connect your database" + "### Import SuperDuperDB and connect your database\n", + "\n", + "Here we have connected with a mega database from `Snowflake` but it works with all other `SQL` database. " ] }, { @@ -63,7 +68,7 @@ "# Let's superduper your database\n", "db = superduper(\n", " snowflake_uri,\n", - " metadata_store='mongodb://localhost:27017/documents', # We need a persistent metadata store to store everything. It could be anything including your own database. Here we are using a MongoDB database.\n", + " metadata_store='sqlite:///your_database_name.db', # We need a persistent metadata store to store important infos like job. It could be anything including your own database. Here we are using a SQLite database. You can use your same database as well. In that case you don't have to add metadata_store, we will use the same database as metadata store\n", ")" ] }, @@ -71,7 +76,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Add OpenAI API Key" + "### Add OpenAI API Key\n", + "\n", + "If you don't have any, call Sam Altman!" ] }, { @@ -93,7 +100,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a helper function to chat your database. Here you can tweak the prompts or you can leave here as it is!" + "### Create a helper function to chat your database. \n", + "\n", + "Here you can tweak the prompts or you can leave here as it is!" ] }, { @@ -144,7 +153,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a helper function explain the result" + "### Create another helper function to explain the result\n", + "\n", + "This function will be used to explain the result" ] }, { @@ -190,7 +201,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Here you just edit the `table_name` and `query` to see the final result" + "### Now let's start chatting with your database. \n", + "\n", + "Run this multiple times as it will keep its context. Here you just edit the `table_name` and `query` to see the final result." ] }, { @@ -330,6 +343,8 @@ } ], "source": [ + "# If you see no result, Run this codeblock multiple times to make the gpt-3.5-turbo work better and change your query as well. Idea: start with a simple query. Then make it gradually complex.\n", + "\n", "table_name = \"FREECOMPANYDATASET\"\n", "query = \"Find me some company in germany in berlin in dortmund in automotive industry. Keep all in lower case\"\n", "\n", @@ -373,7 +388,7 @@ ], "source": [ "# Call the explain_the_result function to analyze and explain the business insights\n", - "# Run multiple times if necessary\n", + "## Run multiple times if no result shown\n", "explain_the_result(query_result=result)" ] }, @@ -381,7 +396,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Now you can do model-chaining as well, if you only care about the explanations. Here we found from the dataset about the company " + "### Let's generate result on the fly by model chaining\n", + "\n", + "Now you can do model-chaining as well, if you only care about the explanations. Here we found from the dataset about the company " ] }, { @@ -426,6 +443,8 @@ } ], "source": [ + "# Run multiple times if no result shown\n", + "\n", "table_name = \"FREECOMPANYDATASET\"\n", "query = \"Find me information about BMW company in Germany. Keep all in lower case.\"\n", "\n", @@ -437,9 +456,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Final Integrated View\n", + "## Let's chat realtime. \n", "\n", - "Let's have one simple interface. Where you just write your query and see the result." + "### Ask questions, get result.\n", + "\n", + "We just boiled the whole thing in one function.\n", + "\n", + "Rerun this for new questions. Don't worry, it is keeping the context!\n", + "\n", + "Let's have one simple interface. Where you just write your query and see the result. Simple." ] }, { @@ -527,7 +552,10 @@ } ], "source": [ + "# Run multiple times if no result shown\n", "table_name = \"FREECOMPANYDATASET\"\n", + "\n", + "# Be innovative and specific here \n", "query = \"Find me information about Volkswagen company in Germany. Keep all in lower case.\"\n", "\n", "def integrated_result(table_name, query):\n", @@ -544,10 +572,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Unlocking Limitless Potential! \n", + "## Voila! You just had a conversation with your database. Let's take it from here.\n", "\n", "This is just the beginning – feel free to customize prompts for your dataset. One secret tips: Mentioning your database schema in the `chat_your_database` function enhances accuracy by a few mile. Another one is giving more data to it. Anyway, it's yours. Play with it. The better you prompt, the better result you get. This prompt of us is just a simple one works for everything! Your journey with SuperDuperDB is in your hands now. Let the exploration begin!" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Give us a star. We will release more update in this example like visualization, fine tuning, prompt chaining etc." + ] } ], "metadata": { diff --git a/superduperdb/base/build.py b/superduperdb/base/build.py index 15ec84fac..7c18d2ce8 100644 --- a/superduperdb/base/build.py +++ b/superduperdb/base/build.py @@ -51,7 +51,7 @@ def build(uri, mapping, type: str = 'data_backend'): if re.match('^mongodb:\/\/', uri) is not None: name = uri.split('/')[-1] - conn = pymongo.MongoClient( + conn: pymongo.MongoClient = pymongo.MongoClient( uri, serverSelectionTimeoutMS=5000, ) @@ -69,28 +69,32 @@ def build(uri, mapping, type: str = 'data_backend'): name = uri.split('/')[-1] conn = mongomock.MongoClient() return mapping['mongodb'](conn, name) + elif uri.endswith('.csv'): if type == 'metadata': raise ValueError('Cannot build metadata from a CSV file.') import glob + csv_files = glob.glob(uri) - tables = { - re.match('^.*/(.*)\.csv$', csv_file).groups()[0]: pandas.read_csv(csv_file) - for csv_file in csv_files - } - conn = ibis.pandas.connect(tables) - return mapping['ibis'](conn, uri.split('/')[0]) + tables = {} + for csv_file in csv_files: + pattern = re.match('^.*/(.*)\.csv$', csv_file) + assert pattern is not None + tables[pattern.groups()[0]] = pandas.read_csv(csv_file) + ibis_conn = ibis.pandas.connect(tables) + return mapping['ibis'](ibis_conn, uri.split('/')[0]) else: name = uri.split('//')[0] if type == 'data_backend': - conn = ibis.connect(uri) - return mapping['ibis'](conn, name) + ibis_conn = ibis.connect(uri) + return mapping['ibis'](ibis_conn, name) else: assert type == 'metadata' from sqlalchemy import create_engine - conn = create_engine(uri) - return mapping['sqlalchemy'](conn, name) + + sql_conn = create_engine(uri) + return mapping['sqlalchemy'](sql_conn, name) def build_compute(compute):