Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configuration Manager Redesign #1272

Merged
merged 26 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ jobs:
else
pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
fi
python -c "import yaml;f = open('evadb/evadb.yml', 'r+');config_obj = yaml.load(f, Loader=yaml.FullLoader);config_obj['experimental']['ray'] = True;f.seek(0);f.write(yaml.dump(config_obj));f.truncate();"
python -c "import evadb;cur=evadb.connect().cursor();cur.query('SET ray=True';)"
else
if [ $PY_VERSION != "3.11" ]; then
pip install ".[dev,ludwig,qdrant,pinecone,chromadb]"
Expand Down
4 changes: 2 additions & 2 deletions apps/pandas_qa/pandas_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ def receive_user_input() -> Dict:

# get OpenAI key if needed
try:
api_key = os.environ["OPENAI_KEY"]
api_key = os.environ["OPENAI_API_KEY"]
except KeyError:
api_key = str(input("🔑 Enter your OpenAI key: "))
os.environ["OPENAI_KEY"] = api_key
os.environ["OPENAI_API_KEY"] = api_key

return user_input

Expand Down
4 changes: 2 additions & 2 deletions apps/youtube_qa/youtube_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def receive_user_input() -> Dict:

# get OpenAI key if needed
try:
api_key = os.environ["OPENAI_KEY"]
api_key = os.environ["OPENAI_API_KEY"]
except KeyError:
api_key = str(input("🔑 Enter your OpenAI key: "))
os.environ["OPENAI_KEY"] = api_key
os.environ["OPENAI_API_KEY"] = api_key

return user_input

Expand Down
2 changes: 1 addition & 1 deletion docs/source/overview/concepts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Here are some illustrative **AI queries** for a ChatGPT-based video question ans
--- The 'transcripts' table has a column called 'text' with the transcript text
--- Since ChatGPT is a built-in function in EvaDB, we don't have to define it
--- We can directly use ChatGPT() in any query
--- We will only need to set the OPENAI_KEY as an environment variable
--- We will only need to set the OPENAI_API_KEY as an environment variable
SELECT ChatGPT('Is this video summary related to Ukraine russia war', text)
FROM TEXT_SUMMARY;

Expand Down
3 changes: 0 additions & 3 deletions docs/source/reference/ai/custom-ai-function.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,6 @@ The following code can be used to create an Object Detection function using Yolo
try_to_import_openai()
import openai

#setting up the key
openai.api_key = ConfigurationManager().get_value("third_party", "OPENAI_KEY")

#getting the data
content = text_df[text_df.columns[0]]
responses = []
Expand Down
2 changes: 1 addition & 1 deletion docs/source/usecases/question-answering.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ EvaDB has built-in support for ``ChatGPT`` function from ``OpenAI``. You will ne

# Set OpenAI key
import os
os.environ["OPENAI_KEY"] = "sk-..."
os.environ["OPENAI_API_KEY"] = "sk-..."

.. note::

Expand Down
21 changes: 18 additions & 3 deletions evadb/binder/statement_binder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from evadb.catalog.catalog_utils import get_metadata_properties, is_document_table
from evadb.catalog.sql_config import RESTRICTED_COL_NAMES
from evadb.configuration.constants import EvaDB_INSTALLATION_DIR
from evadb.executor.execution_context import Context
from evadb.expression.abstract_expression import AbstractExpression, ExpressionType
from evadb.expression.function_expression import FunctionExpression
from evadb.expression.tuple_value_expression import TupleValueExpression
Expand Down Expand Up @@ -273,6 +274,11 @@ def _bind_tuple_expr(self, node: TupleValueExpression):

@bind.register(FunctionExpression)
def _bind_func_expr(self, node: FunctionExpression):
# setup the context
# we read the GPUs from the catalog and populate in the context
gpus_ids = self._catalog().get_configuration_catalog_value("gpu_ids")
node._context = Context(gpus_ids)

# handle the special case of "extract_object"
if node.name.upper() == str(FunctionType.EXTRACT_OBJECT):
handle_bind_extract_object_function(node, self)
Expand Down Expand Up @@ -340,9 +346,18 @@ def _bind_func_expr(self, node: FunctionExpression):
)
# certain functions take additional inputs like yolo needs the model_name
# these arguments are passed by the user as part of metadata
node.function = lambda: function_class(
**get_metadata_properties(function_obj)
)
# we also handle the special case of ChatGPT where we need to send the
# OpenAPI key as part of the parameter if not provided by the user
properties = get_metadata_properties(function_obj)
if string_comparison_case_insensitive(node.name, "CHATGPT"):
# if the user didn't provide any API_KEY, check if we have one in the catalog
if "OPENAI_API_KEY" not in properties.keys():
openapi_key = self._catalog().get_configuration_catalog_value(
"OPENAI_API_KEY"
)
properties["openai_api_key"] = openapi_key

node.function = lambda: function_class(**properties)
except Exception as e:
err_msg = (
f"{str(e)}. Please verify that the function class name in the "
Expand Down
67 changes: 46 additions & 21 deletions evadb/catalog/catalog_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
import shutil
from pathlib import Path
from typing import List
from typing import Any, List

from evadb.catalog.catalog_type import (
ColumnType,
Expand All @@ -23,7 +23,6 @@
VideoColumnName,
)
from evadb.catalog.catalog_utils import (
cleanup_storage,
construct_function_cache_catalog_entry,
get_document_table_column_definitions,
get_image_table_column_definitions,
Expand All @@ -46,6 +45,9 @@
truncate_catalog_tables,
)
from evadb.catalog.services.column_catalog_service import ColumnCatalogService
from evadb.catalog.services.configuration_catalog_service import (
ConfigurationCatalogService,
)
from evadb.catalog.services.database_catalog_service import DatabaseCatalogService
from evadb.catalog.services.function_cache_catalog_service import (
FunctionCacheCatalogService,
Expand All @@ -61,23 +63,28 @@
from evadb.catalog.services.index_catalog_service import IndexCatalogService
from evadb.catalog.services.table_catalog_service import TableCatalogService
from evadb.catalog.sql_config import IDENTIFIER_COLUMN, SQLConfig
from evadb.configuration.configuration_manager import ConfigurationManager
from evadb.expression.function_expression import FunctionExpression
from evadb.parser.create_statement import ColumnDefinition
from evadb.parser.table_ref import TableInfo
from evadb.parser.types import FileFormatType
from evadb.third_party.databases.interface import get_database_handler
from evadb.utils.generic_utils import generate_file_path, get_file_checksum
from evadb.utils.generic_utils import (
generate_file_path,
get_file_checksum,
remove_directory_contents,
)
from evadb.utils.logging_manager import logger


class CatalogManager(object):
def __init__(self, db_uri: str, config: ConfigurationManager):
def __init__(self, db_uri: str):
self._db_uri = db_uri
self._sql_config = SQLConfig(db_uri)
self._config = config
self._bootstrap_catalog()
self._db_catalog_service = DatabaseCatalogService(self._sql_config.session)
self._config_catalog_service = ConfigurationCatalogService(
self._sql_config.session
)
self._table_catalog_service = TableCatalogService(self._sql_config.session)
self._column_service = ColumnCatalogService(self._sql_config.session)
self._function_service = FunctionCatalogService(self._sql_config.session)
Expand Down Expand Up @@ -130,10 +137,14 @@ def _clear_catalog_contents(self):
logger.info("Clearing catalog")
# drop tables which are not part of catalog
drop_all_tables_except_catalog(self._sql_config.engine)
# truncate the catalog tables
truncate_catalog_tables(self._sql_config.engine)
# truncate the catalog tables except configuration_catalog
# We do not remove the configuration entries
truncate_catalog_tables(
self._sql_config.engine, tables_not_to_truncate=["configuration_catalog"]
)
# clean up the dataset, index, and cache directories
cleanup_storage(self._config)
for folder in ["cache_dir", "index_dir", "datasets_dir"]:
remove_directory_contents(self.get_configuration_catalog_value(folder))

"Database catalog services"

Expand Down Expand Up @@ -447,7 +458,7 @@ def get_all_index_catalog_entries(self):
""" Function Cache related"""

def insert_function_cache_catalog_entry(self, func_expr: FunctionExpression):
cache_dir = self._config.get_value("storage", "cache_dir")
cache_dir = self.get_configuration_catalog_value("cache_dir")
entry = construct_function_cache_catalog_entry(func_expr, cache_dir=cache_dir)
return self._function_cache_service.insert_entry(entry)

Expand Down Expand Up @@ -510,7 +521,7 @@ def create_and_insert_table_catalog_entry(
table_name = table_info.table_name
column_catalog_entries = xform_column_definitions_to_catalog_entries(columns)

dataset_location = self._config.get_value("core", "datasets_dir")
dataset_location = self.get_configuration_catalog_value("datasets_dir")
file_url = str(generate_file_path(dataset_location, table_name))
table_catalog_entry = self.insert_table_catalog_entry(
table_name,
Expand Down Expand Up @@ -610,14 +621,28 @@ def create_and_insert_multimedia_metadata_table_catalog_entry(
)
return obj

"Configuration catalog services"

def upsert_configuration_catalog_entry(self, key: str, value: any):
"""Upserts configuration catalog entry"

Args:
key: key name
value: value name
"""
self._config_catalog_service.upsert_entry(key, value)

def get_configuration_catalog_value(self, key: str, default: Any = None) -> Any:
"""
Returns the value entry for the given key
Arguments:
key (str): key name

Returns:
ConfigurationCatalogEntry
"""

#### get catalog instance
# This function plays a crucial role in ensuring that different threads do
# not share the same catalog object, as it can result in serialization issues and
# incorrect behavior with SQLAlchemy. Therefore, whenever a catalog instance is
# required, we create a new one. One possible optimization is to share the catalog
# instance across all objects within the same thread. It is worth investigating whether
# SQLAlchemy already handles this optimization for us, which will be explored at a
# later time.
def get_catalog_instance(db_uri: str, config: ConfigurationManager):
return CatalogManager(db_uri, config)
table_entry = self._config_catalog_service.get_entry_by_name(key)
if table_entry:
return table_entry.value
return default
26 changes: 16 additions & 10 deletions evadb/catalog/catalog_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@
TableCatalogEntry,
)
from evadb.catalog.sql_config import IDENTIFIER_COLUMN
from evadb.configuration.configuration_manager import ConfigurationManager
from evadb.expression.function_expression import FunctionExpression
from evadb.expression.tuple_value_expression import TupleValueExpression
from evadb.parser.create_statement import ColConstraintInfo, ColumnDefinition
from evadb.utils.generic_utils import get_str_hash, remove_directory_contents
from evadb.utils.generic_utils import get_str_hash


def is_video_table(table: TableCatalogEntry):
Expand Down Expand Up @@ -256,12 +255,6 @@ def construct_function_cache_catalog_entry(
return entry


def cleanup_storage(config):
remove_directory_contents(config.get_value("storage", "index_dir"))
remove_directory_contents(config.get_value("storage", "cache_dir"))
remove_directory_contents(config.get_value("core", "datasets_dir"))


def get_metadata_entry_or_val(
function_obj: FunctionCatalogEntry, key: str, default_val: Any = None
) -> str:
Expand Down Expand Up @@ -300,6 +293,19 @@ def get_metadata_properties(function_obj: FunctionCatalogEntry) -> Dict:
return properties


def bootstrap_configs(catalog, configs: dict):
"""
load all the configuration values into the catalog table configuration_catalog
"""
for key, value in configs.items():
catalog.upsert_configuration_catalog_entry(key, value)


def get_configuration_value(key: str):
catalog = get_catalog_instance()
return catalog.get_configuration_catalog_value(key)


#### get catalog instance
# This function plays a crucial role in ensuring that different threads do
# not share the same catalog object, as it can result in serialization issues and
Expand All @@ -308,7 +314,7 @@ def get_metadata_properties(function_obj: FunctionCatalogEntry) -> Dict:
# instance across all objects within the same thread. It is worth investigating whether
# SQLAlchemy already handles this optimization for us, which will be explored at a
# later time.
def get_catalog_instance(db_uri: str, config: ConfigurationManager):
def get_catalog_instance(db_uri: str):
from evadb.catalog.catalog_manager import CatalogManager

return CatalogManager(db_uri, config)
return CatalogManager(db_uri)
36 changes: 0 additions & 36 deletions evadb/catalog/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib

import sqlalchemy
from sqlalchemy import Column, Integer
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy_utils import database_exists

from evadb.catalog.sql_config import CATALOG_TABLES
from evadb.utils.logging_manager import logger


Expand Down Expand Up @@ -100,33 +94,3 @@ def _commit(self, db_session):

# Custom Base Model to be inherited by all models
BaseModel = declarative_base(cls=CustomModel, constructor=None)


def truncate_catalog_tables(engine: Engine):
"""Truncate all the catalog tables"""
# https://stackoverflow.com/questions/4763472/sqlalchemy-clear-database-content-but-dont-drop-the-schema/5003705#5003705 #noqa
# reflect to refresh the metadata
BaseModel.metadata.reflect(bind=engine)
insp = sqlalchemy.inspect(engine)
if database_exists(engine.url):
with contextlib.closing(engine.connect()) as con:
trans = con.begin()
for table in reversed(BaseModel.metadata.sorted_tables):
if insp.has_table(table.name):
con.execute(table.delete())
trans.commit()


def drop_all_tables_except_catalog(engine: Engine):
"""drop all the tables except the catalog"""
# reflect to refresh the metadata
BaseModel.metadata.reflect(bind=engine)
insp = sqlalchemy.inspect(engine)
if database_exists(engine.url):
with contextlib.closing(engine.connect()) as con:
trans = con.begin()
for table in reversed(BaseModel.metadata.sorted_tables):
if table.name not in CATALOG_TABLES:
if insp.has_table(table.name):
table.drop(con)
trans.commit()
43 changes: 43 additions & 0 deletions evadb/catalog/models/configuration_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sqlalchemy import Column, String

from evadb.catalog.models.base_model import BaseModel
from evadb.catalog.models.utils import ConfigurationCatalogEntry, TextPickleType


class ConfigurationCatalog(BaseModel):
"""The `ConfigurationCatalog` catalog stores all the configuration params.
`_row_id:` an autogenerated unique identifier.
`_key:` the key for the config.
`_value:` the value for the config
"""

__tablename__ = "configuration_catalog"

_key = Column("key", String(100), unique=True)
_value = Column("value", TextPickleType())

def __init__(self, key: str, value: any):
self._key = key
self._value = value

def as_dataclass(self) -> "ConfigurationCatalogEntry":
return ConfigurationCatalogEntry(
row_id=self._row_id,
key=self._key,
value=self._value,
)
Loading