Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Mongo Data Migration #804

Draft
wants to merge 65 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
3e645ac
Add SQL Adapter
Aug 22, 2024
8969106
some further changes for sql adapter
skarakuzu Jan 28, 2025
d53fac5
Fix rebase error
danielballan Jan 28, 2025
7690230
few more changes
skarakuzu Jan 28, 2025
43c57a6
Implement from_catalog on SQLAdapter
danielballan Jan 28, 2025
157949d
fixed sql tests
skarakuzu Jan 29, 2025
4e73800
fix typing
skarakuzu Jan 29, 2025
8958631
add TYPE_CHECKING conditional
skarakuzu Jan 29, 2025
5fd1bac
Add SQL Adapter
Aug 22, 2024
4389b68
some further changes for sql adapter
skarakuzu Jan 28, 2025
6f106a0
Fix rebase error
danielballan Jan 28, 2025
5f8d3ac
few more changes
skarakuzu Jan 28, 2025
35e25f5
Implement from_catalog on SQLAdapter
danielballan Jan 28, 2025
0d4d8f5
fixed sql tests
skarakuzu Jan 29, 2025
c437347
fix typing
skarakuzu Jan 29, 2025
4dbfd20
add TYPE_CHECKING conditional
skarakuzu Jan 29, 2025
f004ae2
Adjust for dataclass structures.
danielballan Feb 11, 2025
f1f7d4c
postgres text fix
skarakuzu Feb 11, 2025
2f3abf7
fix mypy in test
skarakuzu Feb 12, 2025
f6c8cf4
add sql tests
skarakuzu Feb 12, 2025
088ea39
Add comments.
danielballan Feb 12, 2025
0d0022e
Format index creation more nicely
danielballan Feb 12, 2025
40dd070
Clean up arrow to SQL type translation.
danielballan Feb 12, 2025
3c90df6
Use DuckDB for embedded tabular storage.
danielballan Feb 12, 2025
52d5b9e
Remove spurrious file
danielballan Feb 12, 2025
dbfe262
Remame method, and test integration.
danielballan Feb 12, 2025
ba4fde0
Ensure writable directory has scheme 'file:'.
danielballan Feb 12, 2025
0ee14e3
Reinstate original intent of test.
danielballan Feb 12, 2025
9c4d551
Template for the future HDF5ArrayAdapter
genematx Feb 13, 2025
ddfd74e
Merge branch 'add_sql_adapter' of github.com:skarakuzu/tiled into add…
genematx Feb 13, 2025
4f86582
fix typo
genematx Feb 13, 2025
0b6bd23
Support SQLite for tabular storage too.
danielballan Feb 13, 2025
2637e41
Merge branch 'add_sql_adapter' of github.com:skarakuzu/tiled into add…
genematx Feb 13, 2025
9f382a5
Remove unneeded conversion
danielballan Feb 13, 2025
f7f6aee
Remove unneeded conversion
danielballan Feb 13, 2025
159c152
Docstring improvements
danielballan Feb 13, 2025
6bc266b
Copyedit comment
danielballan Feb 13, 2025
6c91fd9
Remove another unnecessary conversion
danielballan Feb 13, 2025
7c748c3
Remove another unneeded conversion
danielballan Feb 13, 2025
781fb13
Remove more needless conversions
danielballan Feb 13, 2025
748cc47
Remove access policy
danielballan Feb 13, 2025
1b3148b
Finish removing access policy
danielballan Feb 13, 2025
c78ab18
ENH: Implement HDF5ArrayAdapter
genematx Feb 13, 2025
e554b5d
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 13, 2025
ec2f931
FIX: convert hdf5 dataset parameter
genematx Feb 13, 2025
1767201
FIX: dtype check
genematx Feb 13, 2025
1956212
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 13, 2025
3f374d6
mypy fixes and addressed comments
skarakuzu Feb 13, 2025
abadcea
FIX: shape check
genematx Feb 13, 2025
c7361a4
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 13, 2025
07ae5a8
typing fix and addressed comments
skarakuzu Feb 13, 2025
eaa792e
mypy fix
skarakuzu Feb 13, 2025
ce7b60c
fix typos
genematx Feb 13, 2025
9476144
Merge branch 'add_sql_adapter' of github.com:skarakuzu/tiled into add…
genematx Feb 13, 2025
3e8676b
Merge branch 'add_sql_adapter' into migration
genematx Feb 13, 2025
821fe7e
ENH: convert strings to ndslices
genematx Feb 14, 2025
3ddf0b6
Enable slicing in HDF5ArrayAdapter
genematx Feb 14, 2025
3d2de2b
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 14, 2025
c109ee8
Fix
genematx Feb 14, 2025
28690c8
ENH: accept str for dataset in HDF5ArrayAdapter
genematx Feb 14, 2025
73b10e3
ENH: accept str for dataset in HDF5ArrayAdapter
genematx Feb 14, 2025
a0890f2
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 14, 2025
006f6fa
FIX: string parsing
genematx Feb 14, 2025
1a904d9
Merge branch 'hdf5-array-adapter' into migration
genematx Feb 14, 2025
774c8ad
ENH: consider native chunking in hdf5
genematx Feb 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added main.py
Empty file.
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ tiled = "tiled.commandline.main:main"

# This is the union of all optional dependencies.
all = [
"adbc_driver_manager",
"adbc_driver_postgresql",
"adbc_driver_sqlite",
"aiofiles",
"aiosqlite",
"alembic",
Expand All @@ -68,6 +71,7 @@ all = [
"dask",
"dask[array]",
"dask[dataframe]",
"duckdb",
"entrypoints",
"fastapi",
"h5netcdf",
Expand Down Expand Up @@ -196,6 +200,9 @@ minimal-server = [
]
# This is the "kichen sink" fully-featured server dependency set.
server = [
"adbc_driver_manager",
"adbc_driver_postgresql",
"adbc_driver_sqlite",
"aiofiles",
"aiosqlite",
"alembic",
Expand All @@ -209,6 +216,7 @@ server = [
"dask",
"dask[array]",
"dask[dataframe]",
"duckdb",
"fastapi",
"h5netcdf",
"h5py",
Expand Down
26 changes: 23 additions & 3 deletions tiled/_tests/adapters/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pytest

from tiled.adapters.arrow import ArrowAdapter
from tiled.structures.core import StructureFamily
from tiled.structures.data_source import DataSource, Management, Storage
from tiled.structures.table import TableStructure

names = ["f0", "f1", "f2"]
Expand All @@ -26,11 +28,29 @@


@pytest.fixture
def adapter() -> ArrowAdapter:
def data_source_from_init_storage() -> DataSource[TableStructure]:
table = pa.Table.from_arrays(data0, names)
structure = TableStructure.from_arrow_table(table, npartitions=3)
assets = ArrowAdapter.init_storage(data_uri, structure=structure)
return ArrowAdapter([asset.data_uri for asset in assets], structure=structure)
data_source = DataSource(
management=Management.writable,
mimetype="application/vnd.apache.arrow.file",
structure_family=StructureFamily.table,
structure=structure,
assets=[],
)
storage = Storage(filesystem=data_uri, sql=None)
return ArrowAdapter.init_storage(
data_source=data_source, storage=storage, path_parts=[]
)


@pytest.fixture
def adapter(data_source_from_init_storage: DataSource[TableStructure]) -> ArrowAdapter:
data_source = data_source_from_init_storage
return ArrowAdapter(
[asset.data_uri for asset in data_source.assets],
data_source.structure,
)


def test_attributes(adapter: ArrowAdapter) -> None:
Expand Down
292 changes: 292 additions & 0 deletions tiled/_tests/adapters/test_sql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
import os
from pathlib import Path
from typing import Any, Callable, Generator, Union

import adbc_driver_duckdb
import pyarrow as pa
import pytest

from tiled.adapters.sql import SQLAdapter, check_table_name
from tiled.structures.core import StructureFamily
from tiled.structures.data_source import DataSource, Management, Storage
from tiled.structures.table import TableStructure

names = ["f0", "f1", "f2", "f3"]
data0 = [
pa.array([1, 2, 3, 4, 5]),
pa.array([1.0, 2.0, 3.0, 4.0, 5.0]),
pa.array(["foo0", "bar0", "baz0", None, "goo0"]),
pa.array([True, None, False, True, None]),
]
data1 = [
pa.array([6, 7, 8, 9, 10, 11, 12]),
pa.array([6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]),
pa.array(["foo1", "bar1", None, "baz1", "biz", None, "goo"]),
pa.array([None, True, True, False, False, None, True]),
]
data2 = [
pa.array([13, 14]),
pa.array([13.0, 14.0]),
pa.array(["foo2", "baz2"]),
pa.array([False, None]),
]

batch0 = pa.record_batch(data0, names=names)
batch1 = pa.record_batch(data1, names=names)
batch2 = pa.record_batch(data2, names=names)


@pytest.fixture
def data_source_from_init_storage() -> Callable[[str], DataSource[TableStructure]]:
def _data_source_from_init_storage(data_uri: str) -> DataSource[TableStructure]:
table = pa.Table.from_arrays(data0, names)
structure = TableStructure.from_arrow_table(table, npartitions=1)
data_source = DataSource(
management=Management.writable,
mimetype="application/x-tiled-sql-table",
structure_family=StructureFamily.table,
structure=structure,
assets=[],
)

storage = Storage(filesystem=None, sql=data_uri)
return SQLAdapter.init_storage(
data_source=data_source, storage=storage, path_parts=[]
)

return _data_source_from_init_storage


@pytest.fixture
def adapter_sql(
tmp_path: Path,
data_source_from_init_storage: Callable[[str], DataSource[TableStructure]],
) -> Generator[SQLAdapter, None, None]:
data_uri = f"duckdb:///{tmp_path}/test.db"
data_source = data_source_from_init_storage(data_uri)
yield SQLAdapter(
data_source.assets[0].data_uri,
data_source.structure,
data_source.parameters["table_name"],
data_source.parameters["dataset_id"],
)


def test_attributes(adapter_sql: SQLAdapter) -> None:
assert adapter_sql.structure().columns == names
assert adapter_sql.structure().npartitions == 1
assert isinstance(adapter_sql.conn, adbc_driver_duckdb.dbapi.Connection)


def test_write_read_sql_one(adapter_sql: SQLAdapter) -> None:
# test writing and reading it
adapter_sql.append_partition(batch0, 0)
result = adapter_sql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")

assert pa.Table.from_arrays(data0, names) == pa.Table.from_pandas(result)


def test_write_read_sql_list(adapter_sql: SQLAdapter) -> None:
adapter_sql.append_partition([batch0, batch1, batch2], 0)
result = adapter_sql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")
assert pa.Table.from_batches([batch0, batch1, batch2]) == pa.Table.from_pandas(
result
)

# test write , append and read all
adapter_sql.append_partition([batch2, batch0, batch1], 0)
adapter_sql.append_partition([batch1, batch2, batch0], 0)
result = adapter_sql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")

assert pa.Table.from_batches(
[batch0, batch1, batch2, batch2, batch0, batch1, batch1, batch2, batch0]
) == pa.Table.from_pandas(result)


@pytest.fixture
def postgres_uri() -> str:
uri = os.getenv("TILED_TEST_POSTGRESQL_URI")
if uri is not None:
return uri
pytest.skip("TILED_TEST_POSTGRESQL_URI is not set")
return ""


@pytest.fixture
def adapter_psql(
data_source_from_init_storage: Callable[[str], DataSource[TableStructure]],
postgres_uri: str,
) -> SQLAdapter:
data_source = data_source_from_init_storage(postgres_uri)
return SQLAdapter(
postgres_uri,
data_source.structure,
data_source.parameters["table_name"],
data_source.parameters["dataset_id"],
)


def test_psql(adapter_psql: SQLAdapter) -> None:
assert adapter_psql.structure().columns == names
assert adapter_psql.structure().npartitions == 1
# assert isinstance(
# adapter_psql.conn, adbc_driver_postgresql.dbapi.AdbcSqliteConnection
# )


def test_write_read_psql_one(adapter_psql: SQLAdapter) -> None:
# test writing and reading it
adapter_psql.append_partition(batch0, 0)
result = adapter_psql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")


def test_write_read_psql_list(adapter_psql: SQLAdapter) -> None:
adapter_psql.append_partition([batch0, batch1, batch2], 0)
result = adapter_psql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")
assert pa.Table.from_batches([batch0, batch1, batch2]) == pa.Table.from_pandas(
result
)

# test write , append and read all
adapter_psql.append_partition([batch2, batch0, batch1], 0)
adapter_psql.append_partition([batch1, batch2, batch0], 0)
result = adapter_psql.read()
# the pandas dataframe gives the last column of the data as 0 and 1 since SQL does not save boolean
# so we explicitely convert the last column to boolean for testing purposes
result["f3"] = result["f3"].astype("boolean")

assert pa.Table.from_batches(
[batch0, batch1, batch2, batch2, batch0, batch1, batch1, batch2, batch0]
) == pa.Table.from_pandas(result)


@pytest.mark.parametrize(
"table_name, expected",
[
(
"table_abcdefg12423pnjsbldfhjdfbv_hbdhfljb128w40_ndgjfsdflfnscljm",
pytest.raises(
ValueError, match="Table name is too long, max character number is 63!"
),
),
(
"create_abcdefg12423pnjsbldfhjdfbv_hbdhfljb128w40_ndgjfsdflfnscljk_sdbf_jhvjkbefl",
pytest.raises(
ValueError, match="Table name is too long, max character number is 63!"
),
),
(
"hello_abcdefg12423pnjsbldfhjdfbv_hbdhfljb128w40_ndgjfsdflfnscljk_sdbf_jhvjkbefl",
pytest.raises(
ValueError, match="Table name is too long, max character number is 63!"
),
),
("my_table_here_123_", None),
("the_short_table12374620_hello_table23704ynnm", None),
],
)
def test_check_table_name_long_name(
table_name: str, expected: Union[None, Any]
) -> None:
if isinstance(expected, type(pytest.raises(ValueError))):
with expected:
check_table_name(table_name)
else:
assert check_table_name(table_name) is None # type: ignore[func-returns-value]


@pytest.mark.parametrize(
"table_name, expected",
[
(
"_here_is_my_table",
pytest.raises(ValueError, match="Illegal table name!"),
),
(
"create_this_table1246*",
pytest.raises(ValueError, match="Illegal table name!"),
),
(
"create this_table1246",
pytest.raises(ValueError, match="Illegal table name!"),
),
(
"drop this_table1246",
pytest.raises(ValueError, match="Illegal table name!"),
),
(
"table_mytable!",
pytest.raises(ValueError, match="Illegal table name!"),
),
("my_table_here_123_", None),
("the_short_table12374620_hello_table23704ynnm", None),
],
)
def test_check_table_name_illegal_name(
table_name: str, expected: Union[None, Any]
) -> None:
if isinstance(expected, type(pytest.raises(ValueError))):
with expected:
check_table_name(table_name)
else:
assert check_table_name(table_name) is None # type: ignore[func-returns-value]


@pytest.mark.parametrize(
"table_name, expected",
[
(
"select",
pytest.raises(
ValueError,
match="Reserved SQL keywords are not allowed in the table name!",
),
),
(
"create",
pytest.raises(
ValueError,
match="Reserved SQL keywords are not allowed in the table name!",
),
),
(
"SELECT",
pytest.raises(
ValueError,
match="Reserved SQL keywords are not allowed in the table name!",
),
),
(
"from",
pytest.raises(
ValueError,
match="Reserved SQL keywords are not allowed in the table name!",
),
),
("drop_this_table123_", None),
("DROP_thistable123_hwejk", None),
],
)
def test_check_table_name_reserved_keywords(
table_name: str, expected: Union[None, Any]
) -> None:
if isinstance(expected, type(pytest.raises(ValueError))):
with expected:
check_table_name(table_name)
else:
assert check_table_name(table_name) is None # type: ignore[func-returns-value]
Loading
Loading