-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add support for using JDBC Catalog for Iceberg testing (#6144)
Fixes #6029
- Loading branch information
1 parent
e596689
commit ff1621a
Showing
26 changed files
with
739 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
# sqlite catalogs | ||
|
||
The sqlite JDBC catalog is able to support multiple catalogs through a single database file. | ||
As such, the following convention has been established for testing purposes: | ||
|
||
* database file: `<rootDir>/dh-iceberg-test.db` | ||
* warehouse directory: `<rootDir>/catalogs/<catalogName>/` | ||
|
||
Both writers and readers of this catalog need to be setup to support relative metadata locations to ensure portability. | ||
|
||
A root directory for extension-iceberg testing has been established at `extensions/iceberg/src/test/resources/io/deephaven/iceberg/sqlite/db_resource`. | ||
|
||
## Usage | ||
|
||
### Java | ||
|
||
```java | ||
import org.apache.iceberg.catalog.Catalog; | ||
import io.deephaven.iceberg.sqlite.DbResource; | ||
|
||
Catalog catalog = DbResource.openCatalog("<catalogName>"); | ||
``` | ||
|
||
### pyiceberg | ||
|
||
To setup in [pyiceberg](https://py.iceberg.apache.org/): | ||
|
||
```python | ||
from pyiceberg.catalog.sql import SqlCatalog | ||
|
||
catalog = SqlCatalog( | ||
"<catalogName>", | ||
**{ | ||
"uri": f"sqlite:///dh-iceberg-test.db", | ||
"warehouse": f"catalogs/<catalogName>", | ||
}, | ||
) | ||
``` | ||
|
||
## Generating data | ||
|
||
Note that any scripts that write data should be run relative to | ||
[db_resource](src/test/resources/io/deephaven/iceberg/sqlite/db_resource) working directory to ensure unit testability. | ||
|
||
### pyiceberg-1 | ||
|
||
Here's an example of what was needed to generate this data: | ||
|
||
```shell | ||
$ cd extensions/iceberg/src/test/resources/io/deephaven/iceberg/sqlite/db_resource | ||
|
||
# Note: 3.10 explicitly chosen b/c it doesn't seem like pyiceberg produces 3.12 wheels yet | ||
$ python3.10 -m venv /tmp/iceberg | ||
|
||
$ source /tmp/iceberg/bin/activate | ||
|
||
$ pip install --only-binary=":all:" "pyiceberg[sql-sqlite, pyarrow]" | ||
|
||
$ pip freeze | ||
annotated-types==0.7.0 | ||
certifi==2024.8.30 | ||
charset-normalizer==3.3.2 | ||
click==8.1.7 | ||
fsspec==2024.9.0 | ||
greenlet==3.1.1 | ||
idna==3.10 | ||
markdown-it-py==3.0.0 | ||
mdurl==0.1.2 | ||
mmh3==4.1.0 | ||
numpy==1.26.4 | ||
pyarrow==17.0.0 | ||
pydantic==2.9.2 | ||
pydantic_core==2.23.4 | ||
Pygments==2.18.0 | ||
pyiceberg==0.7.1 | ||
pyparsing==3.1.4 | ||
python-dateutil==2.9.0.post0 | ||
requests==2.32.3 | ||
rich==13.8.1 | ||
six==1.16.0 | ||
sortedcontainers==2.4.0 | ||
SQLAlchemy==2.0.35 | ||
strictyaml==1.7.3 | ||
tenacity==8.5.0 | ||
typing_extensions==4.12.2 | ||
urllib3==2.2.3 | ||
|
||
$ python generate-pyiceberg-1.py | ||
|
||
$ sqlite3 dh-iceberg-test.db | ||
SQLite version 3.45.1 2024-01-30 16:01:20 | ||
Enter ".help" for usage hints. | ||
sqlite> .dump | ||
PRAGMA foreign_keys=OFF; | ||
BEGIN TRANSACTION; | ||
CREATE TABLE iceberg_tables ( | ||
catalog_name VARCHAR(255) NOT NULL, | ||
table_namespace VARCHAR(255) NOT NULL, | ||
table_name VARCHAR(255) NOT NULL, | ||
metadata_location VARCHAR(1000), | ||
previous_metadata_location VARCHAR(1000), | ||
PRIMARY KEY (catalog_name, table_namespace, table_name) | ||
); | ||
INSERT INTO iceberg_tables VALUES('pyiceberg-1','dh-default','cities','catalogs/pyiceberg-1/dh-default.db/cities/metadata/00003-68091f71-d3c5-42bb-8161-e2e187dece14.metadata.json','catalogs/pyiceberg-1/dh-default.db/cities/metadata/00002-106b37f8-8818-439d-87c5-3cae608d1972.metadata.json'); | ||
CREATE TABLE iceberg_namespace_properties ( | ||
catalog_name VARCHAR(255) NOT NULL, | ||
namespace VARCHAR(255) NOT NULL, | ||
property_key VARCHAR(255) NOT NULL, | ||
property_value VARCHAR(1000) NOT NULL, | ||
PRIMARY KEY (catalog_name, namespace, property_key) | ||
); | ||
INSERT INTO iceberg_namespace_properties VALUES('pyiceberg-1','dh-default','exists','true'); | ||
COMMIT; | ||
sqlite> | ||
``` | ||
|
||
### sqlite | ||
|
||
If we add a lot of catalogs to the database, we may want to look into vacuuming the database to keep the file size small. | ||
|
||
`sqlite3 dh-iceberg-test.db 'VACUUM;'` | ||
|
||
Ideally, the sqlite database can be a small collection of catalogs that were created via external tooling to verify that | ||
we can integrate with them successfully. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
137 changes: 137 additions & 0 deletions
137
extensions/iceberg/src/main/java/io/deephaven/iceberg/relative/RelativeFileIO.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
// | ||
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending | ||
// | ||
package io.deephaven.iceberg.relative; | ||
|
||
import io.deephaven.util.annotations.VisibleForTesting; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.iceberg.CatalogUtil; | ||
import org.apache.iceberg.hadoop.HadoopConfigurable; | ||
import org.apache.iceberg.hadoop.SerializableConfiguration; | ||
import org.apache.iceberg.io.BulkDeletionFailureException; | ||
import org.apache.iceberg.io.DelegateFileIO; | ||
import org.apache.iceberg.io.FileIO; | ||
import org.apache.iceberg.io.FileInfo; | ||
import org.apache.iceberg.io.InputFile; | ||
import org.apache.iceberg.io.OutputFile; | ||
import org.apache.iceberg.io.ResolvingFileIO; | ||
import org.apache.iceberg.relocated.com.google.common.collect.Streams; | ||
import org.apache.iceberg.util.SerializableSupplier; | ||
|
||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
/** | ||
* While this class is in the public source set, it is meant to support testing use cases only and should not be used in | ||
* production. | ||
* | ||
* @see <a href="https://github.com/apache/iceberg/issues/1617">Support relative paths in Table Metadata</a> | ||
*/ | ||
@VisibleForTesting | ||
public final class RelativeFileIO implements HadoopConfigurable, DelegateFileIO { | ||
public static final String BASE_PATH = "relative.base-path"; | ||
private static final String IO_DEFAULT_IMPL = ResolvingFileIO.class.getName(); | ||
|
||
private String basePath; | ||
|
||
private DelegateFileIO io; | ||
|
||
private SerializableSupplier<Configuration> hadoopConf; | ||
|
||
public RelativeFileIO() {} | ||
|
||
public RelativeFileIO(Configuration hadoopConf) { | ||
this(new SerializableConfiguration(hadoopConf)::get); | ||
} | ||
|
||
public RelativeFileIO(SerializableSupplier<Configuration> hadoopConf) { | ||
this.hadoopConf = hadoopConf; | ||
} | ||
|
||
@Override | ||
public Configuration getConf() { | ||
return hadoopConf.get(); | ||
} | ||
|
||
@Override | ||
public void setConf(Configuration conf) { | ||
this.hadoopConf = new SerializableConfiguration(conf)::get; | ||
} | ||
|
||
@Override | ||
public void serializeConfWith(Function<Configuration, SerializableSupplier<Configuration>> confSerializer) { | ||
this.hadoopConf = confSerializer.apply(getConf()); | ||
} | ||
|
||
public String absoluteLocation(String location) { | ||
return basePath + location; | ||
} | ||
|
||
private String relativeLocation(String location) { | ||
if (!location.startsWith(basePath)) { | ||
throw new IllegalStateException(); | ||
} | ||
return location.substring(basePath.length()); | ||
} | ||
|
||
@Override | ||
public void initialize(Map<String, String> properties) { | ||
this.basePath = StringUtils.appendIfMissing(properties.get(BASE_PATH), "/"); | ||
// We can add a property here later if we need to override the default | ||
// final String impl = properties.getOrDefault(IO_IMPL, IO_DEFAULT_IMPL); | ||
final FileIO fileIO = CatalogUtil.loadFileIO(IO_DEFAULT_IMPL, properties, hadoopConf.get()); | ||
if (!(fileIO instanceof DelegateFileIO)) { | ||
throw new IllegalArgumentException("filoIO must be DelegateFileIO, " + fileIO.getClass()); | ||
} | ||
this.io = (DelegateFileIO) fileIO; | ||
} | ||
|
||
@Override | ||
public Map<String, String> properties() { | ||
return io.properties(); | ||
} | ||
|
||
@Override | ||
public InputFile newInputFile(String path) { | ||
return new RelativeInputFile(path, io.newInputFile(absoluteLocation(path))); | ||
} | ||
|
||
@Override | ||
public InputFile newInputFile(String path, long length) { | ||
return new RelativeInputFile(path, io.newInputFile(absoluteLocation(path), length)); | ||
} | ||
|
||
@Override | ||
public OutputFile newOutputFile(String path) { | ||
return new RelativeOutputFile(path, io.newOutputFile(absoluteLocation(path))); | ||
} | ||
|
||
@Override | ||
public void deleteFiles(Iterable<String> iterable) throws BulkDeletionFailureException { | ||
io.deleteFiles(Streams.stream(iterable).map(this::absoluteLocation)::iterator); | ||
} | ||
|
||
@Override | ||
public Iterable<FileInfo> listPrefix(String s) { | ||
return Streams.stream(io.listPrefix(absoluteLocation(s))) | ||
.map(x -> new FileInfo(relativeLocation(x.location()), x.size(), x.createdAtMillis()))::iterator; | ||
} | ||
|
||
@Override | ||
public void deletePrefix(String s) { | ||
io.deletePrefix(absoluteLocation(s)); | ||
} | ||
|
||
@Override | ||
public void deleteFile(String path) { | ||
io.deleteFile(absoluteLocation(path)); | ||
} | ||
|
||
@Override | ||
public void close() { | ||
if (io != null) { | ||
io.close(); | ||
} | ||
} | ||
} |
39 changes: 39 additions & 0 deletions
39
extensions/iceberg/src/main/java/io/deephaven/iceberg/relative/RelativeInputFile.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// | ||
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending | ||
// | ||
package io.deephaven.iceberg.relative; | ||
|
||
import org.apache.iceberg.io.InputFile; | ||
import org.apache.iceberg.io.SeekableInputStream; | ||
|
||
import java.util.Objects; | ||
|
||
final class RelativeInputFile implements InputFile { | ||
private final String location; | ||
private final InputFile file; | ||
|
||
public RelativeInputFile(String location, InputFile file) { | ||
this.location = Objects.requireNonNull(location); | ||
this.file = Objects.requireNonNull(file); | ||
} | ||
|
||
@Override | ||
public long getLength() { | ||
return file.getLength(); | ||
} | ||
|
||
@Override | ||
public SeekableInputStream newStream() { | ||
return file.newStream(); | ||
} | ||
|
||
@Override | ||
public String location() { | ||
return location; | ||
} | ||
|
||
@Override | ||
public boolean exists() { | ||
return file.exists(); | ||
} | ||
} |
Oops, something went wrong.