Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define basics for collecting Iceberg metadata for the current snapshot #3559

Merged
merged 12 commits into from
Sep 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;


/**
* Any catalog from which to access {@link IcebergTable}s.
*/
public interface IcebergCatalog {
IcebergTable openTable(String dbName, String tableName);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;

import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.hive.HiveCatalogs;


/**
* Provides an {@link IcebergCatalog}.
*/
public class IcebergCatalogFactory {
public static IcebergCatalog create(Configuration configuration) {
return new IcebergHiveCatalog(HiveCatalogs.loadCatalog(configuration));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.hive.HiveCatalog;


/**
* Hive-Metastore-based {@link IcebergCatalog}.
*/
@Slf4j
@AllArgsConstructor
public class IcebergHiveCatalog implements IcebergCatalog {
// NOTE: specifically necessitates `HiveCatalog`, as `BaseMetastoreCatalog.newTableOps` is `protected`!
private final HiveCatalog hc;

@Override
public IcebergTable openTable(String dbName, String tableName) {
return new IcebergTable(hc.newTableOps(TableIdentifier.of(dbName, tableName)));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;

import java.time.Instant;
import java.util.List;
import java.util.stream.Collectors;

import lombok.Data;

import com.google.common.collect.Lists;


/**
* Information about the metadata file and data file paths of a single Iceberg Snapshot.
*/
@Data
public class IcebergSnapshotInfo {

@Data
public static class ManifestFileInfo {
private final String manifestFilePath;
private final List<String> listedFilePaths;
}

private final Long snapshotId;
private final Instant timestamp;
private final String metadataPath;
private final String manifestListPath;
private final List<ManifestFileInfo> manifestFiles;

public List<String> getManifestFilePaths() {
return manifestFiles.stream().map(ManifestFileInfo::getManifestFilePath).collect(Collectors.toList());
}

public List<String> getAllDataFilePaths() {
return manifestFiles.stream().map(ManifestFileInfo::getListedFilePaths).flatMap(List::stream).collect(Collectors.toList());
}

public List<String> getAllPaths() {
List<String> result = Lists.newArrayList(metadataPath, manifestListPath);
result.addAll(getManifestFilePaths());
result.addAll(getAllDataFilePaths());
return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gobblin.data.management.copy.iceberg;

import java.io.IOException;
import java.time.Instant;
import java.util.List;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.ManifestFiles;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;

import static org.apache.gobblin.data.management.copy.iceberg.IcebergSnapshotInfo.ManifestFileInfo;


/**
* Exposes metadata information for a single Iceberg table.
*/
@Slf4j
@AllArgsConstructor
public class IcebergTable {
private final TableOperations tableOps;

public IcebergSnapshotInfo getCurrentSnapshotInfo() throws IOException {
TableMetadata current = tableOps.current();
Snapshot snapshot = current.currentSnapshot();
List<ManifestFile> manifests = snapshot.allManifests();
return new IcebergSnapshotInfo(
snapshot.snapshotId(),
Instant.ofEpochMilli(snapshot.timestampMillis()),
current.metadataFileLocation(),
snapshot.manifestListLocation(),
// NOTE: unable to `.stream().map(m -> calcManifestFileInfo(m, tableOps.io()))` due to checked exception
calcAllManifestFileInfo(manifests, tableOps.io())
);
}

@VisibleForTesting
static List<ManifestFileInfo> calcAllManifestFileInfo(List<ManifestFile> manifests, FileIO io) throws IOException {
List<ManifestFileInfo> result = Lists.newArrayList();
for (ManifestFile manifest : manifests) {
result.add(calcManifestFileInfo(manifest, io));
}
return result;
}

@VisibleForTesting
static IcebergSnapshotInfo.ManifestFileInfo calcManifestFileInfo(ManifestFile manifest, FileIO io) throws IOException {
return new ManifestFileInfo(manifest.path(), discoverDataFilePaths(manifest, io));
}

@VisibleForTesting
static List<String> discoverDataFilePaths(ManifestFile manifest, FileIO io) throws IOException {
CloseableIterable<String> manifestPathsIterable = ManifestFiles.readPaths(manifest, io);
try {
return Lists.newArrayList(manifestPathsIterable);
} finally {
manifestPathsIterable.close();
}
}
}