Skip to content

Commit

Permalink
Create DataFragment and DataFile during Dataset write process (#440)
Browse files Browse the repository at this point in the history
  • Loading branch information
eddyxu authored Jan 19, 2023
1 parent f7c3838 commit 180052a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 2 deletions.
8 changes: 8 additions & 0 deletions rust/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,14 @@ impl Schema {
self.fields.iter().find(|f| f.name == name)
}

/// Recursively collect all the field IDs,
pub(crate) fn field_ids(&self) -> Vec<i32> {
// TODO: make a tree travesal iterator.

let protos: Vec<pb::Field> = self.into();
protos.iter().map(|f| f.id).collect()
}

pub(crate) fn mut_field_by_id(&mut self, id: i32) -> Option<&mut Field> {
for field in self.fields.as_mut_slice() {
if field.id == id {
Expand Down
60 changes: 58 additions & 2 deletions rust/src/format/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,29 @@

use std::collections::BTreeSet;

use crate::datatypes::Schema;
use crate::format::pb;

/// Lance Data File
///
/// A data file is one piece of file storing data.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq)]
pub struct DataFile {
/// Relative path of the data file to dataset root.
pub path: String,
/// The Ids of fields in this file.
fields: Vec<i32>,
}

impl DataFile {
pub fn new(path: &str, schema: &Schema) -> Self {
Self {
path: path.to_string(),
fields: schema.field_ids(),
}
}
}

impl From<&DataFile> for pb::DataFile {
fn from(df: &DataFile) -> Self {
Self {
Expand Down Expand Up @@ -62,8 +72,19 @@ pub struct Fragment {
}

impl Fragment {
pub fn new(id: u64) -> Self {
Self { id, files: vec![] }
}

/// Create a `Fragment` with one DataFile
pub fn with_file(id: u64, path: &str, schema: &Schema) -> Self {
Self {
id,
files: vec![DataFile::new(path, schema)],
}
}

/// Get all field IDs from this fragment, sorted.
///
pub fn field_ids(&self) -> Vec<i32> {
BTreeSet::from_iter(self.files.iter().flat_map(|f| f.fields.clone()))
.into_iter()
Expand All @@ -88,3 +109,38 @@ impl From<&Fragment> for pb::DataFragment {
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};

#[test]
fn test_new_fragment() {
let path = "foobar.lance";

let arrow_schema = ArrowSchema::new(vec![
ArrowField::new(
"s",
DataType::Struct(vec![
ArrowField::new("si", DataType::Int32, false),
ArrowField::new("sb", DataType::Binary, true),
]),
true,
),
ArrowField::new("bool", DataType::Boolean, true),
]);
let schema = Schema::try_from(&arrow_schema).unwrap();
let fragment = Fragment::with_file(123, &path, &schema);

assert_eq!(123, fragment.id);
assert_eq!(fragment.field_ids(), [0, 1, 2, 3]);
assert_eq!(
fragment.files,
vec![DataFile {
path: path.to_string(),
fields: vec![0, 1, 2, 3]
}]
)
}
}

0 comments on commit 180052a

Please sign in to comment.