-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SDK-parquet] parquet default processor extractor step
- Loading branch information
Showing
4 changed files
with
208 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
157 changes: 157 additions & 0 deletions
157
rust/sdk-processor/src/steps/parquet_default_processor/parquet_default_extractor.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
use crate::parquet_processors::{ParquetTypeEnum, ParquetTypeStructs}; | ||
use ahash::AHashMap; | ||
use aptos_indexer_processor_sdk::{ | ||
aptos_protos::transaction::v1::Transaction, | ||
traits::{async_step::AsyncRunType, AsyncStep, NamedStep, Processable}, | ||
types::transaction_context::TransactionContext, | ||
utils::errors::ProcessorError, | ||
}; | ||
use async_trait::async_trait; | ||
use processor::db::common::models::default_models::{ | ||
parquet_move_modules::MoveModule, | ||
parquet_move_resources::MoveResource, | ||
parquet_move_tables::TableItem, | ||
parquet_transactions::TransactionModel, | ||
parquet_write_set_changes::{WriteSetChangeDetail, WriteSetChangeModel}, | ||
}; | ||
use std::collections::HashMap; | ||
|
||
/// Extracts parquet data from transactions, allowing optional selection of specific tables. | ||
pub struct ParquetDefaultExtractor | ||
where | ||
Self: Processable + Send + Sized + 'static, | ||
{ | ||
pub opt_in_tables: Option<Vec<String>>, | ||
} | ||
|
||
impl ParquetDefaultExtractor { | ||
fn add_if_opted_in( | ||
&self, | ||
map: &mut HashMap<ParquetTypeEnum, ParquetTypeStructs>, | ||
enum_type: ParquetTypeEnum, | ||
data: ParquetTypeStructs, | ||
) { | ||
if let Some(ref opt_in_tables) = self.opt_in_tables { | ||
let table_name = enum_type.to_string(); | ||
if opt_in_tables.contains(&table_name) { | ||
map.insert(enum_type, data); | ||
} | ||
} else { | ||
// If there's no opt-in table, include all data | ||
map.insert(enum_type, data); | ||
} | ||
} | ||
} | ||
|
||
type ParquetTypeMap = HashMap<ParquetTypeEnum, ParquetTypeStructs>; | ||
|
||
#[async_trait] | ||
impl Processable for ParquetDefaultExtractor { | ||
type Input = Vec<Transaction>; | ||
type Output = ParquetTypeMap; | ||
type RunType = AsyncRunType; | ||
|
||
async fn process( | ||
&mut self, | ||
transactions: TransactionContext<Self::Input>, | ||
) -> anyhow::Result<Option<TransactionContext<ParquetTypeMap>>, ProcessorError> { | ||
let (move_resources, write_set_changes, parquet_transactions, table_items, move_modules) = | ||
process_transactions(transactions.data); | ||
|
||
// Print the size of each extracted data type | ||
println!("Processed data sizes:"); | ||
println!(" - MoveResources: {}", move_resources.len()); | ||
println!(" - WriteSetChanges: {}", write_set_changes.len()); | ||
println!(" - ParquetTransactions: {}", parquet_transactions.len()); | ||
println!(" - TableItems: {}", table_items.len()); | ||
println!(" - MoveModules: {}", move_modules.len()); | ||
|
||
let mut map: HashMap<ParquetTypeEnum, ParquetTypeStructs> = HashMap::new(); | ||
// Populate the map based on opt-in tables | ||
self.add_if_opted_in( | ||
&mut map, | ||
ParquetTypeEnum::MoveResource, | ||
ParquetTypeStructs::MoveResource(move_resources), | ||
); | ||
self.add_if_opted_in( | ||
&mut map, | ||
ParquetTypeEnum::WriteSetChange, | ||
ParquetTypeStructs::WriteSetChange(write_set_changes), | ||
); | ||
self.add_if_opted_in( | ||
&mut map, | ||
ParquetTypeEnum::Transaction, | ||
ParquetTypeStructs::Transaction(parquet_transactions), | ||
); | ||
self.add_if_opted_in( | ||
&mut map, | ||
ParquetTypeEnum::TableItem, | ||
ParquetTypeStructs::TableItem(table_items), | ||
); | ||
self.add_if_opted_in( | ||
&mut map, | ||
ParquetTypeEnum::MoveModule, | ||
ParquetTypeStructs::MoveModule(move_modules), | ||
); | ||
println!( | ||
"Map populated with data for the following tables: {:?}", | ||
map.keys().collect::<Vec<_>>() | ||
); | ||
|
||
Ok(Some(TransactionContext { | ||
data: map, | ||
metadata: transactions.metadata, | ||
})) | ||
} | ||
} | ||
|
||
pub fn process_transactions( | ||
transactions: Vec<Transaction>, | ||
) -> ( | ||
Vec<MoveResource>, | ||
Vec<WriteSetChangeModel>, | ||
Vec<TransactionModel>, | ||
Vec<TableItem>, | ||
Vec<MoveModule>, | ||
) { | ||
// this will be removed in the future. | ||
let mut transaction_version_to_struct_count = AHashMap::new(); | ||
let (txns, _, write_set_changes, wsc_details) = TransactionModel::from_transactions( | ||
&transactions, | ||
&mut transaction_version_to_struct_count, | ||
); | ||
|
||
let mut move_modules = vec![]; | ||
let mut move_resources = vec![]; | ||
let mut table_items = vec![]; | ||
|
||
for detail in wsc_details { | ||
match detail { | ||
WriteSetChangeDetail::Module(module) => { | ||
move_modules.push(module); | ||
}, | ||
WriteSetChangeDetail::Resource(resource) => { | ||
move_resources.push(resource); | ||
}, | ||
WriteSetChangeDetail::Table(item, _, _) => { | ||
table_items.push(item); | ||
}, | ||
} | ||
} | ||
|
||
( | ||
move_resources, | ||
write_set_changes, | ||
txns, | ||
table_items, | ||
move_modules, | ||
) | ||
} | ||
|
||
impl AsyncStep for ParquetDefaultExtractor {} | ||
|
||
impl NamedStep for ParquetDefaultExtractor { | ||
fn name(&self) -> String { | ||
"ParquetDefaultExtractor".to_string() | ||
} | ||
} |