-
Notifications
You must be signed in to change notification settings - Fork 60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[POC][wip] faster DefaultEngine
parquet reads
#595
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,6 +202,7 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> { | |
physical_schema.clone(), | ||
predicate, | ||
self.store.clone(), | ||
None, | ||
)), | ||
}; | ||
FileStream::new_async_read_iterator( | ||
|
@@ -215,13 +216,14 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> { | |
} | ||
|
||
/// Implements [`FileOpener`] for a parquet file | ||
struct ParquetOpener { | ||
pub(crate) struct ParquetOpener { | ||
// projection: Arc<[usize]>, | ||
batch_size: usize, | ||
table_schema: SchemaRef, | ||
predicate: Option<ExpressionRef>, | ||
limit: Option<usize>, | ||
store: Arc<DynObjectStore>, | ||
runtime: Option<Arc<tokio::runtime::Runtime>>, | ||
} | ||
|
||
impl ParquetOpener { | ||
|
@@ -230,13 +232,15 @@ impl ParquetOpener { | |
table_schema: SchemaRef, | ||
predicate: Option<ExpressionRef>, | ||
store: Arc<DynObjectStore>, | ||
runtime: Option<Arc<tokio::runtime::Runtime>>, | ||
) -> Self { | ||
Self { | ||
batch_size, | ||
table_schema, | ||
predicate, | ||
limit: None, | ||
store, | ||
runtime, | ||
} | ||
} | ||
} | ||
|
@@ -251,11 +255,18 @@ impl FileOpener for ParquetOpener { | |
let table_schema = self.table_schema.clone(); | ||
let predicate = self.predicate.clone(); | ||
let limit = self.limit; | ||
let handle = match &self.runtime { | ||
Some(runtime) => Some(runtime.handle().clone()), | ||
None => None, | ||
}; | ||
|
||
Ok(Box::pin(async move { | ||
// TODO avoid IO by converting passed file meta to ObjectMeta | ||
let meta = store.head(&path).await?; | ||
let mut reader = ParquetObjectReader::new(store, meta); | ||
if let Some(handle) = handle { | ||
reader = reader.with_runtime(handle); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does setting this do? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. new in arrow 53.3 i think - lets you push down a runtime for them to schedule their IO on. This has gotten me thinking about various ways to enable this sort of 'runtime passthrough' ourselves..
see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Yeah, seems similar to what you're doing. |
||
} | ||
let metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; | ||
let parquet_schema = metadata.schema(); | ||
let (indicies, requested_ordering) = | ||
|
@@ -281,6 +292,9 @@ impl FileOpener for ParquetOpener { | |
|
||
let stream = builder.with_batch_size(batch_size).build()?; | ||
|
||
// println!("read IO"); | ||
// tokio::time::sleep(std::time::Duration::from_millis(10000)).await; // simulate IO delay | ||
|
||
let stream = stream.map(move |rbr| { | ||
// re-order each batch if needed | ||
rbr.map_err(Error::Parquet).and_then(|rb| { | ||
|
@@ -293,7 +307,7 @@ impl FileOpener for ParquetOpener { | |
} | ||
|
||
/// Implements [`FileOpener`] for a opening a parquet file from a presigned URL | ||
struct PresignedUrlOpener { | ||
pub(crate) struct PresignedUrlOpener { | ||
batch_size: usize, | ||
predicate: Option<ExpressionRef>, | ||
limit: Option<usize>, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for experimenting, i'd suggest using the multi-threaded reader. although i guess this does help determine how much a single call can read. regardless,
read-table-multi-threaded
has a--limit
option for this case so you can see that some data got returned but not print it all, but it does tell you the total row count. maybe add that as an option here too :)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep thanks I ended up playing with both but yea the
--limit
is nicer :)