From 06d8dbb0e33ae4a75193d0e2e666b87d96d1d826 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:35:17 +0100 Subject: [PATCH] feat!(ffi): expose metadata in SchemaEngineVisitor ffi api (#659) - new feature PR: "Expose metadata in SchemaEngineVisitor ffi api" ## What changes are proposed in this pull request? ### This PR affects the following public APIs A new argument "metadata" was added to EngineSchemaVisitor visitor functions. ## How was this change tested? `examples/read_table` was updated to test the change. --- ffi/examples/read-table/schema.h | 36 +++++++++++++++++++++++++++---- ffi/src/scan.rs | 7 ++++++ ffi/src/schema.rs | 37 +++++++++++++++++++++++++++++--- 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/ffi/examples/read-table/schema.h b/ffi/examples/read-table/schema.h index b4a64b4e6..8c29675a6 100644 --- a/ffi/examples/read-table/schema.h +++ b/ffi/examples/read-table/schema.h @@ -85,6 +85,24 @@ void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int paren } } +void print_physical_name(const char *name, const CStringMap* metadata) +{ +#ifdef VERBOSE + char* key_str = "delta.columnMapping.physicalName"; + KernelStringSlice key = { key_str, strlen(key_str) }; + char* value = get_from_map(metadata, key, allocate_string); + if (value) { + printf("Physical name of %s is %s\n", name, value); + free(value); + } else { + printf("No physical name\n"); + } +#else + (void)name; + (void)metadata; +#endif +} + // declare all our visitor methods uintptr_t make_field_list(void* data, uintptr_t reserve) { @@ -109,11 +127,13 @@ void visit_struct( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); PRINT_CHILD_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id); + print_physical_name(name_ptr, metadata); SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct", is_nullable); struct_item->children = child_list_id; } @@ -123,12 +143,14 @@ void visit_array( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); + print_physical_name(name_ptr, metadata); PRINT_CHILD_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array", is_nullable); array_item->children = child_list_id; @@ -139,12 +161,14 @@ void visit_map( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); + print_physical_name(name_ptr, metadata); PRINT_CHILD_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map", is_nullable); map_item->children = child_list_id; @@ -155,6 +179,7 @@ void visit_decimal( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uint8_t precision, uint8_t scale) { @@ -162,6 +187,7 @@ void visit_decimal( char* name_ptr = allocate_string(name); char* type = malloc(19 * sizeof(char)); snprintf(type, 19, "decimal(%u)(%d)", precision, scale); + print_physical_name(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } @@ -171,18 +197,20 @@ void visit_simple_type( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, char* type) { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); + print_physical_name(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } -#define DEFINE_VISIT_SIMPLE_TYPE(typename) \ - void visit_##typename(void* data, uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable)\ - { \ - visit_simple_type(data, sibling_list_id, name, is_nullable, #typename); \ +#define DEFINE_VISIT_SIMPLE_TYPE(typename) \ + void visit_##typename(void* data, uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, const CStringMap * metadata)\ + { \ + visit_simple_type(data, sibling_list_id, name, is_nullable, metadata, #typename); \ } DEFINE_VISIT_SIMPLE_TYPE(string) diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 5d3b5047b..f2fee7643 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -269,10 +269,17 @@ type CScanCallback = extern "C" fn( partition_map: &CStringMap, ); +#[derive(Default)] pub struct CStringMap { values: HashMap, } +impl From> for CStringMap { + fn from(val: HashMap) -> Self { + Self { values: val } + } +} + #[no_mangle] /// allow probing into a CStringMap. If the specified key is in the map, kernel will call /// allocate_fn with the value associated with the key and return the value returned from that diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index b74051133..23da22bc3 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -1,7 +1,9 @@ use std::os::raw::c_void; +use crate::scan::CStringMap; use crate::{handle::Handle, kernel_string_slice, KernelStringSlice, SharedSnapshot}; use delta_kernel::schema::{ArrayType, DataType, MapType, PrimitiveType, StructType}; + /// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own /// representation of a schema from a particular schema within kernel. /// @@ -28,7 +30,6 @@ use delta_kernel::schema::{ArrayType, DataType, MapType, PrimitiveType, StructTy /// that element's (already-visited) children. /// 4. The [`visit_schema`] method returns the id of the list of top-level columns // WARNING: the visitor MUST NOT retain internal references to the string slices passed to visitor methods -// TODO: struct field metadata #[repr(C)] pub struct EngineSchemaVisitor { /// opaque state pointer @@ -44,6 +45,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -54,6 +56,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -65,6 +68,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -74,6 +78,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, precision: u8, scale: u8, ), @@ -84,6 +89,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `long` belonging to the list identified by `sibling_list_id`. @@ -92,6 +98,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit an `integer` belonging to the list identified by `sibling_list_id`. @@ -100,6 +107,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `short` belonging to the list identified by `sibling_list_id`. @@ -108,6 +116,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `byte` belonging to the list identified by `sibling_list_id`. @@ -116,6 +125,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `float` belonging to the list identified by `sibling_list_id`. @@ -124,6 +134,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `double` belonging to the list identified by `sibling_list_id`. @@ -132,6 +143,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. @@ -140,6 +152,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit `binary` belonging to the list identified by `sibling_list_id`. @@ -148,6 +161,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `date` belonging to the list identified by `sibling_list_id`. @@ -156,6 +170,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. @@ -164,6 +179,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. @@ -172,6 +188,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), } @@ -197,6 +214,7 @@ pub unsafe extern "C" fn visit_schema( field.name(), field.data_type(), field.is_nullable(), + &field.metadata_with_string_values().into(), visitor, child_list_id, ); @@ -210,10 +228,12 @@ pub unsafe extern "C" fn visit_schema( contains_null: bool, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 1); + let metadata = CStringMap::default(); visit_schema_item( "array_element", &at.element_type, contains_null, + &metadata, visitor, child_list_id, ); @@ -226,11 +246,20 @@ pub unsafe extern "C" fn visit_schema( value_contains_null: bool, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 2); - visit_schema_item("map_key", &mt.key_type, false, visitor, child_list_id); + let metadata = CStringMap::default(); + visit_schema_item( + "map_key", + &mt.key_type, + false, + &metadata, + visitor, + child_list_id, + ); visit_schema_item( "map_value", &mt.value_type, value_contains_null, + &metadata, visitor, child_list_id, ); @@ -242,6 +271,7 @@ pub unsafe extern "C" fn visit_schema( name: &str, data_type: &DataType, is_nullable: bool, + metadata: &CStringMap, visitor: &EngineSchemaVisitor, sibling_list_id: usize, ) { @@ -251,7 +281,8 @@ pub unsafe extern "C" fn visit_schema( visitor.data, sibling_list_id, kernel_string_slice!(name), - is_nullable + is_nullable, + metadata $(, $extra_args) * ) };