From 4991b9e62003703311f82d8b6e9db1789052e778 Mon Sep 17 00:00:00 2001 From: Ivan G Date: Tue, 25 Apr 2023 16:13:23 +0100 Subject: [PATCH] bug: empty logical type container is created if there is no matching logical type (#314) * do not create a logical type container if there is no corresponding logical type * fix test asserts and bump version to 4.9.2 --- .github/workflows/full.yml | 2 +- src/Parquet.PerfRunner/Program.cs | 2 +- .../Integration/WriteQuestionableTypesTest.cs | 49 +++++++++++++++++ src/Parquet/Encodings/SchemaEncoder.cs | 54 ++++++++++++------- 4 files changed, 86 insertions(+), 21 deletions(-) create mode 100644 src/Parquet.Test/Integration/WriteQuestionableTypesTest.cs diff --git a/.github/workflows/full.yml b/.github/workflows/full.yml index 7fde5307..b0f2addd 100644 --- a/.github/workflows/full.yml +++ b/.github/workflows/full.yml @@ -1,7 +1,7 @@ name: 'Full Workflow' env: - VERSION: 4.9.1 + VERSION: 4.9.2 ASM_VERSION: 4.0.0 on: diff --git a/src/Parquet.PerfRunner/Program.cs b/src/Parquet.PerfRunner/Program.cs index 8eb35137..b4cd3a67 100644 --- a/src/Parquet.PerfRunner/Program.cs +++ b/src/Parquet.PerfRunner/Program.cs @@ -22,5 +22,5 @@ //var c = new Classes(); //c.SetUp(); //c.Serialise(); - await ParquetReader.ReadTableFromFileAsync("C:\\Users\\alone\\Downloads\\wide_parquet\\wide_parquet.parquet"); + //await ParquetReader.ReadTableFromFileAsync("C:\\Users\\alone\\Downloads\\wide_parquet\\wide_parquet.parquet"); } diff --git a/src/Parquet.Test/Integration/WriteQuestionableTypesTest.cs b/src/Parquet.Test/Integration/WriteQuestionableTypesTest.cs new file mode 100644 index 00000000..67947d54 --- /dev/null +++ b/src/Parquet.Test/Integration/WriteQuestionableTypesTest.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Parquet.Data; +using Parquet.Schema; +using Xunit; +using F = System.IO.File; +using Path = System.IO.Path; + +namespace Parquet.Test.Integration { + public class WriteQuestionableTypesTest : IntegrationBase { + + private async Task ReadWithPQT(ParquetSchema schema, DataColumn dc) { + string testFileName = Path.GetFullPath($"temp.{nameof(WriteQuestionableTypesTest)}.parquet"); + if(F.Exists(testFileName)) + F.Delete(testFileName); + + using(Stream s = F.OpenWrite(testFileName)) { + using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, s)) { + using ParquetRowGroupWriter rgw = writer.CreateRowGroup(); + + await rgw.WriteColumnAsync(dc); + } + } + + string? json = ExecMrCat(testFileName); + return json ?? string.Empty; + } + + [Fact] + public async Task DateTime_Default() { + var schema = new ParquetSchema(new DataField("qtype")); + var dc = new DataColumn(schema.DataFields.First(), new[] { new DateTime(2023, 04, 25, 1, 2, 3) }); + string json = await ReadWithPQT(schema, dc); + Assert.Equal("{\"qtype\":\"AK4X1GIDAACciSUA\"}", json); + } + + [Fact] + public async Task Timestamp_Default() { + var schema = new ParquetSchema(new DataField("qtype")); + var dc = new DataColumn(schema.DataFields.First(), new[] { TimeSpan.FromHours(7) }); + string json = await ReadWithPQT(schema, dc); + Assert.Equal("{\"qtype\":25200000}", json); + } + } +} diff --git a/src/Parquet/Encodings/SchemaEncoder.cs b/src/Parquet/Encodings/SchemaEncoder.cs index 4fbefbc9..1278dbbb 100644 --- a/src/Parquet/Encodings/SchemaEncoder.cs +++ b/src/Parquet/Encodings/SchemaEncoder.cs @@ -330,7 +330,6 @@ private static void Encode(StructField structField, Thrift.SchemaElement parent, public static Thrift.SchemaElement Encode(DataField field) { SType st = field.ClrType; var tse = new Thrift.SchemaElement(field.Name); - tse.LogicalType = new Thrift.LogicalType(); if(st == typeof(bool)) { // boolean tse.Type = Thrift.Type.BOOLEAN; @@ -348,9 +347,11 @@ public static Thrift.SchemaElement Encode(DataField field) { bw = 32; bool signed = st == typeof(sbyte) || st == typeof(short) || st == typeof(int); - tse.LogicalType.INTEGER = new Thrift.IntType { - BitWidth = bw, - IsSigned = signed + tse.LogicalType = new LogicalType { + INTEGER = new Thrift.IntType { + BitWidth = bw, + IsSigned = signed + } }; tse.Converted_type = bw switch { 8 => signed ? Thrift.ConvertedType.INT_8 : Thrift.ConvertedType.UINT_8, @@ -360,7 +361,12 @@ public static Thrift.SchemaElement Encode(DataField field) { }; } else if(st == typeof(long) || st == typeof(ulong)) { // 64-bit number tse.Type = Thrift.Type.INT64; - tse.LogicalType.INTEGER = new Thrift.IntType { BitWidth = 64, IsSigned = st == typeof(long) }; + tse.LogicalType = new LogicalType { + INTEGER = new Thrift.IntType { + BitWidth = 64, + IsSigned = st == typeof(long) + } + }; tse.Converted_type = st == typeof(long) ? Thrift.ConvertedType.INT_64 : Thrift.ConvertedType.UINT_64; } else if(st == typeof(float)) { // float tse.Type = Thrift.Type.FLOAT; @@ -370,7 +376,9 @@ public static Thrift.SchemaElement Encode(DataField field) { tse.Type = Thrift.Type.INT96; } else if(st == typeof(string)) { // string tse.Type = Thrift.Type.BYTE_ARRAY; - tse.LogicalType.STRING = new Thrift.StringType(); + tse.LogicalType = new LogicalType { + STRING = new Thrift.StringType() + }; tse.Converted_type = Thrift.ConvertedType.UTF8; } else if(st == typeof(decimal)) { // decimal @@ -398,9 +406,11 @@ public static Thrift.SchemaElement Encode(DataField field) { tse.Type_length = 16; } - tse.LogicalType.DECIMAL = new Thrift.DecimalType { - Precision = precision, - Scale = scale + tse.LogicalType = new LogicalType { + DECIMAL = new Thrift.DecimalType { + Precision = precision, + Scale = scale + } }; tse.Precision = precision; tse.Scale = scale; @@ -427,7 +437,7 @@ public static Thrift.SchemaElement Encode(DataField field) { #if NET6_0_OR_GREATER } else if(st == typeof(DateOnly)) { // DateOnly tse.Type = Thrift.Type.INT32; - tse.LogicalType.DATE = new Thrift.DateType(); + tse.LogicalType = new LogicalType { DATE = new Thrift.DateType() }; tse.Converted_type = Thrift.ConvertedType.DATE; #endif } else if(st == typeof(TimeSpan)) { // TimeSpan @@ -435,17 +445,21 @@ public static Thrift.SchemaElement Encode(DataField field) { switch(dfTime.TimeSpanFormat) { case TimeSpanFormat.MilliSeconds: tse.Type = Thrift.Type.INT32; - tse.LogicalType.TIME = new Thrift.TimeType { - IsAdjustedToUTC = true, - Unit = new Thrift.TimeUnit { MILLIS = new Thrift.MilliSeconds() } + tse.LogicalType = new LogicalType { + TIME = new Thrift.TimeType { + IsAdjustedToUTC = true, + Unit = new Thrift.TimeUnit { MILLIS = new Thrift.MilliSeconds() } + } }; tse.Converted_type = Thrift.ConvertedType.TIME_MILLIS; break; case TimeSpanFormat.MicroSeconds: tse.Type = Thrift.Type.INT64; - tse.LogicalType.TIME = new Thrift.TimeType { - IsAdjustedToUTC = true, - Unit = new Thrift.TimeUnit { MICROS = new Thrift.MicroSeconds() } + tse.LogicalType = new LogicalType { + TIME = new Thrift.TimeType { + IsAdjustedToUTC = true, + Unit = new Thrift.TimeUnit { MICROS = new Thrift.MicroSeconds() } + } }; tse.Converted_type = Thrift.ConvertedType.TIME_MICROS; break; @@ -454,9 +468,11 @@ public static Thrift.SchemaElement Encode(DataField field) { } } else { tse.Type = Thrift.Type.INT32; - tse.LogicalType.TIME = new Thrift.TimeType { - IsAdjustedToUTC = true, - Unit = new Thrift.TimeUnit { MILLIS = new Thrift.MilliSeconds() } + tse.LogicalType = new LogicalType { + TIME = new Thrift.TimeType { + IsAdjustedToUTC = true, + Unit = new Thrift.TimeUnit { MILLIS = new Thrift.MilliSeconds() } + } }; tse.Converted_type = Thrift.ConvertedType.TIME_MILLIS; }