Skip to content

Commit cf2bfb0

Browse files
Update vendored DuckDB sources to 44b706b2b7
1 parent a10cb2b commit cf2bfb0

File tree

76 files changed

+1416
-658
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+1416
-658
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
/**
2+
* Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages.
3+
* Swaps Unicode mappings for EBCDIC LF and NL codes, as used on
4+
* S/390 (z/OS) Unix System Services (Open Edition).
5+
* For example, ucnv_open("ibm-1047,swaplfnl", &errorCode);
6+
* See convrtrs.txt.
7+
*
8+
* @see ucnv_open
9+
* @stable ICU 2.4
10+
*/
11+
#define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl"

src/duckdb/extension/parquet/column_reader.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -895,9 +895,6 @@ unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const
895895
default:
896896
throw NotImplementedException("Unrecognized Parquet type for Decimal");
897897
}
898-
case LogicalTypeId::GEOMETRY:
899-
// TODO: Make GeometryColumnReader
900-
return make_uniq<StringColumnReader>(reader, schema);
901898
case LogicalTypeId::UUID:
902899
return make_uniq<UUIDColumnReader>(reader, schema);
903900
case LogicalTypeId::INTERVAL:

src/duckdb/extension/parquet/include/parquet_column_schema.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace duckdb {
1515
using duckdb_parquet::FileMetaData;
1616
struct ParquetOptions;
1717

18-
enum class ParquetColumnSchemaType { COLUMN, FILE_ROW_NUMBER, EXPRESSION, VARIANT };
18+
enum class ParquetColumnSchemaType { COLUMN, FILE_ROW_NUMBER, EXPRESSION, VARIANT, GEOMETRY };
1919

2020
enum class ParquetExtraTypeInfo {
2121
NONE,
@@ -35,7 +35,7 @@ struct ParquetColumnSchema {
3535
ParquetColumnSchemaType schema_type = ParquetColumnSchemaType::COLUMN);
3636
ParquetColumnSchema(string name, LogicalType type, idx_t max_define, idx_t max_repeat, idx_t schema_index,
3737
idx_t column_index, ParquetColumnSchemaType schema_type = ParquetColumnSchemaType::COLUMN);
38-
ParquetColumnSchema(ParquetColumnSchema parent, LogicalType result_type, ParquetColumnSchemaType schema_type);
38+
ParquetColumnSchema(ParquetColumnSchema child, LogicalType result_type, ParquetColumnSchemaType schema_type);
3939

4040
ParquetColumnSchemaType schema_type;
4141
string name;

src/duckdb/extension/parquet/include/parquet_geometry.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,11 @@ struct ParquetColumnSchema;
2121
class ParquetReader;
2222
class ColumnReader;
2323
class ClientContext;
24-
class ExpressionExecutor;
24+
25+
struct GeometryColumnReader {
26+
static unique_ptr<ColumnReader> Create(ParquetReader &reader, const ParquetColumnSchema &schema,
27+
ClientContext &context);
28+
};
2529

2630
enum class GeoParquetColumnEncoding : uint8_t {
2731
WKB = 1,
@@ -90,9 +94,6 @@ class GeoParquetFileMetadata {
9094
const ClientContext &context);
9195
const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;
9296

93-
static unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
94-
ClientContext &context);
95-
9697
bool IsGeometryColumn(const string &column_name) const;
9798

9899
static bool IsGeoParquetConversionEnabled(const ClientContext &context);

src/duckdb/extension/parquet/parquet_geometry.cpp

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
#include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
66
#include "duckdb/execution/expression_executor.hpp"
77
#include "duckdb/function/scalar_function.hpp"
8+
#include "duckdb/function/scalar/geometry_functions.hpp"
89
#include "duckdb/planner/expression/bound_function_expression.hpp"
910
#include "duckdb/planner/expression/bound_reference_expression.hpp"
1011
#include "duckdb/main/extension_helper.hpp"
1112
#include "reader/expression_column_reader.hpp"
1213
#include "parquet_reader.hpp"
1314
#include "yyjson.hpp"
15+
#include "reader/string_column_reader.hpp"
1416

1517
namespace duckdb {
1618

@@ -283,34 +285,24 @@ const unordered_map<string, GeoParquetColumnMetadata> &GeoParquetFileMetadata::G
283285
return geometry_columns;
284286
}
285287

286-
unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReader &reader,
287-
const ParquetColumnSchema &schema,
288-
ClientContext &context) {
289-
// Get the catalog
290-
auto &catalog = Catalog::GetSystemCatalog(context);
291-
292-
// WKB encoding
293-
if (schema.children[0].type.id() == LogicalTypeId::BLOB) {
294-
// Look for a conversion function in the catalog
295-
auto &conversion_func_set =
296-
catalog.GetEntry<ScalarFunctionCatalogEntry>(context, DEFAULT_SCHEMA, "st_geomfromwkb");
297-
auto conversion_func = conversion_func_set.functions.GetFunctionByArguments(context, {LogicalType::BLOB});
298-
299-
// Create a bound function call expression
300-
auto args = vector<unique_ptr<Expression>>();
301-
args.push_back(std::move(make_uniq<BoundReferenceExpression>(LogicalType::BLOB, 0)));
302-
auto expr = make_uniq<BoundFunctionExpression>(conversion_func.GetReturnType(), conversion_func,
303-
std::move(args), nullptr);
304-
305-
// Create a child reader
306-
auto child_reader = ColumnReader::CreateReader(reader, schema.children[0]);
307-
308-
// Create an expression reader that applies the conversion function to the child reader
309-
return make_uniq<ExpressionColumnReader>(context, std::move(child_reader), std::move(expr), schema);
310-
}
288+
unique_ptr<ColumnReader> GeometryColumnReader::Create(ParquetReader &reader, const ParquetColumnSchema &schema,
289+
ClientContext &context) {
290+
D_ASSERT(schema.type.id() == LogicalTypeId::GEOMETRY);
291+
D_ASSERT(schema.children.size() == 1 && schema.children[0].type.id() == LogicalTypeId::BLOB);
292+
293+
// Make a string reader for the underlying WKB data
294+
auto string_reader = make_uniq<StringColumnReader>(reader, schema.children[0]);
295+
296+
// Wrap the string reader in a geometry reader
297+
auto args = vector<unique_ptr<Expression>>();
298+
auto ref = make_uniq_base<Expression, BoundReferenceExpression>(LogicalTypeId::BLOB, 0);
299+
args.push_back(std::move(ref));
311300

312-
// Otherwise, unrecognized encoding
313-
throw NotImplementedException("Unsupported geometry encoding");
301+
// TODO: Pass the actual target type here so we get the CRS information too
302+
auto func = StGeomfromwkbFun::GetFunction();
303+
func.name = "ST_GeomFromWKB";
304+
auto expr = make_uniq_base<Expression, BoundFunctionExpression>(schema.type, func, std::move(args), nullptr);
305+
return make_uniq<ExpressionColumnReader>(context, std::move(string_reader), std::move(expr), schema);
314306
}
315307

316308
} // namespace duckdb

src/duckdb/extension/parquet/parquet_reader.cpp

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,6 @@ LogicalType ParquetReader::DeriveLogicalType(const SchemaElement &s_ele, Parquet
225225
return LogicalType::TIME_TZ;
226226
}
227227
return LogicalType::TIME;
228-
} else if (s_ele.logicalType.__isset.GEOMETRY) {
229-
// TODO: Set CRS too
230-
return LogicalType::GEOMETRY();
231-
} else if (s_ele.logicalType.__isset.GEOGRAPHY) {
232-
return LogicalType::GEOMETRY();
233228
}
234229
}
235230
if (s_ele.__isset.converted_type) {
@@ -409,6 +404,9 @@ unique_ptr<ColumnReader> ParquetReader::CreateReaderRecursive(ClientContext &con
409404
switch (schema.schema_type) {
410405
case ParquetColumnSchemaType::FILE_ROW_NUMBER:
411406
return make_uniq<RowNumberColumnReader>(*this, schema);
407+
case ParquetColumnSchemaType::GEOMETRY: {
408+
return GeometryColumnReader::Create(*this, schema, context);
409+
}
412410
case ParquetColumnSchemaType::COLUMN: {
413411
if (schema.children.empty()) {
414412
// leaf reader
@@ -486,11 +484,11 @@ ParquetColumnSchema::ParquetColumnSchema(string name_p, LogicalType type_p, idx_
486484
max_repeat(max_repeat), schema_index(schema_index), column_index(column_index) {
487485
}
488486

489-
ParquetColumnSchema::ParquetColumnSchema(ParquetColumnSchema parent, LogicalType result_type,
487+
ParquetColumnSchema::ParquetColumnSchema(ParquetColumnSchema child, LogicalType result_type,
490488
ParquetColumnSchemaType schema_type)
491-
: schema_type(schema_type), name(parent.name), type(std::move(result_type)), max_define(parent.max_define),
492-
max_repeat(parent.max_repeat), schema_index(parent.schema_index), column_index(parent.column_index) {
493-
children.push_back(std::move(parent));
489+
: schema_type(schema_type), name(child.name), type(std::move(result_type)), max_define(child.max_define),
490+
max_repeat(child.max_repeat), schema_index(child.schema_index), column_index(child.column_index) {
491+
children.push_back(std::move(child));
494492
}
495493

496494
unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(const FileMetaData &file_meta_data,
@@ -517,6 +515,32 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(const FileMetaData &file_m
517515
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, parquet_options.can_have_nan);
518516
}
519517

518+
static bool IsGeometryType(const SchemaElement &s_ele, const ParquetFileMetadataCache &metadata, idx_t depth) {
519+
const auto is_blob = s_ele.__isset.type && s_ele.type == Type::BYTE_ARRAY;
520+
if (!is_blob) {
521+
return false;
522+
}
523+
524+
// TODO: Handle CRS in the future
525+
const auto is_native_geom = s_ele.__isset.logicalType && s_ele.logicalType.__isset.GEOMETRY;
526+
const auto is_native_geog = s_ele.__isset.logicalType && s_ele.logicalType.__isset.GEOGRAPHY;
527+
if (is_native_geom || is_native_geog) {
528+
return true;
529+
}
530+
531+
// geoparquet types have to be at the root of the schema, and have to be present in the kv metadata.
532+
const auto is_at_root = depth == 1;
533+
const auto is_in_gpq_metadata = metadata.geo_metadata && metadata.geo_metadata->IsGeometryColumn(s_ele.name);
534+
const auto is_leaf = s_ele.num_children == 0;
535+
const auto is_geoparquet_geom = is_at_root && is_in_gpq_metadata && is_leaf;
536+
537+
if (is_geoparquet_geom) {
538+
return true;
539+
}
540+
541+
return false;
542+
}
543+
520544
ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
521545
idx_t &next_schema_idx, idx_t &next_file_idx,
522546
ClientContext &context) {
@@ -540,16 +564,26 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
540564
max_repeat++;
541565
}
542566

543-
// Check for geoparquet spatial types
544-
if (depth == 1) {
545-
// geoparquet types have to be at the root of the schema, and have to be present in the kv metadata.
546-
// geoarrow types, although geometry columns, are structs and have children and are handled below.
547-
if (metadata->geo_metadata && metadata->geo_metadata->IsGeometryColumn(s_ele.name) && s_ele.num_children == 0) {
548-
auto geom_schema = ParseColumnSchema(s_ele, max_define, max_repeat, this_idx, next_file_idx++);
549-
// overwrite the derived type with GEOMETRY
550-
geom_schema.type = LogicalType::GEOMETRY();
551-
return geom_schema;
552-
}
567+
// Check for geometry type
568+
if (IsGeometryType(s_ele, *metadata, depth)) {
569+
// Geometries in both GeoParquet and native parquet are stored as a WKB-encoded BLOB.
570+
// Because we don't just want to validate that the WKB encoding is correct, but also transform it into
571+
// little-endian if necessary, we cant just make use of the StringColumnReader without heavily modifying it.
572+
// Therefore, we create a dedicated GEOMETRY parquet column schema type, which wraps the underlying BLOB column.
573+
// This schema type gets instantiated as a ExpressionColumnReader on top of the standard Blob/String reader,
574+
// which performs the WKB validation/transformation using the `ST_GeomFromWKB` function of DuckDB.
575+
// This enables us to also support other geometry encodings (such as GeoArrow geometries) easier in the future.
576+
577+
// Inner BLOB schema
578+
ParquetColumnSchema blob_schema(max_define, max_repeat, this_idx, next_file_idx++,
579+
ParquetColumnSchemaType::COLUMN);
580+
blob_schema.name = s_ele.name;
581+
blob_schema.type = LogicalType::BLOB;
582+
583+
// Wrap in geometry schema
584+
ParquetColumnSchema geom_schema(std::move(blob_schema), LogicalType::GEOMETRY(),
585+
ParquetColumnSchemaType::GEOMETRY);
586+
return geom_schema;
553587
}
554588

555589
if (s_ele.__isset.num_children && s_ele.num_children > 0) { // inner node

src/duckdb/src/common/allocator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ static void MallocTrim(idx_t pad) {
256256
return; // Another thread has updated LAST_TRIM_TIMESTAMP_MS since we loaded it
257257
}
258258

259-
// We succesfully updated LAST_TRIM_TIMESTAMP_MS, we can trim
259+
// We successfully updated LAST_TRIM_TIMESTAMP_MS, we can trim
260260
malloc_trim(pad);
261261
#endif
262262
}

src/duckdb/src/common/arrow/arrow_query_result.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@ ArrowQueryResult::ArrowQueryResult(StatementType statement_type, StatementProper
1616
ArrowQueryResult::ArrowQueryResult(ErrorData error) : QueryResult(QueryResultType::ARROW_RESULT, std::move(error)) {
1717
}
1818

19-
unique_ptr<DataChunk> ArrowQueryResult::Fetch() {
20-
throw NotImplementedException("Can't 'Fetch' from ArrowQueryResult");
21-
}
22-
unique_ptr<DataChunk> ArrowQueryResult::FetchRaw() {
19+
unique_ptr<DataChunk> ArrowQueryResult::FetchInternal() {
2320
throw NotImplementedException("Can't 'FetchRaw' from ArrowQueryResult");
2421
}
2522

src/duckdb/src/common/arrow/arrow_type_extension.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include "duckdb/common/arrow/schema_metadata.hpp"
88
#include "duckdb/common/types/vector.hpp"
99

10+
#include "yyjson.hpp"
11+
1012
namespace duckdb {
1113

1214
ArrowTypeExtension::ArrowTypeExtension(string extension_name, string arrow_format,
@@ -365,6 +367,72 @@ struct ArrowBool8 {
365367
}
366368
};
367369

370+
struct ArrowGeometry {
371+
static unique_ptr<ArrowType> GetType(const ArrowSchema &schema, const ArrowSchemaMetadata &schema_metadata) {
372+
// Validate extension metadata. This metadata also contains a CRS, which we drop
373+
// because the GEOMETRY type does not implement a CRS at the type level (yet).
374+
const auto extension_metadata = schema_metadata.GetOption(ArrowSchemaMetadata::ARROW_METADATA_KEY);
375+
if (!extension_metadata.empty()) {
376+
unique_ptr<duckdb_yyjson::yyjson_doc, void (*)(duckdb_yyjson::yyjson_doc *)> doc(
377+
duckdb_yyjson::yyjson_read(extension_metadata.data(), extension_metadata.size(),
378+
duckdb_yyjson::YYJSON_READ_NOFLAG),
379+
duckdb_yyjson::yyjson_doc_free);
380+
if (!doc) {
381+
throw SerializationException("Invalid JSON in GeoArrow metadata");
382+
}
383+
384+
duckdb_yyjson::yyjson_val *val = yyjson_doc_get_root(doc.get());
385+
if (!yyjson_is_obj(val)) {
386+
throw SerializationException("Invalid GeoArrow metadata: not a JSON object");
387+
}
388+
389+
duckdb_yyjson::yyjson_val *edges = yyjson_obj_get(val, "edges");
390+
if (edges && yyjson_is_str(edges) && std::strcmp(yyjson_get_str(edges), "planar") != 0) {
391+
throw NotImplementedException("Can't import non-planar edges");
392+
}
393+
}
394+
395+
const auto format = string(schema.format);
396+
if (format == "z") {
397+
return make_uniq<ArrowType>(LogicalType::GEOMETRY(),
398+
make_uniq<ArrowStringInfo>(ArrowVariableSizeType::NORMAL));
399+
}
400+
if (format == "Z") {
401+
return make_uniq<ArrowType>(LogicalType::GEOMETRY(),
402+
make_uniq<ArrowStringInfo>(ArrowVariableSizeType::SUPER_SIZE));
403+
}
404+
if (format == "vz") {
405+
return make_uniq<ArrowType>(LogicalType::GEOMETRY(),
406+
make_uniq<ArrowStringInfo>(ArrowVariableSizeType::VIEW));
407+
}
408+
throw InvalidInputException("Arrow extension type \"%s\" not supported for geoarrow.wkb", format.c_str());
409+
}
410+
411+
static void PopulateSchema(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &schema, const LogicalType &type,
412+
ClientContext &context, const ArrowTypeExtension &extension) {
413+
ArrowSchemaMetadata schema_metadata;
414+
schema_metadata.AddOption(ArrowSchemaMetadata::ARROW_EXTENSION_NAME, "geoarrow.wkb");
415+
schema_metadata.AddOption(ArrowSchemaMetadata::ARROW_METADATA_KEY, "{}");
416+
root_holder.metadata_info.emplace_back(schema_metadata.SerializeMetadata());
417+
schema.metadata = root_holder.metadata_info.back().get();
418+
419+
const auto options = context.GetClientProperties();
420+
if (options.arrow_offset_size == ArrowOffsetSize::LARGE) {
421+
schema.format = "Z";
422+
} else {
423+
schema.format = "z";
424+
}
425+
}
426+
427+
static void ArrowToDuck(ClientContext &, Vector &source, Vector &result, idx_t count) {
428+
Geometry::FromBinary(source, result, count, true);
429+
}
430+
431+
static void DuckToArrow(ClientContext &context, Vector &source, Vector &result, idx_t count) {
432+
Geometry::ToBinary(source, result, count);
433+
}
434+
};
435+
368436
void ArrowTypeExtensionSet::Initialize(const DBConfig &config) {
369437
// Types that are 1:1
370438
config.RegisterArrowExtension({"arrow.uuid", "w:16", make_shared_ptr<ArrowTypeExtensionData>(LogicalType::UUID)});
@@ -380,6 +448,11 @@ void ArrowTypeExtensionSet::Initialize(const DBConfig &config) {
380448
config.RegisterArrowExtension(
381449
{"DuckDB", "time_tz", "w:8", make_shared_ptr<ArrowTypeExtensionData>(LogicalType::TIME_TZ)});
382450

451+
config.RegisterArrowExtension(
452+
{"geoarrow.wkb", ArrowGeometry::PopulateSchema, ArrowGeometry::GetType,
453+
make_shared_ptr<ArrowTypeExtensionData>(LogicalType::GEOMETRY(), LogicalType::BLOB, ArrowGeometry::ArrowToDuck,
454+
ArrowGeometry::DuckToArrow)});
455+
383456
// Types that are 1:n
384457
config.RegisterArrowExtension({"arrow.json", &ArrowJson::PopulateSchema, &ArrowJson::GetType,
385458
make_shared_ptr<ArrowTypeExtensionData>(LogicalType::JSON())});

src/duckdb/src/common/encryption_key_manager.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ EncryptionKey::~EncryptionKey() {
3131
void EncryptionKey::LockEncryptionKey(data_ptr_t key, idx_t key_len) {
3232
#if defined(_WIN32)
3333
VirtualLock(key, key_len);
34+
#elif defined(__MVS__)
35+
__mlockall(_BPX_NONSWAP);
3436
#else
3537
mlock(key, key_len);
3638
#endif
@@ -40,6 +42,8 @@ void EncryptionKey::UnlockEncryptionKey(data_ptr_t key, idx_t key_len) {
4042
memset(key, 0, key_len);
4143
#if defined(_WIN32)
4244
VirtualUnlock(key, key_len);
45+
#elif defined(__MVS__)
46+
__mlockall(_BPX_SWAP);
4347
#else
4448
munlock(key, key_len);
4549
#endif

0 commit comments

Comments
 (0)