Skip to content

Commit d7e1b51

Browse files
letian-jiangMoonm3n
authored andcommitted
[BugFix] Fix writing array column into parquet file (StarRocks#23359)
Signed-off-by: Letian Jiang <[email protected]> Signed-off-by: Moonm3n <[email protected]>
1 parent ec3a438 commit d7e1b51

File tree

2 files changed

+1
-100
lines changed

2 files changed

+1
-100
lines changed

be/src/formats/parquet/level_builder.cpp

+1-96
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ void LevelBuilder::_write_column_chunk(const LevelBuilderContext& ctx, const Typ
113113
break;
114114
}
115115
case TYPE_ARRAY: {
116-
_write_array_column_chunk_branchless(ctx, type_desc, node, col, write_leaf_callback);
116+
_write_array_column_chunk(ctx, type_desc, node, col, write_leaf_callback);
117117
break;
118118
}
119119
case TYPE_MAP: {
@@ -399,101 +399,6 @@ void LevelBuilder::_write_array_column_chunk(const LevelBuilderContext& ctx, con
399399
_write_column_chunk(derived_ctx, type_desc.children[0], inner_node, elements, write_leaf_callback);
400400
}
401401

402-
void LevelBuilder::_write_array_column_chunk_branchless(const LevelBuilderContext& ctx, const TypeDescriptor& type_desc,
403-
const ::parquet::schema::NodePtr& node, const ColumnPtr& col,
404-
const CallbackFunction& write_leaf_callback) {
405-
DCHECK(type_desc.type == TYPE_ARRAY);
406-
auto outer_node = std::static_pointer_cast<::parquet::schema::GroupNode>(node);
407-
auto mid_node = std::static_pointer_cast<::parquet::schema::GroupNode>(outer_node->field(0));
408-
auto inner_node = mid_node->field(0);
409-
410-
auto* null_col = get_raw_null_column(col);
411-
auto* array_col = down_cast<ArrayColumn*>(ColumnHelper::get_data_column(col.get()));
412-
const auto& elements = array_col->elements_column();
413-
const auto& offsets = array_col->offsets_column()->get_data();
414-
415-
size_t num_levels_upper_bound = ctx._num_levels + elements->size();
416-
auto def_levels = std::make_shared<std::vector<int16_t>>(num_levels_upper_bound,
417-
ctx._max_def_level + node->is_optional() + 1);
418-
auto rep_levels = std::make_shared<std::vector<int16_t>>(num_levels_upper_bound, ctx._max_rep_level + 1);
419-
420-
size_t num_levels = 0; // pointer to def/rep levels
421-
int offset = 0; // pointer to current column
422-
int ctx_ptr = 0; // pointer to levels in context
423-
424-
if (null_col != nullptr) {
425-
while (ctx_ptr < ctx._num_levels && offset < col->size()) {
426-
auto def_level = ctx._def_levels ? (*ctx._def_levels)[ctx_ptr] : 0;
427-
auto rep_level = ctx._rep_levels ? (*ctx._rep_levels)[ctx_ptr] : 0;
428-
429-
uint8_t null_in_parent_column = def_level != ctx._max_def_level;
430-
uint8_t null_in_current_column = null_col[offset] == 1;
431-
uint8_t empty_array = offsets[offset + 1] == offsets[offset];
432-
uint8_t is_optional = node->is_optional();
433-
auto array_size = offsets[offset + 1] - offsets[offset];
434-
435-
// set def_level back for null in parent column
436-
(*def_levels)[num_levels] += null_in_parent_column * (def_level - (*def_levels)[num_levels]);
437-
// decrement 1 for empty array
438-
(*def_levels)[num_levels] -= ((1 - null_in_parent_column) & (1 - null_in_current_column) & empty_array);
439-
// decrement node->is_optional for null in current column
440-
(*def_levels)[num_levels] -= ((1 - null_in_parent_column) & null_in_current_column & is_optional);
441-
(*rep_levels)[num_levels] = rep_level;
442-
443-
// update pointers
444-
uint8_t advance_one_step = null_in_parent_column | null_in_current_column | empty_array;
445-
num_levels += advance_one_step + (1 - advance_one_step) * array_size;
446-
offset += (1 - null_in_parent_column);
447-
ctx_ptr++;
448-
}
449-
} else {
450-
while (ctx_ptr < ctx._num_levels && offset < col->size()) {
451-
auto def_level = ctx._def_levels ? (*ctx._def_levels)[ctx_ptr] : 0;
452-
auto rep_level = ctx._rep_levels ? (*ctx._rep_levels)[ctx_ptr] : 0;
453-
454-
uint8_t null_in_parent_column = def_level != ctx._max_def_level;
455-
uint8_t empty_array = offsets[offset + 1] == offsets[offset];
456-
auto array_size = offsets[offset + 1] - offsets[offset];
457-
458-
// set def_level back for null in parent column
459-
(*def_levels)[num_levels] =
460-
null_in_parent_column * def_level + (1 - null_in_parent_column) * (*def_levels)[num_levels];
461-
// decrement 1 for empty array
462-
(*def_levels)[num_levels] -= ((1 - null_in_parent_column) & empty_array);
463-
(*rep_levels)[num_levels] = rep_level;
464-
465-
// update pointers
466-
uint8_t advance_one_step = null_in_parent_column | empty_array;
467-
num_levels += advance_one_step + (1 - advance_one_step) * array_size;
468-
offset += (1 - null_in_parent_column);
469-
ctx_ptr++;
470-
}
471-
}
472-
DCHECK(col->size() == offset);
473-
474-
// handle remaining undefined entries
475-
if (ctx_ptr < ctx._num_levels) {
476-
if (ctx._def_levels == nullptr) {
477-
memset(def_levels->data() + num_levels, 0, ctx._num_levels - ctx_ptr);
478-
} else {
479-
memcpy(def_levels->data() + num_levels, ctx._def_levels->data() + ctx_ptr, ctx._num_levels - ctx_ptr);
480-
}
481-
482-
if (ctx._rep_levels == nullptr) {
483-
memset(rep_levels->data() + num_levels, 0, ctx._num_levels - ctx_ptr);
484-
} else {
485-
memcpy(rep_levels->data() + num_levels, ctx._rep_levels->data() + ctx_ptr, ctx._num_levels - ctx_ptr);
486-
}
487-
}
488-
489-
def_levels->resize(num_levels);
490-
rep_levels->resize(num_levels);
491-
LevelBuilderContext derived_ctx(def_levels->size(), def_levels, ctx._max_def_level + node->is_optional() + 1,
492-
rep_levels, ctx._max_rep_level + 1);
493-
494-
_write_column_chunk(derived_ctx, type_desc.children[0], inner_node, elements, write_leaf_callback);
495-
}
496-
497402
void LevelBuilder::_write_map_column_chunk(const LevelBuilderContext& ctx, const TypeDescriptor& type_desc,
498403
const ::parquet::schema::NodePtr& node, const ColumnPtr& col,
499404
const CallbackFunction& write_leaf_callback) {

be/src/formats/parquet/level_builder.h

-4
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,6 @@ class LevelBuilder {
126126
const ::parquet::schema::NodePtr& node, const ColumnPtr& col,
127127
const CallbackFunction& write_leaf_callback);
128128

129-
void _write_array_column_chunk_branchless(const LevelBuilderContext& ctx, const TypeDescriptor& type_desc,
130-
const ::parquet::schema::NodePtr& node, const ColumnPtr& col,
131-
const CallbackFunction& write_leaf_callback);
132-
133129
void _write_map_column_chunk(const LevelBuilderContext& ctx, const TypeDescriptor& type_desc,
134130
const ::parquet::schema::NodePtr& node, const ColumnPtr& col,
135131
const CallbackFunction& write_leaf_callback);

0 commit comments

Comments
 (0)