Skip to content

Commit 0dd5949

Browse files
authored
Fixes for Multisearch and Append command (#4512)
* Fix for Multisearch and Append command Signed-off-by: Kai Huang <[email protected]> # Conflicts: # docs/category.json * fix tests Signed-off-by: Kai Huang <[email protected]> * fix test Signed-off-by: Kai Huang <[email protected]> * remove error location Signed-off-by: Kai Huang <[email protected]> * Allow same SqlTypeName but with different nullability to be merged Signed-off-by: Kai Huang <[email protected]> * Update error message Signed-off-by: Kai Huang <[email protected]> --------- Signed-off-by: Kai Huang <[email protected]>
1 parent e6eb808 commit 0dd5949

File tree

9 files changed

+171
-251
lines changed

9 files changed

+171
-251
lines changed

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1809,18 +1809,16 @@ public RelNode visitMultisearch(Multisearch node, CalcitePlanContext context) {
18091809
}
18101810

18111811
/**
1812-
* Finds the timestamp field for multisearch ordering.
1812+
* Finds the @timestamp field for multisearch ordering. Only @timestamp field is used for
1813+
* timestamp interleaving. Other timestamp-like fields are ignored.
18131814
*
1814-
* @param rowType The row type to search for timestamp fields
1815-
* @return The name of the timestamp field, or null if not found
1815+
* @param rowType The row type to search for @timestamp field
1816+
* @return "@timestamp" if the field exists, or null if not found
18161817
*/
18171818
private String findTimestampField(RelDataType rowType) {
1818-
String[] candidates = {"@timestamp", "_time", "timestamp", "time"};
1819-
for (String fieldName : candidates) {
1820-
RelDataTypeField field = rowType.getField(fieldName, false, false);
1821-
if (field != null) {
1822-
return fieldName;
1823-
}
1819+
RelDataTypeField field = rowType.getField("@timestamp", false, false);
1820+
if (field != null) {
1821+
return "@timestamp";
18241822
}
18251823
return null;
18261824
}

core/src/main/java/org/opensearch/sql/calcite/SchemaUnifier.java

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,27 @@
77

88
import java.util.ArrayList;
99
import java.util.HashMap;
10-
import java.util.HashSet;
1110
import java.util.List;
1211
import java.util.Map;
13-
import java.util.Set;
1412
import java.util.stream.Collectors;
1513
import org.apache.calcite.rel.RelNode;
1614
import org.apache.calcite.rel.type.RelDataType;
1715
import org.apache.calcite.rel.type.RelDataTypeField;
1816
import org.apache.calcite.rex.RexNode;
19-
import org.apache.calcite.sql.validate.SqlValidatorUtil;
2017

2118
/**
22-
* Utility class for unifying schemas across multiple RelNodes with type conflict resolution. Uses
23-
* the same strategy as append command - renames conflicting fields to avoid type conflicts.
19+
* Utility class for unifying schemas across multiple RelNodes. Throws an exception when type
20+
* conflicts are detected.
2421
*/
2522
public class SchemaUnifier {
2623

2724
/**
28-
* Builds a unified schema for multiple nodes with type conflict resolution.
25+
* Builds a unified schema for multiple nodes. Throws an exception if type conflicts are detected.
2926
*
3027
* @param nodes List of RelNodes to unify schemas for
3128
* @param context Calcite plan context
3229
* @return List of projected RelNodes with unified schema
30+
* @throws IllegalArgumentException if type conflicts are detected
3331
*/
3432
public static List<RelNode> buildUnifiedSchemaWithConflictResolution(
3533
List<RelNode> nodes, CalcitePlanContext context) {
@@ -41,7 +39,7 @@ public static List<RelNode> buildUnifiedSchemaWithConflictResolution(
4139
return nodes;
4240
}
4341

44-
// Step 1: Build the unified schema by processing all nodes
42+
// Step 1: Build the unified schema by processing all nodes (throws on conflict)
4543
List<SchemaField> unifiedSchema = buildUnifiedSchema(nodes);
4644

4745
// Step 2: Create projections for each node to align with unified schema
@@ -55,47 +53,37 @@ public static List<RelNode> buildUnifiedSchemaWithConflictResolution(
5553
projectedNodes.add(projectedNode);
5654
}
5755

58-
// Step 3: Unify names to handle type conflicts (this creates age0, age1, etc.)
59-
List<String> uniqueNames =
60-
SqlValidatorUtil.uniquify(fieldNames, SqlValidatorUtil.EXPR_SUGGESTER, true);
61-
62-
// Step 4: Re-project with unique names if needed
63-
if (!uniqueNames.equals(fieldNames)) {
64-
List<RelNode> renamedNodes = new ArrayList<>();
65-
for (RelNode node : projectedNodes) {
66-
RelNode renamedNode =
67-
context.relBuilder.push(node).project(context.relBuilder.fields(), uniqueNames).build();
68-
renamedNodes.add(renamedNode);
69-
}
70-
return renamedNodes;
71-
}
72-
7356
return projectedNodes;
7457
}
7558

7659
/**
77-
* Builds a unified schema by merging fields from all nodes. Fields with the same name but
78-
* different types are added as separate entries (which will be renamed during uniquification).
60+
* Builds a unified schema by merging fields from all nodes. Throws an exception if fields with
61+
* the same name have different types.
7962
*
8063
* @param nodes List of RelNodes to merge schemas from
81-
* @return List of SchemaField representing the unified schema (may contain duplicate names)
64+
* @return List of SchemaField representing the unified schema
65+
* @throws IllegalArgumentException if type conflicts are detected
8266
*/
8367
private static List<SchemaField> buildUnifiedSchema(List<RelNode> nodes) {
8468
List<SchemaField> schema = new ArrayList<>();
85-
Map<String, Set<RelDataType>> seenFields = new HashMap<>();
69+
Map<String, RelDataType> seenFields = new HashMap<>();
8670

8771
for (RelNode node : nodes) {
8872
for (RelDataTypeField field : node.getRowType().getFieldList()) {
8973
String fieldName = field.getName();
9074
RelDataType fieldType = field.getType();
9175

92-
// Track which (name, type) combinations we've seen
93-
Set<RelDataType> typesForName = seenFields.computeIfAbsent(fieldName, k -> new HashSet<>());
94-
95-
if (!typesForName.contains(fieldType)) {
96-
// New field or same name with different type - add to schema
76+
RelDataType existingType = seenFields.get(fieldName);
77+
if (existingType == null) {
78+
// New field - add to schema
9779
schema.add(new SchemaField(fieldName, fieldType));
98-
typesForName.add(fieldType);
80+
seenFields.put(fieldName, fieldType);
81+
} else if (!areTypesCompatible(existingType, fieldType)) {
82+
// Same field name but different type - throw exception
83+
throw new IllegalArgumentException(
84+
String.format(
85+
"Unable to process column '%s' due to incompatible types: '%s' and '%s'",
86+
fieldName, existingType.getSqlTypeName(), fieldType.getSqlTypeName()));
9987
}
10088
// If we've seen this exact (name, type) combination, skip it
10189
}
@@ -104,6 +92,10 @@ private static List<SchemaField> buildUnifiedSchema(List<RelNode> nodes) {
10492
return schema;
10593
}
10694

95+
private static boolean areTypesCompatible(RelDataType type1, RelDataType type2) {
96+
return type1.getSqlTypeName() != null && type1.getSqlTypeName().equals(type2.getSqlTypeName());
97+
}
98+
10799
/**
108100
* Builds a projection for a node to align with the unified schema. For each field in the unified
109101
* schema: - If the node has a matching field with the same type, use it - Otherwise, project NULL
@@ -125,8 +117,8 @@ private static List<RexNode> buildProjectionForNode(
125117
RelDataType expectedType = schemaField.getType();
126118
RelDataTypeField nodeField = nodeFieldMap.get(fieldName);
127119

128-
if (nodeField != null && nodeField.getType().equals(expectedType)) {
129-
// Field exists with matching type - use it
120+
if (nodeField != null && areTypesCompatible(nodeField.getType(), expectedType)) {
121+
// Field exists with compatible type - use it
130122
projection.add(context.rexBuilder.makeInputRef(node, nodeField.getIndex()));
131123
} else {
132124
// Field missing or type mismatch - project NULL

docs/category.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"user/ppl/cmd/rare.rst",
4141
"user/ppl/cmd/regex.rst",
4242
"user/ppl/cmd/rename.rst",
43+
"user/ppl/cmd/multisearch.rst",
4344
"user/ppl/cmd/replace.rst",
4445
"user/ppl/cmd/rex.rst",
4546
"user/ppl/cmd/search.rst",

docs/user/ppl/cmd/append.rst

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ append <sub-search>
2424

2525
* sub-search: mandatory. Executes PPL commands as a secondary search.
2626

27+
Limitations
28+
===========
29+
30+
* **Schema Compatibility**: When fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with ``eval`` or using ``fields`` to select non-conflicting columns).
31+
2732
Example 1: Append rows from a count aggregation to existing search result
2833
===============================================================
2934

@@ -64,23 +69,3 @@ PPL query::
6469
| 101 | M | null |
6570
+-----+--------+-------+
6671

67-
Example 3: Append rows with column type conflict
68-
=============================================
69-
70-
This example shows how column type conflicts are handled when appending results. Same name columns with different types will generate two different columns in appended result.
71-
72-
PPL query::
73-
74-
os> source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender | eval sum = cast(sum as double) ];
75-
fetched rows / total rows = 6/6
76-
+------+--------+-------+-------+
77-
| sum | gender | state | sum0 |
78-
|------+--------+-------+-------|
79-
| 36 | M | TN | null |
80-
| 33 | M | MD | null |
81-
| 32 | M | IL | null |
82-
| 28 | F | VA | null |
83-
| null | F | null | 28.0 |
84-
| null | M | null | 101.0 |
85-
+------+--------+-------+-------+
86-

docs/user/ppl/cmd/multisearch.rst

Lines changed: 23 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@ Description
3030
* **A/B Testing Analysis**: Combine results from different test groups for comparison
3131
* **Time-series Data Merging**: Interleave events from multiple sources based on timestamps
3232

33-
Version
34-
=======
35-
3.3.0
36-
3733
Syntax
3834
======
3935
| multisearch <subsearch1> <subsearch2> <subsearch3> ...
@@ -59,7 +55,7 @@ Limitations
5955
===========
6056

6157
* **Minimum Subsearches**: At least two subsearches must be specified
62-
* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed with a numeric suffix (e.g., ``age`` becomes ``age0``, ``age1``, etc.). This ensures all data is preserved while maintaining schema consistency.
58+
* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type across all subsearches, or use different field names (e.g., by renaming with ``eval`` or using ``fields`` to select non-conflicting columns).
6359

6460
Usage
6561
=====
@@ -84,8 +80,8 @@ PPL query::
8480
|-----------+-----+-----------|
8581
| Nanette | 28 | young |
8682
| Amber | 32 | adult |
83+
| Dale | 33 | adult |
8784
| Hattie | 36 | adult |
88-
| Dale | 37 | adult |
8985
+-----------+-----+-----------+
9086

9187
Example 2: Success Rate Pattern
@@ -97,14 +93,14 @@ PPL query::
9793

9894
os> | multisearch [search source=accounts | where balance > 20000 | eval query_type = "high_balance" | fields firstname, balance, query_type] [search source=accounts | where balance > 0 AND balance <= 20000 | eval query_type = "regular" | fields firstname, balance, query_type] | sort balance desc;
9995
fetched rows / total rows = 4/4
100-
+-----------+---------+-------------+
101-
| firstname | balance | query_type |
102-
|-----------+---------+-------------|
103-
| Amber | 39225 | high_balance|
104-
| Nanette | 32838 | high_balance|
105-
| Hattie | 5686 | regular |
106-
| Dale | 4180 | regular |
107-
+-----------+---------+-------------+
96+
+-----------+---------+--------------+
97+
| firstname | balance | query_type |
98+
|-----------+---------+--------------|
99+
| Amber | 39225 | high_balance |
100+
| Nanette | 32838 | high_balance |
101+
| Hattie | 5686 | regular |
102+
| Dale | 4180 | regular |
103+
+-----------+---------+--------------+
108104

109105
Example 3: Timestamp Interleaving
110106
==================================
@@ -113,37 +109,19 @@ Combine time-series data from multiple sources with automatic timestamp-based or
113109

114110
PPL query::
115111

116-
os> | multisearch [search source=time_data | where category IN ("A", "B")] [search source=time_data2 | where category IN ("E", "F")] | head 5;
112+
os> | multisearch [search source=time_data | where category IN ("A", "B")] [search source=time_data2 | where category IN ("E", "F")] | fields @timestamp, category, value, timestamp | head 5;
117113
fetched rows / total rows = 5/5
118-
+-------+---------------------+----------+-------+---------------------+
119-
| index | @timestamp | category | value | timestamp |
120-
|-------+---------------------+----------+-------+---------------------|
121-
| null | 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 |
122-
| null | 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 |
123-
| null | 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 |
124-
| null | 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 |
125-
| null | 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 |
126-
+-------+---------------------+----------+-------+---------------------+
127-
128-
Example 4: Handling Empty Results
129-
==================================
130-
131-
Multisearch gracefully handles cases where some subsearches return no results.
132-
133-
PPL query::
134-
135-
os> | multisearch [search source=accounts | where age > 25 | fields firstname, age] [search source=accounts | where age > 200 | eval impossible = "yes" | fields firstname, age, impossible] | head 5;
136-
fetched rows / total rows = 4/4
137-
+-----------+-----+------------+
138-
| firstname | age | impossible |
139-
|-----------+-----+------------|
140-
| Nanette | 28 | null |
141-
| Amber | 32 | null |
142-
| Hattie | 36 | null |
143-
| Dale | 37 | null |
144-
+-----------+-----+------------+
145-
146-
Example 5: Type Compatibility - Missing Fields
114+
+---------------------+----------+-------+---------------------+
115+
| @timestamp | category | value | timestamp |
116+
|---------------------+----------+-------+---------------------|
117+
| 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 |
118+
| 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 |
119+
| 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 |
120+
| 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 |
121+
| 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 |
122+
+---------------------+----------+-------+---------------------+
123+
124+
Example 4: Type Compatibility - Missing Fields
147125
=================================================
148126

149127
Demonstrate how missing fields are handled with NULL insertion.
@@ -157,26 +135,7 @@ PPL query::
157135
|-----------+-----+------------|
158136
| Nanette | 28 | yes |
159137
| Amber | 32 | null |
138+
| Dale | 33 | null |
160139
| Hattie | 36 | null |
161-
| Dale | 37 | null |
162140
+-----------+-----+------------+
163141

164-
Example 6: Type Conflict Resolution - Automatic Renaming
165-
===========================================================
166-
167-
When the same field name has incompatible types across subsearches, the system automatically renames conflicting fields with numeric suffixes.
168-
169-
PPL query::
170-
171-
os> | multisearch [search source=accounts | fields firstname, age, balance | head 2] [search source=locations | fields description, age, place_id | head 2];
172-
fetched rows / total rows = 4/4
173-
+-----------+-----+---------+------------------+------+----------+
174-
| firstname | age | balance | description | age0 | place_id |
175-
|-----------+-----+---------+------------------+------+----------|
176-
| Amber | 32 | 39225 | null | null | null |
177-
| Hattie | 36 | 5686 | null | null | null |
178-
| null | null| null | Central Park | old | 1001 |
179-
| null | null| null | Times Square | modern| 1002 |
180-
+-----------+-----+---------+------------------+------+----------+
181-
182-
In this example, the ``age`` field has type ``bigint`` in accounts but type ``string`` in locations. The system keeps the first occurrence as ``age`` (bigint) and renames the second occurrence to ``age0`` (string), preserving all data while avoiding type conflicts.

0 commit comments

Comments
 (0)