Skip to content

Commit eff717a

Browse files
authored
Support OpenSearch alias field type (#1032)
Signed-off-by: Peng Huo <[email protected]>
1 parent 785d02b commit eff717a

File tree

6 files changed

+161
-8
lines changed

6 files changed

+161
-8
lines changed

Diff for: docs/opensearch-table.md

+31
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,37 @@ Using a wildcard index name:
4747
val df = spark.sql("SELECT * FROM dev.default.`my_index*`")
4848
df.show()
4949
```
50+
## Data Types
51+
The following table defines the data type mapping between OpenSearch index field type and Spark data type.
52+
53+
| **OpenSearch FieldType** | **SparkDataType** |
54+
|--------------------------|-----------------------------------|
55+
| boolean | BooleanType |
56+
| long | LongType |
57+
| integer | IntegerType |
58+
| short | ShortType |
59+
| byte | ByteType |
60+
| double | DoubleType |
61+
| float | FloatType |
62+
| date(Timestamp) | TimestampType |
63+
| date(Date) | DateType |
64+
| keyword | StringType, VarcharType, CharType |
65+
| text | StringType(meta(osType)=text) |
66+
| object | StructType |
67+
| alias | Inherits referenced field type |
68+
69+
* OpenSearch data type date is mapped to Spark data type based on the format:
70+
* Map to DateType if format = strict_date, (we also support format = date, may change in future)
71+
* Map to TimestampType if format = strict_date_optional_time_nanos, (we also support format =
72+
strict_date_optional_time | epoch_millis, may change in future)
73+
* Spark data types VarcharType(length) and CharType(length) are both currently mapped to Flint data
74+
type *keyword*, dropping their length property. On the other hand, Flint data type *keyword* only
75+
maps to StringType.
76+
* Spark data type MapType is mapped to an empty OpenSearch object. The inner fields then rely on
77+
dynamic mapping. On the other hand, Flint data type *object* only maps to StructType.
78+
* Spark data type DecimalType is mapped to an OpenSearch double. On the other hand, Flint data type
79+
*double* only maps to DoubleType.
80+
* OpenSearch alias fields allow alternative names for existing fields in the schema without duplicating data. They inherit the data type and nullability of the referenced field and resolve dynamically to the primary field in queries.
5081

5182
Join two indices
5283
```scala

Diff for: flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala

+30-4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ object FlintDataType {
3030
"dateFormat" -> DateFormatter.defaultPattern,
3131
"timestampFormat" -> STRICT_DATE_OPTIONAL_TIME_FORMATTER_WITH_NANOS)
3232

33+
val METADATA_ALIAS_PATH_NAME = "aliasPath"
34+
3335
/**
3436
* parse Flint metadata and extract properties to StructType.
3537
*/
@@ -39,14 +41,38 @@ object FlintDataType {
3941

4042
def deserializeJValue(json: JValue): StructType = {
4143
val properties = (json \ "properties").extract[Map[String, JValue]]
42-
val fields = properties.map { case (fieldName, fieldProperties) =>
43-
deserializeFiled(fieldName, fieldProperties)
44+
val (aliasProps, normalProps) = properties.partition { case (_, fieldProperties) =>
45+
(fieldProperties \ "type") match {
46+
case JString("alias") => true
47+
case _ => false
48+
}
4449
}
4550

46-
StructType(fields.toSeq)
51+
val fields: Seq[StructField] = normalProps.map { case (fieldName, fieldProperties) =>
52+
deserializeField(fieldName, fieldProperties)
53+
}.toSeq
54+
55+
val normalFieldMap: Map[String, StructField] = fields.map(f => f.name -> f).toMap
56+
57+
val aliasFields: Seq[StructField] = aliasProps.map { case (fieldName, fieldProperties) =>
58+
val aliasPath = (fieldProperties \ "path").extract[String]
59+
if (!normalFieldMap.contains(aliasPath)) {
60+
throw new IllegalStateException(
61+
s"Alias field [$fieldName] references undefined field [$aliasPath]")
62+
}
63+
val metadataBuilder = new MetadataBuilder()
64+
metadataBuilder.putString(METADATA_ALIAS_PATH_NAME, aliasPath)
65+
DataTypes.createStructField(
66+
fieldName,
67+
normalFieldMap(aliasPath).dataType,
68+
true,
69+
metadataBuilder.build())
70+
}.toSeq
71+
72+
StructType(fields ++ aliasFields)
4773
}
4874

49-
def deserializeFiled(fieldName: String, fieldProperties: JValue): StructField = {
75+
def deserializeField(fieldName: String, fieldProperties: JValue): StructField = {
5076
val metadataBuilder = new MetadataBuilder()
5177
val dataType = fieldProperties \ "type" match {
5278
// boolean

Diff for: flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/json/FlintJacksonParser.scala

+25-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.json.{JacksonUtils, JsonFilters, JSONOption
2121
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, DateFormatter, DateTimeUtils, GenericArrayData, IntervalUtils, MapData, PartialResultException, RebaseDateTime, TimestampFormatter}
2222
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
2323
import org.apache.spark.sql.errors.QueryExecutionErrors
24+
import org.apache.spark.sql.flint.datatype.FlintDataType
2425
import org.apache.spark.sql.internal.SQLConf
2526
import org.apache.spark.sql.sources.Filter
2627
import org.apache.spark.sql.types._
@@ -448,13 +449,33 @@ class FlintJacksonParser(
448449
var badRecordException: Option[Throwable] = None
449450
var skipRow = false
450451

452+
// Build mapping from JSON key to sequence of schema field indices.
453+
val fieldMapping: Map[String, Seq[Int]] = {
454+
schema.fields.zipWithIndex.foldLeft(Map.empty[String, Seq[Int]]) {
455+
case (acc, (field, idx)) =>
456+
val jsonKey = if (field.metadata.contains(FlintDataType.METADATA_ALIAS_PATH_NAME)) {
457+
field.metadata.getString(FlintDataType.METADATA_ALIAS_PATH_NAME)
458+
} else {
459+
field.name
460+
}
461+
acc.updated(jsonKey, acc.getOrElse(jsonKey, Seq.empty[Int]) :+ idx)
462+
}
463+
}
464+
451465
structFilters.reset()
452466
while (!skipRow && nextUntil(parser, JsonToken.END_OBJECT)) {
453-
schema.getFieldIndex(parser.getCurrentName) match {
454-
case Some(index) =>
467+
fieldMapping.get(parser.getCurrentName) match {
468+
case Some(indices) =>
455469
try {
456-
row.update(index, fieldConverters(index).apply(parser))
457-
skipRow = structFilters.skipRow(row, index)
470+
// All fields in indices are same type.
471+
val fieldValue = fieldConverters(indices.head).apply(parser)
472+
// Assign the parsed value to all schema fields mapped to this JSON key.
473+
indices.foreach { idx =>
474+
row.update(idx, fieldValue)
475+
if (structFilters.skipRow(row, idx)) {
476+
skipRow = true
477+
}
478+
}
458479
} catch {
459480
case e: SparkUpgradeException => throw e
460481
case NonFatal(e) if isRoot =>

Diff for: flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala

+41
Original file line numberDiff line numberDiff line change
@@ -279,4 +279,45 @@ class FlintDataTypeSuite extends FlintSuite with Matchers {
279279
val data: JValue = JsonMethods.parse(json)
280280
JsonMethods.compact(JsonMethods.render(data))
281281
}
282+
283+
test("alias field deserialize") {
284+
val flintDataType =
285+
"""{
286+
| "properties": {
287+
| "distance": {
288+
| "type": "long"
289+
| },
290+
| "route_length_miles": {
291+
| "type": "alias",
292+
| "path": "distance"
293+
| },
294+
| "transit_mode": {
295+
| "type": "keyword"
296+
| }
297+
| }
298+
|}""".stripMargin
299+
300+
val expectedStructType = StructType(
301+
Seq(
302+
StructField("distance", LongType, true),
303+
StructField("transit_mode", StringType, true),
304+
StructField(
305+
"route_length_miles",
306+
LongType,
307+
true,
308+
new MetadataBuilder().putString("aliasPath", "distance").build())))
309+
310+
val deserialized = FlintDataType.deserialize(flintDataType)
311+
312+
deserialized.fields should have length (3)
313+
deserialized.fields(0) shouldEqual expectedStructType.fields(0)
314+
deserialized.fields(1) shouldEqual expectedStructType.fields(1)
315+
316+
val aliasField = deserialized.fields(2)
317+
aliasField.name shouldEqual "route_length_miles"
318+
aliasField.dataType shouldEqual LongType
319+
aliasField.metadata.contains("aliasPath") shouldBe true
320+
aliasField.metadata.getString("aliasPath") shouldEqual "distance"
321+
}
322+
282323
}

Diff for: integ-test/src/integration/scala/org/apache/spark/opensearch/table/OpenSearchTableQueryITSuite.scala

+18
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,22 @@ class OpenSearchTableQueryITSuite extends OpenSearchCatalogSuite with FlintPPLSu
4747
}
4848
}
4949
}
50+
51+
test("Query index with alias data type") {
52+
val index1 = "t0001"
53+
withIndexName(index1) {
54+
indexWithAlias(index1)
55+
// select original field and alias field
56+
var df = spark.sql(s"""SELECT id, alias FROM ${catalogName}.default.$index1""")
57+
checkAnswer(df, Seq(Row(1, 1), Row(2, 2)))
58+
59+
// filter on alias field
60+
df = spark.sql(s"""SELECT id, alias FROM ${catalogName}.default.$index1 WHERE alias=1""")
61+
checkAnswer(df, Row(1, 1))
62+
63+
// filter on original field
64+
df = spark.sql(s"""SELECT id, alias FROM ${catalogName}.default.$index1 WHERE id=1""")
65+
checkAnswer(df, Row(1, 1))
66+
}
67+
}
5068
}

Diff for: integ-test/src/integration/scala/org/opensearch/flint/OpenSearchSuite.scala

+16
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,22 @@ trait OpenSearchSuite extends BeforeAndAfterAll {
123123
index(indexName, oneNodeSetting, mappings, docs)
124124
}
125125

126+
def indexWithAlias(indexName: String): Unit = {
127+
val mappings = """{
128+
| "properties": {
129+
| "id": {
130+
| "type": "integer"
131+
| },
132+
| "alias": {
133+
| "type": "alias",
134+
| "path": "id"
135+
| }
136+
| }
137+
|}""".stripMargin
138+
val docs = Seq("""{"id": 1}""", """{"id": 2}""")
139+
index(indexName, oneNodeSetting, mappings, docs)
140+
}
141+
126142
def index(index: String, settings: String, mappings: String, docs: Seq[String]): Unit = {
127143
openSearchClient.indices.create(
128144
new CreateIndexRequest(index)

0 commit comments

Comments
 (0)