-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Support table format: Iceberg, Delta, and Hudi #5650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
HaoXuAI
wants to merge
10
commits into
master
Choose a base branch
from
support-table-format
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
8ae5ed5
add support for table format such as Iceberg, Delta, Hudi etc.
HaoXuAI 456b97f
linting
HaoXuAI afa1444
linting
HaoXuAI 7a34eaf
add tests
HaoXuAI 99f85b1
fix tests
HaoXuAI eb15424
fix tests
HaoXuAI 05e3a65
Merge branch 'master' into support-table-format
HaoXuAI b11c083
linting
HaoXuAI 021f164
Merge branch 'master' into support-table-format
HaoXuAI 48b166b
Merge branch 'master' into support-table-format
HaoXuAI File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import json | ||
| import logging | ||
| import traceback | ||
| import warnings | ||
|
|
@@ -14,17 +15,17 @@ | |
| ) | ||
| from feast.repo_config import RepoConfig | ||
| from feast.saved_dataset import SavedDatasetStorage | ||
| from feast.table_format import TableFormat, table_format_from_dict | ||
| from feast.type_map import spark_to_feast_value_type | ||
| from feast.value_type import ValueType | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class SparkSourceFormat(Enum): | ||
| class SparkFileSourceFormat(Enum): | ||
| csv = "csv" | ||
| json = "json" | ||
| parquet = "parquet" | ||
| delta = "delta" | ||
| avro = "avro" | ||
|
|
||
|
|
||
|
|
@@ -42,6 +43,7 @@ def __init__( | |
| query: Optional[str] = None, | ||
| path: Optional[str] = None, | ||
| file_format: Optional[str] = None, | ||
| table_format: Optional[TableFormat] = None, | ||
| created_timestamp_column: Optional[str] = None, | ||
| field_mapping: Optional[Dict[str, str]] = None, | ||
| description: Optional[str] = "", | ||
|
|
@@ -58,7 +60,9 @@ def __init__( | |
| table: The name of a Spark table. | ||
| query: The query to be executed in Spark. | ||
| path: The path to file data. | ||
| file_format: The format of the file data. | ||
| file_format: The underlying file format (parquet, avro, csv, json). | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not consolidate now? |
||
| table_format: The table metadata format (iceberg, delta, hudi, etc.). | ||
| Optional and separate from file_format. | ||
| created_timestamp_column: Timestamp column indicating when the row | ||
| was created, used for deduplicating rows. | ||
| field_mapping: A dictionary mapping of column names in this data | ||
|
|
@@ -70,7 +74,7 @@ def __init__( | |
| timestamp_field: Event timestamp field used for point-in-time joins of | ||
| feature values. | ||
| date_partition_column: The column to partition the data on for faster | ||
| retrieval. This is useful for large tables and will limit the number ofi | ||
| retrieval. This is useful for large tables and will limit the number of | ||
| """ | ||
| # If no name, use the table as the default name. | ||
| if name is None and table is None: | ||
|
|
@@ -102,6 +106,7 @@ def __init__( | |
| path=path, | ||
| file_format=file_format, | ||
| date_partition_column_format=date_partition_column_format, | ||
| table_format=table_format, | ||
| ) | ||
|
|
||
| @property | ||
|
|
@@ -132,6 +137,13 @@ def file_format(self): | |
| """ | ||
| return self.spark_options.file_format | ||
|
|
||
| @property | ||
| def table_format(self): | ||
| """ | ||
| Returns the table format of this feature data source. | ||
| """ | ||
| return self.spark_options.table_format | ||
|
|
||
| @property | ||
| def date_partition_column_format(self): | ||
| """ | ||
|
|
@@ -151,6 +163,7 @@ def from_proto(data_source: DataSourceProto) -> Any: | |
| query=spark_options.query, | ||
| path=spark_options.path, | ||
| file_format=spark_options.file_format, | ||
| table_format=spark_options.table_format, | ||
| date_partition_column_format=spark_options.date_partition_column_format, | ||
| date_partition_column=data_source.date_partition_column, | ||
| timestamp_field=data_source.timestamp_field, | ||
|
|
@@ -219,7 +232,7 @@ def get_table_query_string(self) -> str: | |
| if spark_session is None: | ||
| raise AssertionError("Could not find an active spark session.") | ||
| try: | ||
| df = spark_session.read.format(self.file_format).load(self.path) | ||
| df = self._load_dataframe_from_path(spark_session) | ||
| except Exception: | ||
| logger.exception( | ||
| "Spark read of file source failed.\n" + traceback.format_exc() | ||
|
|
@@ -230,6 +243,24 @@ def get_table_query_string(self) -> str: | |
|
|
||
| return f"`{tmp_table_name}`" | ||
|
|
||
| def _load_dataframe_from_path(self, spark_session): | ||
| """Load DataFrame from path, considering both file format and table format.""" | ||
|
|
||
| if self.table_format is None: | ||
| # No table format specified, use standard file reading with file_format | ||
| return spark_session.read.format(self.file_format).load(self.path) | ||
|
|
||
| # Build reader with table format and options | ||
| reader = spark_session.read.format(self.table_format.format_type.value) | ||
|
|
||
| # Add table format specific options | ||
| for key, value in self.table_format.properties.items(): | ||
| reader = reader.option(key, value) | ||
|
|
||
| # For catalog-based table formats like Iceberg, the path is actually a table name | ||
| # For file-based formats, it's still a file path | ||
| return reader.load(self.path) | ||
|
|
||
| def __eq__(self, other): | ||
| base_eq = super().__eq__(other) | ||
| if not base_eq: | ||
|
|
@@ -245,7 +276,7 @@ def __hash__(self): | |
|
|
||
|
|
||
| class SparkOptions: | ||
| allowed_formats = [format.value for format in SparkSourceFormat] | ||
| allowed_formats = [format.value for format in SparkFileSourceFormat] | ||
|
|
||
| def __init__( | ||
| self, | ||
|
|
@@ -254,6 +285,7 @@ def __init__( | |
| path: Optional[str], | ||
| file_format: Optional[str], | ||
| date_partition_column_format: Optional[str] = "%Y-%m-%d", | ||
| table_format: Optional[TableFormat] = None, | ||
| ): | ||
| # Check that only one of the ways to load a spark dataframe can be used. We have | ||
| # to treat empty string and null the same due to proto (de)serialization. | ||
|
|
@@ -262,11 +294,14 @@ def __init__( | |
| "Exactly one of params(table, query, path) must be specified." | ||
| ) | ||
| if path: | ||
| if not file_format: | ||
| # If table_format is specified, file_format is optional (table format determines the reader) | ||
| # If no table_format, file_format is required for basic file reading | ||
| if not table_format and not file_format: | ||
| raise ValueError( | ||
| "If 'path' is specified, then 'file_format' is required." | ||
| "If 'path' is specified without 'table_format', then 'file_format' is required." | ||
| ) | ||
| if file_format not in self.allowed_formats: | ||
| # Only validate file_format if it's provided (it's optional with table_format) | ||
| if file_format and file_format not in self.allowed_formats: | ||
| raise ValueError( | ||
| f"'file_format' should be one of {self.allowed_formats}" | ||
| ) | ||
|
|
@@ -276,6 +311,7 @@ def __init__( | |
| self._path = path | ||
| self._file_format = file_format | ||
| self._date_partition_column_format = date_partition_column_format | ||
| self._table_format = table_format | ||
|
|
||
| @property | ||
| def table(self): | ||
|
|
@@ -317,6 +353,14 @@ def date_partition_column_format(self): | |
| def date_partition_column_format(self, date_partition_column_format): | ||
| self._date_partition_column_format = date_partition_column_format | ||
|
|
||
| @property | ||
| def table_format(self): | ||
| return self._table_format | ||
|
|
||
| @table_format.setter | ||
| def table_format(self, table_format): | ||
| self._table_format = table_format | ||
|
|
||
| @classmethod | ||
| def from_proto(cls, spark_options_proto: DataSourceProto.SparkOptions): | ||
| """ | ||
|
|
@@ -326,12 +370,20 @@ def from_proto(cls, spark_options_proto: DataSourceProto.SparkOptions): | |
| Returns: | ||
| Returns a SparkOptions object based on the spark_options protobuf | ||
| """ | ||
| # Parse table_format if present | ||
| table_format = None | ||
| if spark_options_proto.table_format: | ||
| table_format = table_format_from_dict( | ||
| json.loads(spark_options_proto.table_format) | ||
| ) | ||
|
|
||
| spark_options = cls( | ||
| table=spark_options_proto.table, | ||
| query=spark_options_proto.query, | ||
| path=spark_options_proto.path, | ||
| file_format=spark_options_proto.file_format, | ||
| date_partition_column_format=spark_options_proto.date_partition_column_format, | ||
| table_format=table_format, | ||
| ) | ||
|
|
||
| return spark_options | ||
|
|
@@ -348,6 +400,9 @@ def to_proto(self) -> DataSourceProto.SparkOptions: | |
| path=self.path, | ||
| file_format=self.file_format, | ||
| date_partition_column_format=self.date_partition_column_format, | ||
| table_format=json.dumps(self.table_format.to_dict()) | ||
| if self.table_format | ||
| else "", | ||
| ) | ||
|
|
||
| return spark_options_proto | ||
|
|
@@ -364,12 +419,14 @@ def __init__( | |
| query: Optional[str] = None, | ||
| path: Optional[str] = None, | ||
| file_format: Optional[str] = None, | ||
| table_format: Optional[TableFormat] = None, | ||
| ): | ||
| self.spark_options = SparkOptions( | ||
| table=table, | ||
| query=query, | ||
| path=path, | ||
| file_format=file_format, | ||
| table_format=table_format, | ||
| ) | ||
|
|
||
| @staticmethod | ||
|
|
@@ -380,6 +437,7 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: | |
| query=spark_options.query, | ||
| path=spark_options.path, | ||
| file_format=spark_options.file_format, | ||
| table_format=spark_options.table_format, | ||
| ) | ||
|
|
||
| def to_proto(self) -> SavedDatasetStorageProto: | ||
|
|
@@ -391,4 +449,5 @@ def to_data_source(self) -> DataSource: | |
| query=self.spark_options.query, | ||
| path=self.spark_options.path, | ||
| file_format=self.spark_options.file_format, | ||
| table_format=self.spark_options.table_format, | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO, create TableFormat proto, consolidate with FileFormat proto