[SUPPORT] Issue with Hudi Parquet Compression Codec Not Working as Expected

I'm experiencing an issue with the Hudi configuration for the parquet compression codec. Despite setting the option "hoodie.parquet.compression.codec": "GZIP" in my Hudi write options, the output files in my data lake are not showing as compressed files. Instead, I only see the standard Parquet files.
Configuration:
Hudi Version: 1.0.0-beta2
Spark Version: 3.4
Java Version: OpenJDK 11


```

hudi_options = {
    'hoodie.table.name': table_name,
    'hoodie.datasource.write.table.type': table_type,
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': method,
    'hoodie.datasource.write.recordkey.field': recordkey,
    'hoodie.datasource.write.precombine.field': precombine,
    "hoodie.datasource.write.partitionpath.field": partition_fields,
    "hoodie.parquet.compression.codec": "GZIP",
    "parquet.compression.codec": "GZIP"
}

```


# Test Code 
```
try:
    import os
    import sys
    import uuid
    import pyspark
    import datetime
    from pyspark.sql import SparkSession
    from pyspark import SparkConf, SparkContext
    from faker import Faker
    import datetime
    from datetime import datetime
    import random 
    import pandas as pd  # Import Pandas library for pretty printing
    print("Imports loaded ")

except Exception as e:
    print("error", e)
HUDI_VERSION = '1.0.0-beta2'
SPARK_VERSION = '3.4'

os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11"
SUBMIT_ARGS = f"--packages org.apache.hudi:hudi-spark{SPARK_VERSION}-bundle_2.12:{HUDI_VERSION} pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
os.environ['PYSPARK_PYTHON'] = sys.executable

# Spark session
spark = SparkSession.builder \
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
    .config('spark.sql.extensions', 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
    .config('className', 'org.apache.hudi') \
    .config('spark.sql.hive.convertMetastoreParquet', 'false') \
    .getOrCreate()
def write_to_hudi(spark_df,
                  table_name,
                  db_name,
                  method='upsert',
                  table_type='COPY_ON_WRITE',
                  recordkey='',
                  precombine='',
                  partition_fields='',
                  index_type='BLOOM',
                  curr_region='us-east-1'
                  ):
    path = f"file:///Users/soumilshah/IdeaProjects/SparkProject/tem/database={db_name}/table_name={table_name}"

    hudi_options = {
    'hoodie.table.name': table_name,
    'hoodie.datasource.write.table.type': table_type,
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': method,
    'hoodie.datasource.write.recordkey.field': recordkey,
    'hoodie.datasource.write.precombine.field': precombine,
    "hoodie.datasource.write.partitionpath.field": partition_fields,
    "hoodie.clustering.plan.strategy.target.file.max.bytes": "1073741824",
    "hoodie.clustering.plan.strategy.small.file.limit": "629145600",
    "hoodie.clustering.execution.strategy.class": "org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy",
    "hoodie.clean.automatic": "true"
    , "hoodie.parquet.max.file.size": 512 * 1024 * 1024
    , "hoodie.parquet.small.file.limit": 104857600,
    "hoodie.parquet.compression.codec": "GZIP",
        "parquet.compression.codec":"GZIP"
    }
 

    spark_df.write.format("hudi"). \
        options(**hudi_options). \
        mode("append"). \
        save(path)

from pyspark.sql.types import StructType, StructField, StringType, LongType

    
schema = StructType([
    StructField("id", StringType(), True),
    StructField("message", StringType(), True)
])


# Loop to generate data and write to Hudi
for i in range(1, 5):  # Number of iterations
    print("Epoch ", str(i))
    # Generate epoch timestamp
    epoch_time = int(datetime.now().timestamp())

    # Create the data
    updated_data = [(str(i), "Batch : {} ".format(i))]

    # Create the DataFrame with the new data
    df = spark.createDataFrame(updated_data, schema)

    # Show the DataFrame with the updated "message" column

    # Write to Hudi
    write_to_hudi(
        spark_df=df,
        method="upsert",
        db_name="default",
        table_name="messages",
        recordkey="id",
        precombine="message"
    )
    
```

# Output 
![image](https://github.com/user-attachments/assets/0b6de5e0-079a-4c71-a802-5d22dc328ef2)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SUPPORT] Issue with Hudi Parquet Compression Codec Not Working as Expected #12169

Test Code

Output

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[SUPPORT] Issue with Hudi Parquet Compression Codec Not Working as Expected #12169

Description

Test Code

Output

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions