Merge pull request #2488 from AI-Hypercomputer:carlosbus/training_v6e_gemma3_12b

Google-ML-Automation · Google-ML-Automation · commit 50bafeb98299 · 2025-10-18T00:22:02.000-07:00
PiperOrigin-RevId: 820979201
diff --git a/benchmarks/maxtext_trillium_model_configs.py b/benchmarks/maxtext_trillium_model_configs.py
@@ -1714,6 +1714,125 @@
     ),
 )
 
+gemma3_12b_32768_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
+gemma3_12b_32768_2x_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-2x-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": 1,
+            "ici_fsdp_transpose_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
+gemma3_12b_32768_4x_v6e256 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="gemma3-12b-32768-4x-v6e256",
+        model_type="gemma3-12b",
+        tuning_params={
+            "per_device_batch_size": 1,
+            "num_vocab_tiling": 16,
+            "ici_fsdp_parallelism": 1,
+            "ici_fsdp_transpose_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "device",
+            "query_proj": "remat",
+            "key_proj": "remat",
+            "value_proj": "remat",
+            "max_target_length": 32768,
+            "attention": "flash",
+            "gcs_metrics": True,
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "reuse_example_batch": 1,
+            "enable_checkpointing": False,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 2,
+            "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"),
+            "sa_block_q": 1024,
+            "sa_block_kv": 1024,
+            "sa_block_kv_compute": 1024,
+            "sa_block_q_dkv": 512,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 512,
+            "sa_block_q_dq": 1024,
+            "sa_block_kv_dq": 1024,
+        },
+        xla_flags=(xla_flags_library.CUSTOM_VMEM_LIMIT_FLAG(vmem_limit=122880)),
+    ),
+)
+
 # Config for Llama3.1 70B model with 131072 max target length aka context length
 llama3_1_70b_131072 = _add_to_model_dictionary(
     trillium_model_dict,