Merge branch 'master' into fix/preprocessing-documentation

anivar · web-flow · commit 460542b31567 · 2025-09-15T13:47:47.000+05:30
diff --git a/docs/benchmarks/image_classification/resnet50.md b/docs/benchmarks/image_classification/resnet50.md
@@ -17,6 +17,8 @@ hide:
     
 {{ mlperf_inference_implementation_readme (4, "resnet50", "nvidia") }}
 
+<!-->
+
 === "Intel"
     ## Intel MLPerf Implementation
     
@@ -31,3 +33,5 @@ hide:
     ## MLPerf Modular Implementation in C++
     
 {{ mlperf_inference_implementation_readme (4, "resnet50", "cpp") }}
+
+-->
diff --git a/docs/benchmarks/language/bert.md b/docs/benchmarks/language/bert.md
@@ -19,6 +19,7 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "nvidia") }}
 
+<!-- 
 === "Intel"
     ## Intel MLPerf Implementation
     
@@ -32,3 +33,4 @@ hide:
 {{ mlperf_inference_implementation_readme (4, "bert-99", "qualcomm") }}
 
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "qualcomm") }}
+ -->s
diff --git a/docs/benchmarks/language/gpt-j.md b/docs/benchmarks/language/gpt-j.md
@@ -23,6 +23,7 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99.9", "nvidia") }}
 
+<!-- 
 === "Intel"
     ## Intel MLPerf Implementation
 
@@ -35,3 +36,4 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99", "qualcomm") }}
 
+-->
diff --git a/docs/benchmarks/language/llama2-70b.md b/docs/benchmarks/language/llama2-70b.md
@@ -19,6 +19,7 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99.9", "nvidia") }}
 
+<!-- 
 === "Neural Magic"
     ## Neural Magic MLPerf Implementation
     
@@ -32,3 +33,4 @@ hide:
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "amd") }}
 
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99.9", "amd") }}
+-->
diff --git a/docs/benchmarks/language/scc25_guide/scc25.md b/docs/benchmarks/language/scc25_guide/scc25.md
@@ -0,0 +1,105 @@
+---
+hide:
+  - toc
+---
+
+# Text Summarization with Llama2-70b for Student Cluster Competition 2025
+
+## Introduction
+
+This guide is designed for the [Student Cluster Competition 2025](https://sc25.supercomputing.org/students/student-cluster-competition/) to walk participants through running and optimizing the [MLPerf Inference Benchmark](https://arxiv.org/abs/1911.02549) using [Llama2 70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) across various software and hardware configurations. The goal is to maximize system throughput (measured in Tokens per second) without compromising accuracy. Since the model performs poorly on CPUs, it is essential to run it on GPUs.
+
+For a valid MLPerf Inference submission in this competition, you must run both a performance test and an accuracy test—**no compliance runs are required**. We use the **Offline** scenario, where throughput is the key metric (higher is better). For Llama 2-70B with the OpenOrca dataset (24,576 samples), the **performance run** must process an integer multiple of the full dataset (24,576 × *N* samples), while the **accuracy run** must process **exactly** the full dataset (24,576 samples). These requirements are taken care of by the MLPerf inference implementations. Setup for NVIDIA GPUs typically takes 2–3 hours and can be done offline. The final output is a tarball (`mlperf_submission.tar.gz`) containing MLPerf-compatible results which can be submitted to the organizers via a CLI command.
+
+## Scoring
+
+In the SCC, your first objective will be to get a valid MLPerf benchmark run. Traditionally running the reference MLPerf inference implementation (in Python) is easier compared to running Nvidia MLPerf inference implementation. Since for SCC25 we are having the Llama2-70b model, running the reference implementation needs around 600GB of VRAM and is tested only on 8xH100 Nvidia GPUs. If you have lower VRAM, trying the vendor implementation like of Nvidia or AMD is the best option.  
+
+MLCommons provides [automation](https://github.com/mlcommons/mlperf-automations/) to run the MLPerf inference benchmarks which you can make use of. Currently the automation supports the reference implementation as well as Nvidia implementation and this is useful for you to get a quick valid result as the automation produces the required final output. You can also use the manual steps by following the [reference](https://github.com/mlcommons/inference/tree/master/language/llama2-70b), [Nvidia](https://github.com/mlcommons/inference_results_v5.0/tree/main/closed/NVIDIA) or [AMD](https://github.com/mlcommons/inference_results_v5.0/tree/main/closed/AMD) implementation readmes.
+
+Once the initial run is successful, you'll have the opportunity to optimize the benchmark further by maximizing system utilization, applying quantization techniques, adjusting ML frameworks, experimenting with batch sizes, and more, all of which can earn you additional points.
+
+Since vendor implementations of the MLPerf inference benchmark vary, teams will compete within their respective hardware categories (e.g., Nvidia GPUs, AMD GPUs). Points will be awarded based on the throughput achieved on your system.
+
+Additionally, significant bonus points will be awarded if your team enhances an existing implementation, enables multi-node execution, or adds/extends scripts to [mlperf-automations repository](https://github.com/mlcommons/mlperf-automations/tree/dev/script) supporting new devices, frameworks, implementations etc. All improvements must be made publicly available under the Apache 2.0 license and submitted as pull requests by November 10, 2025 and only the code which is *merge ready* will be considered for evaluation. As a guideline, below are some examples which can fetch you bonus points. 
+
+* Adds multi-node execution support for Nvidia, AMD or reference implementations
+* Support automation for AMD implementation
+* Supports fp8/fp4 quantization for Reference implementation
+* Automate the [network reference implementation](https://github.com/mlcommons/inference/blob/master/language/llama2-70b/SUT_API.py) (this uses OpenAI compatible endpoints)
+* The MLPerf automation supports docker run of Nvidia implementation. Supporting apptainer is a valuable contribution
+
+PS: For any query regarding the contribution, feel free to raise an issue in the [Inference](https://github.com/mlcommons/inference) or [MLPerf automations](https://github.com/mlcommons/mlperf-automations) repositories.
+
+!!! info
+    Both MLPerf and MLC automation are evolving projects.
+    If you encounter issues related to SCC, please submit them [here](https://github.com/mlcommons/inference/issues) with **scc-25** label
+    with proper information about the command used, error logs and any additional usefull information to debug the issue.
+
+## Artifacts to submit to the SCC committee
+
+You will need to submit the following files:
+
+* `mlperf_submission.run` - MLC commands to run MLPerf inference benchmark saved to this file.
+* `mlperf_submission.md` - description of your platform and some highlights of the MLPerf benchmark execution.
+* `<Team Name>` under which results are pushed to the github repository. 
+
+
+## SCC interview
+
+You are encouraged to highlight and explain the obtained MLPerf inference throughput on your system
+and describe any improvements and extensions to this benchmark (such as adding new hardware backend
+or supporting multi-node execution) useful for the community and [MLCommons](https://mlcommons.org).
+
+## Run Commands
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+    
+{{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "reference", fixed_scenarios=["Offline"], categories=["Datacenter"], setup_tips=False, implementation_tips=False, skip_test_query_count=True) }}
+
+=== "Nvidia"
+    ## Nvidia MLPerf Implementation
+
+{{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "nvidia", fixed_scenarios=["Offline"], categories=["Datacenter"], setup_tips=False, implementation_tips=False, skip_test_query_count=True) }}
+
+## Submission Commands
+
+### Generate actual submission tree
+
+
+```bash
+mlcr generate,inference,submission,_wg-inference \
+   --clean \
+   --run-checker \
+   --tar=yes \
+   --env.MLC_TAR_OUTFILE=submission.tar.gz \
+   --division=open \
+   --category=datacenter \
+   --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+   --quiet \
+   --submitter=<Team Name>
+```
+
+* Use `--hw_name="My system name"` to give a meaningful system name.
+* At the end, a **.tar** file would be generated inside the current working directory.
+
+### Submit Results
+
+> **Note:**
+Further instructions on the final submission will be published as the deadline approaches.
+
+<!-- Fork the `mlperf-inference-results-scc25` branch of the repository URL at [mlperf-automations](https://github.com/mlcommons/mlperf-automations). 
+
+Run the following command after **replacing `--repo_url` with your GitHub fork URL**.
+
+```bash
+mlcr push,github,mlperf,inference,submission \
+   --repo_url=https://github.com/<myfork>/mlperf-automations \
+   --repo_branch=mlperf-inference-results-scc25 \
+   --commit_message="Results on system <HW Name>" \
+   --quiet
+```
+
+Once uploaded give a Pull Request to the origin repository. Github action will be running there and once 
+finished you can see your submitted results at [https://docs.mlcommons.org/mlperf-automations](https://docs.mlcommons.org/mlperf-automations). -->
diff --git a/docs/benchmarks/medical_imaging/3d-unet.md b/docs/benchmarks/medical_imaging/3d-unet.md
@@ -22,10 +22,12 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99.9", "nvidia") }}
 
+<!--  
 === "Intel"
     ## Intel MLPerf Implementation
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99", "intel") }}
 
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99.9", "intel") }}
+-->
diff --git a/docs/benchmarks/object_detection/retinanet.md b/docs/benchmarks/object_detection/retinanet.md
@@ -15,6 +15,7 @@ hide:
     
 {{ mlperf_inference_implementation_readme (4, "retinanet", "nvidia") }}
 
+<!-- 
 === "Intel"
     ## Intel MLPerf Implementation
     
@@ -29,3 +30,4 @@ hide:
     ## MLPerf Modular Implementation in C++
     
 {{ mlperf_inference_implementation_readme (4, "retinanet", "cpp") }}
+ -->
diff --git a/docs/benchmarks/recommendation/dlrm-v2.md b/docs/benchmarks/recommendation/dlrm-v2.md
@@ -19,9 +19,11 @@ hide:
 
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99.9", "nvidia") }}
 
+<!-- 
 === "Intel"
     ## Intel MLPerf Implementation
     
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99", "intel") }}
 
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99.9", "intel") }}
+-->
diff --git a/docs/benchmarks/text_to_image/sdxl.md b/docs/benchmarks/text_to_image/sdxl.md
@@ -16,7 +16,8 @@ hide:
     
 {{ mlperf_inference_implementation_readme (4, "sdxl", "nvidia") }}
 
+<!-- 
 === "Intel"
     ## Intel MLPerf Implementation
 {{ mlperf_inference_implementation_readme (4, "sdxl", "intel") }}
-
+-->
diff --git a/main.py b/main.py
@@ -28,6 +28,7 @@ def mlperf_inference_implementation_readme(
         content = ""
 
         execution_envs = ["Docker", "Native"]
+        run_modes = ["performance-only", "accuracy-only"]
         code_version = "r5.0-dev"
         implementation_run_options = []
 
@@ -67,7 +68,7 @@ def mlperf_inference_implementation_readme(
 
         elif implementation == "nvidia":
             if model in ["retinanet", "resnet50",
-                         "3d-unet-99", "3d-unet-99.9"]:
+                         "3d-unet-99", "3d-unet-99.9", "llama2-70b-99", "llama2-70b-99.9"]:
                 code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"
@@ -186,6 +187,7 @@ def mlperf_inference_implementation_readme(
                     cur_space2 = cur_space1 + "    "
                     cur_space3 = cur_space2 + "    "
                     cur_space4 = cur_space3 + "    "
+                    cur_space5 = cur_space4 + "    "
 
                     content += f"{cur_space1}=== \"{device}\"\n"
                     content += f"{cur_space2}##### {device} device\n\n"
@@ -305,6 +307,8 @@ def mlperf_inference_implementation_readme(
 
                                 if implementation.lower() == "nvidia":
                                     content += f"{cur_space3}* `--gpu_name=<Name of the GPU>` : The GPUs with supported configs in MLC are `orin`, `rtx_4090`, `rtx_a6000`, `rtx_6000_ada`, `l4`, `t4`and `a100`. For other GPUs, default configuration as per the GPU memory will be used.\n"
+                                    if "llama2-70b" in model.lower():
+                                        content += f"{cur_space3}* Add `--adr.llama2-model.tags=_pre-quantized` to use the Nvidia quantized models with the available in the MLC Storage. These models were quantized with three different configurations of tensor parallelism and pipeline parallelism: TP1–PP2, TP2–PP1, and TP1–PP1. The appropriate model will be automatically selected based on the values provided for `--tp_size` and `--pp_size` in run command. By default tp size of 2 and pp size of 1 would be used.\n"
 
                                 if device.lower() not in ["cuda"]:
                                     content += f"{cur_space3}* `--docker_os=ubuntu`: ubuntu and rhel are supported. \n"
@@ -373,25 +377,27 @@ def mlperf_inference_implementation_readme(
 
                         for scenario in scenarios:
                             content += f"{cur_space3}=== \"{scenario}\"\n{cur_space4}###### {scenario}\n\n"
-                            run_cmd = mlperf_inference_run_command(
-                                spaces + 21,
-                                model,
-                                implementation,
-                                framework.lower(),
-                                category.lower(),
-                                scenario,
-                                device.lower(),
-                                final_run_mode,
-                                test_query_count,
-                                False,
-                                skip_test_query_count,
-                                scenarios,
-                                code_version,
-                                extra_variation_tags,
-                                extra_input_string,
-                            )
-                            content += run_cmd
-                            # content += run_suffix
+                            for run_mode in run_modes:
+                                content += f"{cur_space4}=== \"{run_mode}\"\n{cur_space5}###### {run_mode}\n\n"
+                                run_cmd = mlperf_inference_run_command(
+                                    spaces + 25,
+                                    model,
+                                    implementation,
+                                    framework.lower(),
+                                    category.lower(),
+                                    scenario,
+                                    device.lower(),
+                                    final_run_mode,
+                                    test_query_count,
+                                    False,
+                                    skip_test_query_count,
+                                    scenarios,
+                                    code_version,
+                                    extra_variation_tags + f",_{run_mode}",
+                                    extra_input_string,
+                                )
+                                content += run_cmd
+                                # content += run_suffix
 
                         if len(scenarios) > 1:
                             content += f"{cur_space3}=== \"All Scenarios\"\n{cur_space4}###### All Scenarios\n\n"
@@ -481,7 +487,7 @@ def get_min_system_requirements(spaces, model, implementation, device):
             ds = {
                 "dlrm": "500GB",
                 "pointpainting": "500GB",
-                "llama2-70b": "600GB",
+                "llama2-70b": "900GB",
                 "llama3_1-405b": "2.3TB",
                 "mixtral": "100GB",
                 "retinanet": "200GB",
@@ -498,7 +504,12 @@ def get_min_system_requirements(spaces, model, implementation, device):
                     disk_space = ds[key]
                     break
 
+        if "llama2" in model.lower():
+            disk_space = f" 900GB for manual execution of {'reference' if implementation.lower() == 'reference' else 'vendor'} implementation and 1.5TB for automated run through MLC-Scripts"
+
+        if implementation.lower() == "reference" or "llama2" in model.lower():
             min_sys_req_content += f"{spaces}* **Disk Space**: {disk_space}\n\n"
+
         # System memory
         if "dlrm" in model:
             system_memory = "512GB"
@@ -583,9 +594,6 @@ def get_docker_info(spaces, model, implementation,
             if implementation.lower() == "nvidia":
                 info += f"{pre_space}    - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n"
                 info += f"{pre_space}    - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n"
-                if "llama2" in model.lower():
-                    info += f"{pre_space}    - The dataset for NVIDIA's implementation of Llama2 is not publicly available. The user must fill [this](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform?pli=1&fbzx=-8842630989397184967) form and be verified as a MLCommons member to access the dataset.\n\n"
-                    info += f"{pre_space}    - `PATH_TO_PICKE_FILE` should be replaced with path to the downloaded pickle file.\n\n"
         else:
             if model == "sdxl":
                 info += f"\n{pre_space}!!! tip\n\n"
@@ -731,7 +739,6 @@ def mlperf_inference_run_command(
             if "llama2-70b" in model.lower():
                 if implementation == "nvidia":
                     docker_cmd_suffix += f" \\\n{pre_space} --tp_size=2"
-                    docker_cmd_suffix += f" \\\n{pre_space} --nvidia_llama2_dataset_file_path=<PATH_TO_PICKLE_FILE>"
                 elif implementation == "neuralmagic":
                     docker_cmd_suffix += (
                         f" \\\n{pre_space} --api_server=http://localhost:8000"
@@ -779,7 +786,6 @@ def mlperf_inference_run_command(
             if "llama2-70b" in model.lower():
                 if implementation == "nvidia":
                     cmd_suffix += f" \\\n{pre_space} --tp_size=<TP_SIZE>"
-                    cmd_suffix += f" \\\n{pre_space} --nvidia_llama2_dataset_file_path=<PATH_TO_PICKE_FILE>"
                 elif implementation == "neuralmagic":
                     cmd_suffix += f" \\\n{pre_space} --api_server=http://localhost:8000"
                     cmd_suffix += f" \\\n{pre_space} --vllm_model_name=nm-testing/Llama-2-70b-chat-hf-FP8"
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -29,10 +29,10 @@ nav:
       - Image Classification:
           - ResNet50: benchmarks/image_classification/resnet50.md
       - Text to Image:
-          - Stable Diffusion:
-              - Run Commands: benchmarks/text_to_image/sdxl.md
-              - Reproducibility:
-                  - SCC24: benchmarks/text_to_image/reproducibility/scc24.md
+          - Stable Diffusion: 
+            - Run Commands: benchmarks/text_to_image/sdxl.md
+            - External Use:
+              - SCC24 Guide: benchmarks/text_to_image/reproducibility/scc24.md
       - 2D Object Detection:
           - RetinaNet: benchmarks/object_detection/retinanet.md
       - Automotive:
@@ -41,12 +41,12 @@ nav:
       - Medical Imaging:
           - 3d-unet: benchmarks/medical_imaging/3d-unet.md
       - Language Processing:
-        - Bert-Large:
-          - Run Commands: benchmarks/language/bert.md
-          - Reproducibility:
-            - IndySCC24: benchmarks/language/reproducibility/indyscc24-bert.md
+        - Bert-Large: benchmarks/language/bert.md
         - GPT-J: benchmarks/language/gpt-j.md
-        - LLAMA2-70B: benchmarks/language/llama2-70b.md
+        - LLAMA2-70B: 
+          - Run Commands: benchmarks/language/llama2-70b.md
+          - External Use:
+            - SCC25 Guide: benchmarks/language/scc25_guide/scc25.md
         - LLAMA3-405B: benchmarks/language/llama3_1-405b.md
         - LLAMA3-8B: benchmarks/language/llama3_1-8b.md
         - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md