huggingface
diff --git a/‎examples/config_qwen.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/config_qwen.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/config_qwen.yaml‎
Lines changed: 7 additions & 7 deletions b/‎examples/config_qwen.yaml‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎examples/config_qwen_with_moe.yaml‎
Lines changed: 132 additions & 0 deletions b/‎examples/config_qwen_with_moe.yaml‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎examples/inference/qwen_moe/README.md‎
Lines changed: 27 additions & 0 deletions b/‎examples/inference/qwen_moe/README.md‎
Lines changed: 27 additions & 0 deletions
@@ -108,7 +108,7 @@ def get_model_config(model_size: str) -> Qwen2Config:
         is_qwen2_config=True,
         pad_token_id=None,
         _attn_implementation="flash_attention_2",
-        sliding_window_size=20,
+        # sliding_window_size=20,
     )
 
 
 
@@ -32,7 +32,7 @@ general:
   consumed_train_samples: null
   ignore_sanity_checks: false
   project: debug
-  run: qwen_20250410_014907_16027793
+  run: qwen_20250423_201000_16423158
   seed: 42
   step: null
 lighteval: null
@@ -45,14 +45,15 @@ model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
+    scaling_method: NUM_LAYERS
     std: 0.025
   make_vocab_size_divisible_by: 1
   model_config:
     _attn_implementation: flash_attention_2
-    _fused_rms_norm: true
-    _fused_rotary_emb: true
-    _use_doc_masking: true
-    _use_qkv_packed: true
+    _fused_rms_norm: false
+    _fused_rotary_emb: false
+    _use_doc_masking: false
+    _use_qkv_packed: false
     attention_bias: false
     bos_token_id: 1
     eos_token_id: 2
@@ -74,7 +75,7 @@ model:
     rope_interleaved: false
     rope_scaling: null
     rope_theta: 10000.0
-    sliding_window_size: 20
+    sliding_window_size: null
     tie_word_embeddings: true
     use_cache: true
     vocab_size: 128256
@@ -104,7 +105,6 @@ parallelism:
   context_parallel_size: 1
   dp: 2
   expert_parallel_size: 1
-  moe_layer_recompute: false
   pp: 1
   pp_engine: 1f1b
   recompute_layer: false
 
@@ -0,0 +1,132 @@
+checkpoints:
+  checkpoint_interval: 1000
+  checkpoints_path: /fsx/phuc/new_workspace/experiments/qwen2_moe_test
+  checkpoints_path_is_shared_file_system: false
+  load_lr_scheduler: true
+  load_optimizer: true
+  resume_checkpoint_path: null
+  save_final_state: true
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder:
+      - /fsx/loubna/datasets/llama_tokenized/fineweb-edu/merged
+      dataset_max_tokens: null
+      dataset_read_path: null
+      dataset_weights: null
+      pad_samples_to_global_batch_size: false
+      return_positions: true
+      shuffle_files: false
+      skip_in_stream: false
+      token_size_in_bytes: 4
+      tokenizer_name: meta-llama/Llama-3.2-1B
+      use_old_brrr_dataloader: false
+      vocab_size: 128256
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: false
+  project: qwen_moe
+  run: qwen_20250410_014907_16027793
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+metrics_logging: null
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+  make_vocab_size_divisible_by: 1
+  model_config:
+    _attn_implementation: flash_attention_2
+    _fused_rms_norm: true
+    _fused_rotary_emb: true
+    _use_doc_masking: true
+    _use_qkv_packed: true
+    attention_bias: false
+    bos_token_id: 1
+    eos_token_id: 2
+    flex_attention_mask: null
+    hidden_act: silu
+    hidden_size: 256
+    initializer_range: 0.02
+    intermediate_size: 768
+    is_qwen2_config: true
+    max_position_embeddings: 4096
+    moe_config: null
+    no_rope_layer: null
+    num_attention_heads: 4
+    num_hidden_layers: 12
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-06
+    rope_interleaved: false
+    rope_scaling: null
+    rope_theta: 10000.0
+    sliding_window_size: 20
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 128256
+    z_loss_coefficient: 0.0001
+    z_loss_enabled: false
+    moe_config:
+      num_experts: 8
+      top_k: 1
+      enable_shared_expert: true
+      token_dispatcher_type: alltoall
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0003
+    lr_decay_starting_step: null
+    lr_decay_steps: 31998
+    lr_decay_style: cosine
+    lr_warmup_steps: 2
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  weight_decay_exclude_named_params: []
+  zero_stage: 0
+parallelism:
+  context_parallel_size: 1
+  dp: 2
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+s3_upload: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: meta-llama/Llama-3.2-1B
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 3
+  sequence_length: 4096
+  train_steps: 32000
+  val_check_interval: -1
@@ -0,0 +1,27 @@
+# Qwen-MoE Inference
+
+This guide explains how to convert Hugging face Qwen-MoE models to Nanotron format and run inference with them.
+
+## Convert Qwen-MoE to Nanotron Format
+
+Navigate to the `inference/qwen_moe` directory and run:
+
+```bash
+torchrun --nproc-per-node 1 examples/inference/qwen_moe/convert.py \
+    --nanotron-checkpoint-path nanotron_checkpoints/Qwen1.5-MoE-A2.7B \
+    --pretrained-model-name-or-path Qwen/Qwen1.5-MoE-A2.7B
+```
+
+This command will save the converted model weights to the specified path in `nanotron_checkpoints`
+
+## Run Inference
+
+From the root directory of Nanotron, run:
+
+```bash
+torchrun --rdzv_endpoint=localhost:29700 --rdzv-backend=c10d --nproc_per_node=1 \
+    run_generate.py \
+    --ckpt-path nanotron_checkpoints/Qwen1.5-MoE-A2.7B
+```
+
+This command will load the converted model weights and run inference.
Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def get_model_config(model_size: str) -> Qwen2Config:`
`108`	`108`	`is_qwen2_config=True,`
`109`	`109`	`pad_token_id=None,`
`110`	`110`	`_attn_implementation="flash_attention_2",`
`111`		`- sliding_window_size=20,`
	`111`	`+ # sliding_window_size=20,`
`112`	`112`	`)`
`113`	`113`
`114`	`114`