feat(core,shared): enforce VL mode requirement for Planning (#1332)

quanru · claude · web-flow · commit 80a2c9739bf7 · 2025-10-17T17:44:11.000+08:00
This change ensures that Planning functionality only supports vision language models (VL mode) and removes DOM-based planning support. Changes: - Add validation in ModelConfigManager.getModelConfig() to require VL mode for Planning intent - Remove DOM mode logic from llm-planning.ts (describeUserPage, markupImageForLLM) - Simplify image processing to only support VL mode paths - Add comprehensive JSDoc documentation for Planning VL mode requirement - Add 6 new unit tests covering Planning VL mode validation in both isolated and normal modes - Fix existing tests to provide VL mode for Planning intent Breaking Change: - Planning without VL mode configured will now throw an error with clear instructions - Error message includes all supported VL modes and configuration examples 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts
@@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env';
 import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';
 import { assert } from '@midscene/shared/utils';
-import type {
-  ChatCompletionContentPart,
-  ChatCompletionMessageParam,
-} from 'openai/resources/index';
+import type { ChatCompletionMessageParam } from 'openai/resources/index';
 import {
   AIActionType,
   buildYamlFlowFromPlans,
   fillBboxParam,
   findAllMidsceneLocatorField,
-  markupImageForLLM,
   warnGPT4oSizeLimit,
 } from './common';
 import type { ConversationHistory } from './conversation-history';
 import { systemPromptToTaskPlanning } from './prompt/llm-planning';
-import { describeUserPage } from './prompt/util';
 import { callAIWithObjectResponse } from './service-caller/index';
 
 const debug = getDebug('planning');
@@ -43,10 +38,9 @@ export async function plan(
 
   const { modelName, vlMode } = modelConfig;
 
-  const { description: pageDescription, elementById } = await describeUserPage(
-    context,
-    { vlMode },
-  );
+  // Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
+  assert(vlMode, 'Planning requires vlMode to be configured.');
+
   const systemPrompt = await systemPromptToTaskPlanning({
     actionSpace: opts.actionSpace,
     vlMode: vlMode,
@@ -57,21 +51,19 @@ export async function plan(
   let imageHeight = size.height;
   const rightLimit = imageWidth;
   const bottomLimit = imageHeight;
+
+  // Process image based on VL mode requirements
   if (vlMode === 'qwen-vl') {
     const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
     imageWidth = paddedResult.width;
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
   } else if (vlMode === 'qwen3-vl') {
+    // Reserved for qwen3-vl specific processing
     // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
     // imageWidth = paddedResult.width;
     // imageHeight = paddedResult.height;
     // imagePayload = paddedResult.imageBase64;
-  } else if (!vlMode) {
-    imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
-      width: imageWidth,
-      height: imageHeight,
-    });
   }
 
   warnGPT4oSizeLimit(size, modelName);
@@ -120,14 +112,7 @@ export async function plan(
             detail: 'high',
           },
         },
-        ...(vlMode
-          ? []
-          : ([
-              {
-                type: 'text',
-                text: pageDescription,
-              },
-            ] as ChatCompletionContentPart[])),
+        // Planning uses pure vision mode, no DOM description needed
       ],
     },
   ];
@@ -173,21 +158,15 @@ export async function plan(
     locateFields.forEach((field) => {
       const locateResult = action.param[field];
       if (locateResult) {
-        if (vlMode) {
-          action.param[field] = fillBboxParam(
-            locateResult,
-            imageWidth,
-            imageHeight,
-            rightLimit,
-            bottomLimit,
-            vlMode,
-          );
-        } else {
-          const element = elementById(locateResult);
-          if (element) {
-            action.param[field].id = element.id;
-          }
-        }
+        // Always use VL mode to fill bbox parameters
+        action.param[field] = fillBboxParam(
+          locateResult,
+          imageWidth,
+          imageHeight,
+          rightLimit,
+          bottomLimit,
+          vlMode,
+        );
       }
     });
   });
diff --git a/packages/shared/src/env/model-config-manager.ts b/packages/shared/src/env/model-config-manager.ts
@@ -5,6 +5,7 @@ import {
 import type { GlobalConfigManager } from './global-config-manager';
 
 import type { IModelConfig, TIntent, TModelConfigFn } from './types';
+import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types';
 
 const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning'];
 
@@ -101,13 +102,15 @@ export class ModelConfigManager {
    * if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined
    */
   getModelConfig(intent: TIntent): IModelConfig {
+    let config: IModelConfig;
+
     if (this.isolatedMode) {
       if (!this.modelConfigMap) {
         throw new Error(
           'modelConfigMap is not initialized in isolated mode, which should not happen',
         );
       }
-      return this.modelConfigMap[intent];
+      config = this.modelConfigMap[intent];
     } else {
       if (!this.modelConfigMap) {
         if (!this.globalConfigManager) {
@@ -119,8 +122,26 @@ export class ModelConfigManager {
           this.globalConfigManager.getAllEnvConfig(),
         );
       }
-      return this.modelConfigMap[intent];
+      config = this.modelConfigMap[intent];
+    }
+
+    // Validate Planning must use VL mode
+    if (intent === 'planning' && !config.vlMode) {
+      throw new Error(
+        `Planning requires a vision language model (VL model). DOM-based planning is not supported.
+
+Please configure one of the following VL modes:
+  ${VL_MODES.map((mode) => `- ${mode}`).join('\n  ')}
+  
+Configuration examples:
+  - Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl
+  - Or use modelConfig function with planning intent
+  
+Learn more: https://midscenejs.com/choose-a-model`,
+      );
     }
+
+    return config;
   }
 
   getUploadTestServerUrl(): string | undefined {
diff --git a/packages/shared/src/env/types.ts b/packages/shared/src/env/types.ts
@@ -238,6 +238,21 @@ export interface IModelConfigForVQA {
   [MIDSCENE_VQA_VL_MODE]?: TVlModeValues;
 }
 
+/**
+ * Model configuration for Planning intent.
+ *
+ * IMPORTANT: Planning MUST use a vision language model (VL mode).
+ * DOM-based planning is not supported.
+ *
+ * Required: MIDSCENE_PLANNING_VL_MODE must be set to one of:
+ *   - 'qwen-vl'
+ *   - 'qwen3-vl'
+ *   - 'gemini'
+ *   - 'doubao-vision'
+ *   - 'vlm-ui-tars'
+ *   - 'vlm-ui-tars-doubao'
+ *   - 'vlm-ui-tars-doubao-1.5'
+ */
 export interface IModelConfigForPlanning {
   // model name
   [MIDSCENE_PLANNING_MODEL_NAME]: string;
diff --git a/packages/shared/tests/unit-test/env/modle-config-manager.test.ts b/packages/shared/tests/unit-test/env/modle-config-manager.test.ts
@@ -13,6 +13,7 @@ import {
   MIDSCENE_PLANNING_MODEL_NAME,
   MIDSCENE_PLANNING_OPENAI_API_KEY,
   MIDSCENE_PLANNING_OPENAI_BASE_URL,
+  MIDSCENE_PLANNING_VL_MODE,
   MIDSCENE_VQA_MODEL_NAME,
   MIDSCENE_VQA_OPENAI_API_KEY,
   MIDSCENE_VQA_OPENAI_BASE_URL,
@@ -48,9 +49,10 @@ describe('ModelConfigManager', () => {
             };
           case 'planning':
             return {
-              [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
+              [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
               [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
               [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+              [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
             };
           case 'grounding':
             return {
@@ -105,9 +107,10 @@ describe('ModelConfigManager', () => {
             };
           case 'planning':
             return {
-              [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
+              [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
               [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
               [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+              [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl',
             };
           case 'grounding':
             return {
@@ -131,10 +134,11 @@ describe('ModelConfigManager', () => {
       expect(vqaConfig.from).toBe('modelConfig');
 
       const planningConfig = manager.getModelConfig('planning');
-      expect(planningConfig.modelName).toBe('gpt-4');
+      expect(planningConfig.modelName).toBe('qwen-vl-plus');
       expect(planningConfig.openaiApiKey).toBe('test-planning-key');
       expect(planningConfig.intent).toBe('planning');
       expect(planningConfig.from).toBe('modelConfig');
+      expect(planningConfig.vlMode).toBe('qwen-vl');
 
       const groundingConfig = manager.getModelConfig('grounding');
       expect(groundingConfig.modelName).toBe('gpt-4-vision');
@@ -263,4 +267,154 @@ describe('ModelConfigManager', () => {
       expect(config.openaiBaseURL).toBe('https://isolated.openai.com/v1');
     });
   });
+
+  describe('Planning VL mode validation', () => {
+    it('should throw error when planning has no vlMode in isolated mode', () => {
+      const modelConfigFn: TModelConfigFn = ({ intent }) => {
+        if (intent === 'planning') {
+          // Missing VL mode for planning
+          return {
+            [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
+            [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
+            [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+          };
+        }
+        return {
+          [MIDSCENE_MODEL_NAME]: 'gpt-4',
+          [MIDSCENE_OPENAI_API_KEY]: 'test-key',
+          [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+        };
+      };
+
+      const manager = new ModelConfigManager(modelConfigFn);
+
+      expect(() => manager.getModelConfig('planning')).toThrow(
+        'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
+      );
+    });
+
+    it('should succeed when planning has valid vlMode in isolated mode', () => {
+      const modelConfigFn: TModelConfigFn = ({ intent }) => {
+        if (intent === 'planning') {
+          return {
+            [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
+            [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
+            [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+            [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
+          };
+        }
+        return {
+          [MIDSCENE_MODEL_NAME]: 'gpt-4',
+          [MIDSCENE_OPENAI_API_KEY]: 'test-key',
+          [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+        };
+      };
+
+      const manager = new ModelConfigManager(modelConfigFn);
+      const config = manager.getModelConfig('planning');
+
+      expect(config.vlMode).toBe('qwen-vl');
+      expect(config.modelName).toBe('qwen-vl-plus');
+    });
+
+    it('should throw error when planning has no vlMode in normal mode', () => {
+      vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'gpt-4');
+      vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
+      vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
+      // Intentionally not setting MIDSCENE_PLANNING_VL_MODE
+
+      const manager = new ModelConfigManager();
+      manager.registerGlobalConfigManager(new GlobalConfigManager());
+
+      expect(() => manager.getModelConfig('planning')).toThrow(
+        'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
+      );
+    });
+
+    it('should succeed when planning has valid vlMode in normal mode', () => {
+      vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'qwen-vl-plus');
+      vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
+      vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
+      vi.stubEnv(MIDSCENE_PLANNING_VL_MODE, 'qwen-vl');
+
+      const manager = new ModelConfigManager();
+      manager.registerGlobalConfigManager(new GlobalConfigManager());
+
+      const config = manager.getModelConfig('planning');
+
+      expect(config.vlMode).toBe('qwen-vl');
+      expect(config.modelName).toBe('qwen-vl-plus');
+      expect(config.intent).toBe('planning');
+    });
+
+    it('should not affect other intents when planning validation fails', () => {
+      const modelConfigFn: TModelConfigFn = ({ intent }) => {
+        if (intent === 'planning') {
+          // Missing VL mode for planning - should fail
+          return {
+            [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
+            [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
+            [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+          };
+        }
+        // Other intents should work fine
+        return {
+          [MIDSCENE_MODEL_NAME]: 'gpt-4',
+          [MIDSCENE_OPENAI_API_KEY]: 'test-key',
+          [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+        };
+      };
+
+      const manager = new ModelConfigManager(modelConfigFn);
+
+      // Planning should fail
+      expect(() => manager.getModelConfig('planning')).toThrow(
+        'Planning requires a vision language model',
+      );
+
+      // Other intents should succeed
+      expect(() => manager.getModelConfig('default')).not.toThrow();
+      expect(() => manager.getModelConfig('VQA')).not.toThrow();
+      expect(() => manager.getModelConfig('grounding')).not.toThrow();
+    });
+
+    it('should accept all valid VL modes for planning', () => {
+      const vlModeTestCases: Array<{
+        raw: 'qwen-vl' | 'qwen3-vl' | 'gemini' | 'doubao-vision' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
+        expected: string;
+      }> = [
+        { raw: 'qwen-vl', expected: 'qwen-vl' },
+        { raw: 'qwen3-vl', expected: 'qwen3-vl' },
+        { raw: 'gemini', expected: 'gemini' },
+        { raw: 'doubao-vision', expected: 'doubao-vision' },
+        // UI-TARS variants all normalize to 'vlm-ui-tars'
+        { raw: 'vlm-ui-tars', expected: 'vlm-ui-tars' },
+        { raw: 'vlm-ui-tars-doubao', expected: 'vlm-ui-tars' },
+        { raw: 'vlm-ui-tars-doubao-1.5', expected: 'vlm-ui-tars' },
+      ];
+
+      for (const { raw, expected } of vlModeTestCases) {
+        const modelConfigFn: TModelConfigFn = ({ intent }) => {
+          if (intent === 'planning') {
+            return {
+              [MIDSCENE_PLANNING_MODEL_NAME]: 'test-model',
+              [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
+              [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+              [MIDSCENE_PLANNING_VL_MODE]: raw,
+            };
+          }
+          return {
+            [MIDSCENE_MODEL_NAME]: 'gpt-4',
+            [MIDSCENE_OPENAI_API_KEY]: 'test-key',
+            [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
+          };
+        };
+
+        const manager = new ModelConfigManager(modelConfigFn);
+        const config = manager.getModelConfig('planning');
+
+        expect(config.vlMode).toBe(expected);
+      }
+    });
+  });
 });