Skip to content

Commit 80a2c97

Browse files
quanruclaude
andauthored
feat(core,shared): enforce VL mode requirement for Planning (#1332)
This change ensures that Planning functionality only supports vision language models (VL mode) and removes DOM-based planning support. Changes: - Add validation in ModelConfigManager.getModelConfig() to require VL mode for Planning intent - Remove DOM mode logic from llm-planning.ts (describeUserPage, markupImageForLLM) - Simplify image processing to only support VL mode paths - Add comprehensive JSDoc documentation for Planning VL mode requirement - Add 6 new unit tests covering Planning VL mode validation in both isolated and normal modes - Fix existing tests to provide VL mode for Planning intent Breaking Change: - Planning without VL mode configured will now throw an error with clear instructions - Error message includes all supported VL modes and configuration examples 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <[email protected]>
1 parent 46ffc1d commit 80a2c97

File tree

4 files changed

+212
-43
lines changed

4 files changed

+212
-43
lines changed

packages/core/src/ai-model/llm-planning.ts

Lines changed: 17 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env';
88
import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
99
import { getDebug } from '@midscene/shared/logger';
1010
import { assert } from '@midscene/shared/utils';
11-
import type {
12-
ChatCompletionContentPart,
13-
ChatCompletionMessageParam,
14-
} from 'openai/resources/index';
11+
import type { ChatCompletionMessageParam } from 'openai/resources/index';
1512
import {
1613
AIActionType,
1714
buildYamlFlowFromPlans,
1815
fillBboxParam,
1916
findAllMidsceneLocatorField,
20-
markupImageForLLM,
2117
warnGPT4oSizeLimit,
2218
} from './common';
2319
import type { ConversationHistory } from './conversation-history';
2420
import { systemPromptToTaskPlanning } from './prompt/llm-planning';
25-
import { describeUserPage } from './prompt/util';
2621
import { callAIWithObjectResponse } from './service-caller/index';
2722

2823
const debug = getDebug('planning');
@@ -43,10 +38,9 @@ export async function plan(
4338

4439
const { modelName, vlMode } = modelConfig;
4540

46-
const { description: pageDescription, elementById } = await describeUserPage(
47-
context,
48-
{ vlMode },
49-
);
41+
// Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
42+
assert(vlMode, 'Planning requires vlMode to be configured.');
43+
5044
const systemPrompt = await systemPromptToTaskPlanning({
5145
actionSpace: opts.actionSpace,
5246
vlMode: vlMode,
@@ -57,21 +51,19 @@ export async function plan(
5751
let imageHeight = size.height;
5852
const rightLimit = imageWidth;
5953
const bottomLimit = imageHeight;
54+
55+
// Process image based on VL mode requirements
6056
if (vlMode === 'qwen-vl') {
6157
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
6258
imageWidth = paddedResult.width;
6359
imageHeight = paddedResult.height;
6460
imagePayload = paddedResult.imageBase64;
6561
} else if (vlMode === 'qwen3-vl') {
62+
// Reserved for qwen3-vl specific processing
6663
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
6764
// imageWidth = paddedResult.width;
6865
// imageHeight = paddedResult.height;
6966
// imagePayload = paddedResult.imageBase64;
70-
} else if (!vlMode) {
71-
imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
72-
width: imageWidth,
73-
height: imageHeight,
74-
});
7567
}
7668

7769
warnGPT4oSizeLimit(size, modelName);
@@ -120,14 +112,7 @@ export async function plan(
120112
detail: 'high',
121113
},
122114
},
123-
...(vlMode
124-
? []
125-
: ([
126-
{
127-
type: 'text',
128-
text: pageDescription,
129-
},
130-
] as ChatCompletionContentPart[])),
115+
// Planning uses pure vision mode, no DOM description needed
131116
],
132117
},
133118
];
@@ -173,21 +158,15 @@ export async function plan(
173158
locateFields.forEach((field) => {
174159
const locateResult = action.param[field];
175160
if (locateResult) {
176-
if (vlMode) {
177-
action.param[field] = fillBboxParam(
178-
locateResult,
179-
imageWidth,
180-
imageHeight,
181-
rightLimit,
182-
bottomLimit,
183-
vlMode,
184-
);
185-
} else {
186-
const element = elementById(locateResult);
187-
if (element) {
188-
action.param[field].id = element.id;
189-
}
190-
}
161+
// Always use VL mode to fill bbox parameters
162+
action.param[field] = fillBboxParam(
163+
locateResult,
164+
imageWidth,
165+
imageHeight,
166+
rightLimit,
167+
bottomLimit,
168+
vlMode,
169+
);
191170
}
192171
});
193172
});

packages/shared/src/env/model-config-manager.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
import type { GlobalConfigManager } from './global-config-manager';
66

77
import type { IModelConfig, TIntent, TModelConfigFn } from './types';
8+
import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types';
89

910
const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning'];
1011

@@ -101,13 +102,15 @@ export class ModelConfigManager {
101102
* if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined
102103
*/
103104
getModelConfig(intent: TIntent): IModelConfig {
105+
let config: IModelConfig;
106+
104107
if (this.isolatedMode) {
105108
if (!this.modelConfigMap) {
106109
throw new Error(
107110
'modelConfigMap is not initialized in isolated mode, which should not happen',
108111
);
109112
}
110-
return this.modelConfigMap[intent];
113+
config = this.modelConfigMap[intent];
111114
} else {
112115
if (!this.modelConfigMap) {
113116
if (!this.globalConfigManager) {
@@ -119,8 +122,26 @@ export class ModelConfigManager {
119122
this.globalConfigManager.getAllEnvConfig(),
120123
);
121124
}
122-
return this.modelConfigMap[intent];
125+
config = this.modelConfigMap[intent];
126+
}
127+
128+
// Validate Planning must use VL mode
129+
if (intent === 'planning' && !config.vlMode) {
130+
throw new Error(
131+
`Planning requires a vision language model (VL model). DOM-based planning is not supported.
132+
133+
Please configure one of the following VL modes:
134+
${VL_MODES.map((mode) => `- ${mode}`).join('\n ')}
135+
136+
Configuration examples:
137+
- Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl
138+
- Or use modelConfig function with planning intent
139+
140+
Learn more: https://midscenejs.com/choose-a-model`,
141+
);
123142
}
143+
144+
return config;
124145
}
125146

126147
getUploadTestServerUrl(): string | undefined {

packages/shared/src/env/types.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,21 @@ export interface IModelConfigForVQA {
238238
[MIDSCENE_VQA_VL_MODE]?: TVlModeValues;
239239
}
240240

241+
/**
242+
* Model configuration for Planning intent.
243+
*
244+
* IMPORTANT: Planning MUST use a vision language model (VL mode).
245+
* DOM-based planning is not supported.
246+
*
247+
* Required: MIDSCENE_PLANNING_VL_MODE must be set to one of:
248+
* - 'qwen-vl'
249+
* - 'qwen3-vl'
250+
* - 'gemini'
251+
* - 'doubao-vision'
252+
* - 'vlm-ui-tars'
253+
* - 'vlm-ui-tars-doubao'
254+
* - 'vlm-ui-tars-doubao-1.5'
255+
*/
241256
export interface IModelConfigForPlanning {
242257
// model name
243258
[MIDSCENE_PLANNING_MODEL_NAME]: string;

packages/shared/tests/unit-test/env/modle-config-manager.test.ts

Lines changed: 157 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
MIDSCENE_PLANNING_MODEL_NAME,
1414
MIDSCENE_PLANNING_OPENAI_API_KEY,
1515
MIDSCENE_PLANNING_OPENAI_BASE_URL,
16+
MIDSCENE_PLANNING_VL_MODE,
1617
MIDSCENE_VQA_MODEL_NAME,
1718
MIDSCENE_VQA_OPENAI_API_KEY,
1819
MIDSCENE_VQA_OPENAI_BASE_URL,
@@ -48,9 +49,10 @@ describe('ModelConfigManager', () => {
4849
};
4950
case 'planning':
5051
return {
51-
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
52+
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
5253
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
5354
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
55+
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
5456
};
5557
case 'grounding':
5658
return {
@@ -105,9 +107,10 @@ describe('ModelConfigManager', () => {
105107
};
106108
case 'planning':
107109
return {
108-
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
110+
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
109111
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
110112
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
113+
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl',
111114
};
112115
case 'grounding':
113116
return {
@@ -131,10 +134,11 @@ describe('ModelConfigManager', () => {
131134
expect(vqaConfig.from).toBe('modelConfig');
132135

133136
const planningConfig = manager.getModelConfig('planning');
134-
expect(planningConfig.modelName).toBe('gpt-4');
137+
expect(planningConfig.modelName).toBe('qwen-vl-plus');
135138
expect(planningConfig.openaiApiKey).toBe('test-planning-key');
136139
expect(planningConfig.intent).toBe('planning');
137140
expect(planningConfig.from).toBe('modelConfig');
141+
expect(planningConfig.vlMode).toBe('qwen-vl');
138142

139143
const groundingConfig = manager.getModelConfig('grounding');
140144
expect(groundingConfig.modelName).toBe('gpt-4-vision');
@@ -263,4 +267,154 @@ describe('ModelConfigManager', () => {
263267
expect(config.openaiBaseURL).toBe('https://isolated.openai.com/v1');
264268
});
265269
});
270+
271+
describe('Planning VL mode validation', () => {
272+
it('should throw error when planning has no vlMode in isolated mode', () => {
273+
const modelConfigFn: TModelConfigFn = ({ intent }) => {
274+
if (intent === 'planning') {
275+
// Missing VL mode for planning
276+
return {
277+
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
278+
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
279+
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
280+
};
281+
}
282+
return {
283+
[MIDSCENE_MODEL_NAME]: 'gpt-4',
284+
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
285+
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
286+
};
287+
};
288+
289+
const manager = new ModelConfigManager(modelConfigFn);
290+
291+
expect(() => manager.getModelConfig('planning')).toThrow(
292+
'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
293+
);
294+
});
295+
296+
it('should succeed when planning has valid vlMode in isolated mode', () => {
297+
const modelConfigFn: TModelConfigFn = ({ intent }) => {
298+
if (intent === 'planning') {
299+
return {
300+
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
301+
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
302+
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
303+
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
304+
};
305+
}
306+
return {
307+
[MIDSCENE_MODEL_NAME]: 'gpt-4',
308+
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
309+
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
310+
};
311+
};
312+
313+
const manager = new ModelConfigManager(modelConfigFn);
314+
const config = manager.getModelConfig('planning');
315+
316+
expect(config.vlMode).toBe('qwen-vl');
317+
expect(config.modelName).toBe('qwen-vl-plus');
318+
});
319+
320+
it('should throw error when planning has no vlMode in normal mode', () => {
321+
vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'gpt-4');
322+
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
323+
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
324+
// Intentionally not setting MIDSCENE_PLANNING_VL_MODE
325+
326+
const manager = new ModelConfigManager();
327+
manager.registerGlobalConfigManager(new GlobalConfigManager());
328+
329+
expect(() => manager.getModelConfig('planning')).toThrow(
330+
'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
331+
);
332+
});
333+
334+
it('should succeed when planning has valid vlMode in normal mode', () => {
335+
vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'qwen-vl-plus');
336+
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
337+
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
338+
vi.stubEnv(MIDSCENE_PLANNING_VL_MODE, 'qwen-vl');
339+
340+
const manager = new ModelConfigManager();
341+
manager.registerGlobalConfigManager(new GlobalConfigManager());
342+
343+
const config = manager.getModelConfig('planning');
344+
345+
expect(config.vlMode).toBe('qwen-vl');
346+
expect(config.modelName).toBe('qwen-vl-plus');
347+
expect(config.intent).toBe('planning');
348+
});
349+
350+
it('should not affect other intents when planning validation fails', () => {
351+
const modelConfigFn: TModelConfigFn = ({ intent }) => {
352+
if (intent === 'planning') {
353+
// Missing VL mode for planning - should fail
354+
return {
355+
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
356+
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
357+
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
358+
};
359+
}
360+
// Other intents should work fine
361+
return {
362+
[MIDSCENE_MODEL_NAME]: 'gpt-4',
363+
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
364+
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
365+
};
366+
};
367+
368+
const manager = new ModelConfigManager(modelConfigFn);
369+
370+
// Planning should fail
371+
expect(() => manager.getModelConfig('planning')).toThrow(
372+
'Planning requires a vision language model',
373+
);
374+
375+
// Other intents should succeed
376+
expect(() => manager.getModelConfig('default')).not.toThrow();
377+
expect(() => manager.getModelConfig('VQA')).not.toThrow();
378+
expect(() => manager.getModelConfig('grounding')).not.toThrow();
379+
});
380+
381+
it('should accept all valid VL modes for planning', () => {
382+
const vlModeTestCases: Array<{
383+
raw: 'qwen-vl' | 'qwen3-vl' | 'gemini' | 'doubao-vision' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
384+
expected: string;
385+
}> = [
386+
{ raw: 'qwen-vl', expected: 'qwen-vl' },
387+
{ raw: 'qwen3-vl', expected: 'qwen3-vl' },
388+
{ raw: 'gemini', expected: 'gemini' },
389+
{ raw: 'doubao-vision', expected: 'doubao-vision' },
390+
// UI-TARS variants all normalize to 'vlm-ui-tars'
391+
{ raw: 'vlm-ui-tars', expected: 'vlm-ui-tars' },
392+
{ raw: 'vlm-ui-tars-doubao', expected: 'vlm-ui-tars' },
393+
{ raw: 'vlm-ui-tars-doubao-1.5', expected: 'vlm-ui-tars' },
394+
];
395+
396+
for (const { raw, expected } of vlModeTestCases) {
397+
const modelConfigFn: TModelConfigFn = ({ intent }) => {
398+
if (intent === 'planning') {
399+
return {
400+
[MIDSCENE_PLANNING_MODEL_NAME]: 'test-model',
401+
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
402+
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
403+
[MIDSCENE_PLANNING_VL_MODE]: raw,
404+
};
405+
}
406+
return {
407+
[MIDSCENE_MODEL_NAME]: 'gpt-4',
408+
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
409+
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
410+
};
411+
};
412+
413+
const manager = new ModelConfigManager(modelConfigFn);
414+
const config = manager.getModelConfig('planning');
415+
416+
expect(config.vlMode).toBe(expected);
417+
}
418+
});
419+
});
266420
});

0 commit comments

Comments
 (0)