@@ -13,6 +13,7 @@ import {
1313 MIDSCENE_PLANNING_MODEL_NAME ,
1414 MIDSCENE_PLANNING_OPENAI_API_KEY ,
1515 MIDSCENE_PLANNING_OPENAI_BASE_URL ,
16+ MIDSCENE_PLANNING_VL_MODE ,
1617 MIDSCENE_VQA_MODEL_NAME ,
1718 MIDSCENE_VQA_OPENAI_API_KEY ,
1819 MIDSCENE_VQA_OPENAI_BASE_URL ,
@@ -48,9 +49,10 @@ describe('ModelConfigManager', () => {
4849 } ;
4950 case 'planning' :
5051 return {
51- [ MIDSCENE_PLANNING_MODEL_NAME ] : 'gpt-4 ' ,
52+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'qwen-vl-plus ' ,
5253 [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-planning-key' ,
5354 [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
55+ [ MIDSCENE_PLANNING_VL_MODE ] : 'qwen-vl' as const ,
5456 } ;
5557 case 'grounding' :
5658 return {
@@ -105,9 +107,10 @@ describe('ModelConfigManager', () => {
105107 } ;
106108 case 'planning' :
107109 return {
108- [ MIDSCENE_PLANNING_MODEL_NAME ] : 'gpt-4 ' ,
110+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'qwen-vl-plus ' ,
109111 [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-planning-key' ,
110112 [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
113+ [ MIDSCENE_PLANNING_VL_MODE ] : 'qwen-vl' ,
111114 } ;
112115 case 'grounding' :
113116 return {
@@ -131,10 +134,11 @@ describe('ModelConfigManager', () => {
131134 expect ( vqaConfig . from ) . toBe ( 'modelConfig' ) ;
132135
133136 const planningConfig = manager . getModelConfig ( 'planning' ) ;
134- expect ( planningConfig . modelName ) . toBe ( 'gpt-4 ' ) ;
137+ expect ( planningConfig . modelName ) . toBe ( 'qwen-vl-plus ' ) ;
135138 expect ( planningConfig . openaiApiKey ) . toBe ( 'test-planning-key' ) ;
136139 expect ( planningConfig . intent ) . toBe ( 'planning' ) ;
137140 expect ( planningConfig . from ) . toBe ( 'modelConfig' ) ;
141+ expect ( planningConfig . vlMode ) . toBe ( 'qwen-vl' ) ;
138142
139143 const groundingConfig = manager . getModelConfig ( 'grounding' ) ;
140144 expect ( groundingConfig . modelName ) . toBe ( 'gpt-4-vision' ) ;
@@ -263,4 +267,154 @@ describe('ModelConfigManager', () => {
263267 expect ( config . openaiBaseURL ) . toBe ( 'https://isolated.openai.com/v1' ) ;
264268 } ) ;
265269 } ) ;
270+
271+ describe ( 'Planning VL mode validation' , ( ) => {
272+ it ( 'should throw error when planning has no vlMode in isolated mode' , ( ) => {
273+ const modelConfigFn : TModelConfigFn = ( { intent } ) => {
274+ if ( intent === 'planning' ) {
275+ // Missing VL mode for planning
276+ return {
277+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'gpt-4' ,
278+ [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-key' ,
279+ [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
280+ } ;
281+ }
282+ return {
283+ [ MIDSCENE_MODEL_NAME ] : 'gpt-4' ,
284+ [ MIDSCENE_OPENAI_API_KEY ] : 'test-key' ,
285+ [ MIDSCENE_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
286+ } ;
287+ } ;
288+
289+ const manager = new ModelConfigManager ( modelConfigFn ) ;
290+
291+ expect ( ( ) => manager . getModelConfig ( 'planning' ) ) . toThrow (
292+ 'Planning requires a vision language model (VL model). DOM-based planning is not supported.' ,
293+ ) ;
294+ } ) ;
295+
296+ it ( 'should succeed when planning has valid vlMode in isolated mode' , ( ) => {
297+ const modelConfigFn : TModelConfigFn = ( { intent } ) => {
298+ if ( intent === 'planning' ) {
299+ return {
300+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'qwen-vl-plus' ,
301+ [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-key' ,
302+ [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
303+ [ MIDSCENE_PLANNING_VL_MODE ] : 'qwen-vl' as const ,
304+ } ;
305+ }
306+ return {
307+ [ MIDSCENE_MODEL_NAME ] : 'gpt-4' ,
308+ [ MIDSCENE_OPENAI_API_KEY ] : 'test-key' ,
309+ [ MIDSCENE_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
310+ } ;
311+ } ;
312+
313+ const manager = new ModelConfigManager ( modelConfigFn ) ;
314+ const config = manager . getModelConfig ( 'planning' ) ;
315+
316+ expect ( config . vlMode ) . toBe ( 'qwen-vl' ) ;
317+ expect ( config . modelName ) . toBe ( 'qwen-vl-plus' ) ;
318+ } ) ;
319+
320+ it ( 'should throw error when planning has no vlMode in normal mode' , ( ) => {
321+ vi . stubEnv ( MIDSCENE_PLANNING_MODEL_NAME , 'gpt-4' ) ;
322+ vi . stubEnv ( MIDSCENE_PLANNING_OPENAI_API_KEY , 'test-key' ) ;
323+ vi . stubEnv ( MIDSCENE_PLANNING_OPENAI_BASE_URL , 'https://api.openai.com/v1' ) ;
324+ // Intentionally not setting MIDSCENE_PLANNING_VL_MODE
325+
326+ const manager = new ModelConfigManager ( ) ;
327+ manager . registerGlobalConfigManager ( new GlobalConfigManager ( ) ) ;
328+
329+ expect ( ( ) => manager . getModelConfig ( 'planning' ) ) . toThrow (
330+ 'Planning requires a vision language model (VL model). DOM-based planning is not supported.' ,
331+ ) ;
332+ } ) ;
333+
334+ it ( 'should succeed when planning has valid vlMode in normal mode' , ( ) => {
335+ vi . stubEnv ( MIDSCENE_PLANNING_MODEL_NAME , 'qwen-vl-plus' ) ;
336+ vi . stubEnv ( MIDSCENE_PLANNING_OPENAI_API_KEY , 'test-key' ) ;
337+ vi . stubEnv ( MIDSCENE_PLANNING_OPENAI_BASE_URL , 'https://api.openai.com/v1' ) ;
338+ vi . stubEnv ( MIDSCENE_PLANNING_VL_MODE , 'qwen-vl' ) ;
339+
340+ const manager = new ModelConfigManager ( ) ;
341+ manager . registerGlobalConfigManager ( new GlobalConfigManager ( ) ) ;
342+
343+ const config = manager . getModelConfig ( 'planning' ) ;
344+
345+ expect ( config . vlMode ) . toBe ( 'qwen-vl' ) ;
346+ expect ( config . modelName ) . toBe ( 'qwen-vl-plus' ) ;
347+ expect ( config . intent ) . toBe ( 'planning' ) ;
348+ } ) ;
349+
350+ it ( 'should not affect other intents when planning validation fails' , ( ) => {
351+ const modelConfigFn : TModelConfigFn = ( { intent } ) => {
352+ if ( intent === 'planning' ) {
353+ // Missing VL mode for planning - should fail
354+ return {
355+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'gpt-4' ,
356+ [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-key' ,
357+ [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
358+ } ;
359+ }
360+ // Other intents should work fine
361+ return {
362+ [ MIDSCENE_MODEL_NAME ] : 'gpt-4' ,
363+ [ MIDSCENE_OPENAI_API_KEY ] : 'test-key' ,
364+ [ MIDSCENE_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
365+ } ;
366+ } ;
367+
368+ const manager = new ModelConfigManager ( modelConfigFn ) ;
369+
370+ // Planning should fail
371+ expect ( ( ) => manager . getModelConfig ( 'planning' ) ) . toThrow (
372+ 'Planning requires a vision language model' ,
373+ ) ;
374+
375+ // Other intents should succeed
376+ expect ( ( ) => manager . getModelConfig ( 'default' ) ) . not . toThrow ( ) ;
377+ expect ( ( ) => manager . getModelConfig ( 'VQA' ) ) . not . toThrow ( ) ;
378+ expect ( ( ) => manager . getModelConfig ( 'grounding' ) ) . not . toThrow ( ) ;
379+ } ) ;
380+
381+ it ( 'should accept all valid VL modes for planning' , ( ) => {
382+ const vlModeTestCases : Array < {
383+ raw : 'qwen-vl' | 'qwen3-vl' | 'gemini' | 'doubao-vision' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5' ;
384+ expected : string ;
385+ } > = [
386+ { raw : 'qwen-vl' , expected : 'qwen-vl' } ,
387+ { raw : 'qwen3-vl' , expected : 'qwen3-vl' } ,
388+ { raw : 'gemini' , expected : 'gemini' } ,
389+ { raw : 'doubao-vision' , expected : 'doubao-vision' } ,
390+ // UI-TARS variants all normalize to 'vlm-ui-tars'
391+ { raw : 'vlm-ui-tars' , expected : 'vlm-ui-tars' } ,
392+ { raw : 'vlm-ui-tars-doubao' , expected : 'vlm-ui-tars' } ,
393+ { raw : 'vlm-ui-tars-doubao-1.5' , expected : 'vlm-ui-tars' } ,
394+ ] ;
395+
396+ for ( const { raw, expected } of vlModeTestCases ) {
397+ const modelConfigFn : TModelConfigFn = ( { intent } ) => {
398+ if ( intent === 'planning' ) {
399+ return {
400+ [ MIDSCENE_PLANNING_MODEL_NAME ] : 'test-model' ,
401+ [ MIDSCENE_PLANNING_OPENAI_API_KEY ] : 'test-key' ,
402+ [ MIDSCENE_PLANNING_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
403+ [ MIDSCENE_PLANNING_VL_MODE ] : raw ,
404+ } ;
405+ }
406+ return {
407+ [ MIDSCENE_MODEL_NAME ] : 'gpt-4' ,
408+ [ MIDSCENE_OPENAI_API_KEY ] : 'test-key' ,
409+ [ MIDSCENE_OPENAI_BASE_URL ] : 'https://api.openai.com/v1' ,
410+ } ;
411+ } ;
412+
413+ const manager = new ModelConfigManager ( modelConfigFn ) ;
414+ const config = manager . getModelConfig ( 'planning' ) ;
415+
416+ expect ( config . vlMode ) . toBe ( expected ) ;
417+ }
418+ } ) ;
419+ } ) ;
266420} ) ;
0 commit comments