-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
668 lines (580 loc) · 36.4 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
<!-- Replace the content tag with appropriate information -->
<meta name="description" content="DESCRIPTION META TAG">
<meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
<meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
<meta property="og:url" content="URL OF THE WEBSITE"/>
<!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
<meta property="og:image" content="static/image/SkyDiffusion.png" />
<meta property="og:image:width" content="1200"/>
<meta property="og:image:height" content="630"/>
<meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
<meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
<!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
<meta name="twitter:image" content="static/images/SkyDiffusion.png">
<meta name="twitter:card" content="summary_large_image">
<!-- Keywords for your paper to be indexed by-->
<meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>UrBench</title>
<!-- Google Tag Manager -->
<script>(function (w, d, s, l, i) {
w[l] = w[l] || []; w[l].push({
'gtm.start':
new Date().getTime(), event: 'gtm.js'
}); var f = d.getElementsByTagName(s)[0],
j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : ''; j.async = true; j.src =
'https://www.googletagmanager.com/gtm.js?id=' + i + dl; f.parentNode.insertBefore(j, f);
})(window, document, 'script', 'dataLayer', 'GTM-MFCT45H');</script>
<!-- End Google Tag Manager -->
<link rel="icon" type="image/x-icon" href="static/images/SkyDiffusion.ico">
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
rel="stylesheet">
<link rel="stylesheet" href="static/css/bulma.min.css">
<link rel="stylesheet" href="static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="static/css/bulma-slider.min.css">
<link rel="stylesheet" href="static/css/fontawesome.all.min.css">
<link rel="stylesheet"
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
<script defer src="static/js/fontawesome.all.min.js"></script>
<script src="static/js/bulma-carousel.min.js"></script>
<script src="static/js/bulma-slider.min.js"></script>
<script src="static/js/index.js"></script>
</head>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-2 publication-title">
<!-- <b>SkyDiffusion:</b> -->
<b>UrBench</b>: A Comprehensive Benchmark for Evaluating Large Multimodal Models in Multi-View Urban Scenarios</h1>
<div class="is-size-5 publication-authors">
<!-- Paper authors -->
<span class="author-block">
<a href="https://baichuanzhou.github.io/" target="_blank">Baichuan Zhou</a><sup>1<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>*</mo></math></sup>,</span>
</span>
<span class="author-block">
Haote Yang<sup>1<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>*</mo></math></sup>,</span>
<span class="author-block">
Dairong Chen<sup>4,2<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>*</mo></math></sup>,</span>
<span class="author-block">
<a href="https://yejy53.github.io/" target="_blank">Junyan Ye</a><sup>2,1<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>*</mo></math></sup>,</span>
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://beccabai.github.io/" target="_blank">Tianyi Bai</a><sup>1</sup>,</span>
</span>
<span class="author-block">
Jinhua Yu<sup>2</sup>,</span>
</span>
<span class="author-block">
<a href="https://www.zhangsongyang.com/" target="_blank">Songyang Zhang</a><sup>1</sup>,</span>
</span>
<span class="author-block">
<a href="http://dahua.site/" target="_blank">Dahua Lin</a><sup>1,3</sup>,</span>
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://conghui.github.io/" target="_blank">Conghui He</a><sup>1,3<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>†</mo></math></sup>,</span>
</span>
<span class="author-block">
<a href="https://liweijia.github.io/" target="_blank">Weijia Li</a><sup>2<math xmlns="http://www.w3.org/1998/Math/MathML"><mo>†</mo></math></sup></span>
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>Shanghai AI Laboratory </span>
<span class="author-block"><sup>2</sup>Sun Yat-Sen University </span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>3</sup>SenseTime Research</span>
<span class="author-block"><sup>4</sup>Wuhan University</span>
</div>
<h4><sup><math xmlns="http://www.w3.org/1998/Math/MathML"><mo>*</mo></math></sup>Equal Contribution<sup></h4>
<h4><sup><math xmlns="http://www.w3.org/1998/Math/MathML"><mo>†</mo></math></sup>Correspondence<sup></h4>
<!-- <!– Github link –>-->
<!-- <span class="link-block">-->
<!-- <a href="https://opendatalab.github.io/skydiffusion/" target="_blank"-->
<!-- class="external-link button is-normal is-rounded is-dark">-->
<!-- <span class="icon">-->
<!-- <i class="fab fa-github"></i>-->
<!-- </span>-->
<!-- <span>Code</span>-->
<!-- </a>-->
<!-- </span>-->
<!-- ArXiv abstract Link -->
<span class="link-block">
<a href="https://opendatalab.github.io/UrBench/" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<!-- Huggingface Dataset Link-->
<span class="link-block">
<a href="https://opendatalab.github.io/UrBench/" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-database"></i>
</span>
<span>Huggingface Dataset</span>
</a>
</span>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Teaser image-->
<!--<section class="hero is-small">-->
<!-- <div class="container">-->
<!-- <div class="hero-body">-->
<!-- <h2 class="title is-3 has-text-centered">UrBench Overview</h2>-->
<!-- </div>-->
<!-- <div class="hero is-small">-->
<!-- <div id="results-carousel" class="carousel results-carousel">-->
<!-- <div class="item item-image1" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- <span style="font-weight:bold;">UrBench</span> is a comprehensive benchmark designed for-->
<!-- evaluating LMMs in complex multi-view urban scenarios. Compared to existing works, UrBench offers:-->
<!-- <ul>-->
<!-- <li>(1) <i><b>Region-level and role-level questions.</b></i> <b>UrBench</b> contains diverse questions at both region and role level,-->
<!-- while previous benchmarks generally focus on region-level questions, </li>-->
<!-- <li>(2) <i><b>Multi-view data.</b></i> <b>UrBench</b> incorporates both street and satellite data, as well as their paired-up cross-view data.-->
<!-- Prior benchmarks generally focus on evaluations from a single view perspective.</li>-->
<!-- <li>(3) <i><b>Diverse task types.</b></i> <b>UrBench</b> contains 14 diverse task types categorized into four task dimensions,-->
<!-- while previous benchmarks only offer limited task types such as counting, object recognition, etc.</li>-->
<!-- </ul>-->
<!-- </h2>-->
<!-- <img src="static/images/comparison_plot.png" class="center-image" style="max-width: 100%; height: auto;"/>-->
<!-- <h2 class="subtitle has-text-centered">Example tasks in <span style="font-weight:bold;">BLINK</span>.</h2> -->
<!-- </div>-->
<!-- <div class="item item-image2" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <img src="static/images/tasks.png" class="center-image" style="max-width: 100%; height: auto;"/>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!--</section>-->
<!--<section class="hero is-large">-->
<!-- <div >-->
<!-- <div class="container">-->
<!-- <div id="teaser-carousel" class="carousel results-carousel">-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/tasks.png" class="center-image" style="max-width: 100%; height: auto;"/>-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- We propose UrBench, a multi-view benchmark designed-->
<!-- to evaluate LMMs’ performances in urban environments.-->
<!-- Our benchmark includes 14 urban tasks that we categorize into various dimensions. These tasks encompass-->
<!-- both region-level evaluations that assess LMMs’ capabilities in urban planning, as well as role-level evaluations-->
<!-- that examine LMMs’ responses to daily issues.-->
<!-- </h2>-->
<!-- </div>-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/data_process_plot.png" class="center-image" style="max-width: 100%; height: auto;"/>-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- We introduce a novel benchmark curation pipeline that-->
<!-- involves a cross-view detection-matching algorithm for-->
<!-- object-level annotation generation and a question generation approach that integrates LMM-based, rule-based,-->
<!-- and human-based methods. This pipeline ensures the creation of a large-scale and high-quality corpus of questions, significantly enhancing the diversity and depth of-->
<!-- evaluation across multiple urban tasks.-->
<!-- </h2>-->
<!-- </div>-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/radar_chart.png" class="center-image" style="max-width: 80%; height: 60%; top:auto; bottom: auto; align-items: center;"/>-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- We evaluate 21 popular LMMs on <b>UrBench</b>. Our evaluation results show that current models lag behind human-->
<!-- experts in most tasks and reveal LMMs’ inconsistent behaviors with different urban views, which demonstrates-->
<!-- the limitations of current LMMs in urban environments. Here, we show several leading LMMs' performances on <b>UrBench</b>.-->
<!-- </h2>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!--</section>-->
<!-- Paper abstract -->
<section class="section hero">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p style="font-size: 18px">
Recent evaluations of Large Multimodal Models (LMMs) have explored their capabilities in various domains, with only few benchmarks specifically focusing on urban environments.
Moreover, existing urban benchmarks have been limited to evaluating LMMs with basic region-level urban tasks under singular views, leading to incomplete evaluations of LMMs' abilities in urban environments.
To address these issues, we present <b>UrBench</b>, a comprehensive benchmark designed for evaluating LMMs in complex multi-view urban scenarios.
<b>UrBench</b> contains 11.6K meticulously curated questions at both region-level and role-level that cover 4 task dimensions: Geo-Localization, Scene Reasoning, Scene Understanding, and Object Understanding, totaling 14 task types.
In constructing <b>UrBench</b>, we utilize data from existing datasets and additionally collect data from 11 cities, creating new annotations using a cross-view detection-matching method.
With these images and annotations, we then integrate LMM-based, rule-based, and human-based methods to construct large-scale high-quality questions.
Our evaluations on 21 LMMs show that current LMMs struggle in the urban environments in several aspects. Even the best performing GPT-4o lags behind humans in most tasks, ranging from simple tasks such as counting to complex tasks such as orientation, localization and object attribute recognition, with an average performance gap of 17.4%.
Our benchmark also reveals that LMMs exhibit inconsistent behaviors with different urban views, especially with respect to understanding cross-view relations.
<b>UrBench</b> datasets and benchmark results will be publicly available.
</p>
</div>
</div>
</div>
</div>
</section>
<!-- End paper abstract -->
<section class="section hero is-light">
<div class="container is-max-desktop">
<!-- Abstract. -->
<div class="columns is-centered has-text-centered">
<div class="column is-five-fifths">
<h2 class="title is-3">UrBench Overview</h2>
<div class="content has-text-justified">
<div class="column is-centered has-text-centered">
<img src="./static/images/tasks.png" style="max-width: 100%; height: auto"/>
</div>
<div class="has-text-justified" style="font-size: 20px">
We propose <b>UrBench</b>, a multi-view benchmark designed
to evaluate LMMs’ performances in urban environments.
Our benchmark includes 14 urban tasks that we categorize into various dimensions. These tasks encompass
both region-level evaluations that assess LMMs’ capabilities in urban planning, as well as role-level evaluations
that examine LMMs’ responses to daily issues.
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Dataset -->
<section class="section hero">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-five-fifths">
<h2 class="title is-3">Comparisons with Existing Benchmarks</h2>
<div class="content has-text-justified" style="font-size: 20px">
Compared to previous benchmarks, <b>UrBench</b> offers:
<ul>
<li> <i><b>Region-level and role-level questions.</b></i> <b>UrBench</b> contains diverse questions at both region and role level,
while previous benchmarks generally focus on region-level questions.</li>
<li> <i><b>Multi-view data.</b></i> <b>UrBench</b> incorporates both street and satellite data, as well as their paired-up cross-view data.
Prior benchmarks generally focus on evaluations from a single view perspective.</li>
<li> <i><b>Diverse task types.</b></i> <b>UrBench</b> contains 14 diverse task types categorized into four task dimensions,
while previous benchmarks only offer limited task types such as counting, object recognition, etc.</li>
</ul>
<div class="column is-centered has-text-centered">
<img src="./static/images/comparison_plot.png" style="max-width: 85%; height: auto"/>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="section hero is-light">
<div class="container is-max-desktop">
<!-- Abstract. -->
<div class="columns is-centered has-text-centered">
<div>
<h2 class="title is-3">Data Curation Pipeline of UrBench</h2>
<div class="content has-text-justified">
<div class="column is-centered has-text-justified">
<img src="./static/images/data_process_plot.png" style="max-width: 100%; height: auto"/>
<!-- <h3 class="subtitle is-5 has-text-justified">-->
<p style="font-size: 20px">
We introduce a novel benchmark curation pipeline that
involves a cross-view detection-matching algorithm for
object-level annotation generation and a question generation approach that integrates LMM-based, rule-based,
and human-based methods. This pipeline ensures the creation of a large-scale and high-quality corpus of questions, significantly enhancing the diversity and depth of
evaluation across multiple urban tasks.
</p>
<!-- </h3>-->
</div>
</div>
</div>
</div>
</div>
</section>
<!-- End Dataset -->
<!--<section class="section hero">-->
<!-- <div class="hero-body">-->
<!-- <div class="container is-max-desktop">-->
<!-- <h2 class="title is-3 has-text-centered">Detailed Statistics of UrBench</h2>-->
<!-- <div id="teaser-carousel" class="carousel results-carousel">-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/pie_plot.png" class="center-image" style="max-width: 65%; height: auto;"/>-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- The 14 types of tasks under 4 evaluation dimensions.-->
<!-- </h2>-->
<!-- </div>-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/task_view_histogram.png" class="center-image" style="max-width: 55%; height: auto;"/>-->
<!-- <h2 class="subtitle is-5 has-text-justified">-->
<!-- The view types of each task.-->
<!-- </h2>-->
<!-- </div>-->
<!-- <div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/data_statistics.png" class="center-image" style="max-width: 55%; height: auto; top:auto; bottom: auto; align-items: center;"/>-->
<!-- <p style="font-size: 20px">-->
<!-- The statistics of UrBench. cross, sat, and str are the abbreviations for cross-view, satellite-view, and street-view. mono, pano, and multi are the-->
<!-- abbreviations of monocular, panoramic, and multiple. MC and open means multiple-choice and open-ended, respectively.-->
<!-- </p>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!--</section>-->
<section class="section hero ">
<div class="hero-body">
<div class="container is-max-desktop">
<h2 class="title is-3 has-text-centered">Statistics & Characteristics of UrBench</h2>
<p style="font-size: 20px">
<b>UrBench</b> introduces 14 diverse tasks in the urban environment, covering multiple different views. While humans handle most tasks easily, we find LMMs still struggle.
</p>
<div id="teaser-flex" style="display: flex; justify-content: space-between; align-items: flex-start; flex-wrap: nowrap; top: auto">
<div class="item" style="flex: 1; display: flex; flex-direction: column; justify-content: center; align-items: center; margin: 10px;">
<img src="static/images/pie_plot.png" class="center-image" style="width: 80%; height: auto;"/>
</div>
<div class="item" style="flex: 1.2; display: flex; flex-direction: column; justify-content: center; align-items: center; margin: 10px;">
<img src="static/images/radar_chart.png" class="center-image" style="width: 120%; height: auto;"/>
</div>
</div>
</div>
</div>
</section>
<section class="section hero is-light">
<div class="hero-body">
<div class="container is-max-desktop">
<h2 class="title is-3 has-text-centered">Qualitative Results</h2>
<div id="teaser-carousel" class="carousel results-carousel">
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<h2 class="title is-4 has-text-justified" style="font-family: 'Times New Roman'">Geo-Localization</h2>
<img src="static/images/dimension1.png" class="center-image" style="max-width: 100%; height: auto;"/>
<h2 class="subtitle is-5 has-text-justified">
<!-- The 14 types of tasks under 4 evaluation dimensions.-->
</h2>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<h2 class="title is-4 has-text-justified" style="font-family: 'Times New Roman'">Scene Reasoning</h2>
<img src="static/images/dimension2.png" class="center-image" style="max-width: 100%; height: auto;"/>
<h2 class="subtitle is-5 has-text-justified">
<!-- The view types of each task.-->
</h2>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<h2 class="title is-4 has-text-justified" style="font-family: 'Times New Roman'">Scene Understanding</h2>
<img src="static/images/dimension3.png" class="center-image" style="max-width: 100%; height: auto; top:auto; bottom: auto; align-items: center;"/>
<h2 class="subtitle is-5 has-text-justified">
<!-- The statistics of UrBench. cross, sat, and str are the abbreviations for cross-view, satellite-view, and street-view. mono, pano, and multi are the-->
<!-- abbreviations of monocular, panoramic, and multiple. MC and open means multiple-choice and open-ended, respectively.-->
</h2>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<h2 class="title is-4 has-text-justified" style="font-family: 'Times New Roman'">Object Understanding</h2>
<img src="static/images/dimension4.png" class="center-image" style="max-width: 100%; height: auto; top:auto; bottom: auto; align-items: center;"/>
<h2 class="subtitle is-5 has-text-justified">
<!-- The statistics of UrBench. cross, sat, and str are the abbreviations for cross-view, satellite-view, and street-view. mono, pano, and multi are the-->
<!-- abbreviations of monocular, panoramic, and multiple. MC and open means multiple-choice and open-ended, respectively.-->
</h2>
</div>
</div>
</div>
</div>
</section>
<!-- Evaluation -->
<section class="section hero">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-five-fifths">
<h2 class="title is-3">Evaluation Results</h2>
<div class="content has-text-justified">
<!-- Qualitative Results -->
<h3></h3>
<div class="column is-centered has-text-justified">
<p style="font-size: 20px;">
UrBench poses significant challenges to current SoTA LMMs. We find that the best performing closed-source model GPT-4o and open-source model VILA-1.5-
40B only achieve a <b>61.2%</b> and a <b>53.1%</b> accuracy, respectively. Interestingly, our findings indicate that the primary
limitation of these models lies in their ability to comprehend <b>UrBench</b> questions, not in their capacity to process multiple images, as the performance between multi-image and
their single-image counterparts shows little difference, such as LLaVA-NeXT-8B and LLaVA-NeXT-Interleave in the table.
Overall, the challenging nature of our benchmark indicates that current LMMs’ strong performance on the general
benchmarks are not generalized to the multi-view urban scenarios.
</p>
<img src="./static/images/evaluation_results.png" class="center-image"/>
<figcaption style="font-size: 20px;text-align: center;">Performances of LMMs and human experts on the <b>UrBench</b> test set.</figcaption>
</div>
<!-- End Qualitative Results -->
</div>
</div>
</div>
</div>
</section>
<!-- End evaluation -->
<!-- Image carousel -->
<section class="section hero is-light">
<div class="hero-body">
<div class="container is-max-desktop">
<h2 class="title is-3 has-text-centered">Case Study</h2>
<div class="has-text-justified" style="font-size: 20px">
We present randomly selected samples from the 14 tasks of <b>UrBench</b>, with GPT-4o, VILA-1.5-40B and Claude-3.5-Sonnet's responses attached.
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/CO1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/CO2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/CR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/CR2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/CR3.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/IR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OAR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OAR2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OAR3.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OAR4.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OG1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OG2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OM1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/OR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/RBR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/RU1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/RU2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SC1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SC2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SC3.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SC4.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SR2.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/SR3.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/TSR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
<div class="item" style=" height: 100%; display: flex; flex-direction: column; justify-content: center; align-items: center;">
<!-- Your image here -->
<img src="static/images/VPR1.png" class="center-image" style="max-width: 80%; height: auto;"/>
</div>
</div>
</div>
</div>
</section>
<!-- End image carousel -->
<!--<!–BibTex citation –>-->
<!-- <section class="section" id="BibTeX">-->
<!-- <div class="container is-max-desktop content">-->
<!-- <h2 class="title">BibTeX</h2>-->
<!-- <pre><code>@misc{ye2024skydiffusionstreettosatelliteimagesynthesis,-->
<!-- title={SkyDiffusion: Street-to-Satellite Image Synthesis with Diffusion Models and BEV Paradigm}, -->
<!-- author={Junyan Ye and Jun He and Weijia Li and Zhutao Lv and Jinhua Yu and Haote Yang and Conghui He},-->
<!-- year={2024},-->
<!-- eprint={2408.01812},-->
<!-- archivePrefix={arXiv},-->
<!-- primaryClass={cs.CV},-->
<!-- url={https://arxiv.org/abs/2408.01812}, -->
<!-- }</code></pre>-->
<!-- </div>-->
<!--</section>-->
<!--<!–End BibTex citation –>-->
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>
This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
</div>
</div>
</div>
</div>
</footer>
<!-- Statcounter tracking code -->
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->
<!-- End of Statcounter Code -->
</body>
</html>