Skip to content

Commit e3bef82

Browse files
tejasupintel-lab-lkp
authored andcommitted
drm/xe/guc: Use exec queue hints for GT frequency
Allow user to provide a low latency hint per exec queue. When set, KMD sends a hint to GuC which results in special handling for this exec queue. SLPC will ramp the GT frequency aggressively every time it switches to this exec queue. We need to enable the use of SLPC Compute strategy during init, but it will apply only to exec queues that set this bit during exec queue creation. Improvement with this approach as below: Before, :~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency Platform: Intel(R) OpenCL Graphics Device: Intel(R) Graphics [0xe20b] Driver version : 24.52.0 (Linux x64) Compute units : 160 Clock frequency : 2850 MHz Kernel launch latency : 283.16 us After, :~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency Platform: Intel(R) OpenCL Graphics Device: Intel(R) Graphics [0xe20b] Driver version : 24.52.0 (Linux x64) Compute units : 160 Clock frequency : 2850 MHz Kernel launch latency : 63.38 us UMD will indicate low latency hint with flag as mentioned below, * struct drm_xe_exec_queue_create exec_queue_create = { * .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT or 0 * .extensions = 0, * .vm_id = vm, * .num_bb_per_exec = 1, * .num_eng_per_bb = 1, * .instances = to_user_pointer(&instance), * }; * ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create); Link to UMD PR : intel/compute-runtime#794 Note: There is outstanding issue on guc side to be not able to switch to max frequency as per strategy indicated by KMD, so for experminet/test result hardcoding apporch was taken and passed to guc as policy. Effort on debugging from guc side is going on in parallel. V3: - Conver user flag to kernel internal flag and use (Oak) - Support query config for use to check kernel support (Jose) - Dont need to take runtime pm (Vinay) V2: - DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT 1 is already planned for other hint(Szymon) - Add motivation to description (Lucas) Signed-off-by: Tejas Upadhyay <[email protected]>
1 parent 6a04bb5 commit e3bef82

File tree

7 files changed

+43
-6
lines changed

7 files changed

+43
-6
lines changed

drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h

+3
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ struct slpc_task_state_data {
174174
};
175175
} __packed;
176176

177+
#define SLPC_EXEC_QUEUE_FREQ_REQ_IS_COMPUTE REG_BIT(28)
178+
#define SLPC_OPTIMIZED_STRATEGY_COMPUTE REG_BIT(0)
179+
177180
struct slpc_shared_data_header {
178181
/* Total size in bytes of this shared buffer. */
179182
u32 size;

drivers/gpu/drm/xe/xe_exec_queue.c

+8-3
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
544544
struct drm_xe_engine_class_instance __user *user_eci =
545545
u64_to_user_ptr(args->instances);
546546
struct xe_hw_engine *hwe;
547+
unsigned long flags;
547548
struct xe_vm *vm;
548549
struct xe_gt *gt;
549550
struct xe_tile *tile;
@@ -553,7 +554,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
553554
u32 len;
554555
int err;
555556

556-
if (XE_IOCTL_DBG(xe, args->flags) ||
557+
if (XE_IOCTL_DBG(xe, args->flags &&
558+
!(args->flags & DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT)) ||
557559
XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
558560
return -EINVAL;
559561

@@ -570,6 +572,9 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
570572
if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
571573
return -EINVAL;
572574

575+
if (args->flags & DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT)
576+
flags |= EXEC_QUEUE_FLAG_LOW_LATENCY;
577+
573578
if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
574579
if (XE_IOCTL_DBG(xe, args->width != 1) ||
575580
XE_IOCTL_DBG(xe, args->num_placements != 1) ||
@@ -578,8 +583,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
578583

579584
for_each_tile(tile, xe, id) {
580585
struct xe_exec_queue *new;
581-
u32 flags = EXEC_QUEUE_FLAG_VM;
582586

587+
flags |= EXEC_QUEUE_FLAG_VM;
583588
if (id)
584589
flags |= EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD;
585590

@@ -626,7 +631,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
626631
}
627632

628633
q = xe_exec_queue_create(xe, vm, logical_mask,
629-
args->width, hwe, 0,
634+
args->width, hwe, flags,
630635
args->extensions);
631636
up_read(&vm->lock);
632637
xe_vm_put(vm);

drivers/gpu/drm/xe/xe_exec_queue_types.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ struct xe_exec_queue {
8585
#define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD BIT(3)
8686
/* kernel exec_queue only, set priority to highest level */
8787
#define EXEC_QUEUE_FLAG_HIGH_PRIORITY BIT(4)
88-
88+
/* flag to indicate low latency hint to guc */
89+
#define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5)
8990
/**
9091
* @flags: flags for this exec queue, should statically setup aside from ban
9192
* bit

drivers/gpu/drm/xe/xe_guc_pc.c

+16
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,17 @@ static int pc_init_freqs(struct xe_guc_pc *pc)
992992
return ret;
993993
}
994994

995+
static int xe_guc_pc_set_strategy(struct xe_guc_pc *pc, u32 val)
996+
{
997+
int ret = 0;
998+
999+
ret = pc_action_set_param(pc,
1000+
SLPC_PARAM_STRATEGIES,
1001+
val);
1002+
1003+
return ret;
1004+
}
1005+
9951006
/**
9961007
* xe_guc_pc_start - Start GuC's Power Conservation component
9971008
* @pc: Xe_GuC_PC instance
@@ -1051,6 +1062,11 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
10511062
}
10521063

10531064
ret = pc_action_setup_gucrc(pc, GUCRC_FIRMWARE_CONTROL);
1065+
if (ret)
1066+
goto out;
1067+
1068+
/* Enable SLPC Optimized Strategy for compute */
1069+
ret = xe_guc_pc_set_strategy(pc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
10541070

10551071
out:
10561072
xe_force_wake_put(gt_to_fw(gt), fw_ref);

drivers/gpu/drm/xe/xe_guc_submit.c

+7
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <drm/drm_managed.h>
1616

1717
#include "abi/guc_actions_abi.h"
18+
#include "abi/guc_actions_slpc_abi.h"
1819
#include "abi/guc_klvs_abi.h"
1920
#include "regs/xe_lrc_layout.h"
2021
#include "xe_assert.h"
@@ -400,6 +401,7 @@ static void __guc_exec_queue_policy_add_##func(struct exec_queue_policy *policy,
400401
MAKE_EXEC_QUEUE_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM)
401402
MAKE_EXEC_QUEUE_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
402403
MAKE_EXEC_QUEUE_POLICY_ADD(priority, SCHEDULING_PRIORITY)
404+
MAKE_EXEC_QUEUE_POLICY_ADD(slpc_ctx_freq_req, SLPM_GT_FREQUENCY)
403405
#undef MAKE_EXEC_QUEUE_POLICY_ADD
404406

405407
static const int xe_exec_queue_prio_to_guc[] = {
@@ -414,14 +416,19 @@ static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q)
414416
struct exec_queue_policy policy;
415417
enum xe_exec_queue_priority prio = q->sched_props.priority;
416418
u32 timeslice_us = q->sched_props.timeslice_us;
419+
u32 slpc_ctx_freq_req = 0;
417420
u32 preempt_timeout_us = q->sched_props.preempt_timeout_us;
418421

419422
xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
420423

424+
if (q->flags & EXEC_QUEUE_FLAG_LOW_LATENCY)
425+
slpc_ctx_freq_req |= SLPC_EXEC_QUEUE_FREQ_REQ_IS_COMPUTE;
426+
421427
__guc_exec_queue_policy_start_klv(&policy, q->guc->id);
422428
__guc_exec_queue_policy_add_priority(&policy, xe_exec_queue_prio_to_guc[prio]);
423429
__guc_exec_queue_policy_add_execution_quantum(&policy, timeslice_us);
424430
__guc_exec_queue_policy_add_preemption_timeout(&policy, preempt_timeout_us);
431+
__guc_exec_queue_policy_add_slpc_ctx_freq_req(&policy, slpc_ctx_freq_req);
425432

426433
xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g,
427434
__guc_exec_queue_policy_action_size(&policy), 0, 0);

drivers/gpu/drm/xe/xe_query.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
334334
xe->info.devid | (xe->info.revid << 16);
335335
if (xe_device_get_root_tile(xe)->mem.vram.usable_size)
336336
config->info[DRM_XE_QUERY_CONFIG_FLAGS] =
337-
DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM;
337+
DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM |
338+
DRM_XE_QUERY_CONFIG_FLAG_EQUEUE_HAS_LOW_LATENCY;
338339
config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
339340
xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
340341
config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;

include/uapi/drm/xe_drm.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ struct drm_xe_query_mem_regions {
393393
*
394394
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
395395
* has usable VRAM
396+
* - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
397+
* has usable VRAM
396398
* - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
397399
* required by this device, typically SZ_4K or SZ_64K
398400
* - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
@@ -409,6 +411,7 @@ struct drm_xe_query_config {
409411
#define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID 0
410412
#define DRM_XE_QUERY_CONFIG_FLAGS 1
411413
#define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM (1 << 0)
414+
#define DRM_XE_QUERY_CONFIG_FLAG_EQUEUE_HAS_LOW_LATENCY (1 << 1)
412415
#define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2
413416
#define DRM_XE_QUERY_CONFIG_VA_BITS 3
414417
#define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4
@@ -1097,6 +1100,7 @@ struct drm_xe_vm_bind {
10971100
* .engine_class = DRM_XE_ENGINE_CLASS_RENDER,
10981101
* };
10991102
* struct drm_xe_exec_queue_create exec_queue_create = {
1103+
* .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT or 0
11001104
* .extensions = 0,
11011105
* .vm_id = vm,
11021106
* .num_bb_per_exec = 1,
@@ -1110,7 +1114,6 @@ struct drm_xe_exec_queue_create {
11101114
#define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY 0
11111115
#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY 0
11121116
#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE 1
1113-
11141117
/** @extensions: Pointer to the first extension struct, if any */
11151118
__u64 extensions;
11161119

@@ -1123,6 +1126,7 @@ struct drm_xe_exec_queue_create {
11231126
/** @vm_id: VM to use for this exec queue */
11241127
__u32 vm_id;
11251128

1129+
#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT (0x1 << 1)
11261130
/** @flags: MBZ */
11271131
__u32 flags;
11281132

0 commit comments

Comments
 (0)