Skip to content

Commit ce1e4c7

Browse files
committed
Have dispatch skip hedging delays when dispatching to unsupported relations
The cluster dispatcher now tracks which resource relations and subject relations are unsupported by the secondary dispatcher(s) and, if dispatching to one of those relations, skips the hedging delay for the primary. The secondary dispatching still does occur, and if the resource or subject relation is now supported, it will be marked as such again. This ensures that in the majority of unsupported cases, the dispatcher is not injecting an unnecessary hedging delay
1 parent ff12fd3 commit ce1e4c7

File tree

3 files changed

+402
-15
lines changed

3 files changed

+402
-15
lines changed

internal/dispatch/remote/cluster.go

+167-15
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,20 @@ import (
1414
"github.com/prometheus/client_golang/prometheus"
1515
"github.com/rs/zerolog"
1616
"go.uber.org/atomic"
17+
"google.golang.org/genproto/googleapis/rpc/errdetails"
1718
"google.golang.org/grpc"
19+
"google.golang.org/grpc/codes"
1820
"google.golang.org/grpc/connectivity"
21+
"google.golang.org/grpc/status"
1922
"google.golang.org/protobuf/proto"
2023

2124
"github.com/authzed/spicedb/internal/dispatch"
2225
"github.com/authzed/spicedb/internal/dispatch/keys"
2326
log "github.com/authzed/spicedb/internal/logging"
27+
corev1 "github.com/authzed/spicedb/pkg/proto/core/v1"
2428
v1 "github.com/authzed/spicedb/pkg/proto/dispatch/v1"
2529
"github.com/authzed/spicedb/pkg/spiceerrors"
30+
"github.com/authzed/spicedb/pkg/tuple"
2631
)
2732

2833
var dispatchCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
@@ -130,6 +135,7 @@ func NewClusterDispatcher(client ClusterClient, conn *grpc.ClientConn, config Cl
130135
secondaryDispatch: secondaryDispatch,
131136
secondaryDispatchExprs: secondaryDispatchExprs,
132137
secondaryInitialResponseDigests: secondaryInitialResponseDigests,
138+
supportedResourceSubjectTracker: newSupportedResourceSubjectTracker(),
133139
}, nil
134140
}
135141

@@ -142,6 +148,7 @@ type clusterDispatcher struct {
142148
secondaryDispatch map[string]SecondaryDispatch
143149
secondaryDispatchExprs map[string]*DispatchExpr
144150
secondaryInitialResponseDigests map[string]*digestAndLock
151+
supportedResourceSubjectTracker *supportedResourceSubjectTracker
145152
}
146153

147154
// digestAndLock is a struct that holds a TDigest and a lock to protect it.
@@ -188,15 +195,22 @@ func (cr *clusterDispatcher) DispatchCheck(ctx context.Context, req *v1.Dispatch
188195

189196
ctx = context.WithValue(ctx, consistent.CtxKey, requestKey)
190197

191-
resp, err := dispatchSyncRequest(ctx, cr, "check", req, func(ctx context.Context, client ClusterClient) (*v1.DispatchCheckResponse, error) {
192-
resp, err := client.DispatchCheck(ctx, req)
193-
if err != nil {
194-
return resp, err
195-
}
198+
resp, err := dispatchSyncRequest(
199+
ctx,
200+
cr,
201+
"check",
202+
req,
203+
tuple.FromCoreRelationReference(req.ResourceRelation),
204+
tuple.RR(req.Subject.Namespace, req.Subject.Relation),
205+
func(ctx context.Context, client ClusterClient) (*v1.DispatchCheckResponse, error) {
206+
resp, err := client.DispatchCheck(ctx, req)
207+
if err != nil {
208+
return resp, err
209+
}
196210

197-
err = adjustMetadataForDispatch(resp.Metadata)
198-
return resp, err
199-
})
211+
err = adjustMetadataForDispatch(resp.Metadata)
212+
return resp, err
213+
})
200214
if err != nil {
201215
return &v1.DispatchCheckResponse{Metadata: requestFailureMetadata}, err
202216
}
@@ -216,6 +230,13 @@ type responseMessage interface {
216230
GetMetadata() *v1.ResponseMeta
217231
}
218232

233+
type streamingRequestMessage interface {
234+
requestMessage
235+
236+
GetResourceRelation() *corev1.RelationReference
237+
GetSubjectRelation() *corev1.RelationReference
238+
}
239+
219240
type respTuple[S responseMessage] struct {
220241
resp S
221242
err error
@@ -226,7 +247,15 @@ type secondaryRespTuple[S responseMessage] struct {
226247
resp S
227248
}
228249

229-
func dispatchSyncRequest[Q requestMessage, S responseMessage](ctx context.Context, cr *clusterDispatcher, reqKey string, req Q, handler func(context.Context, ClusterClient) (S, error)) (S, error) {
250+
func dispatchSyncRequest[Q requestMessage, S responseMessage](
251+
ctx context.Context,
252+
cr *clusterDispatcher,
253+
reqKey string,
254+
req Q,
255+
resourceTypeAndRelation tuple.RelationReference,
256+
subjectTypeAndRelation tuple.RelationReference,
257+
handler func(context.Context, ClusterClient) (S, error),
258+
) (S, error) {
230259
withTimeout, cancelFn := context.WithTimeout(ctx, cr.dispatchOverallTimeout)
231260
defer cancelFn()
232261

@@ -248,9 +277,11 @@ func dispatchSyncRequest[Q requestMessage, S responseMessage](ctx context.Contex
248277
go func() {
249278
// Have the main dispatch wait some time before returning, to allow the secondaries to
250279
// potentially return first.
251-
computedWait := cr.secondaryInitialResponseDigests[reqKey].getWaitTime()
280+
computedWait := cr.getPrimaryWaitTime(reqKey, resourceTypeAndRelation, subjectTypeAndRelation)
252281
log.Trace().Stringer("computed-wait", computedWait).Msg("primary dispatch started; sleeping for computed wait time")
253-
time.Sleep(computedWait)
282+
if computedWait > 0 {
283+
time.Sleep(computedWait)
284+
}
254285

255286
log.Trace().Msg("running primary dispatch after wait")
256287
select {
@@ -296,10 +327,12 @@ func dispatchSyncRequest[Q requestMessage, S responseMessage](ctx context.Contex
296327
// For secondary dispatches, ignore any errors, as only the primary will be handled in
297328
// that scenario.
298329
log.Trace().Stringer("duration", handlerDuration).Str("secondary", secondary.Name).Err(err).Msg("got ignored secondary dispatch error")
330+
cr.supportedResourceSubjectTracker.updateForError(err)
299331
return
300332
}
301333

302334
log.Trace().Stringer("duration", handlerDuration).Str("secondary", secondary.Name).Msg("secondary dispatch completed")
335+
go cr.supportedResourceSubjectTracker.updateForSuccess(resourceTypeAndRelation, subjectTypeAndRelation)
303336
cr.secondaryInitialResponseDigests[reqKey].addResultTime(handlerDuration)
304337
secondaryResultChan <- secondaryRespTuple[S]{resp: resp, handlerName: secondary.Name}
305338
}
@@ -397,7 +430,7 @@ type ctxAndCancel struct {
397430
// secondary dispatchers. Unlike the non-streaming version, this will first attempt to dispatch
398431
// from the allowed secondary dispatchers before falling back to the primary, rather than running
399432
// them in parallel.
400-
func dispatchStreamingRequest[Q requestMessage, R responseMessage](
433+
func dispatchStreamingRequest[Q streamingRequestMessage, R responseMessage](
401434
ctx context.Context,
402435
cr *clusterDispatcher,
403436
reqKey string,
@@ -495,9 +528,14 @@ func dispatchStreamingRequest[Q requestMessage, R responseMessage](
495528
isPrimary := name == primaryDispatcher
496529
if isPrimary {
497530
// Have the primary wait a bit to ensure the secondaries have a chance to return first.
498-
computedWait := cr.secondaryInitialResponseDigests[reqKey].getWaitTime()
499-
time.Sleep(computedWait)
500-
hedgeWaitHistogram.WithLabelValues(reqKey).Observe(computedWait.Seconds())
531+
computedWait := cr.getPrimaryWaitTime(reqKey,
532+
tuple.FromCoreRelationReference(req.GetResourceRelation()),
533+
tuple.FromCoreRelationReference(req.GetSubjectRelation()),
534+
)
535+
if computedWait > 0 {
536+
time.Sleep(computedWait)
537+
hedgeWaitHistogram.WithLabelValues(reqKey).Observe(computedWait.Seconds())
538+
}
501539
} else {
502540
startTime = time.Now()
503541
}
@@ -521,6 +559,10 @@ func dispatchStreamingRequest[Q requestMessage, R responseMessage](
521559
primaryDispatch.WithLabelValues("false", reqKey).Inc()
522560
}
523561
if err != nil {
562+
if !isPrimary {
563+
cr.supportedResourceSubjectTracker.updateForError(err)
564+
}
565+
524566
log.Warn().Err(err).Str("dispatcher", name).Msg("error when trying to run secondary dispatcher")
525567
errorsLock.Lock()
526568
errorsByDispatcherName[name] = err
@@ -549,6 +591,10 @@ func dispatchStreamingRequest[Q requestMessage, R responseMessage](
549591
finishTime := time.Now()
550592
duration := finishTime.Sub(startTime)
551593
cr.secondaryInitialResponseDigests[reqKey].addResultTime(duration)
594+
go cr.supportedResourceSubjectTracker.updateForSuccess(
595+
tuple.FromCoreRelationReference(req.GetResourceRelation()),
596+
tuple.FromCoreRelationReference(req.GetSubjectRelation()),
597+
)
552598
}
553599

554600
// If a valid result, and we have not yet returned any results, try take a "lock" on
@@ -576,6 +622,10 @@ func dispatchStreamingRequest[Q requestMessage, R responseMessage](
576622
}
577623

578624
if err != nil {
625+
if !isPrimary {
626+
cr.supportedResourceSubjectTracker.updateForError(err)
627+
}
628+
579629
errorsLock.Lock()
580630
errorsByDispatcherName[name] = err
581631
errorsLock.Unlock()
@@ -728,6 +778,108 @@ func (cr *clusterDispatcher) ReadyState() dispatch.ReadyState {
728778
}
729779
}
730780

781+
// getPrimaryWaitTime returns the wait time for the primary dispatch, based on the request key and the
782+
// resource and subject types and relations. If the resource or subject is unsupported, the wait time
783+
// will be zero.
784+
func (cr *clusterDispatcher) getPrimaryWaitTime(
785+
reqKey string,
786+
resourceTypeAndRelation tuple.RelationReference,
787+
subjectTypeAndRelation tuple.RelationReference,
788+
) time.Duration {
789+
// If the resource or subject is unsupported, return zero time.
790+
if cr.supportedResourceSubjectTracker.isUnsupported(
791+
resourceTypeAndRelation,
792+
subjectTypeAndRelation,
793+
) {
794+
return 0
795+
}
796+
797+
return cr.secondaryInitialResponseDigests[reqKey].getWaitTime()
798+
}
799+
800+
// supportedResourceSubjectTracker is a struct that tracks the resources and subjects that are
801+
// unsupported by the secondary dispatcher(s). If a resource or subject is unsupported, the primary
802+
// dispatcher will not wait for the secondary dispatchers to return results, as it is extremely
803+
// likely that hedging will not be beneficial.
804+
type supportedResourceSubjectTracker struct {
805+
unsupportedResources sync.Map
806+
unsupportedSubjects sync.Map
807+
}
808+
809+
func newSupportedResourceSubjectTracker() *supportedResourceSubjectTracker {
810+
return &supportedResourceSubjectTracker{
811+
unsupportedResources: sync.Map{},
812+
unsupportedSubjects: sync.Map{},
813+
}
814+
}
815+
816+
// isUnsupported returns whether the resource or subject is unsupported by the secondary dispatcher(s).
817+
func (srst *supportedResourceSubjectTracker) isUnsupported(
818+
resourceTypeAndRelation tuple.RelationReference,
819+
subjectTypeAndRelation tuple.RelationReference,
820+
) bool {
821+
isUnsupportedResource, found := srst.unsupportedResources.Load(resourceTypeAndRelation)
822+
if found && isUnsupportedResource.(bool) {
823+
return true
824+
}
825+
826+
isUnsupportedSubject, found := srst.unsupportedSubjects.Load(subjectTypeAndRelation)
827+
return found && isUnsupportedSubject.(bool)
828+
}
829+
830+
// updateForSuccess updates the tracker for a successful dispatch, removing the resource and subject
831+
// from the unsupported set.
832+
func (srst *supportedResourceSubjectTracker) updateForSuccess(
833+
resourceTypeAndRelation tuple.RelationReference,
834+
subjectTypeAndRelation tuple.RelationReference,
835+
) {
836+
srst.unsupportedResources.CompareAndSwap(resourceTypeAndRelation, true, false)
837+
srst.unsupportedSubjects.CompareAndSwap(subjectTypeAndRelation, true, false)
838+
}
839+
840+
// updateForError updates the tracker for an error, adding the resource or subject to the unsupported set
841+
// if the error indicates that the resource or subject is unsupported.
842+
func (srst *supportedResourceSubjectTracker) updateForError(err error) {
843+
// Check for a FAILED_PRECONDITION with error details that indicate an unsupported
844+
// resource or subject.
845+
st, ok := status.FromError(err)
846+
if !ok {
847+
return
848+
}
849+
850+
if st.Code() != codes.FailedPrecondition {
851+
return
852+
}
853+
854+
for _, detail := range st.Details() {
855+
errDetail, ok := detail.(*errdetails.ErrorInfo)
856+
if !ok {
857+
continue
858+
}
859+
860+
// If the error is an unsupported resource or subject, add it to the tracker.
861+
if errDetail.Reason == "UNSUPPORTED_RESOURCE_RELATION" {
862+
definitionName := errDetail.Metadata["definition_name"]
863+
relationName := errDetail.Metadata["relation_name"]
864+
rr := tuple.RR(definitionName, relationName)
865+
existing, loaded := srst.unsupportedResources.LoadOrStore(rr, true)
866+
if !loaded || !existing.(bool) {
867+
srst.unsupportedResources.Store(rr, true)
868+
}
869+
}
870+
871+
if errDetail.Reason == "UNSUPPORTED_SUBJECT_RELATION" {
872+
definitionName := errDetail.Metadata["definition_name"]
873+
relationName := errDetail.Metadata["relation_name"]
874+
rr := tuple.RR(definitionName, relationName)
875+
existing, loaded := srst.unsupportedSubjects.LoadOrStore(rr, true)
876+
if !loaded || !existing.(bool) {
877+
srst.unsupportedSubjects.Store(rr, true)
878+
}
879+
}
880+
}
881+
}
882+
731883
// Always verify that we implement the interface
732884
var _ dispatch.Dispatcher = &clusterDispatcher{}
733885

0 commit comments

Comments
 (0)