How tidb retries request to tikv
Source code version v2.1.18
Data structures
type Backoffer struct {
ctx context.Context
fn map[backoffType]func(context.Context) int
maxSleep int
totalSleep int
errors []error
types []backoffType
vars *kv.Variables
}
- Sleep setting for each type retry
- Time in MS
EqualJittermeans half of the duration is randomized
NewBackoffFn(base, cap, jitter int) func(ctx context.Context) int
case boTiKVRPC:
return NewBackoffFn(100, 2000, EqualJitter)
case BoTxnLock:
return NewBackoffFn(200, 3000, EqualJitter)
case boTxnLockFast:
return NewBackoffFn(vars.BackoffLockFast, 3000, EqualJitter)
case boPDRPC:
return NewBackoffFn(500, 3000, EqualJitter)
case BoRegionMiss:
return NewBackoffFn(100, 500, NoJitter)
case BoUpdateLeader:
return NewBackoffFn(1, 10, NoJitter)
case boServerBusy:
return NewBackoffFn(2000, 10000, EqualJitter)
Default max total sleep time in each case
// Maximum total sleep time(in ms) for kv/cop commands.
const (
copBuildTaskMaxBackoff = 5000
tsoMaxBackoff = 15000
scannerNextMaxBackoff = 20000
batchGetMaxBackoff = 20000
copNextMaxBackoff = 20000
getMaxBackoff = 20000
prewriteMaxBackoff = 20000
cleanupMaxBackoff = 20000
GcOneRegionMaxBackoff = 20000
GcResolveLockMaxBackoff = 100000
deleteRangeOneRegionMaxBackoff = 100000
rawkvMaxBackoff = 20000
splitRegionBackoff = 20000
scatterRegionBackoff = 20000
waitScatterRegionFinishBackoff = 120000
)
When do we stop trying
If the backoff attempts have slept more that configured duration, we will return a MySql error that singals that do not retry anymore
realSleep := f(b.ctx)
backoffDuration.Observe(float64(realSleep) / 1000)
b.totalSleep += realSleep
b.types = append(b.types, typ)
var startTs interface{} = ""
if ts := b.ctx.Value(txnStartKey); ts != nil {
startTs = ts
}
logutil.Logger(context.Background()).Debug("retry later",
zap.Error(err),
zap.Int("totalSleep", b.totalSleep),
zap.Int("maxSleep", b.maxSleep),
zap.Stringer("type", typ),
zap.Reflect("txnStartTS", startTs))
b.errors = append(b.errors, errors.Errorf("%s at %s", err.Error(), time.Now().Format(time.RFC3339Nano)))
if b.maxSleep > 0 && b.totalSleep >= b.maxSleep {
errMsg := fmt.Sprintf("backoffer.maxSleep %dms is exceeded, errors:", b.maxSleep)
for i, err := range b.errors {
// Print only last 3 errors for non-DEBUG log levels.
if log.GetLevel() == zapcore.DebugLevel || i >= len(b.errors)-3 {
errMsg += "\n" + err.Error()
}
}
logutil.Logger(context.Background()).Warn(errMsg)
// Use the first backoff type to generate a MySQL error.
return b.types[0].TError()
}
Max sleep for each case
func splitTableRanges(t table.PhysicalTable, store kv.Storage, startHandle, endHandle int64) ([]kv.KeyRange, error)
maxSleep := 10000 // ms
bo := tikv.NewBackoffer(context.Background(), maxSleep)
}
func getPhysicalTableRegions(physicalTableID int64, tableInfo *model.TableInfo, tikvStore tikv.Storage, s kv.SplitableStore, uniqueRegionMap map[uint64]struct{}) ([]regionMeta, error) {
recordRegionMetas, err := regionCache.LoadRegionsInKeyRange(tikv.NewBackoffer(context.Background(), 20000), startKey, endKey)
}
func (c *twoPhaseCommitter) execute(ctx context.Context) error {
prewriteBo := NewBackoffer(ctx, prewriteMaxBackoff).WithVars(c.txn.vars)//defaults 20 seconds
commitBo := NewBackoffer(ctx, CommitMaxBackoff).WithVars(c.txn.vars)//defaults 41 seconds
}