From d6549feb33b56cce72f3c80fa940b31f0c72020c Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Mon, 22 Aug 2022 12:19:31 +0800 Subject: [PATCH 1/2] webgpu: Add non-shared argminmax program The perf of ArgMax[1, 1025, 2049, 19] in cityscapes architecture in DeepLabV3 is very poor. With this changes, this op becomes 6.3ms from 22.36ms. --- tfjs-backend-webgpu/src/argminmax_webgpu.ts | 62 +++++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/tfjs-backend-webgpu/src/argminmax_webgpu.ts b/tfjs-backend-webgpu/src/argminmax_webgpu.ts index b55a53a7f70..58c40b946d2 100644 --- a/tfjs-backend-webgpu/src/argminmax_webgpu.ts +++ b/tfjs-backend-webgpu/src/argminmax_webgpu.ts @@ -15,7 +15,7 @@ * ============================================================================= */ -import {backend_util} from '@tensorflow/tfjs-core'; +import {backend_util, util} from '@tensorflow/tfjs-core'; import {getCoordsXYZ, getMainHeaderString as main, WebGPUProgram} from './webgpu_program'; import {computeDispatch, flatDispatchLayout} from './webgpu_util'; @@ -31,37 +31,37 @@ export class ArgMinMaxProgram implements WebGPUProgram { reductionFactor: number; op: string; size = true; + private type: string; constructor(inputShape: number[], axis: number, reduceType: 'min'|'max') { const axes = [axis]; - backend_util.assertAxesAreInnerMostDims( - 'arg' + reduceType.charAt(0).toUpperCase() + reduceType.slice(1), axes, - inputShape.length); this.op = reduceType === 'min' ? '<' : '>'; // |outShape| is the shape with the removed axis - const [outputShape] = + const [outputShape, reduceShape] = backend_util.computeOutAndReduceShapes(inputShape, axes); this.outputShape = outputShape.length === 0 ? [1] : outputShape; - this.dispatchLayout = flatDispatchLayout(this.outputShape); - // A work group only outputs a data, so we transfer [1, 1, 1] to compute - // dispatch size. - this.dispatch = - computeDispatch(this.dispatchLayout, this.outputShape, [1, 1, 1]); + if (util.sizeFromShape(reduceShape) < 32 || + util.sizeFromShape(outputShape) > 1000) { + this.type = 'plain'; + this.dispatch = computeDispatch( + this.dispatchLayout, this.outputShape, this.workGroupSize); + } else { + this.type = 'shared'; + // A work group only outputs a data, so we transfer [1, 1, 1] to compute + // dispatch size. + this.dispatch = + computeDispatch(this.dispatchLayout, this.outputShape, [1, 1, 1]); + } this.inputShape = inputShape; - this.shaderKey = `argMinMax${this.op}`; + this.shaderKey = `argMinMax_${this.op}_${this.type}`; } getUserCode(): string { - const sharedMemorySnippet = ` - var xBestIndices : array; - var xBestValues : array; - `; - const getInputShapeLastDim = () => { if (this.inputShape.length === 1) { return 'uniforms.xShape'; @@ -84,7 +84,12 @@ export class ArgMinMaxProgram implements WebGPUProgram { return snippet; }; - const userCode = ` + if (this.type === 'shared') { + const sharedMemorySnippet = ` + var xBestIndices : array; + var xBestValues : array; + `; + const userCode = ` fn DIV_CEIL(a : u32, b : u32) -> u32 { return ((a - 1u) / b + 1u); } @@ -131,6 +136,27 @@ export class ArgMinMaxProgram implements WebGPUProgram { } } `; - return userCode; + return userCode; + } else { + const userCode = ` + ${main('index')} { + if (index < uniforms.size) { + let outputCoords = getCoordsFromIndex(index); + var bestIndex = 0; + var bestValue = getX(${splitOutputCoords()} 0); + let reduceLength = ${getInputShapeLastDim()}; + for (var i = 1; i < reduceLength; i++) { + let candidate = getX(${splitOutputCoords()} i); + if (candidate ${this.op} bestValue) { + bestValue = candidate; + bestIndex = i; + } + } + setOutputAtIndexI32(index, bestIndex); + } + } + `; + return userCode; + } } } From 7adb9198435d39868df78bf549162afe3ff62f51 Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Tue, 23 Aug 2022 09:41:26 +0800 Subject: [PATCH 2/2] Add annotation --- tfjs-backend-webgpu/src/argminmax_webgpu.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tfjs-backend-webgpu/src/argminmax_webgpu.ts b/tfjs-backend-webgpu/src/argminmax_webgpu.ts index 58c40b946d2..0cc5edfb87e 100644 --- a/tfjs-backend-webgpu/src/argminmax_webgpu.ts +++ b/tfjs-backend-webgpu/src/argminmax_webgpu.ts @@ -44,6 +44,11 @@ export class ArgMinMaxProgram implements WebGPUProgram { this.outputShape = outputShape.length === 0 ? [1] : outputShape; this.dispatchLayout = flatDispatchLayout(this.outputShape); + // The shared algorithm is mainly used for large reduce size. It fully + // utilizes the threads in one workgroup to do the reduction. However, + // when the reduce size is very small or the output shape is too large. It's + // better to use the plain algorithm to reduce the number of workgroups to + // speedup. The threthold can be further tuned. if (util.sizeFromShape(reduceShape) < 32 || util.sizeFromShape(outputShape) > 1000) { this.type = 'plain';