Spaces:

Alan311
/

Face-Recognition-SDK

Runtime error

App Files Files Community

Face-Recognition-SDK / openvino /vpu_custom_kernels /region_hwc.cl

Turing311

Upload 72 files

2cc8629 about 1 year ago

raw

history blame contribute delete

3.23 kB

	// Copyright (C) 2018-2022 Intel Corporation
	// SPDX-License-Identifier: Apache-2.0
	//

	#pragma OPENCL EXTENSION cl_khr_fp16 : enable
	#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

	__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))

	#define ALLOW_EARLY_RETURN 1

	static void inline logistic_activate_hwc(
	__local const half *restrict src,
	__local half *restrict dst,
	int offset,
	int stride)
	{
	half val = src[offset];
	val = 1.0h / (1.0h + exp2(val * -log_2_e));
	dst[offset * stride] = val;
	}

	__kernel void region_hwc(
	__global const half *restrict src,
	__global half *restrict dst,
	int W,
	int H,
	int classes,
	int coords,
	int num,
	int maskSize,
	int doSoftmax)
	{
	__local half local_src[13 * 13 * (4 + 1 + 80)];
	__local half local_dst[13 * 13 * (4 + 1 + 80)];

	const int pixel_pos = get_local_id(0);

	const int local_C = classes + coords + 1;
	const int c = get_group_id(1) * local_C;
	const int h = get_group_id(0);

	num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
	const int C = local_C * num;

	event_t e1 = async_work_group_copy_2D2D(
	local_src, // dst
	src + h * W * C + c, // src
	local_C, // num_elements_per_line,
	H * W, // num_lines,
	C - local_C, // src_line_stride,
	0, // dst_line_stride,
	0);

	wait_group_events(1, &e1);

	#if ALLOW_EARLY_RETURN
	if (pixel_pos < W * H)
	#endif
	{
	const int w = pixel_pos % W;
	const int h = pixel_pos / W;

	__local const half restrict src = local_src + h W * local_C + w * local_C;
	__local half restrict dst = local_dst + h W + w;

	const int stride = H * W;
	logistic_activate_hwc(src, dst, 0, stride);
	logistic_activate_hwc(src, dst, 1, stride);

	//copy plane 2 and 3
	dst[2 * stride] = src[2];
	dst[3 * stride] = src[3];

	logistic_activate_hwc(src, dst, 4, stride);

	src += coords + 1;
	dst += (coords + 1) * stride;

	if (doSoftmax) {
	half max_val = src[0];
	#pragma unroll 4
	for (int c = 1; c < classes; c++) {
	max_val = max(max_val, src[c]);
	}

	half expSum = 0.0h;
	#pragma unroll 4
	for (int c = 0; c < classes; c++) {
	const half e = src[c] - max_val;
	const half tmp = exp2(e * log_2_e);
	dst[c * stride] = tmp;
	expSum += tmp;
	}

	const half invExpSum = 1.0h / expSum;
	#pragma unroll 4
	for (int c = 0; c < classes; c++) {
	dst[c * stride] *= invExpSum;
	}
	} else {
	#pragma unroll 4
	for (int c = 0; c < classes; c++) {
	logistic_activate_hwc(src, dst, c, stride);
	}
	}
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	const int box_sz = W * H * (classes + coords + 1);
	event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
	wait_group_events(1, &e2);
	}