Turing311's picture
Upload 72 files
2cc8629
raw
history blame
3.23 kB
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
#define ALLOW_EARLY_RETURN 1
static void inline logistic_activate_hwc(
__local const half *restrict src,
__local half *restrict dst,
int offset,
int stride)
{
half val = src[offset];
val = 1.0h / (1.0h + exp2(val * -log_2_e));
dst[offset * stride] = val;
}
__kernel void region_hwc(
__global const half *restrict src,
__global half *restrict dst,
int W,
int H,
int classes,
int coords,
int num,
int maskSize,
int doSoftmax)
{
__local half local_src[13 * 13 * (4 + 1 + 80)];
__local half local_dst[13 * 13 * (4 + 1 + 80)];
const int pixel_pos = get_local_id(0);
const int local_C = classes + coords + 1;
const int c = get_group_id(1) * local_C;
const int h = get_group_id(0);
num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
const int C = local_C * num;
event_t e1 = async_work_group_copy_2D2D(
local_src, // dst
src + h * W * C + c, // src
local_C, // num_elements_per_line,
H * W, // num_lines,
C - local_C, // src_line_stride,
0, // dst_line_stride,
0);
wait_group_events(1, &e1);
#if ALLOW_EARLY_RETURN
if (pixel_pos < W * H)
#endif
{
const int w = pixel_pos % W;
const int h = pixel_pos / W;
__local const half *restrict src = local_src + h * W * local_C + w * local_C;
__local half *restrict dst = local_dst + h * W + w;
const int stride = H * W;
logistic_activate_hwc(src, dst, 0, stride);
logistic_activate_hwc(src, dst, 1, stride);
//copy plane 2 and 3
dst[2 * stride] = src[2];
dst[3 * stride] = src[3];
logistic_activate_hwc(src, dst, 4, stride);
src += coords + 1;
dst += (coords + 1) * stride;
if (doSoftmax) {
half max_val = src[0];
#pragma unroll 4
for (int c = 1; c < classes; c++) {
max_val = max(max_val, src[c]);
}
half expSum = 0.0h;
#pragma unroll 4
for (int c = 0; c < classes; c++) {
const half e = src[c] - max_val;
const half tmp = exp2(e * log_2_e);
dst[c * stride] = tmp;
expSum += tmp;
}
const half invExpSum = 1.0h / expSum;
#pragma unroll 4
for (int c = 0; c < classes; c++) {
dst[c * stride] *= invExpSum;
}
} else {
#pragma unroll 4
for (int c = 0; c < classes; c++) {
logistic_activate_hwc(src, dst, c, stride);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
const int box_sz = W * H * (classes + coords + 1);
event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
wait_group_events(1, &e2);
}