Turing311's picture
Upload 72 files
2cc8629
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
__kernel void reorg_chw(
__global const half *restrict src,
__global half *restrict dst,
int W,
int H,
int C,
int stride)
{
__local half local_src[8 * 1024];
__local half local_dst[8 * 1024];
event_t e1 = async_work_group_copy_2D2D(
local_src, // dst
src + get_group_id(1) * W * stride
+ get_group_id(0) * W * stride * stride, // src
W * stride, // num_elements_per_line,
get_local_size(0), // num_lines,
W * stride * (stride * get_num_groups(0) - 1), // src_line_stride,
0, // dst_line_stride,
0);
wait_group_events(1, &e1);
const int c = get_local_id(0);
const int stride_x = get_local_id(1);
const int srcIdx = stride_x + c * W * stride;
const int dstIdx = stride_x * W * get_local_size(0) + c * W;
int x = 0;
for (; x <= W - 8; x += 8) {
half8 data = (half8){
local_src[srcIdx + (x + 0) * stride],
local_src[srcIdx + (x + 1) * stride],
local_src[srcIdx + (x + 2) * stride],
local_src[srcIdx + (x + 3) * stride],
local_src[srcIdx + (x + 4) * stride],
local_src[srcIdx + (x + 5) * stride],
local_src[srcIdx + (x + 6) * stride],
local_src[srcIdx + (x + 7) * stride]};
*((__local half8 *)(&local_dst[dstIdx + x])) = data;
}
for (; x < W; x++) {
local_dst[dstIdx + x] = local_src[srcIdx + x * stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
event_t e2 = async_work_group_copy_2D2D(
dst + get_group_id(0) * W
+ get_group_id(1) * W * stride * get_global_size(0), // dst
local_dst, // src
W, // num_elements_per_line
get_local_size(0) * stride, // num_lines
0, // src_line_stride
W * (get_num_groups(0) - 1), // dst_line_stride
0);
wait_group_events(1, &e2);
}