File size: 2,609 Bytes
2cc8629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

__kernel void experimental_detectron_prior_grid_generator(
    __global const half *restrict input_priors,
    __global const half *restrict input_feature_map,
    __global const half *restrict input_rois,
    __global half *restrict output,
    int grid_h,
    int grid_w,
    float stride_h,
    float stride_w,
    int num_priors,
    int num_anchors_per_prior)
{
    __local half local_input_priors[8 * 1024];
    __local half local_output[8 * 1024];

    event_t e1 = async_work_group_copy(
        local_input_priors,
        input_priors,
        num_anchors_per_prior * num_priors,
        0);
    wait_group_events(1, &e1);

    int width_start = get_group_id(0) * get_local_size(0);
    int width_end   = min(width_start + get_local_size(0), (unsigned)grid_w);
    int width       = width_end - width_start;

    int h     = get_group_id(1);
    int w_idx = get_group_id(0) * get_local_size(0);
    for (int w = 0; w < width; ++w) {
        #pragma unroll 4
        for (int p = 0; p < num_priors; ++p) {
            local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
                local_input_priors[4 * p + 0]
                + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
                local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
                local_input_priors[4 * p + 2]
                + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
                local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    event_t e2 = async_work_group_copy_2D2D(
        output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
            + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
                  * num_priors, // dst
        local_output, // src
        width * num_anchors_per_prior * num_priors, // num_elements_per_line
        1, // num_lines
        (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
        (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
        0);
    wait_group_events(1, &e2);
}