NVCC does not unwrap a small nested loop

Question

NVCC does not unwrap a small nested loop

I was wondering why NVCC cannot deploy the next Cholesky factorization kernel for small matrices (N = 4).

template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}

The generated PTX code looks like this:

sqrt.rn.f64     %fd12, %fd29;
    st.local.f64    [%rd1], %fd12;
    rcp.rn.f64  %fd34, %fd12;
    mul.f64     %fd13, %fd30, %fd34;
    st.local.f64    [%rd1+32], %fd13;
    fma.rn.f64  %fd35, %fd13, %fd13, 0d0000000000000000;
    sub.f64     %fd36, %fd31, %fd35;
    sqrt.rn.f64     %fd14, %fd36;
    st.local.f64    [%rd1+40], %fd14;
    mul.f64     %fd15, %fd32, %fd34;
    st.local.f64    [%rd1+64], %fd15;
    ld.local.f64    %fd37, [%rd1+32];
    fma.rn.f64  %fd38, %fd15, %fd37, 0d0000000000000000;
    sub.f64     %fd39, %fd33, %fd38;
    rcp.rn.f64  %fd40, %fd14;
    mul.f64     %fd16, %fd39, %fd40;
    st.local.f64    [%rd1+72], %fd16;
    mov.f64     %fd58, 0d0000000000000000;
    mov.u32     %r58, -2;
    mov.u64     %rd40, -8;

BB1_5:
    shl.b64     %rd23, %rd40, 3;
    sub.s64     %rd24, %rd1, %rd23;
    ld.local.f64    %fd41, [%rd24];
    fma.rn.f64  %fd58, %fd41, %fd41, %fd58;
    add.s64     %rd40, %rd40, -1;
    add.s32     %r58, %r58, 1;
    setp.ne.s32 %p3, %r58, 0;
    @%p3 bra    BB1_5;

    sub.f64     %fd43, %fd6, %fd58;
    sqrt.rn.f64     %fd19, %fd43;
    st.local.f64    [%rd1+80], %fd19;
    mul.f64     %fd20, %fd8, %fd34;
    st.local.f64    [%rd1+96], %fd20;
    ld.local.f64    %fd45, [%rd1+32];
    fma.rn.f64  %fd46, %fd20, %fd45, 0d0000000000000000;
    sub.f64     %fd47, %fd9, %fd46;
    mul.f64     %fd21, %fd47, %fd40;
    st.local.f64    [%rd1+104], %fd21;
    mov.f64     %fd59, 0d0000000000000000;
    mov.u32     %r59, -2;
    mov.u64     %rd41, %rd1;

BB1_7:
    mov.u64     %rd5, %rd41;
    ld.local.f64    %fd49, [%rd5+64];
    ld.local.f64    %fd50, [%rd5+96];
    fma.rn.f64  %fd59, %fd50, %fd49, %fd59;
    add.s64     %rd6, %rd5, 8;
    add.s32     %r59, %r59, 1;
    setp.ne.s32 %p4, %r59, 0;
    mov.u64     %rd41, %rd6;
    @%p4 bra    BB1_7;

    sub.f64     %fd52, %fd10, %fd59;
    rcp.rn.f64  %fd53, %fd19;
    mul.f64     %fd24, %fd52, %fd53;
    st.local.f64    [%rd1+112], %fd24;
    mov.f64     %fd60, 0d0000000000000000;
    mov.u32     %r60, -3;
    mov.u64     %rd42, -12;

BB1_9:
    shl.b64     %rd26, %rd42, 3;
    sub.s64     %rd27, %rd1, %rd26;
    ld.local.f64    %fd54, [%rd27];
    fma.rn.f64  %fd60, %fd54, %fd54, %fd60;
    add.s64     %rd42, %rd42, -1;
    add.s32     %r60, %r60, 1;
    setp.ne.s32 %p5, %r60, 0;
    @%p5 bra    BB1_9;

It is clear that the loops are not unrolled and that expensive local memory is used (my goal is to do everything in registers only).

I am calling the function like this:

    T l[N*N];
    for(int i = 0; i < N*N; ++i){
        l[i] = buffer[offset+i];
    }

    choleskyKernel2<T,N>(l);

    for(int i = 0; i < N*N; ++i){
        buffer[offset+i] =  l[i];
    }

Is there a way to properly unwrap these loops so that everything can be done in registers?

EDIT:

Complete code:

#include <thrust/device_vector.h>

template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}

template<typename T, int N>
__global__ static
void test3(T* buffer){
    const int matrixElements = N * N;

    T l[matrixElements];
    for(int i = 0; i < matrixElements; ++i){
        l[i] = buffer[i];
    }

    choleskyKernel2<T,N>(l);

    for(int i = 0; i < matrixElements; ++i){
        buffer[i] =  l[i];
    }
}

int main(){
     thrust::device_vector<double> d_data(16);
     test3<double,4> <<< 1,1 >>>(thrust::raw_pointer_cast(d_data.data()));
}

+3

cuda

dari May 19 '17 at 14:43

source to share

1 answer

tera · Accepted Answer · 2017-05-19T19:28:27+0000

While I can't tell you why nvcc (or indeed cicc, which compiles device code on behalf of nvcc) isn't unwrapping your loops, I can show you how to change the code to do this.

Turn

#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {

in

#pragma unroll
    for (int i = 0; i < N; i++) {
#pragma unroll
        for (int j = 0; j < N; j++)
            if (j <= i) {

and you will find that all loops are unrolled without using any local memory.

This is even though you did not ask for a reversal of the loading and warehouse cycles. In fact, with my change above, you don't need any directives #pragma unroll

at all.

NVCC does not unwrap a small nested loop

More articles: