Browse Source

Do some micro optimizations

This results in a moderate performance increase of 25 per cent on my machine
with a GTX 580.
Matthias Vogelgesang 10 years ago
parent
commit
aba0112c2d
3 changed files with 72 additions and 53 deletions
  1. 56 46
      src/lamino_bp_generic.cl
  2. 9 3
      src/lamino_ft_conv.cl
  3. 7 4
      src/ufo-lamino-bp-task.c

+ 56 - 46
src/lamino_bp_generic.cl

@@ -7,92 +7,102 @@
 
 #include <lamino-filter-def.h>
 
-
-__kernel void lamino_bp_generic ( __global float *proj,
-				  __global float *volume,
-				  __global CLParameters *param)
+kernel void
+lamino_bp_generic (global float *proj,
+				   global float *volume,
+				   global CLParameters *param)
 {
-    const int vX = get_global_id(0);
-    const int vY = get_global_id(1);
-    const int vZ = get_global_id(2);
+    const ushort vX = get_global_id(0);
+    const ushort vY = get_global_id(1);
+    const ushort vZ = get_global_id(2);
 
-    const int vSX = get_global_size(0);
-    const int vSY = get_global_size(1);
-    const int vSZ = get_global_size(2);
+    /* const ushort vSX = get_global_size(0); */
+    /* const ushort vSY = get_global_size(1); */
+    /* const ushort vSZ = get_global_size(2); */
 
-    const long int idx = (vZ * vSY * vSX) + (vY * vSX) + vX;
-    //const int idx = (vY * vSX) + vX;
+    /* const long int idx = (vZ * vSY * vSX) + (vY * vSX) + vX; */
 
+    const long int idx = (vZ * get_global_size(1) * get_global_size(0)) +
+                         (vY * get_global_size(0)) + vX;
+    //const int idx = (vY * vSX) + vX;
 
-    float newz = (float)vZ - param->vol_oz;
+    const float newz = (float)vZ - param->vol_oz;
 
-    float newz_matr02 = newz * param->mat_2 + param->proj_ox;
-    float newz_matr12 = newz * param->mat_5 + param->proj_oy;
+    /* float newz_matr02 = newz * param->mat_2 + param->proj_ox; */
+    /* float newz_matr12 = newz * param->mat_5 + param->proj_oy; */
 
     /// prepare y info
-    float newy = (float)vY - param->vol_oy;
+    const float newy = (float)vY - param->vol_oy;
 
-    float newy_matr01 = newy * param->mat_1 + newz_matr02;
-    float newy_matr11 = newy * param->mat_4 + newz_matr12;
+    /* float newy_matr01 = newy * param->mat_1 + newz_matr02; */
+    /* float newy_matr11 = newy * param->mat_4 + newz_matr12; */
+    /* float newy_matr01 = newy * param->mat_1 + newz * param->mat_2 + param->proj_ox; */
+    /* float newy_matr11 = newy * param->mat_4 + newz * param->mat_5 + param->proj_oy; */
 
     /// prepare x info
-    float newx = (float)vX - param->vol_ox;
-    float oldy = newx * param->mat_3 + newy_matr11;
-    float yo = floor(oldy);
+    const float newx = (float)vX - param->vol_ox;
+
+    /* float oldx = newx*param->mat_0 + newy_matr01; */
+    const float oldx = newx * param->mat_0 + newy * param->mat_1 + newz * param->mat_2 + param->proj_ox;
 
-    float oldx = newx*param->mat_0 + newy_matr01;
-    float xo = floor(oldx);
+    /* float oldy = newx * param->mat_3 + newy_matr11; */
+    const float oldy = newx * param->mat_3 + newy * param->mat_4 + newz * param->mat_5 + param->proj_oy;
+
+    /* const float yo = floor(oldy); */
+    /* const float xo = floor(oldx); */
 
     // bilinear interpolation
-    float yf_1 = oldy - yo;
-    float yf_0 = 1.0f - yf_1;
-    float xf_1 = oldx - xo;
-    float xf_0 = 1.0f  - xf_1;
+    /* const float yf_1 = oldy - yo; */
+    const float yf_1 = oldy - floor(oldy);
+    const float yf_0 = 1.0f - yf_1;
+
+    /* const float xf_1 = oldx - xo; */
+    const float xf_1 = oldx - floor(oldx);
+    const float xf_0 = 1.0f  - xf_1;
 
-    int base = (int)xo + (int)yo * param->proj_sx;
+    const int base = ((int) floor(oldx)) + ((int) floor(oldy)) * param->proj_sx;
+    /* const int base = (int)xo + (int)yo * param->proj_sx; */
     float result;
 // TODO: check that out of proj plain
     result  = proj[base    ] * xf_0 * yf_0;
     result += proj[base + 1] * xf_1 * yf_0;
     result += proj[base + param->proj_sx    ] * xf_0 * yf_1;
     result += proj[base + param->proj_sx + 1] * xf_1 * yf_1;
-    volume[idx] +=  result; 
-
-
+    volume[idx] += result;
 }
 
 __kernel void lamino_clean_vol(__global float *volume)
 {
-    const int vX = get_global_id(0);
-    const int vY = get_global_id(1);
-    const int vZ = get_global_id(2);
+    const ushort vX = get_global_id(0);
+    const ushort vY = get_global_id(1);
+    const ushort vZ = get_global_id(2);
 
-    const int vSX = get_global_size(0);
-    const int vSY = get_global_size(1);
-    const int vSZ = get_global_size(2);
+    const ushort vSX = get_global_size(0);
+    const ushort vSY = get_global_size(1);
+    const ushort vSZ = get_global_size(2);
 
     const int idx = (vZ * vSY * vSX) + (vY * vSX) + vX;
     //const int idx = (vY * vSX) + vX;
 
-    volume[idx] = 0; 
+    volume[idx] = 0;
 }
 
-__kernel void lamino_norm_vol(__global float *volume, 
+__kernel void lamino_norm_vol(__global float *volume,
 				const float factor)
 {
-    const int vX = get_global_id(0);
-    const int vY = get_global_id(1);
-    const int vZ = get_global_id(2);
+    const ushort vX = get_global_id(0);
+    const ushort vY = get_global_id(1);
+    const ushort vZ = get_global_id(2);
 
-    const int vSX = get_global_size(0);
-    const int vSY = get_global_size(1);
-    const int vSZ = get_global_size(2);
+    const ushort vSX = get_global_size(0);
+    const ushort vSY = get_global_size(1);
+    const ushort vSZ = get_global_size(2);
 
     const int idx = (vZ * vSY * vSX) + (vY * vSX) + vX;
     //const int idx = (vY * vSX) + vX;
 
     float val = volume[idx] * factor;
-    volume[idx] = val; 
+    volume[idx] = val;
 }
 
 

+ 9 - 3
src/lamino_ft_conv.cl

@@ -3,14 +3,20 @@ __kernel void lamino_c( __global float * in, __global float * flt,  __global flo
     const int idx = get_global_id(0);
     const int idy = get_global_id(1);
     const int index = idy * get_global_size(0) + idx;
+    const uchar m = idx % 2;
 
+    float result = in[index] * flt[index - m];
 
-    if(idx % 2 == 0) // real part
+    if (m == 0) // real part
     {
-         out[index]= in[index] * flt[index] - in[index+1] * flt[index+1];
+        /* out[index]= in[index] * flt[index] - in[index+1] * flt[index+1]; */
+        result -= in[index + 1] * flt[index+1];
     }
     else // imaginary part
     {
-        out[index]=  in[index] * flt[index - 1] + in[index-1] * flt[index];
+        /* out[index]=  in[index] * flt[index - 1] + in[index-1] * flt[index]; */
+        result += in[index - 1] * flt[index];
     }
+
+    out[index] = result;
 }

+ 7 - 4
src/ufo-lamino-bp-task.c

@@ -135,9 +135,11 @@ ufo_lamino_bp_task_process (UfoTask *task,
     cl_mem in_mem;
     cl_mem out_mem;
     cl_kernel kernel;
+    cl_event event;
     /* cl_event process_event; */
     gfloat cf, ct, cg;
     gfloat sf, st, sg;
+    size_t local_work_size[] = { 16, 16, 1 };
     
     priv = UFO_LAMINO_BP_TASK_GET_PRIVATE (task);
 
@@ -163,9 +165,9 @@ ufo_lamino_bp_task_process (UfoTask *task,
     cmd_queue = ufo_gpu_node_get_cmd_queue (node);
 
     UFO_RESOURCES_CHECK_CLERR (clEnqueueWriteBuffer (cmd_queue,
-                                                     priv->param_mem, CL_TRUE,
+                                                     priv->param_mem, CL_FALSE,
                                                      0, sizeof(CLParameters), &priv->params,
-                                                     0, NULL, NULL));
+                                                     0, NULL, &event));
 
     in_mem = ufo_buffer_get_device_array (inputs[0], cmd_queue);
     out_mem = ufo_buffer_get_device_array (output, cmd_queue);
@@ -189,9 +191,10 @@ ufo_lamino_bp_task_process (UfoTask *task,
     // call backprojection routine
     g_message("processing of %d-th projection", priv->proj_idx);
     UFO_RESOURCES_CHECK_CLERR (clEnqueueNDRangeKernel (cmd_queue, kernel,
-                                                       3, NULL, requisition->dims, NULL,
-                                                       0, NULL, NULL));
+                                                       3, NULL, requisition->dims, local_work_size,
+                                                       1, &event, NULL));
 
+    UFO_RESOURCES_CHECK_CLERR (clReleaseEvent (event));
     priv->proj_idx++;
     return TRUE;
 }