Browse Source

Merge branch 'backproject-from-3d' into 'master'

Backproject From 3d
Farago, Tomas 9 years ago
parent
commit
d74437ef47

+ 2 - 0
CMakeLists.txt

@@ -21,6 +21,8 @@ configure_paths(UFO_ANKAFILTERS)
 set(UFO_ANKAFILTERS_PLUGINDIR "${UFO_ANKAFILTERS_LIBDIR}/ufo")
 set(UFO_ANKAFILTERS_KERNELDIR "${UFO_ANKAFILTERS_DATADIR}/ufo")
 set(PKG_UFO_CORE_MIN_REQUIRED "0.6")
+# Backprojection burst mode, must be one of 1, 2, 4, 8, 16
+set(BP_BURST "16")
 #}}}
 
 #{{{ Common dependencies

+ 5 - 5
src/CMakeLists.txt

@@ -7,7 +7,6 @@ set(ufofilter_SRCS
     ufo-remove-stripes-task.c
     )
 
-file(GLOB ufofilter_KERNELS "kernels/*.cl")
 #}}}
 #{{{ Variables
 if (CMAKE_COMPILER_IS_GNUCC OR ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang"))
@@ -16,6 +15,8 @@ if (CMAKE_COMPILER_IS_GNUCC OR ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang"))
                     "-Wno-missing-field-initializers -Wpointer-arith "
                     "-Wredundant-decls -Wshadow -Wstrict-prototypes -Wwrite-strings")
 endif()
+add_definitions ("-DBURST=${BP_BURST}")
+
 #}}}
 #{{{ Plugin targets
 include_directories(${CMAKE_CURRENT_BINARY_DIR}
@@ -57,9 +58,8 @@ foreach(_src ${ufofilter_SRCS})
                 LIBRARY DESTINATION ${UFO_ANKAFILTERS_PLUGINDIR})
     endif()
 endforeach()
+#}}}
 
-# copy kernels
-foreach(_kernel ${ufofilter_KERNELS})
-    install(FILES ${_kernel} DESTINATION ${UFO_ANKAFILTERS_KERNELDIR})
-endforeach()
+#{{{ Subdirectories
+add_subdirectory(kernels)
 #}}}

+ 27 - 0
src/kernels/CMakeLists.txt

@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 2.6)
+
+# make burst backprojection kernels
+set(KERNEL_TEMPLATE templates/ankabackprojectburst.in)
+set(GENERATOR tools/make_burst_kernels.py)
+set(KERNEL_NAME ankabackprojectburst.cl)
+
+add_custom_command(
+    OUTPUT ${KERNEL_NAME}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${GENERATOR} ${GENERATOR}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL_TEMPLATE} ${KERNEL_TEMPLATE}
+    COMMAND python ${GENERATOR} ${KERNEL_TEMPLATE} 1 > ${KERNEL_NAME}
+    COMMAND python ${GENERATOR} ${KERNEL_TEMPLATE} ${BP_BURST} >> ${KERNEL_NAME}
+    DEPENDS ${GENERATOR} ${KERNEL_TEMPLATE}
+    COMMENT "Generating burst backprojection kernels")
+
+add_custom_target(burst ALL
+    DEPENDS ankabackprojectburst.cl)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_NAME} DESTINATION ${UFO_ANKAFILTERS_KERNELDIR})
+
+# copy kernels
+file(GLOB ufofilter_KERNELS "*.cl")
+
+foreach(_kernel ${ufofilter_KERNELS})
+    install(FILES ${_kernel} DESTINATION ${UFO_ANKAFILTERS_KERNELDIR})
+endforeach()

+ 52 - 19
src/kernels/ankabackproject.cl

@@ -17,25 +17,25 @@
  * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
  */
 
- __kernel void backproject (global float *volume,
-                            __read_only image2d_t projection,
-                            const sampler_t sampler,
-                            const float2 x_region,
-                            const float2 y_region,
-                            const float2 z_region,
-                            const float8 tmatrix,
-                            const uint cumulate)
+kernel void backproject (global float *volume,
+                         read_only image2d_t projection,
+                         const sampler_t sampler,
+                         const float2 x_region,
+                         const float2 y_region,
+                         const float2 z_region,
+                         const float8 tmatrix,
+                         const uint cumulate)
 {
-     int3 id = (int3) (get_global_id (0), get_global_id (1), get_global_id (2));
-     float2 pixel;
-     float3 voxel;
- 
-     voxel.x = mad((float) id.x, x_region.y, x_region.x);
-     voxel.y = mad((float) id.y, y_region.y, y_region.x);
-     voxel.z = mad((float) id.z, z_region.y, z_region.x);
- 
-     pixel.x = mad(voxel.x, tmatrix.s0, mad(voxel.y, tmatrix.s1, tmatrix.s3));
-     pixel.y = mad(voxel.x, tmatrix.s4, mad(voxel.y, tmatrix.s5, mad(voxel.z, tmatrix.s6, tmatrix.s7)));
+    int3 id = (int3) (get_global_id (0), get_global_id (1), get_global_id (2));
+    float2 pixel;
+    float3 voxel;
+
+    voxel.x = mad((float) id.x, x_region.y, x_region.x);
+    voxel.y = mad((float) id.y, y_region.y, y_region.x);
+    voxel.z = mad((float) id.z, z_region.y, z_region.x);
+
+    pixel.x = mad(voxel.x, tmatrix.s0, mad(voxel.y, tmatrix.s1, tmatrix.s3));
+    pixel.y = mad(voxel.x, tmatrix.s4, mad(voxel.y, tmatrix.s5, mad(voxel.z, tmatrix.s6, tmatrix.s7)));
  
     if (cumulate) {
          volume[id.z * get_global_size (0) * get_global_size (1) +
@@ -45,4 +45,37 @@
          volume[id.z * get_global_size (0) * get_global_size (1) +
                 id.y * get_global_size (0) + id.x] = read_imagef (projection, sampler, pixel).x;
     }
- }
+}
+
+
+kernel void backproject_from_3d (global float *volume,
+                                 read_only image3d_t projections,
+                                 const sampler_t sampler,
+                                 const float2 x_region,
+                                 const float2 y_region,
+                                 const float2 z_region,
+                                 constant float *sines,
+                                 constant float *cosines,
+                                 const float2 center,
+                                 const float sin_lamino,
+                                 const float cos_lamino)
+{
+    int3 id = (int3) (get_global_id (0), get_global_id (1), get_global_id (2));
+    int i;
+    float result = 0.0f;
+    float4 pixel;
+    float3 voxel;
+
+    voxel.x = mad((float) id.x, x_region.y, x_region.x);
+    voxel.y = mad((float) id.y, y_region.y, y_region.x);
+    voxel.z = mad((float) id.z, z_region.y, z_region.x);
+
+    for (i = 0; i < get_image_depth (projections); i++) {
+        pixel.x = voxel.x * cosines[i] + voxel.y * sines[i] + center.x;
+        pixel.y = voxel.x * cos_lamino * sines[i] - voxel.y * cos_lamino * cosines[i] + voxel.z * sin_lamino + center.y;
+        pixel.z = (float) i;
+        result += read_imagef (projections, sampler, pixel).x;
+    }
+
+    volume[id.z * get_global_size (0) * get_global_size (1) + id.y * get_global_size (0) + id.x] = result;
+}

+ 5 - 5
src/kernels/ankapadding.cl

@@ -17,11 +17,11 @@
  * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-__kernel void pad (image2d_t in_image,
-                   sampler_t sampler,
-                   global float *result,
-                   const uint2 input_shape,
-                   const int2 offset)
+kernel void pad (read_only image2d_t in_image,
+                 sampler_t sampler,
+                 global float *result,
+                 const uint2 input_shape,
+                 const int2 offset)
 {
     int2 pixel = (int2) (get_global_id (0), get_global_id (1));
     float2 norm_pixel = (float2) (((float) pixel.x - offset.x) / input_shape.x,

+ 45 - 0
src/kernels/templates/ankabackprojectburst.in

@@ -0,0 +1,45 @@
+                                    read_only image2d_t projection_{},
+%nl
+        pixel.x = mad(voxel.x, cosines{1}, mad(voxel.y, sines{1}, center.x));
+        pixel.y = mad(tmp_x, sines{1}, mad(tmp_y, cosines{1}, tmp));
+        result {2}= read_imagef (projection_{0}, sampler, pixel).x;
+%nl
+kernel void backproject_burst_{0} (
+{1}
+                                    global float *volume,
+                                    const sampler_t sampler,
+                                    const int3 real_size,
+                                    const float2 center,
+                                    const float2 x_region,
+                                    const float2 y_region,
+                                    const float2 z_region,
+                                    const float sin_lamino,
+                                    const float cos_lamino,
+                                    const float{2} sines,
+                                    const float{2} cosines,
+                                    const int cumulate)
+{{
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    int idz = get_global_id (2);
+    float result, tmp, tmp_x, tmp_y;
+    float2 pixel;
+    float3 voxel;
+
+    if (idx < real_size.x && idy < real_size.y && idz < real_size.z) {{
+        voxel.x = mad((float) idx, x_region.y, x_region.x);
+        voxel.y = mad((float) idy, y_region.y, y_region.x);
+        voxel.z = mad((float) idz, z_region.y, z_region.x);
+        tmp = mad(voxel.z, sin_lamino, center.y);
+        tmp_x = voxel.x * cos_lamino;
+        tmp_y = -voxel.y * cos_lamino;
+
+{3}
+
+        if (cumulate) {{
+            volume[idz * real_size.x * real_size.y + idy * real_size.x + idx] += result;
+        }} else {{
+            volume[idz * real_size.x * real_size.y + idy * real_size.x + idx] = result;
+        }}
+    }}
+}}

+ 53 - 0
src/kernels/tools/make_burst_kernels.py

@@ -0,0 +1,53 @@
+"""Generate burst laminographic backprojection OpenCL kernels."""
+import argparse
+
+
+IDX_TO_VEC_ELEM = dict(zip(range(10), range(10)))
+IDX_TO_VEC_ELEM[10] = 'a'
+IDX_TO_VEC_ELEM[11] = 'b'
+IDX_TO_VEC_ELEM[12] = 'c'
+IDX_TO_VEC_ELEM[13] = 'd'
+IDX_TO_VEC_ELEM[14] = 'e'
+IDX_TO_VEC_ELEM[15] = 'f'
+
+
+def fill_compute_template(tmpl, num_items, index):
+    """Fill the template doing the pixel computation and texture fetch."""
+    operation = '+' if index else ''
+    access = '.s{}'.format(IDX_TO_VEC_ELEM[index]) if num_items > 1 else ''
+    return tmpl.format(index, access, operation)
+
+
+def fill_kernel_template(input_tmpl, compute_tmpl, kernel_tmpl, num_items):
+    """Construct the whole kernel."""
+    vector_length = num_items if num_items > 1 else ''
+    computes = '\n'.join([fill_compute_template(compute_tmpl, num_items, i)
+                          for i in range(num_items)])
+    inputs = '\n'.join([input_tmpl.format(i) for i in range(num_items)])
+
+    return kernel_tmpl.format(num_items, inputs, vector_length, computes)
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', type=str, help='File name with the kernel template')
+    parser.add_argument('burst', type=int,
+                        help='Number of projections processed by one kernel invocation')
+
+    return parser.parse_args()
+
+
+def main():
+    """execute program."""
+    args = parse_args()
+    allowed_bursts = [2 ** i for i in range(5)]
+    if args.burst not in allowed_bursts:
+        raise ValueError('Specified burst mode `{}` must be one of `{}`'.format(args.burst,
+                                                                                allowed_bursts))
+    in_tmpl, comp_tmpl, ker_tmpl = open(args.filename, 'r').read().split('\n%nl\n')
+    print fill_kernel_template(in_tmpl, comp_tmpl, ker_tmpl, args.burst)
+
+
+if __name__ == '__main__':
+    main()

+ 370 - 51
src/ufo-anka-backproject-task.c

@@ -16,8 +16,11 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
  */
+#include <stdio.h>
 #include <math.h>
 #include <string.h>
+#include <glib.h>
+#include <glib/gprintf.h>
 
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
@@ -27,11 +30,16 @@
 
 #include "ufo-anka-backproject-task.h"
 
+/* Copy only neccessary projection region */
+/* TODO: make this a parameter? */
+#define COPY_PROJECTION_REGION 1
 #define EXTRACT_INT(region, index) g_value_get_int (g_value_array_get_nth ((region), (index)))
 #define EXTRACT_FLOAT(region, index) g_value_get_float (g_value_array_get_nth ((region), (index)))
 #define REGION_SIZE(region) ((EXTRACT_INT ((region), 2)) == 0) ? 0 : \
                             ((EXTRACT_INT ((region), 1) - EXTRACT_INT ((region), 0) - 1) /\
                             EXTRACT_INT ((region), 2) + 1)
+#define PAD_TO_DIVIDE(dividend, divisor) ((dividend) + (divisor) - (dividend) % (divisor))
+
 
 /**
  * SECTION:ufo-anka-backproject-task
@@ -44,12 +52,18 @@ struct _UfoAnkaBackprojectTaskPrivate {
     /* private */
     gboolean generated;
     guint count;
-    float tmatrix[8];
+    /* sine and cosine table size based on BURST */
+    gsize table_size;
 
     /* OpenCL */
     cl_context context;
-    cl_kernel bp_kernel;
+    cl_kernel vector_kernel;
+    cl_kernel scalar_kernel;
     cl_sampler sampler;
+    /* Buffered images for invoking backprojection on BURST projections at once.
+     * We potentially don't need to copy the last image and can use the one from
+     * framework directly but it seems to have no performance effects. */
+    cl_mem images[BURST];
 
     /* properties */
     GValueArray *x_region;
@@ -57,7 +71,9 @@ struct _UfoAnkaBackprojectTaskPrivate {
     GValueArray *z_region;
     GValueArray *center;
     GValueArray *projection_offset;
-    gboolean tomo_angle_is_absolute;
+    float sines[BURST], cosines[BURST];
+    guint num_projections;
+    gfloat overall_angle;
     gfloat tomo_angle;
     gfloat lamino_angle;
 };
@@ -77,7 +93,8 @@ enum {
     PROP_Z_REGION,
     PROP_PROJECTION_OFFSET,
     PROP_CENTER,
-    PROP_TOMO_ANGLE_IS_ABSOLUTE,
+    PROP_NUM_PROJECTIONS,
+    PROP_OVERALL_ANGLE,
     PROP_TOMO_ANGLE,
     PROP_LAMINO_ANGLE,
     N_PROPERTIES
@@ -85,17 +102,151 @@ enum {
 
 static GParamSpec *properties[N_PROPERTIES] = { NULL, };
 
+static inline void
+swap (gint *first, gint *second) {
+    gint tmp;
+
+    tmp = *first;
+    *first = *second;
+    *second = tmp;
+}
+
+/**
+ * Determine the left and right column to read from a projection at a given
+ * tomographic angle.
+ */
 static void
-create_transformation_matrix (UfoAnkaBackprojectTaskPrivate *priv, float tomo_angle)
+determine_x_extrema (gfloat extrema[2], GValueArray *x_extrema, GValueArray *y_extrema,
+                     gfloat tomo_angle, gfloat x_center)
 {
-    priv->tmatrix[0] = cos (tomo_angle);
-    priv->tmatrix[1] = sin (tomo_angle);
-    priv->tmatrix[2] = 0.0f;
-    priv->tmatrix[3] = EXTRACT_FLOAT (priv->center, 0) - EXTRACT_INT (priv->projection_offset, 0);
-    priv->tmatrix[4] = cos (priv->lamino_angle) * sin (tomo_angle);
-    priv->tmatrix[5] = -cos (priv->lamino_angle) * cos(tomo_angle);
-    priv->tmatrix[6] = sin(priv->lamino_angle);
-    priv->tmatrix[7] = EXTRACT_FLOAT (priv->center, 1) - EXTRACT_INT (priv->projection_offset, 1);
+    gfloat sin_tomo, cos_tomo;
+    gint x_min, x_max, y_min, y_max;
+
+    sin_tomo = sin (tomo_angle);
+    cos_tomo = cos (tomo_angle);
+    x_min = EXTRACT_INT (x_extrema, 0);
+    /* The interval is right-opened when OpenCL indices for both x and y are generated, */
+    /* so the last index doesn't count */
+    x_max = EXTRACT_INT (x_extrema, 1) - 1;
+    y_min = EXTRACT_INT (y_extrema, 0);
+    y_max = EXTRACT_INT (y_extrema, 1) - 1;
+
+    if (sin_tomo < 0) {
+        swap (&y_min, &y_max);
+    }
+    if (cos_tomo < 0) {
+        swap (&x_min, &x_max);
+    }
+
+    extrema[0] = cos_tomo * x_min + sin_tomo * y_min + x_center;
+    /* +1 because extrema[1] will be accessed by interpolation
+     * but the region in copying is right-open */
+    extrema[1] = cos_tomo * x_max + sin_tomo * y_max + x_center + 1;
+}
+
+/**
+ * Determine the top and bottom row to read from a projection at given
+ * tomographic and laminographic angles.
+ */
+static void
+determine_y_extrema (gfloat extrema[2], GValueArray *x_extrema, GValueArray *y_extrema,
+                     GValueArray *z_extrema, gfloat tomo_angle, gfloat lamino_angle,
+                     gfloat y_center)
+{
+    gfloat sin_tomo, cos_tomo, sin_lamino, cos_lamino;
+    gint x_min, x_max, y_min, y_max;
+
+    sin_tomo = sin (tomo_angle);
+    cos_tomo = cos (tomo_angle);
+    sin_lamino = sin (lamino_angle);
+    cos_lamino = cos (lamino_angle);
+    x_min = EXTRACT_INT (x_extrema, 0);
+    x_max = EXTRACT_INT (x_extrema, 1) - 1;
+    y_min = EXTRACT_INT (y_extrema, 0);
+    y_max = EXTRACT_INT (y_extrema, 1) - 1;
+
+    if (sin_tomo < 0) {
+        swap (&x_min, &x_max);
+    }
+    if (cos_tomo > 0) {
+        swap (&y_min, &y_max);
+    }
+
+    extrema[0] = sin_tomo * x_min - cos_tomo * y_min;
+    extrema[1] = sin_tomo * x_max - cos_tomo * y_max;
+    extrema[0] = extrema[0] * cos_lamino + EXTRACT_INT (z_extrema, 0) * sin_lamino + y_center;
+    extrema[1] = extrema[1] * cos_lamino + EXTRACT_INT (z_extrema, 1) * sin_lamino + y_center + 1;
+}
+
+/**
+ * clip:
+ * @result: resulting clipped extrema
+ * @extrema: (min, max)
+ * @maximum: projection width or height
+ *
+ * Clip extrema to an allowed interval [0, projection width/height)
+ */
+static void
+clip (gint result[2], gfloat extrema[2], gint maximum)
+{
+    result[0] = (gint) floorf (extrema[0]);
+    result[1] = (gint) ceilf (extrema[1]);
+
+    if (result[0] < 0) {
+        result[0] = 0;
+    }
+    if (result[0] > maximum) {
+        result[0] = maximum;
+    }
+    if (result[1] < 0) {
+        result[1] = 0;
+    }
+    if (result[1] > maximum) {
+        result[1] = maximum;
+    }
+
+    if (result[0] == result[1]) {
+        if (result[1] < maximum) {
+            result[1]++;
+        } else if (result[0] > 0) {
+            result[0]--;
+        } else {
+            g_warning ("Cannot extend");
+        }
+    } else if (result[0] > result[1]) {
+        g_warning ("Invalid extrema: minimum larger than maximum");
+    }
+}
+
+/**
+ * Determine the left and right column to read from a projection at a given
+ * tomographic angle. The result is bound to [0, projection width)
+ */
+static void
+determine_x_region (gint result[2], GValueArray *x_extrema, GValueArray *y_extrema, gfloat tomo_angle,
+                    gfloat x_center, gint width)
+{
+    gfloat extrema[2];
+
+    determine_x_extrema (extrema, x_extrema, y_extrema, tomo_angle, x_center);
+
+    clip (result, extrema, width);
+}
+
+/**
+ * Determine the top and bottom column to read from a projection at given
+ * tomographic and laminographic angles. The result is bound to
+ * [0, projection height).
+ */
+static void
+determine_y_region (gint result[2], GValueArray *x_extrema, GValueArray *y_extrema, GValueArray *z_extrema,
+                    gfloat tomo_angle, gfloat lamino_angle, gfloat y_center, gint height)
+{
+    gfloat extrema[2];
+
+    determine_y_extrema (extrema, x_extrema, y_extrema, z_extrema, tomo_angle,
+                         lamino_angle, y_center);
+    clip (result, extrema, height);
 }
 
 static void
@@ -127,10 +278,19 @@ ufo_anka_backproject_task_setup (UfoTask *task,
 {
     UfoAnkaBackprojectTaskPrivate *priv;
     cl_int cl_error;
+    gint i;
+    gchar vector_kernel_name[30];
+
+    if (!g_sprintf (vector_kernel_name, "backproject_burst_%d", BURST)) {
+        g_warning ("Error making burst kernel name");
+    }
 
     priv = UFO_ANKA_BACKPROJECT_TASK_GET_PRIVATE (task);
     priv->context = ufo_resources_get_context (resources);
-    priv->bp_kernel = ufo_resources_get_kernel (resources, "ankabackproject.cl", "backproject", error);
+    priv->vector_kernel = ufo_resources_get_kernel (resources, "ankabackprojectburst.cl",
+                                                    vector_kernel_name, error);
+    priv->scalar_kernel = ufo_resources_get_kernel (resources, "ankabackprojectburst.cl",
+                                                    "backproject_burst_1", error);
     priv->sampler = clCreateSampler (priv->context,
                                      (cl_bool) FALSE,
                                      CL_ADDRESS_CLAMP,
@@ -139,8 +299,24 @@ ufo_anka_backproject_task_setup (UfoTask *task,
 
     UFO_RESOURCES_CHECK_CLERR (clRetainContext (priv->context));
     UFO_RESOURCES_CHECK_CLERR (cl_error);
-    if (priv->bp_kernel) {
-        UFO_RESOURCES_CHECK_CLERR (clRetainKernel (priv->bp_kernel));
+    if (priv->vector_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clRetainKernel (priv->vector_kernel));
+    }
+    if (priv->scalar_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clRetainKernel (priv->scalar_kernel));
+    }
+
+    for (i = 0; i < BURST; i++) {
+        priv->images[i] = NULL;
+    }
+
+    switch (BURST) {
+        case 1: priv->table_size = sizeof (cl_float); break;
+        case 2: priv->table_size = sizeof (cl_float2); break;
+        case 4: priv->table_size = sizeof (cl_float4); break;
+        case 8: priv->table_size = sizeof (cl_float8); break;
+        case 16: priv->table_size = sizeof (cl_float16); break;
+        default: g_warning ("Unsupported vector size"); break;
     }
 }
 
@@ -153,6 +329,10 @@ ufo_anka_backproject_task_get_requisition (UfoTask *task,
 
     priv = UFO_ANKA_BACKPROJECT_TASK_GET_PRIVATE (task);
 
+    if (!priv->num_projections) {
+        g_warning ("Number of projections has not been set");
+    }
+
     requisition->n_dims = 3;
     requisition->dims[0] = REGION_SIZE (priv->x_region);
     requisition->dims[1] = REGION_SIZE (priv->y_region);
@@ -180,7 +360,7 @@ ufo_anka_backproject_task_equal_real (UfoNode *n1,
 {
     g_return_val_if_fail (UFO_IS_ANKA_BACKPROJECT_TASK (n1) && UFO_IS_ANKA_BACKPROJECT_TASK (n2), FALSE);
 
-    return UFO_ANKA_BACKPROJECT_TASK (n1)->priv->bp_kernel == UFO_ANKA_BACKPROJECT_TASK (n2)->priv->bp_kernel;
+    return UFO_ANKA_BACKPROJECT_TASK (n1)->priv->vector_kernel == UFO_ANKA_BACKPROJECT_TASK (n2)->priv->vector_kernel;
 }
 
 static UfoTaskMode
@@ -198,42 +378,151 @@ ufo_anka_backproject_task_process (UfoTask *task,
     UfoAnkaBackprojectTaskPrivate *priv;
     UfoGpuNode *node;
     UfoProfiler *profiler;
-    gfloat tomo_angle;
+    gfloat tomo_angle, *sines, *cosines;
+    gint i, index;
+    gint cumulate;
+    gsize table_size;
+    gboolean scalar;
     /* regions stripped off the "to" value */
-    gfloat x_region[2], y_region[2], z_region[2];
+    gfloat x_region[2], y_region[2], z_region[2], center[2], sin_lamino, cos_lamino;
+    gint x_copy_region[2], y_copy_region[2];
+    cl_kernel kernel;
     cl_command_queue cmd_queue;
     cl_mem image;
     cl_mem out_mem;
+    cl_int cl_error;
+    /* image creation and copying */
+    size_t im_width, im_height;
+    cl_image_format image_fmt;
+    size_t origin[3];
+    size_t region[3];
+    /* keep the warp size satisfied but make sure the local grid is localized
+     * around a point in 3D for efficient caching */
+    const gint real_size[4] = {requisition->dims[0], requisition->dims[1], requisition->dims[2], 0};
+    const gsize local_work_size[] = {16, 8, 8};
+    gsize global_work_size[3];
+
+    global_work_size[0] = requisition->dims[0] % local_work_size[0] ?
+                          PAD_TO_DIVIDE (requisition->dims[0], local_work_size[0]) :
+                          requisition->dims[0];
+    global_work_size[1] = requisition->dims[1] % local_work_size[1] ?
+                          PAD_TO_DIVIDE (requisition->dims[1], local_work_size[1]) :
+                          requisition->dims[1];
+    global_work_size[2] = requisition->dims[2] % local_work_size[2] ?
+                          PAD_TO_DIVIDE (requisition->dims[2], local_work_size[2]) :
+                          requisition->dims[2];
 
     priv = UFO_ANKA_BACKPROJECT_TASK (task)->priv;
     node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
     cmd_queue = ufo_gpu_node_get_cmd_queue (node);
     out_mem = ufo_buffer_get_device_array (output, cmd_queue);
+    /* TODO: Get host/buffer/image depending on where the data *currently* resides. */
+    /* This Must be made available in UFO core first. */
     image = ufo_buffer_get_device_image (inputs[0], cmd_queue); 
 
+    index = priv->count % BURST;
+    tomo_angle = priv->tomo_angle > -G_MAXFLOAT ? priv->tomo_angle :
+                 priv->overall_angle * priv->count / priv->num_projections;
+    priv->sines[index] = sin (tomo_angle);
+    priv->cosines[index] = cos (tomo_angle);
     x_region[0] = (gfloat) EXTRACT_INT (priv->x_region, 0);
     x_region[1] = (gfloat) EXTRACT_INT (priv->x_region, 2);
-
     y_region[0] = (gfloat) EXTRACT_INT (priv->y_region, 0);
     y_region[1] = (gfloat) EXTRACT_INT (priv->y_region, 2);
-
     z_region[0] = (gfloat) EXTRACT_INT (priv->z_region, 0);
     z_region[1] = (gfloat) EXTRACT_INT (priv->z_region, 2);
+    center[0] = EXTRACT_FLOAT (priv->center, 0) - EXTRACT_INT (priv->projection_offset, 0);
+    center[1] = EXTRACT_FLOAT (priv->center, 1) - EXTRACT_INT (priv->projection_offset, 1);
+    sin_lamino = sinf (priv->lamino_angle);
+    cos_lamino = cosf (priv->lamino_angle);
+    scalar = priv->count >= priv->num_projections / BURST * BURST ? 1 : 0;
+
+    /* copy the image for further usage once BURST images arrived */
+    clGetImageInfo (image, CL_IMAGE_WIDTH, sizeof (size_t), &im_width, NULL);
+    clGetImageInfo (image, CL_IMAGE_HEIGHT, sizeof (size_t), &im_height, NULL);
+
+    /* If COPY_PROJECTION_REGION is True we copy only the part necessary  */
+    /* for a given tomographic and laminographic angle */
+    if (COPY_PROJECTION_REGION) {
+        determine_x_region (x_copy_region, priv->x_region, priv->y_region, tomo_angle,
+                            EXTRACT_FLOAT (priv->center, 0), im_width);
+        determine_y_region (y_copy_region, priv->x_region, priv->y_region, priv->z_region,
+                            tomo_angle, priv->lamino_angle, EXTRACT_FLOAT (priv->center, 1),
+                            im_height);
+        origin[0] = x_copy_region[0];
+        origin[1] = y_copy_region[0];
+        origin[2] = 0;
+        region[0] = x_copy_region[1] - x_copy_region[0];
+        region[1] = y_copy_region[1] - y_copy_region[0];
+    } else {
+        origin[0] = origin[1] = origin[2] = 0;
+        region[0] = im_width;
+        region[1] = im_height;
+    }
+    region[2] = 1;
+
+    if (priv->images[index] == NULL) {
+        clGetImageInfo (image, CL_IMAGE_FORMAT, sizeof (cl_image_format), &image_fmt, NULL);
+        /* TODO: what with the "other" API? */
+        priv->images[index] = clCreateImage2D (priv->context,
+                                               CL_MEM_READ_ONLY,
+                                               &image_fmt,
+                                               im_width,
+                                               im_height,
+                                               0,
+                                               NULL,
+                                               &cl_error);
+        UFO_RESOURCES_CHECK_CLERR (cl_error);
+    }
+    UFO_RESOURCES_CHECK_CLERR (clEnqueueCopyImage (cmd_queue,
+                               image,
+                               priv->images[index],
+                               origin,
+                               origin,
+                               region,
+                               0,
+                               NULL,
+                               NULL));
+
+    if (scalar) {
+        kernel = priv->scalar_kernel;
+        cumulate = priv->count;
+        table_size = sizeof (cl_float);
+        sines = &priv->sines[index];
+        cosines = &priv->cosines[index];
+        i = 1;
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, 0, sizeof (cl_mem), &priv->images[index]));
+    } else {
+        kernel = priv->vector_kernel;
+        cumulate = priv->count + 1 == BURST ? 0 : 1;
+        table_size = priv->table_size;
+        sines = priv->sines;
+        cosines = priv->cosines;
+        i = BURST;
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, index, sizeof (cl_mem), &priv->images[index]));
+    }
 
-    tomo_angle = priv->tomo_angle_is_absolute ? priv->tomo_angle : priv->tomo_angle * priv->count;
-    create_transformation_matrix (priv, tomo_angle);
-
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 0, sizeof (cl_mem), &out_mem));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 1, sizeof (cl_mem), &image));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 2, sizeof (cl_sampler), &priv->sampler));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 3, sizeof (cl_float2), x_region));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 4, sizeof (cl_float2), y_region));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 5, sizeof (cl_float2), z_region));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 6, sizeof (cl_float8), priv->tmatrix));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->bp_kernel, 7, sizeof (cl_uint), (cl_uint *) &priv->count));
-
-    profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
-    ufo_profiler_call (profiler, cmd_queue, priv->bp_kernel, 3, requisition->dims, NULL);
+    if (scalar || index == BURST - 1) {
+        /* Execute the kernel after BURST images have arrived, i.e. we use more
+         * projections at one invocation, so the number of read/writes to the
+         * result is reduced by a factor of BURST. If there are not enough
+         * projecttions left, execute the scalar kernel */
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_mem), &out_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_sampler), &priv->sampler));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_int3), real_size));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float2), center));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float2), x_region));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float2), y_region));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float2), z_region));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float), &sin_lamino));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, sizeof (cl_float), &cos_lamino));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, table_size, sines));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i++, table_size, cosines));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (kernel, i, sizeof (cl_int), (cl_int *) &cumulate));
+
+        profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
+        ufo_profiler_call (profiler, cmd_queue, kernel, 3, global_work_size, local_work_size);
+    }
 
     priv->count++;
 
@@ -262,6 +551,7 @@ static void
 ufo_anka_backproject_task_finalize (GObject *object)
 {
     UfoAnkaBackprojectTaskPrivate *priv;
+    gint i;
 
     priv = UFO_ANKA_BACKPROJECT_TASK_GET_PRIVATE (object);
     g_value_array_free (priv->x_region);
@@ -270,9 +560,13 @@ ufo_anka_backproject_task_finalize (GObject *object)
     g_value_array_free (priv->projection_offset);
     g_value_array_free (priv->center);
 
-    if (priv->bp_kernel) {
-        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->bp_kernel));
-        priv->bp_kernel = NULL;
+    if (priv->vector_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->vector_kernel));
+        priv->vector_kernel = NULL;
+    }
+    if (priv->scalar_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->scalar_kernel));
+        priv->scalar_kernel = NULL;
     }
     if (priv->context) {
         UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context));
@@ -283,6 +577,13 @@ ufo_anka_backproject_task_finalize (GObject *object)
         priv->sampler = NULL;
     }
 
+    for (i = 0; i < BURST; i++) {
+        if (priv->images[i] != NULL) {
+            UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->images[i]));
+            priv->images[i] = NULL;
+        }
+    }
+
     G_OBJECT_CLASS (ufo_anka_backproject_task_parent_class)->finalize (object);
 }
 
@@ -331,8 +632,11 @@ ufo_anka_backproject_task_set_property (GObject *object,
             g_value_array_free (priv->center);
             priv->center = g_value_array_copy (array);
             break;
-        case PROP_TOMO_ANGLE_IS_ABSOLUTE:
-            priv->tomo_angle_is_absolute = g_value_get_boolean (value);
+        case PROP_NUM_PROJECTIONS:
+            priv->num_projections = g_value_get_uint (value);
+            break;
+        case PROP_OVERALL_ANGLE:
+            priv->overall_angle = g_value_get_float (value);
             break;
         case PROP_TOMO_ANGLE:
             priv->tomo_angle = g_value_get_float (value);
@@ -370,8 +674,11 @@ ufo_anka_backproject_task_get_property (GObject *object,
         case PROP_CENTER:
             g_value_set_boxed (value, priv->center);
             break;
-        case PROP_TOMO_ANGLE_IS_ABSOLUTE:
-            g_value_set_boolean (value, priv->tomo_angle_is_absolute);
+        case PROP_NUM_PROJECTIONS:
+            g_value_set_uint (value, priv->num_projections);
+            break;
+        case PROP_OVERALL_ANGLE:
+            g_value_set_float (value, priv->overall_angle);
             break;
         case PROP_TOMO_ANGLE:
             g_value_set_float (value, priv->tomo_angle);
@@ -451,13 +758,24 @@ ufo_anka_backproject_task_class_init (UfoAnkaBackprojectTaskClass *klass)
                                   float_region_vals,
                                   G_PARAM_READWRITE);
 
-    properties[PROP_TOMO_ANGLE_IS_ABSOLUTE] =
-        g_param_spec_boolean ("tomo-angle-is-absolute",
-                              "Tomographic angle is absolute",
-                              "If TRUE, the value stored in tomo-angle property represents \
-                              an absolute angle, relative otherwise",
-                              FALSE,
-                              G_PARAM_READWRITE);
+    properties[PROP_OVERALL_ANGLE] =
+        g_param_spec_float ("overall-angle",
+                            "Angle covered by all projections",
+                            "Angle covered by all projections (can be negative for negative steps "
+                            "in case only num-projections is specified",
+                            -G_MAXFLOAT,
+                            G_MAXFLOAT,
+                            G_PI,
+                            G_PARAM_READWRITE);
+
+    properties[PROP_NUM_PROJECTIONS] =
+        g_param_spec_uint ("num-projections",
+                          "Number of projections",
+                          "Number of projections",
+                          0,
+                          16384,
+                          0,
+                          G_PARAM_READWRITE);
 
     properties[PROP_TOMO_ANGLE] =
         g_param_spec_float ("tomo-angle",
@@ -514,8 +832,9 @@ ufo_anka_backproject_task_init(UfoAnkaBackprojectTask *self)
         }
     }
 
-    self->priv->tomo_angle_is_absolute = FALSE;
-    self->priv->tomo_angle = 0.0f;
+    self->priv->num_projections = 0;
+    self->priv->overall_angle = G_PI;
+    self->priv->tomo_angle = -G_MAXFLOAT;
     self->priv->lamino_angle = 0.0f;
     self->priv->count = 0;
     self->priv->generated = FALSE;