Różnice

Różnice między wybraną wersją a wersją aktualną.

--- studia:magisterskie:1sem:systemy_rownolegle_i_rozproszone [2016/03/10 11:10]
149.156.112.6
+++ studia:magisterskie:1sem:systemy_rownolegle_i_rozproszone [2016/05/19 11:13] (aktualna)
149.156.112.6 [tmp/opencl_mat_transp/mat_transp/mat_transp.c]
@@ Linia 508: / Linia 508: @@
 ===== Zajęcia 3 =====
 ==== child.c ====
-<code>MPI_Status status;
+<code c># include <stdlib.h>
-  double a = 0.0; // lewy brzeg przedzialu
+# include <stdio.h>
-  double b = 1.0; // prawy brzeg przedzialu
+# include <math.h>
-  int i;
+# include "mpi.h"
+int main ( int argc, char *argv[] );
+void heat_part ( int n, int p, int id, double x_min, double x_max );
+/******************************************************************************/
+int main ( int argc, char *argv[] )
+{
   int id; // rank
   int n;  // liczba punktow dla kazdego wezla
@@ Linia 517: / Linia 526: @@
   double x_max;
   double x_min;
+  MPI_Comm parentcomm;
+  MPI_Status status;
   MPI_Init ( &argc, &argv );
-MPI_Comm_get_parent(&parentcomm);
-if (parentcomm == MPI_COMM_NULL) {
-//rodzic
-}
-else{
-//dziecko
-printf("Proces dzidzia\n");
- MPI_Comm_rank ( MPI_COMM_WORLD, &id );
- MPI_Comm_size ( MPI_COMM_WORLD, &p );
-//double
- MPI_Recv(&n, 1, MPI_INT, 0, 1, parentcomm, &status);
+  MPI_Comm_rank ( MPI_COMM_WORLD, &id );
- MPI_Recv(&x_min, 1, MPI_DOUBLE, 0, 2, parentcomm, &status);
- MPI_Recv(&x_max, 1, MPI_DOUBLE, 0, 3, parentcomm, &status);
+  MPI_Comm_size ( MPI_COMM_WORLD, &p );
+  MPI_Comm_get_parent(&parentcomm);
+  MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status );
+  MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status );
+  MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &status );
+  //MPI_Barrier(MPI_COMM_WORLD);
-// printf("xmin: %lf, xmax: %lf\n",x_min, x_max);
   heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
-}
   MPI_Finalize ( );
- // return p;
+  return 0;
 }
@@ Linia 581: / Linia 590: @@
   for ( i = 0; i <= n + 1; i++ )
   {
     x[i] = ( ( double ) (         i ) * x_max
            + ( double ) ( n + 1 - i ) * x_min )
            / ( double ) ( n + 1     );
   }
@@ Linia 621: / Linia 630: @@
     tag = 2;
-    if ( 0 < id )
+ if ( 0 < id ) {
-    {
+    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
-      // DO UZUPELNIENIA
+  }
-	MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
-    }
+  if ( id < p - 1 ) {
+    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
-    if ( id < p - 1 )
+  }
-    {
-      // DO UZUPELNIENIA
- 	MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
-    }
@@ Linia 637: / Linia 642: @@
     for ( i = 1; i <= n; i++ )
     {
       h_new[i] = h[i] + t_del * (
         k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
         + 2.0 * sin ( x[i] * t ) );
     }
     // nowa chwila czasu
     t = ( ( double ) (         j - j_min ) * t_max
             + ( double ) ( j_max - j         ) * t_min )
             / ( double ) ( j_max     - j_min );
@@ Linia 659: / Linia 664: @@
   // koncowa wymiana informacji z wezlami sasiednimi
   tag = 11;
   if ( id < p - 1 ) {
     MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
   }
   if ( 0 < id ) {
     MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
   }
   tag = 12;
   if ( 0 < id ) {
     // DO UZUPELNIENIA
     MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
   }
   if ( id < p - 1 ) {
     // DO UZUPELNIENIA
     MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
   }
   wtime = MPI_Wtime ( ) - wtime;
@@ Linia 711: / Linia 716: @@
 ==== parent.c ====
-<code>
+<code c>
 # include <stdlib.h>
 # include <stdio.h>
@@ Linia 717: / Linia 722: @@
 # include "mpi.h"
+int main ( int argc, char *argv[] );
 /******************************************************************************/
@@ Linia 722: / Linia 729: @@
 int main ( int argc, char *argv[] )
 {
-	int np = 8;
-	int errcodes[8];
-	MPI_Comm parentcomm, intercomm;
   double a = 0.0; // lewy brzeg przedzialu
   double b = 1.0; // prawy brzeg przedzialu
   int i;
   int id; // rank
   int n;  // liczba punktow dla kazdego wezla
   int p;  // size
+  int np = 8; // liczba procesow
   double x_max;
   double x_min;
+  MPI_Comm komunikator;
+  int *errcodes;
   MPI_Init ( &argc, &argv );
-MPI_Comm_get_parent(&parentcomm);
-if (parentcomm == MPI_COMM_NULL) {
+  //MPI_Comm_rank ( MPI_COMM_WORLD, &id );
-//rodzic
-MPI_Comm_spawn("heat", MPI_ARGV_NULL, np,MPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercomm, errcodes );
-printf("Proces rodzic\n");
- //MPI_Comm_rank ( MPI_COMM_WORLD, &id );
   //MPI_Comm_size ( MPI_COMM_WORLD, &p );
- int p = np;
+  MPI_Comm_spawn("dziecko", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &komunikator, errcodes);
- int j=0;
-for(j=0; j<p; j++){
+  n = 12; // liczba punktow dla kazdego wezla
-  n = 12; // liczba punktow dla kazdego wezla
+  int j;
-  i = 0;  // poczatkowa chwila czasu
+  for(j=0;j<np;j++){
-  // wspolrzedna lewego punktu dla wezla id
+   i = 0;  // poczatkowa chwila czasu
-  x_min = ( ( double )( p * n + 1 - j * n - i ) * a
+    // wspolrzedna lewego punktu dla wezla id
-          + ( double )(             j * n + i ) * b )
+    x_min = ( ( double )( np * n + 1 - j * n - i ) * a
-          / ( double ) ( p * n + 1              );
+          + ( double )(             j * n + i ) * b )
+          / ( double ) ( np * n + 1              );
-  i = n + 1;
+    i = n + 1;
-  // wspolrzedna prawego punktu dla wezla id
+    // wspolrzedna prawego punktu dla wezla id
-  x_max = ( ( double )( p * n + 1 - j * n - i ) * a
+    x_max = ( ( double )( np * n + 1 - j * n - i ) * a
           + ( double )(             j * n + i ) * b )
-          / ( double )( p * n + 1              );
+          / ( double )( np * n + 1              );
+    MPI_Send ( &x_min, 1, MPI_DOUBLE, j, 1, komunikator );
+    MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 2, komunikator );
+    MPI_Send ( &n, 1, MPI_INT, j, 3, komunikator );
-	//printf("x_min %d , x_max %d\n",x_min, x_max);
+  }
-  //heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
-MPI_Send ( &n, 1, MPI_INT, j, 1, intercomm );
-MPI_Send ( &x_min, 1, MPI_DOUBLE, j , 2, intercomm );
-MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 3, intercomm );
-}   }
-else{
-//dziecko
-printf("Proces dzidzia\n");
-}
   MPI_Finalize ( );
- // return p;
+  return 0;
 }
+    struct rekord wys;
+    MPI_Datatype rekord_typ;
+    int tab_dlug_blokow[3] = {1, 1, 1};
+    MPI_Datatype tab_typow[3] = {MPI_DOUBLE, MPI_DOUBLE, MPI_INT};
+    MPI_Aint podstawa, tab_odstepow[3];
+    MPI_Get_address(&wys.x_min, &tab_odstepow[0]);
+    MPI_Get_address(&wys.x_max, &tab_odstepow[1]);
+    MPI_Get_address(&wys.n, &tab_odstepow[2]);
+    MPI_Type_struct(3, tab_dlug_blokow, tab_odstepow, tab_typow, &rekord_typ);
+    MPI_Type_commit(&rekord_typ);
+    MPI_Recv ( &wys, 1, rekord_typ, 0, 1, parentcomm, &status );
 </code>
-==== root_proc.c ====
+====== Zajęcia 4 ======
-<code c># include <stdlib.h>
+wkrótce
-# include <stdio.h>
+====== Zajęcia 5 ======
-# include <math.h>
+==== Makefile ====
-# include "mpi.h"
+<code bash># optimization and other system dependent options
+#include  make.$(SRR_ARCH)
+# or directly
+include  make.lab_404_NVIDIA
-#define NUM_SPAWNS 1
+NAME = Hello_GPU
+program: main.o
+	$(CC) $(LDFL) main.o  $(LIB) -o $(NAME)
+main.o: main.c
+	$(CC) $(CFL) -c main.c  $(INC) -o main.o
+clean:
+	rm -f obj/*
+	rm -f $(NAME)
+</code>
+==== make.lab_404_NVIDIA ====
+<code bash># C compiler
+#CC = icc
+CC = gcc
+# C++ compiler
+CPPC = icpc
+#CPPC = g++
+# Loader (to link C/C++ and Fortran libraries)
+LD = icpc
+#LD = g++
+# Archiver
+AR = ar r
+# For removing files
+RM = rm -f
+# Include directories
+INC = -I/opt/cuda7/include
+# Standard and/or local libraries
+LIB = -L/opt/cuda7/lib64 -lOpenCL
+# C optimization and other flags
+#CFL(icc) = -O3 -openmp
+#CFL(gcc) = -O3 -fopenmp
+#CFL(icc,debug) = -g
+#CFL(gcc) = -g
+CFL =
+# Loader optimization and other flags
+#LDFL(debug) = -g
+#LDFL(profile) = -p
+LDFL =
+</code>
+==== main.c ====
+<code c>#include<stdlib.h>
+#include<stdio.h>
+#include <CL/cl.h>
+// functions to display platform and device properties
+void DisplayPlatformInfo(
+			 cl_platform_id id,
+			 cl_platform_info name,
+			 char* str)
+{
+  cl_int retval;
+  size_t paramValueSize;
+  retval = clGetPlatformInfo(
+			     id,
+			     name,
+,
+			     NULL,
+			     &paramValueSize);
+  if (retval != CL_SUCCESS){
+    printf("Failed to find OpenCL platform %s.\n", str);
+    return;
+  }
+  char * info = (char *)malloc(sizeof(char) * paramValueSize);
+  retval = clGetPlatformInfo(
+			     id,
+			     name,
+			     paramValueSize,
+			     info,
+			     NULL);
+  if (retval != CL_SUCCESS)  {
+    printf("Failed to find OpenCL platform %s.\n", str);
+    return;
+  }
+  printf("\t%s:\t%s\n", str, info );
+  free(info);
+}
+void DisplayDeviceInfo_char(
+		       cl_device_id id,
+		       cl_device_info name,
+		       char* str)
+{
+  cl_int retval;
+  size_t paramValueSize;
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+,
+			   NULL,
+			   &paramValueSize);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  char * info = (char *)malloc(sizeof(char) * paramValueSize);
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+			   paramValueSize,
+			   info,
+			   NULL);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  printf("\t\t%s:\t%s\n", str, info );
+  free(info);
+};
+void DisplayDeviceInfo_ulong(
+		       cl_device_id id,
+		       cl_device_info name,
+		       char* str)
+{
+  cl_int retval;
+  size_t paramValueSize;// = sizeof(cl_ulong);
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+,
+			   NULL,
+			   &paramValueSize);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize);
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+			   paramValueSize,
+			   info,
+			   NULL);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  printf("\t\t%s:\t%lu MB\n", str, info[0] / 1024 / 1024 );
+  free(info);
+};
+void DisplayDeviceInfo_uint(
+		       cl_device_id id,
+		       cl_device_info name,
+		       char* str)
+{
+  cl_int retval;
+  size_t paramValueSize;// = sizeof(cl_ulong);
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+,
+			   NULL,
+			   &paramValueSize);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize);
+  retval = clGetDeviceInfo(
+			   id,
+			   name,
+			   paramValueSize,
+			   info,
+			   NULL);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to find OpenCL device info %s.\n", str);
+    return;
+  }
+  printf("\t\t%s:\t%d MHz\n", str, info[0] );
+  free(info);
+};
+// main program controlling execution of CPU code and OpenCL kernels
+int main(int argc, char** argv)
+{
+  cl_uint number_of_contexts = 2;
+  cl_context context = NULL;
+  cl_context list_of_contexts[2] = {0,0};
+  cl_command_queue commandQueue = 0;
+  cl_program program = 0;
+  cl_uint number_of_devices;
+  cl_device_id device = 0;
+  cl_device_id *list_of_devices;
+  cl_device_type type;
+  cl_kernel kernel = 0;
+  cl_mem memObjects[3] = { 0, 0, 0 };
+  cl_int retval;
+  int icon, idev;
+  cl_uint numPlatforms;
+  cl_platform_id * platformIds;
+  cl_uint i,j;
+  // flag to control displaying
+  int Monitor = 1;
+  // Create OpenCL contexts
+  // First, query the total number of platforms
+  retval = clGetPlatformIDs(0, (cl_platform_id *) NULL, &numPlatforms);
+  // Next, allocate memory for the installed plaforms, and qeury
+  // to get the list.
+  platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
+  // Then, query the platform IDs
+  retval = clGetPlatformIDs(numPlatforms, platformIds, NULL);
+  if(Monitor>=0){
+    printf("Number of platforms: \t%d\n", numPlatforms);
+  }
+  // Iterate through the list of platforms displaying associated information
+  for (i = 0; i < numPlatforms; i++) {
+    if(Monitor>0){
+      printf("Platform ID - %d\n",i);
+      // First we display information associated with the platform
+      DisplayPlatformInfo(
+			platformIds[i],
+			CL_PLATFORM_NAME,
+			"CL_PLATFORM_NAME");
+      DisplayPlatformInfo(
+			platformIds[i],
+			CL_PLATFORM_PROFILE,
+			"CL_PLATFORM_PROFILE");
+      DisplayPlatformInfo(
+			platformIds[i],
+			CL_PLATFORM_VERSION,
+			"CL_PLATFORM_VERSION");
+      DisplayPlatformInfo(
+			platformIds[i],
+			CL_PLATFORM_VENDOR,
+			"CL_PLATFORM_VENDOR");
+    }
+  }
+  // For the first platform
+  int iplat;
+  printf("Select Platform ID: "); scanf("%d", &iplat);
+  // Query the set of devices associated with the platform
+  retval = clGetDeviceIDs(
+			  platformIds[iplat],
+			  CL_DEVICE_TYPE_ALL,
+,
+			  NULL,
+			  &number_of_devices);
+  list_of_devices =
+    (cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices);
+  retval = clGetDeviceIDs(
+			  platformIds[iplat],
+			  CL_DEVICE_TYPE_ALL,
+			  number_of_devices,
+			  list_of_devices,
+			  NULL);
+  if(Monitor>=0){
+    printf("Number of devices: \t%d\n", number_of_devices);
+  }
+  // Iterate through each device, displaying associated information
+  for (j = 0; j < number_of_devices; j++) {
+    clGetDeviceInfo(list_of_devices[j], CL_DEVICE_TYPE,
+		    sizeof(cl_device_type), &type, NULL);
+    if(Monitor>0){
+      DisplayDeviceInfo_char(
+			list_of_devices[j],
+			CL_DEVICE_NAME,
+			"CL_DEVICE_NAME");
+      DisplayDeviceInfo_char(
+			list_of_devices[j],
+			CL_DEVICE_VENDOR,
+			"CL_DEVICE_VENDOR");
+      DisplayDeviceInfo_char(
+			list_of_devices[j],
+			CL_DEVICE_VERSION,
+			"CL_DEVICE_VERSION");
+      DisplayDeviceInfo_ulong(
+			list_of_devices[j],
+			CL_DEVICE_GLOBAL_MEM_SIZE,
+			"CL_DEVICE_GLOBAL_MEM_SIZE");
+      DisplayDeviceInfo_uint(
+			list_of_devices[j],
+			CL_DEVICE_MAX_CLOCK_FREQUENCY,
+			"CL_DEVICE_MAX_CLOCK_FREQUENCY");
+      printf("\n");
+    }
+  }
+  // Next, create OpenCL contexts on platforms
+  cl_context_properties contextProperties[] = {
+    CL_CONTEXT_PLATFORM,
+    (cl_context_properties)platformIds[iplat],
+
+  };
+  if(Monitor>0){
+    printf("Creating CPU context %d on platform %d\n", 1, iplat);
+  }
+  list_of_contexts[1] =
+    clCreateContextFromType(contextProperties,
+			    CL_DEVICE_TYPE_CPU, NULL, NULL, &retval);
+  if(Monitor>=0 && retval != CL_SUCCESS){
+    printf("Could not create CPU context on platform %d\n", i);
+  }
+  if(Monitor>0){
+    printf("Creating GPU context 0 on platform %d\n", iplat);
+  }
+  list_of_contexts[0] =
+    clCreateContextFromType(contextProperties,
+			    CL_DEVICE_TYPE_GPU, NULL, NULL, &retval);
+  if(Monitor>=0 && retval != CL_SUCCESS){
+    printf("Could not create GPU context on platform %d\n", i);
+  }
+  // in a loop over devices of the seleceted platform
+  for(idev=0; idev<number_of_devices;idev++){
+    if(Monitor>0){
+      printf("\nFor context %d and device %d:\n",
+	     idev, idev);
+    }
+    device = list_of_devices[idev];
+    icon = idev;
+    // choose OpenCL context on first available platform
+    context = list_of_contexts[icon];
+    if(context !=0){
+      commandQueue = clCreateCommandQueue(context, device, 0, NULL);
+      if (commandQueue == NULL) {
+	printf("Failed to create commandQueue for device %d\n", idev);
+	exit(0);
+      }
+      if(Monitor>0){
+	printf("Reading program from source\n");
+      }
+      // read source code from file
+      FILE *fp;
+      char* source;
+      long int size;
+      fp = fopen("HelloWorld.cl", "rb");
+      if(!fp) {
+	printf("Could not open kernel file\n");
+	exit(-1);
+      }
+      int status = fseek(fp, 0, SEEK_END);
+      if(status != 0) {
+	printf("Error seeking to end of file\n");
+	exit(-1);
+      }
+      size = ftell(fp);
+      if(size < 0) {
+	printf("Error getting file position\n");
+	exit(-1);
+      }
+      rewind(fp);
+      source = (char *)malloc(size + 1);
+      int i;
+      for (i = 0; i < size+1; i++) {
+	source[i]='\0';
+      }
+      if(source == NULL) {
+	printf("Error allocating space for the kernel source\n");
+	exit(-1);
+      }
+      fread(source, 1, size, fp);
+      source[size] = '\0';
+      const char* source_const = source;
+      cl_program program = clCreateProgramWithSource(context, 1,
+						     &source_const,
+						     NULL, NULL);
+      if (program == NULL)
+	{
+	  printf("Failed to create CL program from source.\n");
+	  exit(-1);
+	}
+      if(Monitor>0){
+	printf("Creating program and kernel\n");
+      }
+      // build program (passing options to compiler if necessary
+      retval = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+      char* buildLog; size_t size_of_buildLog;
+      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+, NULL, &size_of_buildLog);
+      buildLog = malloc(size_of_buildLog+1);
+      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+			    size_of_buildLog, buildLog, NULL);
+      buildLog[size_of_buildLog]= '\0';
+      printf("Kernel buildLog: %s\n", buildLog);
+      if (retval != CL_SUCCESS)
+	{
+	  printf("Error in kernel\n");
+	  clReleaseProgram(program);
+	  exit(-1);
+	}
+      // Create OpenCL kernel
+      kernel = clCreateKernel(program, "hello_kernel", NULL);
+      if (kernel == NULL)
+	{
+	  printf("Failed to create kernel.\n");
+	  exit(0);
+	}
+      if(Monitor>0){
+	printf("Creating memory objects\n");
+      }
+      // Create memory objects that will be used as arguments to
+      // kernel.  First create host memory arrays that will be
+      // used to store the arguments to the kernel
+      float result[1];
+      float a[1];
+      float b[1];
+      a[0] = 2;
+      b[0] = 2;
+      memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
+				     sizeof(float), NULL, NULL);
+      memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
+				     sizeof(float), NULL, NULL);
+      memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
+				     sizeof(float), NULL, NULL);
+      if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){
+	printf("Error creating memory objects.\n");
+	return 0;
+      }
+      if(Monitor>0){
+	printf("Sending kernel arguments\n");
+      }
+      retval = clEnqueueWriteBuffer(
+				    commandQueue,
+				    memObjects[0],
+				    CL_FALSE,
+,
+				    sizeof(float),
+				    a,
+,
+				    NULL,
+				    NULL);
+      // Use clEnqueueWriteBuffer() to write input array B to
+      // the device buffer bufferB
+      retval = clEnqueueWriteBuffer(
+				    commandQueue,
+				    memObjects[1],
+				    CL_FALSE,
+,
+				    sizeof(float),
+				    b,
+,
+				    NULL,
+				    NULL);
+      // Set the kernel arguments (result, a, b)
+      retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+      retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+      retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+      if (retval != CL_SUCCESS)
+	{
+	  printf("Failed to Set the kernel arguments.\n");
+	  //Cleanup(context, commandQueue, program, kernel, memObjects);
+	  return 1;
+	}
+      if(Monitor>0){
+	printf("Running the kernel!\n");
+      }
+      size_t globalWorkSize[1] = { 1 };
+      size_t localWorkSize[1] = { 1 };
+      // Queue the kernel up for execution across the array
+      retval = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
+				      globalWorkSize, localWorkSize,
+, NULL, NULL);
+      if (retval != CL_SUCCESS)
+	{
+	  printf("Failed to queue kernel for execution.\n");
+	  //Cleanup(context, commandQueue, program, kernel, memObjects);
+	  return 1;
+	}
+      if(Monitor>0){
+	printf("Transfering back results\n");
+      }
+      // Read the output buffer back to the Host
+      retval = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
+, sizeof(float), result,
+, NULL, NULL);
+      if (retval != CL_SUCCESS)
+	{
+	  printf("Failed to read result buffer.\n");
+	  //Cleanup(context, commandQueue, program, kernel, memObjects);
+	  return 1;
+	}
+      // Verify the output
+      if(result[0]==4)  {
+        printf("Output is correct: %lf + %lf = %lf\n",
+	       a[0], b[0], result[0]);
+      } else {
+        printf("Output is incorrect: %lf + %lf != %lf\n",
+	       a[0], b[0], result[0]);
+      }
+      for (i = 0; i < 3; i++)
+      	{
+      	  if (memObjects[i] != 0)
+            clReleaseMemObject(memObjects[i]);
+      	}
+      if (commandQueue != 0)
+        clReleaseCommandQueue(commandQueue);
+      if (kernel != 0)
+        clReleaseKernel(kernel);
+      if (program != 0)
+        clReleaseProgram(program);
+    }
+  }
+  free(list_of_devices);
+  free(platformIds);
+  return 0;
+}
+</code>
+==== HelloWorld.cl ====
+<code c>
+__kernel void hello_kernel(__global const float *a,
+		           __global const float *b,
+			   __global float *result)
+{
+    int gid = get_global_id(0);
+    if(gid<1) result[gid] = a[gid] + b[gid];
+}</code>
+===== Zajęcia 6 =====
+Pliki zadania: {{:studia:magisterskie:1sem:opencl_vecadd.tgz|}}
+Działające 3 kernele: {{:studia:magisterskie:1sem:sala404.zip|}}
+==== opencl_vecadd/vecadd_host/execute_kernels.c ====
+<code c>#include<stdlib.h>
+#include<stdio.h>
+#include <math.h>
+#include<omp.h>
+#include <CL/cl.h>
+#include"uth_ocl_intf.h"
+#include"./vecadd_host.h"
+#include"./hardware_params.h"
+#define time_measurments
+#ifdef time_measurments
+#include"uth_time_intf.h"
+  static double t_begin, t_end, t_total;
+#endif
+int execute_kernel_2_CPU(
+			 int platform_index,
+			 int device_index,
+			 int kernel_index,
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
+{
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for CPU
+  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
+				  // kernel name:         , file:
+				  "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. Restoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  int nr_cores_CPU = omp_get_num_procs();
+  int nr_threads = nr_cores_CPU; // total number of threads
+  int work_group_size = 1; // number of threads in work-group
+  int nr_work_groups = nr_threads / work_group_size;
+  globalWorkSize[0] = nr_threads ;
+  localWorkSize[0] = work_group_size ;
+  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	 nr_cores_CPU, nr_cores_CPU, 1);
+  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	 work_group_size, nr_threads / work_group_size,
+	 nr_threads / work_group_size / nr_cores_CPU);
+  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU,
+	 globalWorkSize[0]/nr_cores_CPU);
+  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  int size = array_size;
+  int size_per_thread = ceilf((float)size/nr_threads);
+  // Set the kernel arguments (result, a, b)
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
+  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to Set the kernel arguments.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  if(monitor>UTC_BASIC_INFO){
+    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
+int execute_kernel_3_CPU(
+			 int platform_index,
+			 int device_index,
+			 int kernel_index,
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
+{
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for CPU
+  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
+				  // kernel name:         , file:
+				  "vecadd_3_opt_blocks_kernel","vecadd_3_opt_blocks.cl",monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  int nr_cores_CPU = omp_get_num_procs();
+  int nr_threads = nr_cores_CPU; // total number of threads
+  int work_group_size = 1; // number of threads in work-group
+  int nr_work_groups = nr_threads / work_group_size;
+  globalWorkSize[0] = nr_threads ;
+  localWorkSize[0] = work_group_size ;
+  printf("\t\tExecuting kernel %d, on platform %d and device %d\n",
+	 kernel_index, platform_index, device_index);
+  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	 nr_cores_CPU, nr_cores_CPU, 1);
+  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	 work_group_size, nr_threads / work_group_size,
+	 nr_threads / work_group_size / nr_cores_CPU);
+  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU,
+	 globalWorkSize[0]/nr_cores_CPU);
+  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  int size = array_size;
+  // Set the kernel arguments (result, a, b)
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
+  if (retval != CL_SUCCESS) {
+    printf("Failed to Set the kernel arguments.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    return 1;
+  }
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
+// tutaj modyfikujemy
+int execute_kernel_0_GPU(
+			 int platform_index,
+			 int device_index,
+			 int kernel_index,
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
+{
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for GPU
+    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				    // kernel name:         , file:
+				    "vecadd_0_kernel", "vecadd_0.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  // globalna liczba wątków i lokalna liczba wątków
+  int work_group_size = WORK_GROUP_SIZE;
+  globalWorkSize[0] = array_size;
+  localWorkSize[0] = work_group_size;
+  if(monitor>UTC_BASIC_INFO){
+    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
+    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
+	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
+    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+  }
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  // Set the kernel arguments (result, a, b)
+  // ustawiamy liczbę argumentów
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  if (retval != CL_SUCCESS) {
+      printf("Failed to Set the kernel arguments.\n");
+      //Cleanup(context, command_queue, program, kernel, memObjects);
+      return 1;
+    }
-int main ( int argc, char *argv[] );
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
-/******************************************************************************/
+int execute_kernel_1_GPU(
+			 int platform_index,
+			 int device_index,
+			 int kernel_index,
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
+{
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for GPU
+/*
+    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				    // kernel name:         , file:
+				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
+    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+ 				    // kernel name:         , file: */
+ 				    "vecadd_1_kernel", "vecadd_1.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  int work_group_size = WORK_GROUP_SIZE;
+  globalWorkSize[0] = array_size / 4;
+  localWorkSize[0] = work_group_size;
+  if(monitor>UTC_BASIC_INFO){
+    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
+    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
+	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
+    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+  }
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  // Set the kernel arguments (result, a, b)
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  if (retval != CL_SUCCESS) {
+      printf("Failed to Set the kernel arguments.\n");
+      //Cleanup(context, command_queue, program, kernel, memObjects);
+      return 1;
+    }
-int main ( int argc, char *argv[] )
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
+int execute_kernel_2_GPU(
+			 int platform_index,
+			 int device_index,
+			 int kernel_index,
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
 {
-	int np = NUM_SPAWNS;
-	int errcodes[NUM_SPAWNS];
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for GPU
+/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				    // kernel name:         , file:
+				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
+     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+ 				    // kernel name:         , file: */
+ 				    "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  int work_group_size = WORK_GROUP_SIZE;
+  globalWorkSize[0] = array_size / work_group_size;
+  localWorkSize[0] = work_group_size;
-	MPI_Comm parentcomm, intercomm;
+  int size_per_thread = work_group_size;
+  int size = array_size;
+  if(monitor>UTC_BASIC_INFO){
+    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
+    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
+	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
+    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+  }
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  // Set the kernel arguments (result, a, b)
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
+  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
+  if (retval != CL_SUCCESS) {
+      printf("Failed to Set the kernel arguments.\n");
+      //Cleanup(context, command_queue, program, kernel, memObjects);
+      return 1;
+    }
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
-	MPI_Init(&argc, &argv);
-	MPI_Comm_get_parent(&parentcomm);
-	if (parentcomm == MPI_COMM_NULL) {
+int execute_kernel_3_GPU(
-		MPI_Comm_spawn("root_proc", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercomm, errcodes);
+			 int platform_index,
-	printf("proces rodzic 1\n");
+			 int device_index,
-	} else {
+			 int kernel_index,
-		printf("Proces dziecko 1\n");
+			 int array_size,
+			 cl_mem* memObjects,
+			 int monitor
+			 )
+{
+  cl_kernel kernel;
+  cl_event kernel_execution;
+  cl_ulong startTime;
+  cl_ulong endTime;
+  double exec_time;
+  cl_int retval;
+  size_t globalWorkSize[1] = { 0 };
+  size_t localWorkSize[1] = { 0 };
+  int i,j,k;
+  int size = array_size;
+  /*----------------KERNEL CREATION PHASE----------------------*/
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the kernel for GPU
+/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				    // kernel name:         , file:
+				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
+     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+ 				    // kernel name:         , file:
+				    "vecadd_3_opt_blocks_kernel", "vecadd_3_opt_blocks.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
+#endif
+  // choose the context
+  cl_context context = utr_ocl_select_context(platform_index, device_index);
+  // choose the command queue
+  cl_command_queue command_queue =
+    utr_ocl_select_command_queue(platform_index, device_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
+	   platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL){
+    printf("failed to restore context and command queue for platform %d, device %d\n",
+	   platform_index, device_index);
+    printf("%lu %lu\n", context, command_queue);
+  }
+  int NDRange_dim = 1;
+  int work_group_size = WORK_GROUP_SIZE;
+  globalWorkSize[0] = array_size;
+  localWorkSize[0] = work_group_size;
+  if(monitor>UTC_BASIC_INFO){
+    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
+	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
+    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
+	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
+	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
+    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
+	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
+	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
+  }
+  // choose the kernel
+  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+  if(monitor>UTC_BASIC_INFO){
+    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	   kernel_index, platform_index, device_index);
+  }
+  if(context == NULL || command_queue == NULL || kernel == NULL){
+    printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	   platform_index, device_index, kernel_index);
+    printf("context %lu, command queue %lu, kernel %lu\n",
+	   context, command_queue, kernel);
+  }
+  // Set the kernel arguments (result, a, b)
+  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
+  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
+  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
+  if (retval != CL_SUCCESS) {
+      printf("Failed to Set the kernel arguments.\n");
+      //Cleanup(context, command_queue, program, kernel, memObjects);
+      return 1;
+    }
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_begin = time_clock();
+#endif
+  // Queue the kernel up for execution across the array
+  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+				  globalWorkSize, localWorkSize,
+, NULL, &kernel_execution);
+  clWaitForEvents(1, &kernel_execution);
+#ifdef time_measurments
+  clFinish(command_queue);
+  t_end = time_clock();
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(kernel_execution,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
+  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
+	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
+  printf("\tNumber of operations %d, performance %lf GFlops\n",
+	 array_size, array_size / exec_time * 1e-9);
+  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
+*array_size*sizeof(float)*1e-9,
+*array_size*sizeof(float)/exec_time*1e-9);
+#endif
+  if (retval != CL_SUCCESS) {
+    printf("Failed to queue kernel for execution.\n");
+    //Cleanup(context, command_queue, program, kernel, memObjects);
+    exit(-1);
+  }
+  return(0);
+}
+/*     // create the fourth kernel for GPU */
+/*     kernel_index = 4;  */
+/*     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, */
+/* 				    // kernel name:         , file: */
+/* 				    "vecadd_4_opt_blocks_vect_kernel", "vecadd_4_opt_blocks_vect.cl", monitor); */</code>
+==== opencl_vecadd/vecadd_2_blocks.cl ====
+<code c>
+__kernel void vecadd_2_blocks_kernel(__global const float *a,
+				     __global const float *b,
+				     __global float *result,
+		                     const int size,
+				     const int size_per_thread)
+{
+  int gid = get_global_id(0);
+  int index_start = gid * size_per_thread;
+  int index_end = (gid+1) * size_per_thread;
+  if (gid == 0) {
+	//printf("size: %d, %d", size, size_per_thread);
+  }
+  for (int i=index_start; i < index_end && i < size; i++) {
+  //for (int i=index_start; i < index_end ; i++) {
+    result[i] = a[i]+b[i];
+  }
+}</code>
+===== Zajęcia 7 =====
+{{:studia:magisterskie:1sem:zajecia_7.zip|}}
+==== tmp/opencl_mat_transp/mat_transp_1.cl ====
+<code c> #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+__kernel void mat_transp_1_kernel(
+  __global float *A ,
+  __global float *B ,
+  __global float *C ,
+  int N)
+{
+int id = get_global_id(0) * get_global_size(0) + get_global_id(1);
+float temp = 0.0;
+for(int i = 0; i<N;i++){
+	temp += A[id*N + i] *B[i];
+}
+C[id] = temp;
+}</code>
+==== tmp/opencl_mat_transp/mat_transp/mat_transp.c ====
+<code c>#include<stdlib.h>
+#include<stdio.h>
+#include <math.h>
+#include <CL/cl.h>
+#include"uth_ocl_intf.h"
+#define time_measurments
+#ifdef time_measurments
+#include"uth_time_intf.h"
+  static double t_begin, t_end, t_total;
+#endif
+#define BLOCK_SIZE 16
+#define NR_GROUPS 16
+#define MULT 16
+#define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT)
+#define ROZMIAR (WYMIAR*WYMIAR)
+// Matrices are stored in row-major order:
+// M(row, col) = M( row * WYMIAR + col )
+int verify_result(
+  float* result,
+  float* result_compare
+		  )
+{
+  // Verify the result
+  int result_OK = 1;
+  int i,j;
+  for(i = 0; i < WYMIAR; i++) {
+      if(fabs(result[i] - result_compare[i])>1.e-6) {
+	result_OK = 0;
+	break;
+    }
+  }
+  printf("\t\t6. verifying results: ");
+  if(result_OK) {
+    printf("Output is correct\n");
+  } else {
+    printf("Output is incorrect\n");
+    j = 0;
+    for(i = 0; i < WYMIAR, j<100; i++) {
+	if(fabs(result[i] - result_compare[i])>1.e-9) {
+	  j++;
+	  printf("%d %16.8f %16.8f\n",
+	  	 i, result[i], result_compare[i]);
 	}
-	fflush(stdout);
+    }
-	MPI_Finalize();
+    exit(0);
+  }
+    /* for(i = 0; i < length; i++) { */
+    /*   printf("%16.8f %16.8f\n", result[i], result_compare[i]); */
+    /* } */
+  return(result_OK);
+}
+  /*----------------KERNEL CREATION PHASE----------------------*/
+void create_kernels()
+{
+  // for all operations indicate explicit info messages
+  int monitor = UTC_BASIC_INFO + 1;
+  int kernel_index;
+  int platform_index = utv_ocl_struct.current_platform_index;
+  if(utr_ocl_CPU_context_exists(platform_index)){
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the first kernel for CPU
+  /* kernel_index = 0;  */
+  /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, */
+  /* 				  // kernel name:         , file: */
+  /* 				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor); */
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
+#endif
+  }
+  if(utr_ocl_GPU_context_exists(platform_index)){
+#ifdef time_measurments
+  t_begin = time_clock();
+#endif
+  // create the first kernel for GPU
+  kernel_index = 0;
+  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				  // kernel name:         , file:
+				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor);
+#ifdef time_measurments
+  t_end = time_clock();
+  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
+#endif
+  // create the second kernel for GPU
+  kernel_index = 1;
+  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
+				  // kernel name:         , file:
+				  "mat_transp_2_kernel", "mat_transp_2.cl", monitor);
+  }
+}
+  /*----------------EXECUTION PHASE----------------------*/
+int execute_kernels()
+{
+  // for all operations indicate explicit info messages
+  int monitor = UTC_BASIC_INFO + 1;
+  int platform_index = utv_ocl_struct.current_platform_index;
+  utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index];
+  int kernel_index;
+  int i,j,n;
+  double nr_access;
+  double t1,t2;
+  if(monitor>UTC_BASIC_INFO){
+    printf("\n------------Starting execution phase----------------\n");
+  }
+  // create matrices
+  n=WYMIAR;
+  float* A = (float *) malloc(ROZMIAR*sizeof(float));
+  float* B = (float *) malloc(WYMIAR*sizeof(float));
+  float* C = (float *) malloc(WYMIAR*sizeof(float));
+  float* D = (float *) malloc(WYMIAR*sizeof(float));
+  for(i=0;i<ROZMIAR;i++) A[i]=1.0*i/10000000.0;
+  for(i=0;i<WYMIAR;i++) B[i]=1.0*i/10000000.0;
+  // mnozenie macierz wektor
+  for(i=0; i<WYMIAR; i++){
+	for(j=0; j<WYMIAR; j++){
+		D[i] += A[i*WYMIAR + j] * B[j];
+	}
+  }
+  nr_access= ROZMIAR + 2.0*WYMIAR; // read + write
+  printf("mat_transp: nr_access %lf\n", nr_access);
+  // get hardware characteristics to select good matrix shape
+  // the set of device characteristics stored in data structure
+  int device_index = 0;
+  utt_ocl_device_struct device_struct =
+    utv_ocl_struct.list_of_platforms[platform_index].list_of_devices[device_index];
+  double global_mem_bytes = device_struct.global_mem_bytes;
+  double global_max_alloc = device_struct.global_max_alloc;
+  double local_mem_bytes = device_struct.local_mem_bytes;
+  double constant_mem_bytes = device_struct.constant_mem_bytes;
+  int max_num_comp_units = device_struct.max_num_comp_units;
+  int max_work_group_size = device_struct.max_work_group_size;
+  // in a loop over devices (or for a selected device)
+  int idev=0;
+  for(idev=0; idev<platform_struct.number_of_devices; idev++){
+    // int device_type = .....
+    // choose device_index
+    // int device_index = utr_ocl_select_device(platform_index, device_type);
+    int device_index = idev;
+    int device_type = utr_ocl_device_type(platform_index, device_index);
+    if(device_index>0 && device_type==utr_ocl_device_type(platform_index, device_index-1)) break;
+    if(device_type == UTC_OCL_DEVICE_CPU) break;
+    // choose the context
+    cl_context context = utr_ocl_select_context(platform_index, device_index);
+    // choose the command queue
+    cl_command_queue command_queue =
+      utr_ocl_select_command_queue(platform_index, device_index);
+    if(monitor>UTC_BASIC_INFO){
+      printf("\nExecution: \t0. restoring context and command queue for platform %d and device %d\n",
+	     platform_index, device_index);
+    }
+    if(context == NULL || command_queue == NULL){
+      printf("failed to restore context and command queue for platform %d, device %d\n",
+	     platform_index, device_index);
+      printf("%lu %lu\n", context, command_queue);
+    }
+    // choose the kernel
+    kernel_index = 0;
+    cl_kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
+    if(monitor>UTC_BASIC_INFO){
+      printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
+	     kernel_index, platform_index, device_index);
+    }
+    if(context == NULL || command_queue == NULL || kernel == NULL){
+      printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
+	     platform_index, device_index, kernel_index);
+      printf("context %lu, command queue %lu, kernel %lu\n",
+	     context, command_queue, kernel);
+    }
+    for(i=0;i<WYMIAR;i++) C[i]=0.0;
+    time_init(); t1 = time_clock();
+    // call routine to perform matrix transposition
+    mat_transp_host(kernel_index, A, B, C, n, context, kernel, command_queue);
+    t2 = time_clock(); time_print();
+    printf("GB/s = %lf\n\n", nr_access*sizeof(float)/(t2-t1)/1024/1024/1024);
+    // verify result
+    verify_result(C, D);
+  } // end loop over devices
+  return(0);
+}
+// Matrix transposition - Host code
+// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
+int mat_transp_host(
+  int kernel_index,
+  float* A,
+  float* B,
+  float* C,
+  int N,
+  const cl_context context,
+  const cl_kernel mat_transp_kernel,
+  const cl_command_queue queue
+		)
+{
+// Load A to device memory
+  size_t size_bytesm = N*N*sizeof(float);
+  size_t size_bytesw = N*sizeof(float);
+  cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY,
+				size_bytesm, NULL, NULL);
+  // Write A to device memory
+  clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, size_bytesm, A, 0, 0, 0);
+  cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY,
+				size_bytesw, NULL, NULL);
+  // Write A to device memory
+  clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, size_bytesw, B, 0, 0, 0);
+  // Allocate B in device memory
+  cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size_bytesw, NULL, NULL);
+  // Invoke kernel
+  clSetKernelArg(mat_transp_kernel, 0, sizeof(cl_mem), (void*)&d_A);
+  clSetKernelArg(mat_transp_kernel, 1, sizeof(cl_mem), (void*)&d_B);
+  clSetKernelArg(mat_transp_kernel, 2, sizeof(cl_mem), (void*)&d_C);
+  clSetKernelArg(mat_transp_kernel, 3, sizeof(int), (void*)&N);
+  size_t localWorkSize[3];
+  size_t globalWorkSize[3];
+  cl_uint work_dim;
+  if(kernel_index==3){
+    work_dim = 2;
+    localWorkSize[0] = BLOCK_SIZE;
+    globalWorkSize[0] = N/MULT;
+    localWorkSize[1] = BLOCK_SIZE;
+    globalWorkSize[1] = N/MULT;
+    localWorkSize[2] = 0;
+    globalWorkSize[2] = 0;
+  }
+  else{
+    work_dim = 2;
+    localWorkSize[0] = BLOCK_SIZE;
+    globalWorkSize[0] = sqrt(N);
+    localWorkSize[1] = BLOCK_SIZE;
+    globalWorkSize[1] = sqrt(N);
+    localWorkSize[2] = 0;
+    globalWorkSize[2] = 0;
+  }
+  clFinish(queue);
+  double t1 = time_clock();
+  // Enqueue a kernel run call
+  cl_event ndrEvt;
+  clEnqueueNDRangeKernel(queue, mat_transp_kernel, work_dim, 0,
+			 globalWorkSize, localWorkSize, 0, 0, &ndrEvt);
+  clWaitForEvents(1, &ndrEvt);
+  clFinish(queue);
+  double t2 = time_clock();
+  // Calculate performance
+  cl_ulong startTime;
+  cl_ulong endTime;
+  // Get kernel profiling info
+  clGetEventProfilingInfo(ndrEvt,
+			  CL_PROFILING_COMMAND_START,
+			  sizeof(cl_ulong),
+			  &startTime,
+);
+  clGetEventProfilingInfo(ndrEvt,
+			  CL_PROFILING_COMMAND_END,
+			  sizeof(cl_ulong),
+			  &endTime,
+);
+  double time = (double)endTime - (double)startTime;
+  printf("\nKernel execution internal: time %lf, GB/s = %lf\n",
+	 time*1.0e-9, 2.0*N*N*sizeof(float)/(time*1.0e-9)/1024/1024/1024);
+  printf("Kernel execution external: time %lf,  GB/s = %lf\n\n",
+	 t2-t1, 2.0*N*N*sizeof(float)/(t2-t1)/1024/1024/1024);
+  // Read B from device memory
+  clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, size_bytesw, C, 0, 0, 0);
+  // Free device memory
+  clReleaseMemObject(d_A);
+  clReleaseMemObject(d_B);
+  clReleaseMemObject(d_C);
+  return(0);
 }</code>
+==== Kod wogu ====
+=== server.c ====
+<code c>
+/*
+    C socket server example, handles multiple clients using threads
+    Compile
+    gcc server.c -lpthread -o server
+*/
+#include<stdio.h>
+#include<string.h>    //strlen
+#include<stdlib.h>    //strlen
+#include<sys/socket.h>
+#include<arpa/inet.h> //inet_addr
+#include<unistd.h>    //write
+#include<pthread.h> //for threading , link with lpthread
+#include <unistd.h>
+//the thread function
+void *connection_handler(void *);
+int main(int argc , char *argv[])
+{
+    int counter = 0;
+    pid_t pid = fork();
+    if (pid == 0)
+    {
+       	    int sock;
+	    struct sockaddr_in server;
+	    char message[1000] , server_reply[2000];
+            int counter;
+	    unsigned int ip[] = { 192, 168, 102, 1 };
+            char* ip_final;
+		char *buf;
+		size_t sz;
+	    //Create socket
+            for ( counter = 0; counter < 255; ++counter ) {
+		    sock = socket(AF_INET , SOCK_STREAM , 0);
+		    if (sock == -1)
+		    {
+			printf("Could not create socket");
+		    }
+		    puts("Socket created");
+		    //printf("192.168.102.%d", counter);
+		sz = snprintf(NULL, 0, "192.168.102.%d", counter);
+		buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */
+		    server.sin_addr.s_addr = inet_addr(buf);
+		    server.sin_family = AF_INET;
+		    server.sin_port = htons( 65001 );
+		    //Connect to remote server
+		    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
+		    {
+			perror("connect failed. Error");
+			continue;
+		    }
+		    puts("Connected\n");
+		    //keep communicating with server
+		    while(1)
+		    {
+			//Send some data
+			if( send(sock , "Werset 6-client" , 15 , 0) < 0)
+			{
+			    puts("Send failed");
+			    return 1;
+			}
+			//Receive a reply from the server
+			if( recv(sock , server_reply , 2000 , 0) < 0)
+			{
+			    puts("recv failed");
+			    break;
+			}
+			puts(server_reply);
+			break;
+		    }
+		    close(sock);
+           }
+    }
+    else if (pid > 0)
+    {
+        int socket_desc , client_sock , c;
+	    struct sockaddr_in server , client;
+	    //Create socket
+	    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
+	    if (socket_desc == -1)
+	    {
+		printf("Could not create socket");
+	    }
+	    puts("Socket created");
+	    //Prepare the sockaddr_in structure
+	    server.sin_family = AF_INET;
+	    server.sin_addr.s_addr = INADDR_ANY;
+	    server.sin_port = htons( 65001 );
+	    //Bind
+	    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
+	    {
+		//print the error message
+		perror("bind failed. Error");
+		return 1;
+	    }
+	    puts("bind done");
+	    //Listen
+	    listen(socket_desc , 3);
+	    //Accept and incoming connection
+	    puts("Waiting for incoming connections...");
+	    c = sizeof(struct sockaddr_in);
+	    //Accept and incoming connection
+	    puts("Waiting for incoming connections...");
+	    c = sizeof(struct sockaddr_in);
+		pthread_t thread_id;
+	    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
+	    {
+		puts("Connection accepted");
+		if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
+		{
+		    perror("could not create thread");
+		    return 1;
+		}
+		//Now join the thread , so that we dont terminate before the thread
+		//pthread_join( thread_id , NULL);
+		puts("Handler assigned");
+	    }
+	    if (client_sock < 0)
+	    {
+		perror("accept failed");
+		return 1;
+	    }
+    }
+    else
+    {
+        // fork failed
+        printf("fork() failed!\n");
+        return 1;
+    }
+    return 0;
+}
+/*
+ * This will handle connection for each client
+ * */
+void *connection_handler(void *socket_desc)
+{
+    //Get the socket descriptor
+    int sock = *(int*)socket_desc;
+    int read_size;
+    char *message , client_message[2000];
+    //Receive a message from client
+    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
+    {
+	printf("%s\n", client_message);
+	//end of string marker
+	client_message[read_size] = '\0';
+	//Send the message back to client
+	write(sock , "Werset 6" , 8);
+	//clear the message buffer
+	memset(client_message, 0, 2000);
+    }
+    if(read_size == 0)
+    {
+        puts("Client disconnected");
+        fflush(stdout);
+    }
+    else if(read_size == -1)
+    {
+        perror("recv failed");
+    }
+    return 0;
+}</code>
+=== Makefile ===
+<code bash># kompilator c
+CC = gcc
+# konsolidator
+CFLAGS=-g
+all: server client
+server: server.o
+	$(CC) $(LFLAGS) server.o -lpthread -o server
+server.o: server.c
+	$(CC) -c server.c -o server.o
+client: client.o
+	$(CC) client.o -o client
+client.o: client.c
+	$(CC) -c client.c -o client.o
+clean:
+	rm -f *.o</code>
+===== Zajęcia 9 =====
+<code c>/*
+    C socket server example, handles multiple clients using threads
+    Compile
+    gcc server.c -lpthread -o server
+*/
+#include<stdio.h>
+#include<string.h>    //strlen
+#include<stdlib.h>    //strlen
+#include<sys/socket.h>
+#include<arpa/inet.h> //inet_addr
+#include<unistd.h>    //write
+#include<pthread.h> //for threading , link with lpthread
+//the thread function
+void *connection_handler(void *);
+int main(int argc , char *argv[])
+{
+    int socket_desc , client_sock , c;
+    struct sockaddr_in server , client;
+    //Create socket
+    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
+    if (socket_desc == -1)
+    {
+        printf("Could not create socket");
+    }
+    puts("Socket created");
+    //Prepare the sockaddr_in structure
+    server.sin_family = AF_INET;
+    server.sin_addr.s_addr = INADDR_ANY;
+    server.sin_port = htons( 8888 );
+    //Bind
+    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
+    {
+        //print the error message
+        perror("bind failed. Error");
+        return 1;
+    }
+    puts("bind done");
+    //Listen
+    listen(socket_desc , 3);
+    //Accept and incoming connection
+    puts("Waiting for incoming connections...");
+    c = sizeof(struct sockaddr_in);
+    //Accept and incoming connection
+    puts("Waiting for incoming connections...");
+    c = sizeof(struct sockaddr_in);
+	pthread_t thread_id;
+    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
+    {
+        puts("Connection accepted");
+        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
+        {
+            perror("could not create thread");
+            return 1;
+        }
+        //Now join the thread , so that we dont terminate before the thread
+        //pthread_join( thread_id , NULL);
+        puts("Handler assigned");
+    }
+    if (client_sock < 0)
+    {
+        perror("accept failed");
+        return 1;
+    }
+    return 0;
+}
+/*
+ * This will handle connection for each client
+ * */
+void *connection_handler(void *socket_desc)
+{
+    //Get the socket descriptor
+    int sock = *(int*)socket_desc;
+    int read_size;
+    char *message , client_message[2000];
+    //Receive a message from client
+    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
+    {
+	//end of string marker
+	client_message[read_size] = '\0';
+	//Send the message back to client
+	write(sock , "Werset 6" , strlen(client_message));
+	//clear the message buffer
+	memset(client_message, 0, 2000);
+    }
+    if(read_size == 0)
+    {
+        puts("Client disconnected");
+        fflush(stdout);
+    }
+    else if(read_size == -1)
+    {
+        perror("recv failed");
+    }
+    return 0;
+} </code>
+==== Client C ====
+<code c>#include<stdio.h> //printf
+#include<string.h>    //strlen
+#include<sys/socket.h>    //socket
+#include<arpa/inet.h> //inet_addr
+int main(int argc , char *argv[])
+{
+    int sock;
+    struct sockaddr_in server;
+    char message[1000] , server_reply[2000];
+    //Create socket
+    sock = socket(AF_INET , SOCK_STREAM , 0);
+    if (sock == -1)
+    {
+        printf("Could not create socket");
+    }
+    puts("Socket created");
+    server.sin_addr.s_addr = inet_addr("127.0.0.1");
+    server.sin_family = AF_INET;
+    server.sin_port = htons( 8888 );
+    //Connect to remote server
+    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
+    {
+        perror("connect failed. Error");
+        return 1;
+    }
+    puts("Connected\n");
+    //keep communicating with server
+    while(1)
+    {
+        printf("Enter message : ");
+        scanf("%s" , message);
+        //Send some data
+        if( send(sock , message , strlen(message) , 0) < 0)
+        {
+            puts("Send failed");
+            return 1;
+        }
+        //Receive a reply from the server
+        if( recv(sock , server_reply , 2000 , 0) < 0)
+        {
+            puts("recv failed");
+            break;
+        }
+        puts("Server reply :");
+        puts(server_reply);
+    }
+    close(sock);
+    return 0;
+}</code>
+==== Kod Maciek Gonzo itp ====
+<code c>
+#include<stdio.h>
+#include<string.h>    //strlen
+#include<stdlib.h>    //strlen
+#include<sys/socket.h>
+#include<arpa/inet.h> //inet_addr
+#include<unistd.h>    //write
+#include<pthread.h> //for threading , link with lpthread
+//the thread function
+void *connection_handler(void *);
+void *client_function(void *socket_desc){
+  int sock;
+    struct sockaddr_in server_id;
+    char message[1000] , server_reply[2000];
+    //Create socket
+    sock = socket(AF_INET , SOCK_STREAM , 0);
+    if (sock == -1)
+    {
+        printf("Could not create socket");
+    }
+    puts("Socket created");
+     // 192.168.102.63 wogu
+    server_id.sin_addr.s_addr = inet_addr("localhost");
+    server_id.sin_family = AF_INET;
+    server_id.sin_port = htons( 65000 );
+    //Connect to remote server
+    if (connect(sock , (struct sockaddr *)&server_id , sizeof(server_id)) < 0)
+    {
+        perror("connect failed. Error");
+    }else{
+    puts("Connected\n");
+    //keep communicating with server
+    while(1)
+    {
+        printf("Enter message : ");
+        scanf("%s" , message);
+        //Send some data
+        if( send(sock , message , strlen(message) , 0) < 0)
+        {
+            puts("Send failed");
+        }
+        //Receive a reply from the server
+        if( recv(sock , server_reply , 2000 , 0) < 0)
+        {
+            puts("recv failed");
+            break;
+        }
+        puts("Server reply :");
+        puts(server_reply);
+    }
+    close(sock);
+}
+};
+void *server_function(void * sock){
+    printf("SERVER Start");
+fflush(stdout);
+    int socket_desc , client_sock , c;
+    pthread_t thread_id;
+    struct sockaddr_in server , client;
+    //Create socket
+    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
+    if (socket_desc == -1)
+    {
+        printf("Could not create socket");
+    }
+    puts("Socket created");
+    //Prepare the sockaddr_in structure
+    server.sin_family = AF_INET;
+    server.sin_addr.s_addr = INADDR_ANY;
+    server.sin_port = htons( 65000);
+    //Bind
+    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
+    {
+        //print the error message
+        perror("bind failed. Error");
+        return;
+    }
+    puts("bind done");
+    //Listen
+    listen(socket_desc , 3);
+    //Accept and incoming connection
+    puts("Waiting for incoming connections...");
+    c = sizeof(struct sockaddr_in);
+fflush(stdout);
+    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
+    {
+        puts("Connection accepted");
+        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
+        {
+            perror("could not create thread");
+            return;
+        }
+        //Now join the thread , so that we dont terminate before the thread
+        //pthread_join( thread_id , NULL);
+        puts("Handler assigned");
+    }
+    if (client_sock < 0)
+    {
+        perror("accept failed");
+        return;
+    }
+}
+int main(int argc , char *argv[])
+{
+    pthread_t thread_id;
+//Create server
+        if( pthread_create( &thread_id , NULL ,  server_function , NULL) != 0)
+        {
+            perror("could not create server thread");
+	    fflush(stdout);
+            return 1;
+        }
+    pthread_t thread_id2;
+	// Create client
+        if( pthread_create( &thread_id2 , NULL ,  client_function , NULL) != 0)
+        {
+            perror("could not create client thread");
+	    fflush(stdout);
+            return 1;
+        }
+    return 0;
+}
+/*
+ * This will handle connection for each client
+ * */
+void *connection_handler(void *socket_desc)
+{
+    //Get the socket descriptor
+    int sock = *(int*)socket_desc;
+    int read_size;
+    char *message , client_message[2000];
+    //Receive a message from client
+    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
+    {
+        //end of string marker
+		client_message[read_size] = '\0';
+		//Send the message back to client
+        write(sock , "Werset 5" , strlen(client_message));
+		//clear the message buffer
+		memset(client_message, 0, 2000);
+    }
+    if(read_size == 0)
+    {
+        puts("Client disconnected");
+        fflush(stdout);
+    }
+    else if(read_size == -1)
+    {
+        perror("recv failed");
+    }
+    return 0;
+}
+</code>

WoGu Wiki

Narzędzia użytkownika

Narzędzia witryny

Różnice

Narzędzia strony