Narzędzia użytkownika

Narzędzia witryny


studia:magisterskie:1sem:systemy_rownolegle_i_rozproszone

Różnice

Różnice między wybraną wersją a wersją aktualną.

Odnośnik do tego porównania

Poprzednia rewizja po obu stronach Poprzednia wersja
Nowa wersja
Poprzednia wersja
studia:magisterskie:1sem:systemy_rownolegle_i_rozproszone [2016/03/10 11:10]
149.156.112.6
studia:magisterskie:1sem:systemy_rownolegle_i_rozproszone [2016/05/19 11:13] (aktualna)
149.156.112.6 [tmp/opencl_mat_transp/mat_transp/mat_transp.c]
Linia 508: Linia 508:
 ===== Zajęcia 3 ===== ===== Zajęcia 3 =====
 ==== child.c ==== ==== child.c ====
-<​code>​MPI_Status status;  +<​code ​c># include <​stdlib.h
-  ​double a = 0.0// lewy brzeg przedzialu +# include <stdio.h> 
-  double ​b = 1.0; // prawy brzeg przedzialu +# include <​math.h>​ 
-  int i;  ​+ 
 +# include "​mpi.h"​ 
 + 
 +int main ( int argc, char *argv[] )
 +void heat_part ( int n, int p, int id, double ​x_min, double x_max ); 
 + 
 +/******************************************************************************
 + 
 +int main ( int argc, char *argv[] ) 
 +{
   int id; // rank   int id; // rank
   int n;  // liczba punktow dla kazdego wezla   int n;  // liczba punktow dla kazdego wezla
Linia 517: Linia 526:
   double x_max;   double x_max;
   double x_min;   double x_min;
 +  MPI_Comm parentcomm; ​
 +  MPI_Status status;
  
   MPI_Init ( &argc, &argv );   MPI_Init ( &argc, &argv );
-MPI_Comm_get_parent(&​parentcomm);​ 
-if (parentcomm == MPI_COMM_NULL) { 
-//rodzic 
-}    ​ 
-else{ 
-//dziecko 
-printf("​Proces dzidzia\n"​);​ 
- ​MPI_Comm_rank ( MPI_COMM_WORLD,​ &id ); 
- ​MPI_Comm_size ( MPI_COMM_WORLD,​ &p ); 
-//​double ​ 
  
- ​MPI_Recv(&​n, 1, MPI_INT, 0, 1, parentcomm, &​status);​ +  MPI_Comm_rank ( MPI_COMM_WORLD,​ &id ); 
- ​MPI_Recv(&​x_min, 1, MPI_DOUBLE, 0, 2, parentcomm, &​status);​ + 
- ​MPI_Recv(&​x_max, 1, MPI_DOUBLE, 0, 3, parentcomm, &​status);​+  MPI_Comm_size ( MPI_COMM_WORLD,​ &p ); 
 + 
 +  MPI_Comm_get_parent(&​parentcomm);​ 
 + 
 +  ​MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status ); 
 + 
 +  ​MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status ); 
 +  MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &​status ​); 
 + 
 +  //​MPI_Barrier(MPI_COMM_WORLD);
  
-// printf("​xmin:​ %lf, xmax: %lf\n",​x_min,​ x_max); 
   heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla   heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
-}+
   MPI_Finalize ( );   MPI_Finalize ( );
  
- // ​return ​p;+  ​return ​0;
 } }
  
Linia 581: Linia 590:
   for ( i = 0; i <= n + 1; i++ )   for ( i = 0; i <= n + 1; i++ )
   {   {
-    x[i] = ( ( double ) (         i ) * x_max   ​ +    x[i] = ( ( double ) (         i ) * x_max   
-           + ( double ) ( n + 1 - i ) * x_min ) +           + ( double ) ( n + 1 - i ) * x_min )
            / ( double ) ( n + 1     );            / ( double ) ( n + 1     );
   }   }
Linia 621: Linia 630:
     tag = 2;     tag = 2;
  
-    ​if ( 0 < id ) + if ( 0 < id ) { 
-    ​+    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); 
-      // DO UZUPELNIENIA +  
- MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); +  
-    +  if ( id < p - 1 ) { 
- +    MPI_Recv ( &​h[n+1],​ 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD,​ &status );     ​ 
-    if ( id < p - 1 ) +  }
-    ​+
-      // DO UZUPELNIENIA ​   +
-  MPI_Recv ( &​h[n+1],​ 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD,​ &status ); +
-    }+
  
  
Linia 637: Linia 642:
     for ( i = 1; i <= n; i++ )     for ( i = 1; i <= n; i++ )
     {     {
-      h_new[i] = h[i] + t_del * (  +      h_new[i] = h[i] + t_del * ( 
-        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del +        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
         + 2.0 * sin ( x[i] * t ) );         + 2.0 * sin ( x[i] * t ) );
     }     }
  
     // nowa chwila czasu     // nowa chwila czasu
-    t = ( ( double ) (         j - j_min ) * t_max   ​ +    t = ( ( double ) (         j - j_min ) * t_max   
-            + ( double ) ( j_max - j         ) * t_min ) +            + ( double ) ( j_max - j         ) * t_min )
             / ( double ) ( j_max     - j_min );             / ( double ) ( j_max     - j_min );
  
Linia 659: Linia 664:
   // koncowa wymiana informacji z wezlami sasiednimi   // koncowa wymiana informacji z wezlami sasiednimi
   tag = 11;   tag = 11;
-  ​+ 
   if ( id < p - 1 ) {   if ( id < p - 1 ) {
     MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );     MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
   }   }
-  ​+ 
   if ( 0 < id ) {   if ( 0 < id ) {
     MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD,​ &status );     MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD,​ &status );
   }   }
-  ​+ 
   tag = 12;   tag = 12;
-  ​+ 
   if ( 0 < id ) {   if ( 0 < id ) {
     // DO UZUPELNIENIA     // DO UZUPELNIENIA
     MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );     MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
   }   }
-  ​+ 
   if ( id < p - 1 ) {   if ( id < p - 1 ) {
     // DO UZUPELNIENIA     // DO UZUPELNIENIA
-    MPI_Recv ( &​h[n+1],​ 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD,​ &status );      +    MPI_Recv ( &​h[n+1],​ 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD,​ &status );     ​
   }   }
-  ​+ 
   wtime = MPI_Wtime ( ) - wtime;   wtime = MPI_Wtime ( ) - wtime;
  
Linia 711: Linia 716:
  
 ==== parent.c ==== ==== parent.c ====
-<​code>​+<​code ​c>
 # include <​stdlib.h>​ # include <​stdlib.h>​
 # include <​stdio.h>​ # include <​stdio.h>​
Linia 717: Linia 722:
  
 # include "​mpi.h"​ # include "​mpi.h"​
 +
 +int main ( int argc, char *argv[] );
  
 /​******************************************************************************/​ /​******************************************************************************/​
Linia 722: Linia 729:
 int main ( int argc, char *argv[] ) int main ( int argc, char *argv[] )
 { {
- int np = 8; 
- int errcodes[8];​  
- MPI_Comm parentcomm, intercomm; 
   double a = 0.0; // lewy brzeg przedzialu   double a = 0.0; // lewy brzeg przedzialu
   double b = 1.0; // prawy brzeg przedzialu   double b = 1.0; // prawy brzeg przedzialu
-  int i;  +  int i; 
   int id; // rank   int id; // rank
   int n;  // liczba punktow dla kazdego wezla   int n;  // liczba punktow dla kazdego wezla
   int p;  // size   int p;  // size
 +  int np = 8; // liczba procesow
   double x_max;   double x_max;
   double x_min;   double x_min;
 +  MPI_Comm komunikator;​
 +  int *errcodes;
  
   MPI_Init ( &argc, &argv );   MPI_Init ( &argc, &argv );
-MPI_Comm_get_parent(&​parentcomm);​ + 
-if (parentcomm == MPI_COMM_NULL) { +  //​MPI_Comm_rank ( MPI_COMM_WORLD,​ &id );
-//rodzic +
-MPI_Comm_spawn("​heat",​ MPI_ARGV_NULL,​ np,​MPI_INFO_NULL,​ 0, MPI_COMM_WORLD,​ &​intercomm,​ errcodes ); +
-printf("​Proces rodzic\n"​);​ +
- //​MPI_Comm_rank ( MPI_COMM_WORLD,​ &id );+
  
   //​MPI_Comm_size ( MPI_COMM_WORLD,​ &p );   //​MPI_Comm_size ( MPI_COMM_WORLD,​ &p );
  
- int p = np+  MPI_Comm_spawn("​dziecko",​ MPI_ARGV_NULL, ​np, MPI_INFO_NULL, ​0, MPI_COMM_WORLD,​ &​komunikator,​ errcodes);
- int j=0;+
  
-for(j=0; j<p; j++){ +  n = 12; // liczba punktow dla kazdego wezla 
-  n = 12; // liczba punktow dla kazdego wezla  +  int j; 
-  ​i = 0;  // poczatkowa chwila czasu +  ​for(j=0;​j<​np;j++){ 
-  // wspolrzedna lewego punktu dla wezla id +   ​i = 0;  // poczatkowa chwila czasu 
-  x_min = ( ( double )( * n + 1 - j * n - i ) * a   ​ +    // wspolrzedna lewego punktu dla wezla id 
-          + ( double )(             j * n + i ) * b )  +    x_min = ( ( double )( np * n + 1 - j * n - i ) * a   
-          / ( double ) ( * n + 1              );+          + ( double )(             j * n + i ) * b ) 
 +          / ( double ) ( np * n + 1              );
  
-  ​i = n + 1;+    ​i = n + 1;
  
-  ​// wspolrzedna prawego punktu dla wezla id +    ​// wspolrzedna prawego punktu dla wezla id 
-  x_max = ( ( double )( * n + 1 - j * n - i ) * a   ​ +    x_max = ( ( double )( np * n + 1 - j * n - i ) * a   
-          + ( double )(             j * n + i ) * b )  +          + ( double )(             j * n + i ) * b ) 
-          / ( double )( * n + 1              );+          / ( double )( np * n + 1              ​); 
 +  
 +    MPI_Send ( &x_min, 1, MPI_DOUBLE, j, 1, komunikator ); 
 +    MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 2, komunikator ); 
 +    MPI_Send ( &n, 1, MPI_INT, j, 3, komunikator ​);
  
- //​printf("​x_min %d , x_max %d\n",​x_min,​ x_max); +  }
-  //heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla +
-MPI_Send ( &n, 1, MPI_INT, j, 1, intercomm ); +
-MPI_Send ( &x_min, 1, MPI_DOUBLE, j , 2, intercomm ); +
-MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 3, intercomm ); +
-  }  +
-else{ +
-//dziecko +
-printf("​Proces dzidzia\n"​);​+
  
-} 
   MPI_Finalize ( );   MPI_Finalize ( );
  
- // ​return ​p;+  ​return ​0;
 } }
 +
 +
 +
 +
 +
 +
 +    struct rekord wys;
 +    MPI_Datatype rekord_typ;
 +    int tab_dlug_blokow[3] = {1, 1, 1};
 +    MPI_Datatype tab_typow[3] = {MPI_DOUBLE,​ MPI_DOUBLE, MPI_INT};
 +    MPI_Aint podstawa, tab_odstepow[3];​
 +
 +    MPI_Get_address(&​wys.x_min,​ &​tab_odstepow[0]);​
 +    MPI_Get_address(&​wys.x_max,​ &​tab_odstepow[1]);​
 +    MPI_Get_address(&​wys.n,​ &​tab_odstepow[2]);​
 +
 +    MPI_Type_struct(3,​ tab_dlug_blokow,​ tab_odstepow,​ tab_typow, &​rekord_typ);​
 +    MPI_Type_commit(&​rekord_typ);​
 +    MPI_Recv ( &wys, 1, rekord_typ, 0, 1, parentcomm, &status );
 </​code>​ </​code>​
  
-==== root_proc.c ​==== +====== Zajęcia 4 ====== 
-<​code ​c># include <​stdlib.h>​ +wkrótce 
-# include ​<stdio.h> +====== Zajęcia 5 ====== 
-include <​math.h>​ +==== Makefile ​==== 
-include ​"mpi.h"+<​code ​bash># optimization and other system dependent options 
 +#​include ​ make.$(SRR_ARCH) 
 +or directly 
 +include ​ make.lab_404_NVIDIA
  
-#​define ​NUM_SPAWNS ​1+NAME = Hello_GPU 
 + 
 +program: main.o  
 + $(CC) $(LDFL) main.o ​ $(LIB) -o $(NAME) 
 + 
 +main.o: main.c  
 + $(CC) $(CFL) -c main.c ​ $(INC) -o main.o 
 + 
 +clean: 
 + rm -f obj/* 
 + rm -f $(NAME) 
 +</​code>​ 
 +==== make.lab_404_NVIDIA ==== 
 +<code bash># C compiler 
 +#CC = icc 
 +CC = gcc 
 + 
 +# C++ compiler 
 +CPPC = icpc 
 +#CPPC = g++ 
 + 
 +# Loader (to link C/C++ and Fortran libraries) 
 +LD = icpc 
 +#LD = g++ 
 + 
 +# Archiver 
 +AR = ar r 
 + 
 +# For removing files 
 +RM = rm -f 
 + 
 +# Include directories 
 +INC = -I/​opt/​cuda7/​include 
 + 
 +# Standard and/or local libraries 
 +LIB = -L/​opt/​cuda7/​lib64 -lOpenCL  
 + 
 +# C optimization and other flags 
 +#CFL(icc) = -O3 -openmp 
 +#CFL(gcc) = -O3 -fopenmp 
 +#​CFL(icc,​debug) = -g 
 +#CFL(gcc) = -g 
 +CFL = 
 + 
 +# Loader optimization and other flags 
 +#​LDFL(debug) = -g  
 +#​LDFL(profile) = -p                                 
 +LDFL =  
 + 
 +</​code>​ 
 +==== main.c ==== 
 +<code c>#​include<​stdlib.h>​ 
 +#​include<​stdio.h>​ 
 + 
 +#include <​CL/​cl.h>​ 
 + 
 +// functions to display platform and device properties 
 +void DisplayPlatformInfo( 
 + cl_platform_id id,  
 + cl_platform_info name, 
 + char* str) 
 +
 +  cl_int retval; 
 +  size_t paramValueSize;​ 
 +   
 +  retval = clGetPlatformInfo( 
 +      id, 
 +      ​name,​ 
 +      0, 
 +      ​NULL,​ 
 +      &​paramValueSize);​ 
 +  if (retval != CL_SUCCESS){ 
 +    printf("​Failed to find OpenCL platform %s.\n",​ str); 
 +    return; 
 +  } 
 +   
 +  char * info = (char *)malloc(sizeof(char) * paramValueSize);​ 
 +  retval = clGetPlatformInfo( 
 +      id, 
 +      ​name,​ 
 +      ​paramValueSize,​ 
 +      ​info,​ 
 +      ​NULL);​ 
 +  if (retval != CL_SUCCESS) ​ { 
 +    printf("​Failed to find OpenCL platform %s.\n",​ str); 
 +    return; 
 +  } 
 +   
 +  printf("​\t%s:​\t%s\n",​ str, info ); 
 +  free(info);  
 +
 + 
 +void DisplayDeviceInfo_char( 
 +        ​cl_device_id id,  
 +        ​cl_device_info name, 
 +        char* str) 
 +
 +  cl_int retval; 
 +  size_t paramValueSize;​ 
 +   
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    0, 
 +    ​NULL,​ 
 +    &​paramValueSize);​ 
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 +   
 +  char * info = (char *)malloc(sizeof(char) * paramValueSize);​ 
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    ​paramValueSize,​ 
 +    ​info,​ 
 +    ​NULL);​ 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 + 
 +  printf("​\t\t%s:​\t%s\n",​ str, info ); 
 +  free(info);​ 
 +}; 
 + 
 +void DisplayDeviceInfo_ulong( 
 +        ​cl_device_id id,  
 +        ​cl_device_info name, 
 +        char* str) 
 +
 +  cl_int retval; 
 +  size_t paramValueSize;//​ = sizeof(cl_ulong);​ 
 +   
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    0, 
 +    ​NULL,​ 
 +    &​paramValueSize);​ 
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 +   
 +  cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize);​ 
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    ​paramValueSize,​ 
 +    ​info,​ 
 +    ​NULL);​ 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 + 
 +  printf("​\t\t%s:​\t%lu MB\n", str, info[0] / 1024 / 1024 ); 
 +  free(info);​ 
 +}; 
 + 
 +void DisplayDeviceInfo_uint( 
 +        ​cl_device_id id,  
 +        ​cl_device_info name, 
 +        char* str) 
 +
 +  cl_int retval; 
 +  size_t paramValueSize;//​ = sizeof(cl_ulong);​ 
 +   
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    0, 
 +    ​NULL,​ 
 +    &​paramValueSize);​ 
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 +   
 +  cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize);​ 
 +  retval = clGetDeviceInfo( 
 +    id, 
 +    ​name,​ 
 +    ​paramValueSize,​ 
 +    ​info,​ 
 +    ​NULL);​ 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to find OpenCL device info %s.\n",​ str); 
 +    return; 
 +  } 
 + 
 +  printf("​\t\t%s:​\t%d MHz\n",​ str, info[0] ); 
 +  free(info);​ 
 +}; 
 + 
 + 
 +// main program controlling execution of CPU code and OpenCL kernels 
 +int main(int argc, char** argv) 
 +
 +  cl_uint number_of_contexts = 2; 
 +  cl_context context = NULL; 
 +  cl_context list_of_contexts[2] = {0,0}; 
 +  cl_command_queue commandQueue = 0; 
 +  cl_program program = 0; 
 +  cl_uint number_of_devices;​ 
 +  cl_device_id device = 0; 
 +  cl_device_id *list_of_devices;​ 
 +  cl_device_type type; 
 +  cl_kernel kernel = 0; 
 +  cl_mem memObjects[3] = { 0, 0, 0 }; 
 +  cl_int retval; 
 +  int icon, idev; 
 +  cl_uint numPlatforms;​ 
 +  cl_platform_id * platformIds;​ 
 +  cl_uint i,j; 
 + 
 +  // flag to control displaying 
 +  int Monitor = 1; 
 + 
 + 
 +  // Create OpenCL contexts  
 + 
 +  // First, query the total number of platforms 
 +  retval = clGetPlatformIDs(0,​ (cl_platform_id *) NULL, &​numPlatforms);​ 
 + 
 +  // Next, allocate memory for the installed plaforms, and qeury  
 +  // to get the list. 
 +  platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);​ 
 + 
 +  // Then, query the platform IDs 
 +  retval = clGetPlatformIDs(numPlatforms,​ platformIds,​ NULL); 
 + 
 +  if(Monitor>​=0){ 
 +    printf("​Number of platforms: \t%d\n",​ numPlatforms);​  
 +  } 
 + 
 +  // Iterate through the list of platforms displaying associated information 
 +  for (i = 0; i < numPlatforms;​ i++) { 
 + 
 +    if(Monitor>​0){ 
 + 
 +      printf("​Platform ID - %d\n",​i);​ 
 +      // First we display information associated with the platform 
 +      DisplayPlatformInfo( 
 + platformIds[i],​  
 + CL_PLATFORM_NAME,​  
 + "​CL_PLATFORM_NAME"​);​ 
 +      DisplayPlatformInfo( 
 + platformIds[i],​  
 + CL_PLATFORM_PROFILE,​  
 + "​CL_PLATFORM_PROFILE"​);​ 
 +      DisplayPlatformInfo( 
 + platformIds[i],​  
 + CL_PLATFORM_VERSION,​  
 + "​CL_PLATFORM_VERSION"​);​ 
 +      DisplayPlatformInfo( 
 + platformIds[i],​  
 + CL_PLATFORM_VENDOR,​  
 + "​CL_PLATFORM_VENDOR"​);​ 
 +    } 
 + 
 +  } 
 + 
 +  // For the first platform 
 +  int iplat; 
 +  printf("​Select Platform ID: "); scanf("​%d",​ &​iplat);​ 
 + 
 +  // Query the set of devices associated with the platform 
 +  retval = clGetDeviceIDs( 
 +   platformIds[iplat],​ 
 +   CL_DEVICE_TYPE_ALL,​ 
 +   0, 
 +   NULL, 
 +   &​number_of_devices);​ 
 +   
 +   
 +  list_of_devices =  
 +    (cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices);​ 
 +   
 +  retval = clGetDeviceIDs( 
 +   platformIds[iplat],​ 
 +   CL_DEVICE_TYPE_ALL,​ 
 +   number_of_devices,​ 
 +   list_of_devices,​ 
 +   NULL); 
 +   
 +  if(Monitor>​=0){ 
 +    printf("​Number of devices: \t%d\n",​ number_of_devices);​  
 +  } 
 + 
 +  // Iterate through each device, displaying associated information 
 +  for (j = 0; j < number_of_devices;​ j++) { 
 +     
 +    clGetDeviceInfo(list_of_devices[j],​ CL_DEVICE_TYPE,​  
 +     sizeof(cl_device_type),​ &type, NULL); 
 +     
 +    if(Monitor>​0){ 
 +       
 +      DisplayDeviceInfo_char( 
 + list_of_devices[j],​  
 + CL_DEVICE_NAME,​  
 + "​CL_DEVICE_NAME"​);​ 
 +       
 +      DisplayDeviceInfo_char( 
 + list_of_devices[j],​  
 + CL_DEVICE_VENDOR,​  
 + "​CL_DEVICE_VENDOR"​);​ 
 +       
 +      DisplayDeviceInfo_char( 
 + list_of_devices[j],​  
 + CL_DEVICE_VERSION,​  
 + "​CL_DEVICE_VERSION"​);​ 
 + 
 +      DisplayDeviceInfo_ulong( 
 + list_of_devices[j],​  
 + CL_DEVICE_GLOBAL_MEM_SIZE,​  
 + "​CL_DEVICE_GLOBAL_MEM_SIZE"​);​ 
 + 
 +      DisplayDeviceInfo_uint( 
 + list_of_devices[j],​  
 + CL_DEVICE_MAX_CLOCK_FREQUENCY,​  
 + "​CL_DEVICE_MAX_CLOCK_FREQUENCY"​);​ 
 +      printf("​\n"​);​ 
 +    } 
 +  } 
 +   
 +  // Next, create OpenCL contexts on platforms 
 +  cl_context_properties contextProperties[] = { 
 +    CL_CONTEXT_PLATFORM,​ 
 +    (cl_context_properties)platformIds[iplat],​ 
 +    0 
 +  }; 
 +   
 +  if(Monitor>​0){ 
 +    printf("​Creating CPU context %d on platform %d\n", 1, iplat); 
 +  } 
 +   
 +  list_of_contexts[1] =  
 +    clCreateContextFromType(contextProperties,​  
 +     CL_DEVICE_TYPE_CPU,​ NULL, NULL, &​retval);​ 
 +   
 +  if(Monitor>​=0 && retval != CL_SUCCESS){ 
 +    printf("​Could not create CPU context on platform %d\n", i); 
 +  } 
 +   
 +  if(Monitor>​0){ 
 +    printf("​Creating GPU context 0 on platform %d\n", iplat); 
 +  } 
 +   
 +  list_of_contexts[0] =  
 +    clCreateContextFromType(contextProperties,​  
 +     CL_DEVICE_TYPE_GPU,​ NULL, NULL, &​retval);​ 
 +   
 +  if(Monitor>​=0 && retval != CL_SUCCESS){ 
 +    printf("​Could not create GPU context on platform %d\n", i); 
 +  } 
 +   
 +  // in a loop over devices of the seleceted platform ​   
 +  for(idev=0; idev<​number_of_devices;​idev++){ 
 +     
 +    if(Monitor>​0){ 
 +      printf("​\nFor context %d and device %d:​\n",​ 
 +      idev, idev); 
 +    } 
 +    device = list_of_devices[idev];​ 
 +    icon = idev; 
 +     
 +    // choose OpenCL context on first available platform 
 +    context = list_of_contexts[icon];​ 
 +     
 +    if(context !=0){ 
 +             
 +      commandQueue = clCreateCommandQueue(context,​ device, 0, NULL); 
 +      if (commandQueue == NULL) { 
 + printf("​Failed to create commandQueue for device %d\n", idev); 
 + exit(0); 
 +      } 
 +       
 +      if(Monitor>​0){ 
 + printf("​Reading program from source\n"​);​ 
 +      } 
 + 
 +      // read source code from file 
 +      FILE *fp; 
 +      char* source; 
 +      long int size; 
 +       
 +      fp = fopen("​HelloWorld.cl",​ "​rb"​);​ 
 +      if(!fp) { 
 + printf("​Could not open kernel file\n"​);​ 
 + exit(-1);​ 
 +      } 
 +      int status = fseek(fp, 0, SEEK_END);​ 
 +      if(status != 0) { 
 + printf("​Error seeking to end of file\n"​);​ 
 + exit(-1);​ 
 +      } 
 +      size = ftell(fp);​ 
 +      if(size < 0) { 
 + printf("​Error getting file position\n"​);​ 
 + exit(-1);​ 
 +      } 
 +       
 +      rewind(fp);​ 
 +       
 +      source = (char *)malloc(size + 1); 
 +       
 +      int i; 
 +      for (i = 0; i < size+1; i++) { 
 + source[i]='​\0';​ 
 +      } 
 +       
 +      if(source == NULL) { 
 + printf("​Error allocating space for the kernel source\n"​);​ 
 + exit(-1);​ 
 +      } 
 +       
 +      fread(source,​ 1, size, fp); 
 +      source[size] = '​\0';​ 
 + 
 +      const char* source_const = source; 
 +      cl_program program = clCreateProgramWithSource(context,​ 1, 
 +  ​    &​source_const,​ 
 +  ​    NULL, NULL); 
 +      if (program == NULL) 
 +
 +   printf("​Failed to create CL program from source.\n"​);​ 
 +   exit(-1); 
 +
 +       
 +      if(Monitor>​0){ 
 + printf("​Creating program and kernel\n"​);​ 
 +      } 
 +      // build program (passing options to compiler if necessary 
 +      retval = clBuildProgram(program,​ 0, NULL, NULL, NULL, NULL); 
 +      char* buildLog; size_t size_of_buildLog;​  
 +      clGetProgramBuildInfo(program,​ device, CL_PROGRAM_BUILD_LOG,​  
 +     0, NULL, &​size_of_buildLog);​  
 +      buildLog = malloc(size_of_buildLog+1);​  
 +      clGetProgramBuildInfo(program,​ device, CL_PROGRAM_BUILD_LOG,​  
 +     size_of_buildLog,​ buildLog, NULL);  
 +      buildLog[size_of_buildLog]= '​\0';​  
 +      printf("​Kernel buildLog: %s\n", buildLog);  
 +      if (retval != CL_SUCCESS) 
 +
 +   printf("​Error in kernel\n"​);​ 
 +   clReleaseProgram(program);​ 
 +   exit(-1); 
 +
 +       
 +       
 +      // Create OpenCL kernel 
 +      kernel = clCreateKernel(program,​ "​hello_kernel",​ NULL); 
 +      if (kernel == NULL) 
 +
 +   printf("​Failed to create kernel.\n"​);​ 
 +   exit(0); 
 +
 +       
 +      if(Monitor>​0){ 
 + printf("​Creating memory objects\n"​);​ 
 +      } 
 +      // Create memory objects that will be used as arguments to 
 +      // kernel. ​ First create host memory arrays that will be 
 +      // used to store the arguments to the kernel 
 +      float result[1];​ 
 +      float a[1]; 
 +      float b[1]; 
 +      a[0] = 2; 
 +      b[0] = 2; 
 +       
 +      memObjects[0] = clCreateBuffer(context,​ CL_MEM_READ_ONLY , 
 +      ​sizeof(float),​ NULL, NULL); 
 +      memObjects[1] = clCreateBuffer(context,​ CL_MEM_READ_ONLY , 
 +      ​sizeof(float),​ NULL, NULL); 
 +      memObjects[2] = clCreateBuffer(context,​ CL_MEM_READ_WRITE,​ 
 +      ​sizeof(float),​ NULL, NULL); 
 +       
 +      if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){ 
 + printf("​Error creating memory objects.\n"​);​ 
 + return 0; 
 +      } 
 +       
 +      if(Monitor>​0){ 
 + printf("​Sending kernel arguments\n"​);​ 
 +      } 
 +      retval = clEnqueueWriteBuffer( 
 +     commandQueue,​  
 +     memObjects[0],​  
 +     CL_FALSE,  
 +     0,  
 +     sizeof(float), ​                         
 +     a,  
 +     0,  
 +     NULL,  
 +     NULL); 
 +       
 +      // Use clEnqueueWriteBuffer() to write input array B to  
 +      // the device buffer bufferB 
 +      retval = clEnqueueWriteBuffer( 
 +     commandQueue,​  
 +     memObjects[1],​  
 +     CL_FALSE,  
 +     0,  
 +     sizeof(float), ​                         
 +     b, 
 +     0,  
 +     NULL,  
 +     NULL); 
 +       
 +      // Set the kernel arguments (result, a, b) 
 +      retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +      retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +      retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +      if (retval != CL_SUCCESS) 
 +
 +   printf("​Failed to Set the kernel arguments.\n"​);​ 
 +   //​Cleanup(context,​ commandQueue,​ program, kernel, memObjects);​ 
 +   return 1; 
 +
 +       
 +      if(Monitor>​0){ 
 + printf("​Running the kernel!\n"​);​ 
 +      } 
 +      size_t globalWorkSize[1] = { 1 }; 
 +      size_t localWorkSize[1] = { 1 }; 
 +       
 +      // Queue the kernel up for execution across the array 
 +      retval = clEnqueueNDRangeKernel(commandQueue,​ kernel, 1, NULL, 
 +       globalWorkSize,​ localWorkSize,​ 
 +       0, NULL, NULL); 
 +      if (retval != CL_SUCCESS) 
 +
 +   printf("​Failed to queue kernel for execution.\n"​);​ 
 +   //​Cleanup(context,​ commandQueue,​ program, kernel, memObjects);​ 
 +   return 1; 
 +
 +       
 +      if(Monitor>​0){ 
 + printf("​Transfering back results\n"​);​ 
 +      } 
 +      // Read the output buffer back to the Host 
 +      retval = clEnqueueReadBuffer(commandQueue,​ memObjects[2],​ CL_TRUE, 
 +    0, sizeof(float),​ result, 
 +    0, NULL, NULL); 
 +      if (retval != CL_SUCCESS) 
 +
 +   printf("​Failed to read result buffer.\n"​);​ 
 +   //​Cleanup(context,​ commandQueue,​ program, kernel, memObjects);​ 
 +   return 1; 
 +
 +       
 +      // Verify the output 
 +      if(result[0]==4) ​ { 
 +        printf("​Output is correct: %lf + %lf = %lf\n",​ 
 +        a[0], b[0], result[0]);​ 
 +      } else { 
 +        printf("​Output is incorrect: %lf + %lf != %lf\n",​ 
 +        a[0], b[0], result[0]);​ 
 +      } 
 +       
 +      for (i = 0; i < 3; i++) 
 +      { 
 +        if (memObjects[i] != 0) 
 +            clReleaseMemObject(memObjects[i]);​ 
 +      } 
 +      if (commandQueue != 0) 
 +        clReleaseCommandQueue(commandQueue);​ 
 +       
 +      if (kernel != 0) 
 +        clReleaseKernel(kernel);​ 
 +       
 +      if (program != 0) 
 +        clReleaseProgram(program);​ 
 +       
 +    } 
 +  } 
 +   
 +  free(list_of_devices);​ 
 +  free(platformIds);​ 
 +   
 +  return 0; 
 +
 +</​code>​ 
 +==== HelloWorld.cl ==== 
 +<code c> 
 +__kernel void hello_kernel(__global const float *a, 
 +            ​__global const float *b, 
 +    ​__global float *result) 
 +
 +    int gid = get_global_id(0);​ 
 + 
 +    if(gid<​1) result[gid] = a[gid] + b[gid]; 
 +}</​code>​ 
 +===== Zajęcia 6 ===== 
 +Pliki zadania: {{:​studia:​magisterskie:​1sem:​opencl_vecadd.tgz|}} 
 +Działające 3 kernele: {{:​studia:​magisterskie:​1sem:​sala404.zip|}} 
 + 
 +==== opencl_vecadd/​vecadd_host/​execute_kernels.c ==== 
 +<code c>#​include<​stdlib.h>​ 
 +#​include<​stdio.h>​ 
 +#include <​math.h>​ 
 + 
 +#​include<​omp.h>​ 
 +#include <​CL/​cl.h>​ 
 + 
 +#​include"​uth_ocl_intf.h"​ 
 + 
 +#​include"​./​vecadd_host.h"​ 
 +#​include"​./​hardware_params.h"​ 
 + 
 +#​define ​time_measurments 
 + 
 +#ifdef time_measurments 
 +#​include"​uth_time_intf.h"​ 
 +  static double t_begin, t_end, t_total; 
 +#endif 
 + 
 +int execute_kernel_2_CPU( 
 + int platform_index,​ 
 + int device_index,​ 
 + int kernel_index,​ 
 + int array_size,​ 
 + cl_mem* memObjects,​ 
 + int monitor 
 +
 +
 +   
 +  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  int i,j,k; 
 +   
 +   
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +   
 +  // create the kernel for CPU 
 +  utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_CPU,​ kernel_index,​ 
 +   // kernel name:         , file: 
 +   "​vecadd_2_blocks_kernel",​ "​vecadd_2_blocks.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. Restoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL){  
 +     
 +    printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +    ​platform_index,​ device_index);​ 
 +    printf("​%lu %lu\n",​ context, command_queue);​ 
 +  } 
 +   
 +   
 +  int NDRange_dim = 1; 
 +  int nr_cores_CPU = omp_get_num_procs();​ 
 +  int nr_threads = nr_cores_CPU;​ // total number of threads 
 +  int work_group_size = 1; // number of threads in work-group 
 +  int nr_work_groups = nr_threads / work_group_size;​ 
 +  globalWorkSize[0] = nr_threads ; 
 +  localWorkSize[0] = work_group_size ; 
 +   
 +  printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 + nr_cores_CPU,​ nr_cores_CPU,​ 1); 
 +  printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 + work_group_size,​ nr_threads / work_group_size,​  
 + nr_threads / work_group_size / nr_cores_CPU);​ 
 +  printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 + globalWorkSize[0],​ globalWorkSize[0]/​nr_cores_CPU,​  
 + globalWorkSize[0]/​nr_cores_CPU);​ 
 +  printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 + array_size, array_size/​globalWorkSize[0],​ array_size/​nr_cores_CPU);​ 
 +   
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  int size = array_size;​ 
 +  int size_per_thread = ceilf((float)size/​nr_threads);​  
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +  retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  retval |= clSetKernelArg(kernel,​ 3, sizeof(int),​ &​size);​ 
 +  retval |= clSetKernelArg(kernel,​ 4, sizeof(int),​ &​size_per_thread);​ 
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to Set the kernel arguments.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\t\t4. executing kernel %d, on platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 + 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queue,​ kernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9;​ 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",​ 
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0);​ 
 +
 + 
 + 
 +int execute_kernel_3_CPU( 
 + int platform_index,​ 
 + int device_index,​ 
 + int kernel_index,​ 
 + int array_size,​ 
 + cl_mem* memObjects,​ 
 + int monitor 
 +
 +
 +   
 +  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  int i,j,k; 
 +   
 +   
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +  // create the kernel for CPU 
 +  utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_CPU,​ kernel_index,​ 
 +   // kernel name:         , file: 
 +   "​vecadd_3_opt_blocks_kernel","​vecadd_3_opt_blocks.cl",​monitor);​ 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL){  
 +    printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +    ​platform_index,​ device_index);​ 
 +    printf("​%lu %lu\n",​ context, command_queue);​ 
 +  } 
 +   
 +   
 +  int NDRange_dim = 1; 
 +  int nr_cores_CPU = omp_get_num_procs();​ 
 +  int nr_threads = nr_cores_CPU;​ // total number of threads 
 +  int work_group_size = 1; // number of threads in work-group 
 +  int nr_work_groups = nr_threads / work_group_size;​ 
 +  globalWorkSize[0] = nr_threads ; 
 +  localWorkSize[0] = work_group_size ; 
 +   
 +  printf("​\t\tExecuting kernel %d, on platform %d and device %d\n",​ 
 + kernel_index,​ platform_index,​ device_index);​ 
 +  printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 + nr_cores_CPU,​ nr_cores_CPU,​ 1); 
 +  printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 + work_group_size,​ nr_threads / work_group_size,​  
 + nr_threads / work_group_size / nr_cores_CPU);​ 
 +  printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 + globalWorkSize[0],​ globalWorkSize[0]/​nr_cores_CPU,​  
 + globalWorkSize[0]/​nr_cores_CPU);​ 
 +  printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 + array_size, array_size/​globalWorkSize[0],​ array_size/​nr_cores_CPU);​ 
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  int size = array_size;​ 
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +  retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  retval |= clSetKernelArg(kernel,​ 3, sizeof(int),​ &​size);​ 
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to Set the kernel arguments.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    return 1; 
 +  } 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queue,​ kernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9;​ 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",​ 
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0);​ 
 +
 + 
 +// tutaj modyfikujemy 
 +int execute_kernel_0_GPU( 
 + int platform_index,​ 
 + int device_index,​ 
 + int kernel_index,​ 
 + int array_size,​ 
 + cl_mem* memObjects,​ 
 + int monitor 
 +
 +
 +   
 +  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  int i,j,k; 
 +   
 +   
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +  // create the kernel for GPU 
 +    utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +     // kernel name:         , file: 
 +     "​vecadd_0_kernel",​ "​vecadd_0.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL){  
 +    printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +    ​platform_index,​ device_index);​ 
 +    printf("​%lu %lu\n",​ context, command_queue);​ 
 +  } 
 +   
 +   
 +  int NDRange_dim = 1; 
 +   
 +  // globalna liczba wątków i lokalna liczba wątków 
 +  int work_group_size = WORK_GROUP_SIZE;​ 
 +  globalWorkSize[0] = array_size;​ 
 +  localWorkSize[0] = work_group_size;​ 
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\t\t4. executing kernel %d, on platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +    printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 +    ​NR_COMP_UNITS,​ NR_CORES_PER_CU*NR_COMP_UNITS,​ NR_CORES_PER_CU);​ 
 +    printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 +    ​localWorkSize[0],​ globalWorkSize[0]/​localWorkSize[0],​ 
 +    ​globalWorkSize[0]/​localWorkSize[0]/​NR_COMP_UNITS);​ 
 +    printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​NR_COMP_UNITS,​ 
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +    printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​globalWorkSize[0],​  
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +  } 
 +    
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  // ustawiamy liczbę argumentów 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +  retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  if (retval != CL_SUCCESS) { 
 +      printf("​Failed to Set the kernel arguments.\n"​);​ 
 +      //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +      return ​1
 +    } 
 +  ​
    
-int main int argcchar *argv[] ​);+#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queuekernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9; 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",​ 
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0); 
 +}
  
-/******************************************************************************/+ 
 +int execute_kernel_1_GPU( 
 + int platform_index,​ 
 + int device_index,​ 
 + int kernel_index,​ 
 + int array_size,​ 
 + cl_mem* memObjects,​ 
 + int monitor 
 +
 +
 +   
 +  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  int i,j,k; 
 +   
 +   
 +  ​/*----------------KERNEL CREATION PHASE----------------------*
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +  // create the kernel for GPU 
 +/* 
 +    utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +     // kernel name:         , file: 
 +     "​vecadd_0_kernel",​ "​vecadd_0.cl",​ monitor);*
 + 
 +    utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 + ​  ​   // kernel name:         , file: *
 + ​  ​   "​vecadd_1_kernel",​ "​vecadd_1.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL){  
 +    printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +    ​platform_index,​ device_index);​ 
 +    printf("​%lu %lu\n",​ context, command_queue);​ 
 +  } 
 +   
 +   
 +  int NDRange_dim = 1; 
 +   
 +  int work_group_size = WORK_GROUP_SIZE;​ 
 +  globalWorkSize[0] = array_size / 4; 
 +  localWorkSize[0] = work_group_size;​ 
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\t\t4. executing kernel %d, on platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +    printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 +    ​NR_COMP_UNITS,​ NR_CORES_PER_CU*NR_COMP_UNITS,​ NR_CORES_PER_CU);​ 
 +    printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 +    ​localWorkSize[0],​ globalWorkSize[0]/​localWorkSize[0],​ 
 +    ​globalWorkSize[0]/​localWorkSize[0]/​NR_COMP_UNITS);​ 
 +    printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​NR_COMP_UNITS,​ 
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +    printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​globalWorkSize[0],​  
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +  } 
 +    
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +  retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  if (retval != CL_SUCCESS) { 
 +      printf("​Failed to Set the kernel arguments.\n"​);​ 
 +      //Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +      return 1; 
 +    } 
 +  ​
    
-int main ( int argcchar *argv[] ​)+#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queue,​ kernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9;​ 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",​ 
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0);​ 
 +
 + 
 + 
 +int execute_kernel_2_GPU( 
 +  int platform_index, 
 + int device_index,​ 
 + int kernel_index,​ 
 + int array_size,​ 
 + cl_memmemObjects,​ 
 + int monitor 
 +  )
 { {
- int np NUM_SPAWNS+   
- int errcodes[NUM_SPAWNS];+  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  ​int i,j,k; 
 +   
 +   
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +#ifdef time_measurments 
 +  t_begin ​time_clock()
 +#endif 
 +   
 +  // create the kernel for GPU 
 +/*    utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +     // kernel name:         , file: 
 +     "​vecadd_0_kernel",​ "​vecadd_0.cl",​ monitor);​*/​ 
 +     ​utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 + ​  ​   // kernel name:         , file: */ 
 + ​  ​   "​vecadd_2_blocks_kernel",​ "​vecadd_2_blocks.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL){  
 +    printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +    ​platform_index,​ device_index);​ 
 +    printf("​%lu %lu\n",​ context, command_queue);​ 
 +  } 
 +   
 +   
 +  ​int NDRange_dim = 1; 
 +   
 +  int work_group_size = WORK_GROUP_SIZE;​ 
 +  globalWorkSize[0= array_size / work_group_size;​ 
 +  localWorkSize[0] = work_group_size;
  
- MPI_Comm parentcommintercomm;+  int size_per_thread = work_group_size;​ 
 +  int size = array_size;​ 
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\t\t4. executing kernel %don platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +    printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 +    ​NR_COMP_UNITS,​ NR_CORES_PER_CU*NR_COMP_UNITS,​ NR_CORES_PER_CU);​ 
 +    printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 +    ​localWorkSize[0],​ globalWorkSize[0]/​localWorkSize[0],​ 
 +    ​globalWorkSize[0]/​localWorkSize[0]/​NR_COMP_UNITS);​ 
 +    printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​NR_COMP_UNITS,​ 
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +    printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​globalWorkSize[0],​  
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +  } 
 +    
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem),​ &​memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel,​ 1, sizeof(cl_mem),​ &​memObjects[1]);​ 
 +  retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  retval |= clSetKernelArg(kernel,​ 3, sizeof(int),​ &​size);​ 
 +  retval |= clSetKernelArg(kernel,​ 4, sizeof(int),​ &​size_per_thread);​ 
 +  if (retval != CL_SUCCESS) { 
 +      printf("​Failed to Set the kernel arguments.\n"​);​ 
 +      //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +      return 1; 
 +    } 
 +   
 +  
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queue,​ kernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9;​ 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",​ 
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0); 
 +}
  
- MPI_Init(&​argc,​ &argv); 
- MPI_Comm_get_parent(&​parentcomm);​ 
  
- if (parentcomm ​== MPI_COMM_NULL) { +int execute_kernel_3_GPU( 
- MPI_Comm_spawn("root_proc", ​MPI_ARGV_NULLnpMPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercommerrcodes); + int platform_index,​ 
- printf("​proces rodzic ​1\n"​);​ + int device_index,​ 
- } else { + int kernel_index,​ 
- printf("​Proces dziecko ​1\n");+ int array_size,​ 
 + cl_mem* memObjects,​ 
 + int monitor 
 +
 +
 +   
 +  cl_kernel kernel; 
 +  cl_event kernel_execution;​ 
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +  double exec_time;​ 
 +  cl_int retval; 
 +   
 +  size_t globalWorkSize[1] = { 0 }; 
 +  size_t localWorkSize[1] = { 0 }; 
 +   
 +  int i,j,k; 
 +   
 +  int size = array_size;​ 
 +   
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +  // create the kernel for GPU 
 +/*    utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +     // kernel name:         , file: 
 +     "​vecadd_0_kernel",​ "​vecadd_0.cl",​ monitor);​*/​ 
 +     ​utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 + ​  ​   // kernel name:         , file: 
 +     "​vecadd_3_opt_blocks_kernel",​ "​vecadd_3_opt_blocks.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel: %lf\n",​ t_end-t_begin);​ 
 +#endif 
 +   
 +   
 +  // choose the context 
 +  cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +   
 +  // choose the command queue 
 +  cl_command_queue command_queue =  
 +    utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +   
 +  ​if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \tRestoring context and command queue for platform %d and device %d\n",​ 
 +    ​platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue ​== NULL){  
 +    ​printf("failed to restore context and command queue for platform %d, device %d\n", ​ 
 +    ​platform_indexdevice_index);​ 
 +    printf("​%lu %lu\n"contextcommand_queue);​ 
 +  } 
 +   
 +   
 +  int NDRange_dim = 1; 
 +   
 +  int work_group_size = WORK_GROUP_SIZE;​ 
 +  globalWorkSize[0] = array_size;​ 
 +  localWorkSize[0] = work_group_size;​ 
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\t\t4. executing kernel %don platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +    printf("​PARAMETERS:​ \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",​ 
 +    ​NR_COMP_UNITS,​ NR_CORES_PER_CU*NR_COMP_UNITS,​ NR_CORES_PER_CU);​ 
 +    printf("​\t\tworkgroup size %d, nr_workgroups %d ,​nr_workgroups_per_CU %d\n",​ 
 +    ​localWorkSize[0],​ globalWorkSize[0]/​localWorkSize[0],​ 
 +    ​globalWorkSize[0]/​localWorkSize[0]/​NR_COMP_UNITS);​ 
 +    printf("​\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​NR_COMP_UNITS,​ 
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +    printf("​\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",​ 
 +    ​globalWorkSize[0],​ globalWorkSize[0]/​globalWorkSize[0],​  
 +    ​globalWorkSize[0]/​(NR_CORES_PER_CU*NR_COMP_UNITS));​ 
 +  } 
 +    
 +   
 +  // choose the kernel 
 +  kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +   
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +    ​kernel_index,​ platform_index,​ device_index);​ 
 +  } 
 +   
 +  if(context == NULL || command_queue == NULL || kernel == NULL){  
 +     
 +    printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +    ​platform_index,​ device_index,​ kernel_index);​ 
 +    printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +    ​context,​ command_queue,​ kernel); 
 +  } 
 +   
 +  // Set the kernel arguments (result, a, b) 
 +  retval = clSetKernelArg(kernel,​ 0, sizeof(cl_mem), &memObjects[0]);​ 
 +  retval |= clSetKernelArg(kernel1, sizeof(cl_mem),​ &​memObjects[1]); 
 +  ​retval |= clSetKernelArg(kernel,​ 2, sizeof(cl_mem),​ &​memObjects[2]);​ 
 +  retval |= clSetKernelArg(kernel,​ 3, sizeof(int),​ &​size);​ 
 + 
 +  if (retval != CL_SUCCESS) { 
 +      ​printf("​Failed to Set the kernel arguments.\n"​);​ 
 +      //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +      return ​1
 +    } 
 +   
 +  
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // Queue the kernel up for execution across the array 
 +  retval = clEnqueueNDRangeKernel(command_queue,​ kernel, 1, NULL, 
 +   globalWorkSize,​ localWorkSize,​ 
 +   0, NULL, &​kernel_execution);​ 
 +  clWaitForEvents(1,​ &​kernel_execution);​ 
 +   
 +   
 +#ifdef time_measurments 
 +  clFinish(command_queue);​ 
 +  t_end = time_clock();​ 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(kernel_execution,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  exec_time = ((double)endTime - (double)startTime)*1.0e-9;​ 
 +  printf("​EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n"
 + t_end-t_begin,​ ((double)endTime - (double)startTime)*1.0e-9);​ 
 +  printf("​\tNumber of operations %d, performance %lf GFlops\n",​ 
 + array_size, array_size / exec_time * 1e-9); 
 +  printf("​\tGBytes transferred to processor %lf, speed %lf GB/​s\n",​ 
 + 3*array_size*sizeof(float)*1e-9,​ 
 + 3*array_size*sizeof(float)/​exec_time*1e-9);​ 
 +#endif 
 +   
 +  if (retval != CL_SUCCESS) { 
 +    printf("​Failed to queue kernel for execution.\n"​);​ 
 +    //​Cleanup(context,​ command_queue,​ program, kernel, memObjects);​ 
 +    exit(-1); 
 +  } 
 +   
 +  return(0);​ 
 +
 + 
 + 
 + 
 + 
 +     
 +/*     // create the fourth kernel for GPU */ 
 +/*     ​kernel_index = 4;  */ 
 +/*     ​utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ */ 
 +/*     // kernel name:         , file: */ 
 +/*     "​vecadd_4_opt_blocks_vect_kernel",​ "​vecadd_4_opt_blocks_vect.cl",​ monitor); */</​code>​ 
 + 
 +==== opencl_vecadd/​vecadd_2_blocks.cl ==== 
 +<code c> 
 +__kernel void vecadd_2_blocks_kernel(__global const float *a, 
 +      ​__global const float *b, 
 +      ​__global float *result, 
 +                      const int size, 
 +      const int size_per_thread) 
 +
 +  int gid = get_global_id(0);​ 
 +  int index_start = gid * size_per_thread;​ 
 +  int index_end = (gid+1) * size_per_thread;​ 
 + 
 +  if (gid == 0) { 
 + //​printf("​size:​ %d, %d", size, size_per_thread);​ 
 +  } 
 + 
 +  for (int i=index_start;​ i < index_end && i < size; i++) {  
 +  //for (int i=index_start;​ i < index_end ; i++) {  
 +    result[i] = a[i]+b[i];​ 
 +  } 
 +}</​code>​ 
 +===== Zajęcia 7 ===== 
 +{{:​studia:​magisterskie:​1sem:​zajecia_7.zip|}} 
 + 
 +==== tmp/​opencl_mat_transp/​mat_transp_1.cl ==== 
 +<code c> #pragma OPENCL EXTENSION cl_khr_fp64 : enable 
 + 
 +__kernel void mat_transp_1_kernel( 
 +  __global float *A ,  
 +  __global float *B , 
 +  __global float *C , 
 +  int N) 
 +
 + 
 +int id = get_global_id(0) * get_global_size(0) + get_global_id(1);​ 
 +float temp = 0.0; 
 + 
 +for(int i = 0; i<​N;​i++){ 
 + temp += A[id*N + i] *B[i]; 
 +
 + 
 +C[id] = temp; 
 +}</​code>​ 
 + 
 +==== tmp/​opencl_mat_transp/​mat_transp/​mat_transp.c ==== 
 +<code c>#​include<​stdlib.h>​ 
 +#​include<​stdio.h>​ 
 +#include <​math.h>​ 
 + 
 +#include <​CL/​cl.h>​ 
 + 
 +#​include"​uth_ocl_intf.h"​ 
 + 
 +#define time_measurments 
 + 
 +#ifdef time_measurments 
 +#​include"​uth_time_intf.h"​ 
 +  static double t_begin, t_end, t_total; 
 +#endif 
 + 
 + 
 + 
 +#define BLOCK_SIZE 16 
 +#define NR_GROUPS 16 
 +#define MULT 16 
 +#define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT) 
 +#define ROZMIAR (WYMIAR*WYMIAR) 
 +// Matrices are stored in row-major order:  
 +// M(row, col) = M( row * WYMIAR + col )  
 + 
 +int verify_result( 
 +  float* result, 
 +  float* result_compare 
 +   ) 
 +
 +  // Verify the result 
 +  int result_OK = 1; 
 +  int i,j; 
 +  for(i = 0; i < WYMIAR; i++) { 
 +      if(fabs(result[i] - result_compare[i])>​1.e-6) { 
 + result_OK = 0; 
 + break; 
 +    } 
 +  } 
 +  printf("​\t\t6. verifying results: "); 
 +  if(result_OK) { 
 +    printf("​Output is correct\n"​);​ 
 +  ​} else { 
 +    printf("​Output is incorrect\n"​);​ 
 +    j = 0; 
 +    for(i = 0; i < WYMIAR, j<100; i++) { 
 + if(fabs(result[i] - result_compare[i])>​1.e-9) { 
 +   j++; 
 +   printf("​%d %16.8f %16.8f\n",  
 +   i, result[i], result_compare[i]);
  }  }
- fflush(stdout); +    } 
- MPI_Finalize();+    exit(0); 
 +  } 
 +    /* for(i = 0; i < length; i++) { */ 
 +    /*   ​printf("​%16.8f %16.8f\n",​ result[i], result_compare[i]);​ */ 
 +    /* } */ 
 + 
 +  return(result_OK);​ 
 +
 + 
 +  /​*----------------KERNEL CREATION PHASE----------------------*/​ 
 +void create_kernels() 
 +
 + 
 +  // for all operations indicate explicit info messages 
 +  int monitor = UTC_BASIC_INFO + 1; 
 + 
 +  int kernel_index;​ 
 +   
 +  int platform_index = utv_ocl_struct.current_platform_index;​ 
 + 
 +  if(utr_ocl_CPU_context_exists(platform_index)){ 
 + 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +   
 +  // create the first kernel for CPU 
 +  /* kernel_index = 0;  */ 
 +  /* utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_CPU,​ kernel_index,​ */ 
 +  /*   // kernel name:         , file: */ 
 +  /*   "​mat_transp_1_kernel",​ "​mat_transp_1.cl",​ monitor); */ 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel %d: %lf\n",​ kernel_index,​ t_end-t_begin);​ 
 +#endif 
 +   
 +  } 
 + 
 +  if(utr_ocl_GPU_context_exists(platform_index)){ 
 + 
 + 
 +#ifdef time_measurments 
 +  t_begin = time_clock();​ 
 +#endif 
 +  // create the first kernel for GPU 
 +  kernel_index = 0;  
 +  utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +   // kernel name:         , file: 
 +   "​mat_transp_1_kernel",​ "​mat_transp_1.cl",​ monitor); 
 +   
 +#ifdef time_measurments 
 +  t_end = time_clock();​ 
 +  printf("​EXECUTION TIME: creating CPU kernel %d: %lf\n",​ kernel_index,​ t_end-t_begin);​ 
 +#endif 
 +   
 +  // create the second kernel for GPU 
 +  kernel_index = 1;  
 +  utr_ocl_create_kernel_dev_type( platform_index,​ UTC_OCL_DEVICE_GPU,​ kernel_index,​ 
 +   // kernel name:         , file: 
 +   "​mat_transp_2_kernel",​ "​mat_transp_2.cl",​ monitor); 
 + 
 +   
 + 
 +  } 
 + 
 +
 + 
 +  /​*----------------EXECUTION PHASE----------------------*/​ 
 +int execute_kernels() 
 +
 + 
 +  // for all operations indicate explicit info messages 
 +  int monitor = UTC_BASIC_INFO + 1; 
 + 
 +  int platform_index = utv_ocl_struct.current_platform_index;​ 
 +  utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index];​ 
 + 
 +  int kernel_index;​ 
 +  int i,j,n; 
 + 
 +  double nr_access;​ 
 +  double t1,t2; 
 + 
 +  if(monitor>​UTC_BASIC_INFO){ 
 +    printf("​\n------------Starting execution phase----------------\n"​);​ 
 +  } 
 +   
 +  // create matrices 
 +  n=WYMIAR; 
 +  float* A = (float *) malloc(ROZMIAR*sizeof(float));​ 
 +  float* B = (float *) malloc(WYMIAR*sizeof(float));​ 
 +  float* C = (float *) malloc(WYMIAR*sizeof(float));​ 
 +  float* D = (float *) malloc(WYMIAR*sizeof(float));​ 
 +   
 +  for(i=0;​i<​ROZMIAR;​i++) A[i]=1.0*i/​10000000.0;​ 
 + 
 +  for(i=0;​i<​WYMIAR;​i++) B[i]=1.0*i/​10000000.0;​ 
 + 
 +  // mnozenie macierz wektor 
 +  for(i=0; i<​WYMIAR;​ i++){ 
 + for(j=0; j<​WYMIAR;​ j++){ 
 + D[i] += A[i*WYMIAR + j] * B[j];  
 +
 +  } 
 +   
 +  nr_access= ROZMIAR + 2.0*WYMIAR; // read + write 
 +   
 +  printf("​mat_transp:​ nr_access %lf\n",​ nr_access); 
 +   
 +  // get hardware characteristics to select good matrix shape 
 +  // the set of device characteristics stored in data structure 
 +  int device_index = 0;  
 +  utt_ocl_device_struct device_struct =  
 +    utv_ocl_struct.list_of_platforms[platform_index].list_of_devices[device_index];​ 
 +  double global_mem_bytes = device_struct.global_mem_bytes;​ 
 +  double global_max_alloc = device_struct.global_max_alloc;​ 
 +  double local_mem_bytes = device_struct.local_mem_bytes;​ 
 +  double constant_mem_bytes = device_struct.constant_mem_bytes;​ 
 +  int max_num_comp_units = device_struct.max_num_comp_units;​ 
 +  int max_work_group_size = device_struct.max_work_group_size;​ 
 +   
 +  // in a loop over devices ​(or for a selected device) 
 +  int idev=0 
 +  for(idev=0; idev<​platform_struct.number_of_devices;​ idev++){ 
 +     
 +    // int device_type = ..... 
 +    // choose device_index 
 +    // int device_index = utr_ocl_select_device(platform_index,​ device_type);​ 
 +    int device_index = idev; 
 +    int device_type = utr_ocl_device_type(platform_index,​ device_index);​  
 +     
 +    if(device_index>​0 && device_type==utr_ocl_device_type(platform_index,​ device_index-1)) break;  
 +    if(device_type == UTC_OCL_DEVICE_CPU) break; 
 + 
 +    // choose the context 
 +    cl_context context = utr_ocl_select_context(platform_index,​ device_index); ​  
 +     
 +    // choose the command queue 
 +    cl_command_queue command_queue =  
 +      utr_ocl_select_command_queue(platform_index,​ device_index); ​  
 +     
 +    if(monitor>​UTC_BASIC_INFO){ 
 +      printf("​\nExecution:​ \t0. restoring context and command queue for platform %d and device %d\n",​ 
 +      ​platform_index,​ device_index);​ 
 +    } 
 +     
 +    if(context == NULL || command_queue == NULL){  
 +       
 +      printf("​failed to restore context and command queue for platform %d, device %d\n",  
 +      ​platform_index,​ device_index);​ 
 +      printf("​%lu %lu\n",​ context, command_queue);​ 
 +    } 
 +     
 +    // choose the kernel 
 +    kernel_index = 0; 
 +    cl_kernel kernel = utr_ocl_select_kernel(platform_index,​ device_index,​ kernel_index); ​  
 +     
 +    if(monitor>​UTC_BASIC_INFO){ 
 +      printf("​\nExecution:​ \t3. restoring kernel %d for platform %d and device %d\n",​ 
 +      ​kernel_index,​ platform_index,​ device_index);​ 
 +    } 
 +     
 +    if(context == NULL || command_queue == NULL || kernel == NULL){  
 +       
 +      printf("​failed to restore kernel for platform %d, device %d, kernel %d\n",  
 +      ​platform_index,​ device_index,​ kernel_index);​ 
 +      printf("​context %lu, command queue %lu, kernel %lu\n",​  
 +      ​context,​ command_queue,​ kernel); 
 +    } 
 +     
 +    for(i=0;​i<​WYMIAR;​i++) C[i]=0.0; 
 +    time_init();​ t1 = time_clock();​ 
 +     
 +    // call routine to perform matrix transposition 
 +    mat_transp_host(kernel_index,​ A, B, C, n, context, kernel, command_queue);​ 
 +     
 +    t2 = time_clock();​ time_print();​ 
 +    printf("​GB/​s = %lf\n\n",​ nr_access*sizeof(float)/​(t2-t1)/​1024/​1024/​1024);​ 
 +     
 +     
 +    // verify result  
 +    verify_result(C,​ D); 
 +     
 +     
 +  } // end loop over devices 
 +   
 +  return(0);​ 
 +
 + 
 + 
 + 
 +// Matrix transposition - Host code  
 +// Matrix dimensions are assumed to be multiples of BLOCK_SIZE  
 +int mat_transp_host( 
 +  int kernel_index,​ 
 +  float* A,  
 +  float* B, 
 +  float* C,  
 +  int N,  
 +  const cl_context context,  
 +  const cl_kernel mat_transp_kernel,​  
 +  const cl_command_queue queue 
 + )  
 +{  
 + 
 +// Load A to device memory  
 +  size_t size_bytesm = N*N*sizeof(float);​  
 +  size_t size_bytesw = N*sizeof(float);​  
 +   
 + 
 +  cl_mem d_A = clCreateBuffer(context,​ CL_MEM_READ_ONLY,​ 
 + size_bytesm,​ NULL, NULL);  
 + 
 +  // Write A to device memory  
 +  clEnqueueWriteBuffer(queue,​ d_A, CL_TRUE, 0, size_bytesm,​ A, 0, 0, 0);  
 + 
 +  cl_mem d_B = clCreateBuffer(context,​ CL_MEM_READ_ONLY,​ 
 + size_bytesw,​ NULL, NULL);  
 + 
 +  // Write A to device memory  
 +  clEnqueueWriteBuffer(queue,​ d_B, CL_TRUE, 0, size_bytesw,​ B, 0, 0, 0);  
 + 
 +  // Allocate B in device memory  
 +  cl_mem d_C = clCreateBuffer(context,​ CL_MEM_WRITE_ONLY,​ size_bytesw,​ NULL, NULL);  
 + 
 + 
 +  // Invoke kernel  
 +  clSetKernelArg(mat_transp_kernel,​ 0, sizeof(cl_mem),​ (void*)&​d_A);​ 
 +  clSetKernelArg(mat_transp_kernel,​ 1, sizeof(cl_mem),​ (void*)&​d_B);​ 
 +  clSetKernelArg(mat_transp_kernel,​ 2, sizeof(cl_mem),​ (void*)&​d_C);​ 
 +  clSetKernelArg(mat_transp_kernel,​ 3, sizeof(int),​ (void*)&​N);​ 
 + 
 +  size_t localWorkSize[3];​ 
 +  size_t globalWorkSize[3];​ 
 +  cl_uint work_dim; 
 + 
 +  if(kernel_index==3){ 
 + 
 +    work_dim = 2; 
 +    localWorkSize[0] = BLOCK_SIZE;  
 +    globalWorkSize[0] = N/MULT;  
 +    localWorkSize[1] = BLOCK_SIZE;  
 +    globalWorkSize[1] = N/MULT;  
 +    localWorkSize[2] = 0;  
 +    globalWorkSize[2] = 0;  
 + 
 + 
 +  } 
 +  else{ 
 + 
 +    work_dim = 2; 
 +    localWorkSize[0] = BLOCK_SIZE;  
 +    globalWorkSize[0] = sqrt(N);  
 +    localWorkSize[1] = BLOCK_SIZE;  
 +    globalWorkSize[1] = sqrt(N);  
 +    localWorkSize[2] = 0;  
 +    globalWorkSize[2] = 0;  
 +   
 +  } 
 + 
 +  clFinish(queue);​ 
 +  double t1 = time_clock();​ 
 +  // Enqueue a kernel run call 
 +  cl_event ndrEvt; 
 +  clEnqueueNDRangeKernel(queue,​ mat_transp_kernel,​ work_dim, 0,  
 + globalWorkSize,​ localWorkSize,​ 0, 0, &​ndrEvt);​  
 +  clWaitForEvents(1,​ &​ndrEvt);​ 
 +  clFinish(queue);​ 
 +  double t2 = time_clock();​ 
 +  // Calculate performance  
 +  cl_ulong startTime;​ 
 +  cl_ulong endTime; 
 +   
 +  // Get kernel profiling info  
 +  clGetEventProfilingInfo(ndrEvt,​ 
 +   CL_PROFILING_COMMAND_START,​ 
 +   sizeof(cl_ulong),​ 
 +   &​startTime,​ 
 +   0); 
 +  clGetEventProfilingInfo(ndrEvt,​ 
 +   CL_PROFILING_COMMAND_END,​ 
 +   sizeof(cl_ulong),​ 
 +   &​endTime,​ 
 +   0); 
 +  double time = (double)endTime - (double)startTime;​ 
 +  printf("​\nKernel execution internal: time %lf, GB/s = %lf\n",​  
 + time*1.0e-9,​ 2.0*N*N*sizeof(float)/​(time*1.0e-9)/​1024/​1024/​1024);​ 
 + 
 +  printf("​Kernel execution external: time %lf,  GB/s = %lf\n\n",​  
 + t2-t1, 2.0*N*N*sizeof(float)/​(t2-t1)/​1024/​1024/​1024);​ 
 + 
 +  // Read B from device memory  
 +  clEnqueueReadBuffer(queue,​ d_C, CL_TRUE, 0, size_bytesw,​ C, 0, 0, 0);  
 + 
 +  // Free device memory  
 +  clReleaseMemObject(d_A);​  
 +  clReleaseMemObject(d_B);​  
 +  clReleaseMemObject(d_C);​ 
 + 
 +  return(0);​ 
 }</​code>​ }</​code>​
 +==== Kod wogu ====
 +=== server.c ====
 +<code c>
 +/*
 +    C socket server example, handles multiple clients using threads
 +    Compile
 +    gcc server.c -lpthread -o server
 +*/
 + 
 +#​include<​stdio.h>​
 +#​include<​string.h> ​   //strlen
 +#​include<​stdlib.h> ​   //strlen
 +#​include<​sys/​socket.h>​
 +#​include<​arpa/​inet.h>​ //inet_addr
 +#​include<​unistd.h> ​   //write
 +#​include<​pthread.h>​ //for threading , link with lpthread
 +#include <​unistd.h>​
 + 
 +//the thread function
 +void *connection_handler(void *);
 + 
 +int main(int argc , char *argv[])
 +{
 +    int counter = 0;
 +    pid_t pid = fork();
 +
 +    if (pid == 0)
 +    {
 +       ​  ​   int sock;
 +     struct sockaddr_in server;
 +     char message[1000] , server_reply[2000];​
 +            int counter;
 +     unsigned int ip[] = { 192, 168, 102, 1 };
 +            char* ip_final;
 + char *buf;
 + size_t sz;
 +  
 +     //Create socket
 +
 +            for ( counter = 0; counter < 255; ++counter ) {
 +
 +     sock = socket(AF_INET , SOCK_STREAM , 0);
 +     if (sock == -1)
 +     {
 + printf("​Could not create socket"​);​
 +     }
 +     puts("​Socket created"​);​
 +
 +     //​printf("​192.168.102.%d",​ counter);
 +
 + sz = snprintf(NULL,​ 0, "​192.168.102.%d",​ counter);
 + buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */
 +     ​
 +     server.sin_addr.s_addr = inet_addr(buf);​
 +     server.sin_family = AF_INET;
 +     server.sin_port = htons( 65001 );
 +  
 +     //Connect to remote server
 +     if (connect(sock , (struct sockaddr *)&​server , sizeof(server)) < 0)
 +     {
 + perror("​connect failed. Error"​);​
 + continue;​
 +     }
 +  
 +     puts("​Connected\n"​);​
 +  
 +     //keep communicating with server
 +     while(1)
 +     {
 + //Send some data
 + if( send(sock , "​Werset 6-client"​ , 15 , 0) < 0)
 + {
 +     puts("​Send failed"​);​
 +     return 1;
 + }
 +  
 + //​Receive a reply from the server
 + if( recv(sock , server_reply , 2000 , 0) < 0)
 + {
 +     puts("​recv failed"​);​
 +     break;
 + }
 +
 + puts(server_reply);​
 +
 + break;
 +     }
 +  
 +     close(sock);​
 +           }
 +    }
 +    else if (pid > 0)
 +    {
 +        int socket_desc , client_sock , c;
 +     struct sockaddr_in server , client;
 +      
 +     //Create socket
 +     socket_desc = socket(AF_INET , SOCK_STREAM , 0);
 +     if (socket_desc == -1)
 +     {
 + printf("​Could not create socket"​);​
 +     }
 +     puts("​Socket created"​);​
 +      
 +     //Prepare the sockaddr_in structure
 +     server.sin_family = AF_INET;
 +     server.sin_addr.s_addr = INADDR_ANY;
 +     server.sin_port = htons( 65001 );
 +      
 +     //Bind
 +     if( bind(socket_desc,​(struct sockaddr *)&​server , sizeof(server)) < 0)
 +     {
 + //print the error message
 + perror("​bind failed. Error"​);​
 + return 1;
 +     }
 +     puts("​bind done"​);​
 +      
 +     //Listen
 +     listen(socket_desc , 3);
 +      
 +     //Accept and incoming connection
 +     puts("​Waiting for incoming connections..."​);​
 +     c = sizeof(struct sockaddr_in);​
 +      
 +      
 +     //Accept and incoming connection
 +     puts("​Waiting for incoming connections..."​);​
 +     c = sizeof(struct sockaddr_in);​
 + pthread_t thread_id;
 +
 +     while( (client_sock = accept(socket_desc,​ (struct sockaddr *)&​client,​ (socklen_t*)&​c)) )
 +     {
 + puts("​Connection accepted"​);​
 +  
 + if( pthread_create( &​thread_id , NULL ,  connection_handler , (void*) &​client_sock) < 0)
 + {
 +     perror("​could not create thread"​);​
 +     return 1;
 + }
 +  
 + //Now join the thread , so that we dont terminate before the thread
 + //​pthread_join( thread_id , NULL);
 + puts("​Handler assigned"​);​
 +     }
 +      
 +     if (client_sock < 0)
 +     {
 + perror("​accept failed"​);​
 + return 1;
 +     }
 +    }
 +    else
 +    {
 +        // fork failed
 +        printf("​fork() failed!\n"​);​
 +        return 1;
 +    }
 +     
 +    return 0;
 +}
 + 
 +/*
 + * This will handle connection for each client
 + * */
 +void *connection_handler(void *socket_desc)
 +{
 +    //Get the socket descriptor
 +    int sock = *(int*)socket_desc;​
 +    int read_size;
 +    char *message , client_message[2000];​
 +
 +    //Receive a message from client
 +    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
 +    {
 + printf("​%s\n",​ client_message);​
 +
 + //end of string marker
 + client_message[read_size] = '​\0';​
 +
 + //Send the message back to client
 + write(sock , "​Werset 6" , 8);
 +
 + //clear the message buffer
 + memset(client_message,​ 0, 2000);
 +    }
 +     
 +    if(read_size == 0)
 +    {
 +        puts("​Client disconnected"​);​
 +        fflush(stdout);​
 +    }
 +    else if(read_size == -1)
 +    {
 +        perror("​recv failed"​);​
 +    }
 +         
 +    return 0;
 +}</​code>​
 +=== Makefile ===
 +<code bash># kompilator c
 +CC = gcc
 + 
 +# konsolidator
 +CFLAGS=-g
 +
 +all: server client
 + 
 +server: server.o
 + $(CC) $(LFLAGS) server.o -lpthread -o server
 + 
 +server.o: server.c
 + $(CC) -c server.c -o server.o
 +
 +client: client.o
 + $(CC) client.o -o client
 + 
 +client.o: client.c
 + $(CC) -c client.c -o client.o
 + 
 +clean:
 + rm -f *.o</​code>​
 +===== Zajęcia 9 =====
 +<code c>/*
 +    C socket server example, handles multiple clients using threads
 +    Compile
 +    gcc server.c -lpthread -o server
 +*/
 + 
 +#​include<​stdio.h>​
 +#​include<​string.h> ​   //strlen
 +#​include<​stdlib.h> ​   //strlen
 +#​include<​sys/​socket.h>​
 +#​include<​arpa/​inet.h>​ //inet_addr
 +#​include<​unistd.h> ​   //write
 +#​include<​pthread.h>​ //for threading , link with lpthread
 + 
 +//the thread function
 +void *connection_handler(void *);
 + 
 +int main(int argc , char *argv[])
 +{
 +    int socket_desc , client_sock , c;
 +    struct sockaddr_in server , client;
 +     
 +    //Create socket
 +    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
 +    if (socket_desc == -1)
 +    {
 +        printf("​Could not create socket"​);​
 +    }
 +    puts("​Socket created"​);​
 +     
 +    //Prepare the sockaddr_in structure
 +    server.sin_family = AF_INET;
 +    server.sin_addr.s_addr = INADDR_ANY;
 +    server.sin_port = htons( 8888 );
 +     
 +    //Bind
 +    if( bind(socket_desc,​(struct sockaddr *)&​server , sizeof(server)) < 0)
 +    {
 +        //print the error message
 +        perror("​bind failed. Error"​);​
 +        return 1;
 +    }
 +    puts("​bind done"​);​
 +     
 +    //Listen
 +    listen(socket_desc , 3);
 +     
 +    //Accept and incoming connection
 +    puts("​Waiting for incoming connections..."​);​
 +    c = sizeof(struct sockaddr_in);​
 +     
 +     
 +    //Accept and incoming connection
 +    puts("​Waiting for incoming connections..."​);​
 +    c = sizeof(struct sockaddr_in);​
 + pthread_t thread_id;
 +
 +    while( (client_sock = accept(socket_desc,​ (struct sockaddr *)&​client,​ (socklen_t*)&​c)) )
 +    {
 +        puts("​Connection accepted"​);​
 +         
 +        if( pthread_create( &​thread_id , NULL ,  connection_handler , (void*) &​client_sock) < 0)
 +        {
 +            perror("​could not create thread"​);​
 +            return 1;
 +        }
 +         
 +        //Now join the thread , so that we dont terminate before the thread
 +        //​pthread_join( thread_id , NULL);
 +        puts("​Handler assigned"​);​
 +    }
 +     
 +    if (client_sock < 0)
 +    {
 +        perror("​accept failed"​);​
 +        return 1;
 +    }
 +     
 +    return 0;
 +}
 + 
 +/*
 + * This will handle connection for each client
 + * */
 +void *connection_handler(void *socket_desc)
 +{
 +    //Get the socket descriptor
 +    int sock = *(int*)socket_desc;​
 +    int read_size;
 +    char *message , client_message[2000];​
 +
 +    //Receive a message from client
 +    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
 +    {
 + //end of string marker
 + client_message[read_size] = '​\0';​
 +
 + //Send the message back to client
 + write(sock , "​Werset 6" , strlen(client_message));​
 +
 + //clear the message buffer
 + memset(client_message,​ 0, 2000);
 +    }
 +     
 +    if(read_size == 0)
 +    {
 +        puts("​Client disconnected"​);​
 +        fflush(stdout);​
 +    }
 +    else if(read_size == -1)
 +    {
 +        perror("​recv failed"​);​
 +    }
 +         
 +    return 0;
 +} </​code>​
 +
 +==== Client C ====
 +
 +<code c>#​include<​stdio.h>​ //printf
 +#​include<​string.h> ​   //strlen
 +#​include<​sys/​socket.h> ​   //socket
 +#​include<​arpa/​inet.h>​ //inet_addr
 + 
 +int main(int argc , char *argv[])
 +{
 +    int sock;
 +    struct sockaddr_in server;
 +    char message[1000] , server_reply[2000];​
 +     
 +    //Create socket
 +    sock = socket(AF_INET , SOCK_STREAM , 0);
 +    if (sock == -1)
 +    {
 +        printf("​Could not create socket"​);​
 +    }
 +    puts("​Socket created"​);​
 +     
 +    server.sin_addr.s_addr = inet_addr("​127.0.0.1"​);​
 +    server.sin_family = AF_INET;
 +    server.sin_port = htons( 8888 );
 + 
 +    //Connect to remote server
 +    if (connect(sock , (struct sockaddr *)&​server , sizeof(server)) < 0)
 +    {
 +        perror("​connect failed. Error"​);​
 +        return 1;
 +    }
 +     
 +    puts("​Connected\n"​);​
 +     
 +    //keep communicating with server
 +    while(1)
 +    {
 +        printf("​Enter message : ");
 +        scanf("​%s"​ , message);
 +         
 +        //Send some data
 +        if( send(sock , message , strlen(message) , 0) < 0)
 +        {
 +            puts("​Send failed"​);​
 +            return 1;
 +        }
 +         
 +        //Receive a reply from the server
 +        if( recv(sock , server_reply , 2000 , 0) < 0)
 +        {
 +            puts("​recv failed"​);​
 +            break;
 +        }
 +         
 +        puts("​Server reply :");
 +        puts(server_reply);​
 +    }
 +     
 +    close(sock);​
 +    return 0;
 +}</​code>​
 +
 +
 +==== Kod Maciek Gonzo itp ====
 +<code c>
 +#​include<​stdio.h>​
 +#​include<​string.h> ​   //strlen
 +#​include<​stdlib.h> ​   //strlen
 +#​include<​sys/​socket.h>​
 +#​include<​arpa/​inet.h>​ //inet_addr
 +#​include<​unistd.h> ​   //write
 +#​include<​pthread.h>​ //for threading , link with lpthread
 + 
 +//the thread function
 +void *connection_handler(void *);
 +
 +void *client_function(void *socket_desc){
 +  int sock;
 +    struct sockaddr_in server_id;
 +    char message[1000] , server_reply[2000];​
 +     
 +    //Create socket
 +    sock = socket(AF_INET , SOCK_STREAM , 0);
 +    if (sock == -1)
 +    {
 +        printf("​Could not create socket"​);​
 +    }
 +    puts("​Socket created"​);​
 +     // 192.168.102.63 wogu
 +    server_id.sin_addr.s_addr = inet_addr("​localhost"​);​
 +    server_id.sin_family = AF_INET;
 +    server_id.sin_port = htons( 65000 );
 + 
 +    //Connect to remote server
 +
 +    if (connect(sock , (struct sockaddr *)&​server_id , sizeof(server_id)) < 0)
 +    {
 +        perror("​connect failed. Error"​);​
 +
 +    }else{
 +
 +    puts("​Connected\n"​);​
 +     
 +    //keep communicating with server
 +    while(1)
 +    {
 +        printf("​Enter message : ");
 +        scanf("​%s"​ , message);
 +         
 +        //Send some data
 +        if( send(sock , message , strlen(message) , 0) < 0)
 +        {
 +            puts("​Send failed"​);​
 +
 +        }
 +         
 +        //Receive a reply from the server
 +        if( recv(sock , server_reply , 2000 , 0) < 0)
 +        {
 +            puts("​recv failed"​);​
 +            break;
 +        }
 +         
 +        puts("​Server reply :");
 +        puts(server_reply);​
 +    }
 +     
 +    close(sock); ​
 +}
 +   
 +};
 + 
 +
 +void *server_function(void * sock){
 +    printf("​SERVER Start"​);​
 +fflush(stdout);​
 +    int socket_desc , client_sock , c;
 +    pthread_t thread_id;
 +    struct sockaddr_in server , client;
 + 
 +    //Create socket
 +    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
 +    if (socket_desc == -1)
 +    {
 +        printf("​Could not create socket"​);​
 +    }
 +    puts("​Socket created"​);​
 + 
 +    //Prepare the sockaddr_in structure
 +    server.sin_family = AF_INET;
 +    server.sin_addr.s_addr = INADDR_ANY;
 +    server.sin_port = htons( 65000);
 + 
 +    //Bind
 +    if( bind(socket_desc,​(struct sockaddr *)&​server , sizeof(server)) < 0)
 +    {
 +        //print the error message
 +        perror("​bind failed. Error"​);​
 +        return;
 +    }
 +    puts("​bind done"​);​
 +
 +
 +
 +    //Listen
 +    listen(socket_desc , 3);
 + 
 +    //Accept and incoming connection
 +    puts("​Waiting for incoming connections..."​);​
 +    c = sizeof(struct sockaddr_in);​
 +fflush(stdout);​
 + 
 +    while( (client_sock = accept(socket_desc,​ (struct sockaddr *)&​client,​ (socklen_t*)&​c)) )
 +    {
 +        puts("​Connection accepted"​);​
 + 
 +        if( pthread_create( &​thread_id , NULL ,  connection_handler , (void*) &​client_sock) < 0)
 +        {
 +            perror("​could not create thread"​);​
 +            return;
 +        }
 + 
 +        //Now join the thread , so that we dont terminate before the thread
 +        //​pthread_join( thread_id , NULL);
 +        puts("​Handler assigned"​);​
 +    }
 + 
 +    if (client_sock < 0)
 +    {
 +        perror("​accept failed"​);​
 +        return;
 +    }
 +}
 +
 +
 +int main(int argc , char *argv[])
 +{
 +    pthread_t thread_id;
 +//Create server
 +        if( pthread_create( &​thread_id , NULL ,  server_function , NULL) != 0)
 +        {
 +            perror("​could not create server thread"​);​
 +     fflush(stdout);​
 +            return 1;
 +        }
 +    pthread_t thread_id2;
 +
 + // Create client
 +        if( pthread_create( &​thread_id2 , NULL ,  client_function , NULL) != 0)
 +        {
 +            perror("​could not create client thread"​);​
 +     fflush(stdout);​
 +            return 1;
 +        }
 +
 +    return 0;
 +}
 + 
 +/*
 + * This will handle connection for each client
 + * */
 +void *connection_handler(void *socket_desc)
 +{
 +    //Get the socket descriptor
 +    int sock = *(int*)socket_desc;​
 +    int read_size;
 +    char *message , client_message[2000];​
 + 
 +    //Receive a message from client
 +    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
 +    {
 +        //end of string marker
 + client_message[read_size] = '​\0';​
 + 
 + //Send the message back to client
 +        write(sock , "​Werset 5" , strlen(client_message));​
 + 
 + //clear the message buffer
 + memset(client_message,​ 0, 2000);
 +    }
 + 
 +    if(read_size == 0)
 +    {
 +        puts("​Client disconnected"​);​
 +        fflush(stdout);​
 +    }
 +    else if(read_size == -1)
 +    {
 +        perror("​recv failed"​);​
 +    }
 + 
 +    return 0;
 +
 +
 +</​code>​
studia/magisterskie/1sem/systemy_rownolegle_i_rozproszone.1457604633.txt.gz · ostatnio zmienione: 2016/03/10 11:10 przez 149.156.112.6