====== Zajęcia 1 ====== ==== Makefile ====

# kompilator c
CCOMP = mpicc

# konsolidator
LINK = mpicc

MPIRUN = /usr/lib64/openmpi/bin/mpiexec 

# opcje optymalizacji:
# wersja do debugowania
# OPT = -g -DDEBUG -p
# wersja zoptymalizowana do mierzenia czasu
# OPT = -O3 -fopenmp -p

# pliki naglowkowe
#INC = -I../pomiar_czasu

# biblioteki
#LIB = -L../pomiar_czasu -lm
LIB = -lm

# zaleznosci i komendy

heat: heat.o
	$(LINK) $(OPT) heat.o -o heat $(LIB)

heat.o: heat.c
	$(CCOMP) -c $(OPT) heat.c $(INC)

run:
	$(MPIRUN) -np 8 ./heat

clean:
	rm -f *.o

==== heat.c ====

# include 
# include 
# include 

# include "mpi.h"

int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );

/******************************************************************************/

int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1.0; // prawy brzeg przedzialu
  int i;  
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;

  MPI_Init ( &argc, &argv );

  MPI_Comm_rank ( MPI_COMM_WORLD, &id );

  MPI_Comm_size ( MPI_COMM_WORLD, &p );

  n = 12; // liczba punktow dla kazdego wezla 
  i = 0;  // poczatkowa chwila czasu

  // wspolrzedna lewego punktu dla wezla id
  x_min = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double ) ( p * n + 1              );

  i = n + 1;

  // wspolrzedna prawego punktu dla wezla id
  x_max = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double )( p * n + 1              );

  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla

  MPI_Finalize ( );
}

/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;

  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow

  k = 0.002 / ( double ) p; // przewodniosc cieplna

  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t

  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max   
           + ( double ) ( n + 1 - i ) * x_min ) 
           / ( double ) ( n + 1     );
  }

  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }

  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;

  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }

  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu

  for ( j = 1; j <= j_max; j++ )
  {

    // wymiana informacji z wezlami sasiednimi
    tag = 1;

    if ( id < p - 1 )
    {
      MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
    }

    if ( 0 < id )
    {
      MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
    }

    tag = 2;

    if ( 0 < id )
    {
      // DO UZUPELNIENIA
	MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
    }

    if ( id < p - 1 )
    {
      // DO UZUPELNIENIA   
	MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );  
    }


    // implementacja wzoru roznicowego
    for ( i = 1; i <= n; i++ )
    {
      h_new[i] = h[i] + t_del * ( 
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del 
        + 2.0 * sin ( x[i] * t ) );
    }

    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max   
            + ( double ) ( j_max - j         ) * t_min ) 
            / ( double ) ( j_max     - j_min );

    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;

  }

  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
  
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
  
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
  
  tag = 12;
  
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
  
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );      
  }
  
  wtime = MPI_Wtime ( ) - wtime;

  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }

  // wydruk wyniku
  printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" );

  free ( h );
  free ( h_new );
  free ( x );

  return;
}

====== Zajęcia 2 ====== ==== heat.c ====

# include 
# include 
# include 

# include "mpi.h"

int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );

/******************************************************************************/

int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1000000.0; // prawy brzeg przedzialu
  int i;  
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;

  MPI_Init ( &argc, &argv );

  MPI_Comm_rank ( MPI_COMM_WORLD, &id );

  MPI_Comm_size ( MPI_COMM_WORLD, &p );

  n = 1000000; // liczba punktow dla kazdego wezla 
  i = 0;  // poczatkowa chwila czasu

  // wspolrzedna lewego punktu dla wezla id
  x_min = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double ) ( p * n + 1              );

  i = n + 1;

  // wspolrzedna prawego punktu dla wezla id
  x_max = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double )( p * n + 1              );

  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla

  MPI_Finalize ( );


}

/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;

  MPI_Request req1, req2, req3, req4;
  MPI_Status stat1, stat2, stat3, stat4;

  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow

  k = 0.002 / ( double ) p; // przewodniosc cieplna

  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t

  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max   
           + ( double ) ( n + 1 - i ) * x_min ) 
           / ( double ) ( n + 1     );
  }

  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }

  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;

  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }

  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu

  for ( j = 1; j <= j_max; j++ )
  {

    // wymiana informacji z wezlami sasiednimi

    ///NON BLOCK
    if ( id > 0 )
    {
        MPI_Irecv ( &h[0], 1, MPI_DOUBLE, id-1, 1, MPI_COMM_WORLD, &req1 );
    }
    if ( id < p - 1 )
    { 
	MPI_Irecv ( &h[n+1], 1, MPI_DOUBLE, id+1, 2, MPI_COMM_WORLD, &req2 );
    }
    ///END NON BLOCK
   
    if ( id > 0 )
    {
	MPI_Isend ( &h[1], 1, MPI_DOUBLE, id-1, 2, MPI_COMM_WORLD, &req3 );
    }
    if ( id < p - 1 )
    {
        MPI_Isend ( &h[n], 1, MPI_DOUBLE, id+1, 1, MPI_COMM_WORLD, &req4 );
    }

    

    // implementacja wzoru roznicowego
    for ( i = 2; i <= n-1; i++ )
    {
      h_new[i] = h[i] + t_del * ( 
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del 
        + 2.0 * sin ( x[i] * t ) );
    }


    ///NON BLOCK 
    ///WAIT FOR RECEIVE
    if ( id > 0 ) 
    {
      MPI_Wait(&req1, &stat1); 
    }
    if ( id < p - 1 ) 
    {
      MPI_Wait(&req2, &stat2); 
    }

    int tmp = 1;
    h_new[tmp] = h[tmp] + t_del * ( 
        k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del 
        + 2.0 * sin ( x[tmp] * t ) );
    tmp = n;
    h_new[tmp] = h[tmp] + t_del * ( 
        k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del 
        + 2.0 * sin ( x[tmp] * t ) );


    ///WAIT FOR SEND
    if ( id > 0 ) 
    {
      MPI_Wait(&req3, &stat3); 
    }
    if ( id < p - 1 ) 
    {
      MPI_Wait(&req4, &stat4); 
    }

    ///END NON BLOCK



    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max   
            + ( double ) ( j_max - j         ) * t_min ) 
            / ( double ) ( j_max     - j_min );

    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;

  }

  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
  
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
  
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
  
  tag = 12;
  
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
  
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );      
  }
  
  wtime = MPI_Wtime ( ) - wtime;

  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }

  // wydruk wyniku
  /*printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" ); */

  free ( h );
  free ( h_new );
  free ( x );

  return;
}

===== Zajęcia 3 ===== ==== child.c ====

# include 
# include 
# include 

# include "mpi.h"

int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );

/******************************************************************************/

int main ( int argc, char *argv[] )
{
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;
  MPI_Comm parentcomm; 
  MPI_Status status;

  MPI_Init ( &argc, &argv );

  MPI_Comm_rank ( MPI_COMM_WORLD, &id );

  MPI_Comm_size ( MPI_COMM_WORLD, &p );

  MPI_Comm_get_parent(&parentcomm);

  MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status );

  MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status );
  MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &status );

  //MPI_Barrier(MPI_COMM_WORLD);

  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla

  MPI_Finalize ( );

  return 0;
}

/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;

  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow

  k = 0.002 / ( double ) p; // przewodniosc cieplna

  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t

  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max  
           + ( double ) ( n + 1 - i ) * x_min )
           / ( double ) ( n + 1     );
  }

  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }

  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;

  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }

  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu

  for ( j = 1; j <= j_max; j++ )
  {

    // wymiana informacji z wezlami sasiednimi
    tag = 1;

    if ( id < p - 1 )
    {
      MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
    }

    if ( 0 < id )
    {
      MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
    }

    tag = 2;

 if ( 0 < id ) {
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );     
  }


    // implementacja wzoru roznicowego
    for ( i = 1; i <= n; i++ )
    {
      h_new[i] = h[i] + t_del * (
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
        + 2.0 * sin ( x[i] * t ) );
    }

    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max  
            + ( double ) ( j_max - j         ) * t_min )
            / ( double ) ( j_max     - j_min );

    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;

  }

  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
 
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
 
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
 
  tag = 12;
 
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );     
  }
 
  wtime = MPI_Wtime ( ) - wtime;

  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }

  // wydruk wyniku
  printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" );

  free ( h );
  free ( h_new );
  free ( x );

  return;
}

==== parent.c ====


# include 
# include 
# include 

# include "mpi.h"

int main ( int argc, char *argv[] );

/******************************************************************************/

int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1.0; // prawy brzeg przedzialu
  int i; 
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  int np = 8; // liczba procesow
  double x_max;
  double x_min;
  MPI_Comm komunikator;
  int *errcodes;

  MPI_Init ( &argc, &argv );

  //MPI_Comm_rank ( MPI_COMM_WORLD, &id );

  //MPI_Comm_size ( MPI_COMM_WORLD, &p );

  MPI_Comm_spawn("dziecko", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &komunikator, errcodes);

  n = 12; // liczba punktow dla kazdego wezla
  int j;
  for(j=0;j

====== Zajęcia 4 ======
wkrótce
====== Zajęcia 5 ======
==== Makefile ====
# optimization and other system dependent options
#include  make.$(SRR_ARCH)
# or directly
include  make.lab_404_NVIDIA

NAME = Hello_GPU

program: main.o 
	$(CC) $(LDFL) main.o  $(LIB) -o $(NAME)

main.o: main.c 
	$(CC) $(CFL) -c main.c  $(INC) -o main.o

clean:
	rm -f obj/*
	rm -f $(NAME)

==== make.lab_404_NVIDIA ====
# C compiler
#CC = icc
CC = gcc

# C++ compiler
CPPC = icpc
#CPPC = g++

# Loader (to link C/C++ and Fortran libraries)
LD = icpc
#LD = g++

# Archiver
AR = ar r

# For removing files
RM = rm -f

# Include directories
INC = -I/opt/cuda7/include

# Standard and/or local libraries
LIB = -L/opt/cuda7/lib64 -lOpenCL 

# C optimization and other flags
#CFL(icc) = -O3 -openmp
#CFL(gcc) = -O3 -fopenmp
#CFL(icc,debug) = -g
#CFL(gcc) = -g
CFL =

# Loader optimization and other flags
#LDFL(debug) = -g 
#LDFL(profile) = -p                                
LDFL = 


==== main.c ====
#include
#include

#include 

// functions to display platform and device properties
void DisplayPlatformInfo(
			 cl_platform_id id, 
			 cl_platform_info name,
			 char* str)
{
  cl_int retval;
  size_t paramValueSize;
  
  retval = clGetPlatformInfo(
			     id,
			     name,
			     0,
			     NULL,
			     ¶mValueSize);
  if (retval != CL_SUCCESS){
    printf("Failed to find OpenCL platform %s.\n", str);
    return;
  }
  
  char * info = (char *)malloc(sizeof(char) * paramValueSize);
  retval = clGetPlatformInfo(
			     id,
			     name,
			     paramValueSize,
			     info,
			     NULL);
  if (retval != CL_SUCCESS)  {
    printf("Failed to find OpenCL platform %s.\n", str);
    return;
  }
  
  printf("\t%s:\t%s\n", str, info );
  free(info); 
}

void DisplayDeviceInfo_char(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;
  
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   ¶mValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
  
  char * info = (char *)malloc(sizeof(char) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
  
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }

  printf("\t\t%s:\t%s\n", str, info );
  free(info);
};

void DisplayDeviceInfo_ulong(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;// = sizeof(cl_ulong);
  
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   ¶mValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
  
  cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
  
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }

  printf("\t\t%s:\t%lu MB\n", str, info[0] / 1024 / 1024 );
  free(info);
};

void DisplayDeviceInfo_uint(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;// = sizeof(cl_ulong);
  
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   ¶mValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
  
  cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
  
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }

  printf("\t\t%s:\t%d MHz\n", str, info[0] );
  free(info);
};


// main program controlling execution of CPU code and OpenCL kernels
int main(int argc, char** argv)
{
  cl_uint number_of_contexts = 2;
  cl_context context = NULL;
  cl_context list_of_contexts[2] = {0,0};
  cl_command_queue commandQueue = 0;
  cl_program program = 0;
  cl_uint number_of_devices;
  cl_device_id device = 0;
  cl_device_id *list_of_devices;
  cl_device_type type;
  cl_kernel kernel = 0;
  cl_mem memObjects[3] = { 0, 0, 0 };
  cl_int retval;
  int icon, idev;
  cl_uint numPlatforms;
  cl_platform_id * platformIds;
  cl_uint i,j;

  // flag to control displaying
  int Monitor = 1;


  // Create OpenCL contexts 

  // First, query the total number of platforms
  retval = clGetPlatformIDs(0, (cl_platform_id *) NULL, &numPlatforms);

  // Next, allocate memory for the installed plaforms, and qeury 
  // to get the list.
  platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);

  // Then, query the platform IDs
  retval = clGetPlatformIDs(numPlatforms, platformIds, NULL);

  if(Monitor>=0){
    printf("Number of platforms: \t%d\n", numPlatforms); 
  }

  // Iterate through the list of platforms displaying associated information
  for (i = 0; i < numPlatforms; i++) {

    if(Monitor>0){

      printf("Platform ID - %d\n",i);
      // First we display information associated with the platform
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_NAME, 
			"CL_PLATFORM_NAME");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_PROFILE, 
			"CL_PLATFORM_PROFILE");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_VERSION, 
			"CL_PLATFORM_VERSION");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_VENDOR, 
			"CL_PLATFORM_VENDOR");
    }

  }

  // For the first platform
  int iplat;
  printf("Select Platform ID: "); scanf("%d", &iplat);

  // Query the set of devices associated with the platform
  retval = clGetDeviceIDs(
			  platformIds[iplat],
			  CL_DEVICE_TYPE_ALL,
			  0,
			  NULL,
			  &number_of_devices);
  
  
  list_of_devices = 
    (cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices);
  
  retval = clGetDeviceIDs(
			  platformIds[iplat],
			  CL_DEVICE_TYPE_ALL,
			  number_of_devices,
			  list_of_devices,
			  NULL);
  
  if(Monitor>=0){
    printf("Number of devices: \t%d\n", number_of_devices); 
  }

  // Iterate through each device, displaying associated information
  for (j = 0; j < number_of_devices; j++) {
    
    clGetDeviceInfo(list_of_devices[j], CL_DEVICE_TYPE, 
		    sizeof(cl_device_type), &type, NULL);
    
    if(Monitor>0){
      
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_NAME, 
			"CL_DEVICE_NAME");
      
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_VENDOR, 
			"CL_DEVICE_VENDOR");
      
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_VERSION, 
			"CL_DEVICE_VERSION");

      DisplayDeviceInfo_ulong(
			list_of_devices[j], 
			CL_DEVICE_GLOBAL_MEM_SIZE, 
			"CL_DEVICE_GLOBAL_MEM_SIZE");

      DisplayDeviceInfo_uint(
			list_of_devices[j], 
			CL_DEVICE_MAX_CLOCK_FREQUENCY, 
			"CL_DEVICE_MAX_CLOCK_FREQUENCY");
      printf("\n");
    }
  }
  
  // Next, create OpenCL contexts on platforms
  cl_context_properties contextProperties[] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties)platformIds[iplat],
    0
  };
  
  if(Monitor>0){
    printf("Creating CPU context %d on platform %d\n", 1, iplat);
  }
  
  list_of_contexts[1] = 
    clCreateContextFromType(contextProperties, 
			    CL_DEVICE_TYPE_CPU, NULL, NULL, &retval);
  
  if(Monitor>=0 && retval != CL_SUCCESS){
    printf("Could not create CPU context on platform %d\n", i);
  }
  
  if(Monitor>0){
    printf("Creating GPU context 0 on platform %d\n", iplat);
  }
  
  list_of_contexts[0] = 
    clCreateContextFromType(contextProperties, 
			    CL_DEVICE_TYPE_GPU, NULL, NULL, &retval);
  
  if(Monitor>=0 && retval != CL_SUCCESS){
    printf("Could not create GPU context on platform %d\n", i);
  }
  
  // in a loop over devices of the seleceted platform   
  for(idev=0; idev0){
      printf("\nFor context %d and device %d:\n",
	     idev, idev);
    }
    device = list_of_devices[idev];
    icon = idev;
    
    // choose OpenCL context on first available platform
    context = list_of_contexts[icon];
    
    if(context !=0){
            
      commandQueue = clCreateCommandQueue(context, device, 0, NULL);
      if (commandQueue == NULL) {
	printf("Failed to create commandQueue for device %d\n", idev);
	exit(0);
      }
      
      if(Monitor>0){
	printf("Reading program from source\n");
      }

      // read source code from file
      FILE *fp;
      char* source;
      long int size;
      
      fp = fopen("HelloWorld.cl", "rb");
      if(!fp) {
	printf("Could not open kernel file\n");
	exit(-1);
      }
      int status = fseek(fp, 0, SEEK_END);
      if(status != 0) {
	printf("Error seeking to end of file\n");
	exit(-1);
      }
      size = ftell(fp);
      if(size < 0) {
	printf("Error getting file position\n");
	exit(-1);
      }
      
      rewind(fp);
      
      source = (char *)malloc(size + 1);
      
      int i;
      for (i = 0; i < size+1; i++) {
	source[i]='\0';
      }
      
      if(source == NULL) {
	printf("Error allocating space for the kernel source\n");
	exit(-1);
      }
      
      fread(source, 1, size, fp);
      source[size] = '\0';

      const char* source_const = source;
      cl_program program = clCreateProgramWithSource(context, 1,
						     &source_const,
						     NULL, NULL);
      if (program == NULL)
	{
	  printf("Failed to create CL program from source.\n");
	  exit(-1);
	}
      
      if(Monitor>0){
	printf("Creating program and kernel\n");
      }
      // build program (passing options to compiler if necessary
      retval = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
      char* buildLog; size_t size_of_buildLog; 
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
			    0, NULL, &size_of_buildLog); 
      buildLog = malloc(size_of_buildLog+1); 
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
			    size_of_buildLog, buildLog, NULL); 
      buildLog[size_of_buildLog]= '\0'; 
      printf("Kernel buildLog: %s\n", buildLog); 
      if (retval != CL_SUCCESS)
	{
	  printf("Error in kernel\n");
	  clReleaseProgram(program);
	  exit(-1);
	}
      
      
      // Create OpenCL kernel
      kernel = clCreateKernel(program, "hello_kernel", NULL);
      if (kernel == NULL)
	{
	  printf("Failed to create kernel.\n");
	  exit(0);
	}
      
      if(Monitor>0){
	printf("Creating memory objects\n");
      }
      // Create memory objects that will be used as arguments to
      // kernel.  First create host memory arrays that will be
      // used to store the arguments to the kernel
      float result[1];
      float a[1];
      float b[1];
      a[0] = 2;
      b[0] = 2;
      
      memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
				     sizeof(float), NULL, NULL);
      memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
				     sizeof(float), NULL, NULL);
      memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
				     sizeof(float), NULL, NULL);
      
      if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){
	printf("Error creating memory objects.\n");
	return 0;
      }
      
      if(Monitor>0){
	printf("Sending kernel arguments\n");
      }
      retval = clEnqueueWriteBuffer(
				    commandQueue, 
				    memObjects[0], 
				    CL_FALSE, 
				    0, 
				    sizeof(float),                         
				    a, 
				    0, 
				    NULL, 
				    NULL);
      
      // Use clEnqueueWriteBuffer() to write input array B to 
      // the device buffer bufferB
      retval = clEnqueueWriteBuffer(
				    commandQueue, 
				    memObjects[1], 
				    CL_FALSE, 
				    0, 
				    sizeof(float),                         
				    b,
				    0, 
				    NULL, 
				    NULL);
      
      // Set the kernel arguments (result, a, b)
      retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
      retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
      retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to Set the kernel arguments.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
      
      if(Monitor>0){
	printf("Running the kernel!\n");
      }
      size_t globalWorkSize[1] = { 1 };
      size_t localWorkSize[1] = { 1 };
      
      // Queue the kernel up for execution across the array
      retval = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
				      globalWorkSize, localWorkSize,
				      0, NULL, NULL);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to queue kernel for execution.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
      
      if(Monitor>0){
	printf("Transfering back results\n");
      }
      // Read the output buffer back to the Host
      retval = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
				   0, sizeof(float), result,
				   0, NULL, NULL);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to read result buffer.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
      
      // Verify the output
      if(result[0]==4)  {
        printf("Output is correct: %lf + %lf = %lf\n",
	       a[0], b[0], result[0]);
      } else {
        printf("Output is incorrect: %lf + %lf != %lf\n",
	       a[0], b[0], result[0]);
      }
      
      for (i = 0; i < 3; i++)
      	{
      	  if (memObjects[i] != 0)
            clReleaseMemObject(memObjects[i]);
      	}
      if (commandQueue != 0)
        clReleaseCommandQueue(commandQueue);
      
      if (kernel != 0)
        clReleaseKernel(kernel);
      
      if (program != 0)
        clReleaseProgram(program);
      
    }
  }
  
  free(list_of_devices);
  free(platformIds);
  
  return 0;
}

==== HelloWorld.cl ====

__kernel void hello_kernel(__global const float *a,
		           __global const float *b,
			   __global float *result)
{
    int gid = get_global_id(0);

    if(gid<1) result[gid] = a[gid] + b[gid];
}
===== Zajęcia 6 =====
Pliki zadania: {{:studia:magisterskie:1sem:opencl_vecadd.tgz|}}
Działające 3 kernele: {{:studia:magisterskie:1sem:sala404.zip|}}

==== opencl_vecadd/vecadd_host/execute_kernels.c ====
#include
#include
#include 

#include
#include 

#include"uth_ocl_intf.h"

#include"./vecadd_host.h"
#include"./hardware_params.h"

#define time_measurments

#ifdef time_measurments
#include"uth_time_intf.h"
  static double t_begin, t_end, t_total;
#endif

int execute_kernel_2_CPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  
  // create the kernel for CPU
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
				  // kernel name:         , file:
				  "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. Restoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  int nr_cores_CPU = omp_get_num_procs();
  int nr_threads = nr_cores_CPU; // total number of threads
  int work_group_size = 1; // number of threads in work-group
  int nr_work_groups = nr_threads / work_group_size;
  globalWorkSize[0] = nr_threads ;
  localWorkSize[0] = work_group_size ;
  
  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	 nr_cores_CPU, nr_cores_CPU, 1);
  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	 work_group_size, nr_threads / work_group_size, 
	 nr_threads / work_group_size / nr_cores_CPU);
  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, 
	 globalWorkSize[0]/nr_cores_CPU);
  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
  
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  int size = array_size;
  int size_per_thread = ceilf((float)size/nr_threads); 
  
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
  if (retval != CL_SUCCESS) {
    printf("Failed to Set the kernel arguments.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif

  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}


int execute_kernel_3_CPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the kernel for CPU
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
				  // kernel name:         , file:
				  "vecadd_3_opt_blocks_kernel","vecadd_3_opt_blocks.cl",monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  int nr_cores_CPU = omp_get_num_procs();
  int nr_threads = nr_cores_CPU; // total number of threads
  int work_group_size = 1; // number of threads in work-group
  int nr_work_groups = nr_threads / work_group_size;
  globalWorkSize[0] = nr_threads ;
  localWorkSize[0] = work_group_size ;
  
  printf("\t\tExecuting kernel %d, on platform %d and device %d\n",
	 kernel_index, platform_index, device_index);
  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	 nr_cores_CPU, nr_cores_CPU, 1);
  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	 work_group_size, nr_threads / work_group_size, 
	 nr_threads / work_group_size / nr_cores_CPU);
  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, 
	 globalWorkSize[0]/nr_cores_CPU);
  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  int size = array_size;
  
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  if (retval != CL_SUCCESS) {
    printf("Failed to Set the kernel arguments.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    return 1;
  }
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}

// tutaj modyfikujemy
int execute_kernel_0_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the kernel for GPU
    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  
  // globalna liczba wątków i lokalna liczba wątków
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size;
  localWorkSize[0] = work_group_size;
  
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
   
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  // Set the kernel arguments (result, a, b)
  // ustawiamy liczbę argumentów
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
  
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}


int execute_kernel_1_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the kernel for GPU
/*
    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/

    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file: */
 				    "vecadd_1_kernel", "vecadd_1.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size / 4;
  localWorkSize[0] = work_group_size;
  
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
   
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
  
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}


int execute_kernel_2_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the kernel for GPU
/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file: */
 				    "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size / work_group_size;
  localWorkSize[0] = work_group_size;

  int size_per_thread = work_group_size;
  int size = array_size;
  
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
   
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
  
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}


int execute_kernel_3_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
  
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
  
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
  
  int i,j,k;
  
  int size = array_size;
  
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the kernel for GPU
/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file:
				    "vecadd_3_opt_blocks_kernel", "vecadd_3_opt_blocks.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
  
  
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
  
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
  
  
  int NDRange_dim = 1;
  
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size;
  localWorkSize[0] = work_group_size;
  
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
   
  
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
  
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
  
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
    
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
  
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);

  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
  
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
  
  
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
  
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
  
  return(0);
}




    
/*     // create the fourth kernel for GPU */
/*     kernel_index = 4;  */
/*     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, */
/* 				    // kernel name:         , file: */
/* 				    "vecadd_4_opt_blocks_vect_kernel", "vecadd_4_opt_blocks_vect.cl", monitor); */

==== opencl_vecadd/vecadd_2_blocks.cl ====

__kernel void vecadd_2_blocks_kernel(__global const float *a,
				     __global const float *b,
				     __global float *result,
		                     const int size,
				     const int size_per_thread)
{
  int gid = get_global_id(0);
  int index_start = gid * size_per_thread;
  int index_end = (gid+1) * size_per_thread;

  if (gid == 0) {
	//printf("size: %d, %d", size, size_per_thread);
  }

  for (int i=index_start; i < index_end && i < size; i++) {	
  //for (int i=index_start; i < index_end ; i++) {	
    result[i] = a[i]+b[i];
  }
}
===== Zajęcia 7 =====
{{:studia:magisterskie:1sem:zajecia_7.zip|}}

==== tmp/opencl_mat_transp/mat_transp_1.cl ====
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

__kernel void mat_transp_1_kernel(
  __global float *A , 
  __global float *B ,
  __global float *C ,
  int N)
{

int id = get_global_id(0) * get_global_size(0) + get_global_id(1);
float temp = 0.0;

for(int i = 0; i

==== tmp/opencl_mat_transp/mat_transp/mat_transp.c ====
#include
#include
#include 

#include 

#include"uth_ocl_intf.h"

#define time_measurments

#ifdef time_measurments
#include"uth_time_intf.h"
  static double t_begin, t_end, t_total;
#endif



#define BLOCK_SIZE 16
#define NR_GROUPS 16
#define MULT 16
#define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT)
#define ROZMIAR (WYMIAR*WYMIAR)
// Matrices are stored in row-major order: 
// M(row, col) = M( row * WYMIAR + col ) 

int verify_result(
  float* result,
  float* result_compare
		  )
{
  // Verify the result
  int result_OK = 1;
  int i,j;
  for(i = 0; i < WYMIAR; i++) {
      if(fabs(result[i] - result_compare[i])>1.e-6) {
	result_OK = 0;
	break;
    }
  }
  printf("\t\t6. verifying results: ");
  if(result_OK) {
    printf("Output is correct\n");
  } else {
    printf("Output is incorrect\n");
    j = 0;
    for(i = 0; i < WYMIAR, j<100; i++) {
	if(fabs(result[i] - result_compare[i])>1.e-9) {
	  j++;
	  printf("%d %16.8f %16.8f\n", 
	  	 i, result[i], result_compare[i]);
	}
    }
    exit(0);
  }
    /* for(i = 0; i < length; i++) { */
    /*   printf("%16.8f %16.8f\n", result[i], result_compare[i]); */
    /* } */

  return(result_OK);
}

  /*----------------KERNEL CREATION PHASE----------------------*/
void create_kernels()
{

  // for all operations indicate explicit info messages
  int monitor = UTC_BASIC_INFO + 1;

  int kernel_index;
  
  int platform_index = utv_ocl_struct.current_platform_index;

  if(utr_ocl_CPU_context_exists(platform_index)){

#ifdef time_measurments
  t_begin = time_clock();
#endif
  
  // create the first kernel for CPU
  /* kernel_index = 0;  */
  /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, */
  /* 				  // kernel name:         , file: */
  /* 				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor); */
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
  
  }

  if(utr_ocl_GPU_context_exists(platform_index)){


#ifdef time_measurments
  t_begin = time_clock();
#endif
  // create the first kernel for GPU
  kernel_index = 0; 
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				  // kernel name:         , file:
				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor);
  
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
  
  // create the second kernel for GPU
  kernel_index = 1; 
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				  // kernel name:         , file:
				  "mat_transp_2_kernel", "mat_transp_2.cl", monitor);

  

  }

}

  /*----------------EXECUTION PHASE----------------------*/
int execute_kernels()
{

  // for all operations indicate explicit info messages
  int monitor = UTC_BASIC_INFO + 1;

  int platform_index = utv_ocl_struct.current_platform_index;
  utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index];

  int kernel_index;
  int i,j,n;

  double nr_access;
  double t1,t2;

  if(monitor>UTC_BASIC_INFO){
    printf("\n------------Starting execution phase----------------\n");
  }
  
  // create matrices
  n=WYMIAR;
  float* A = (float *) malloc(ROZMIAR*sizeof(float));
  float* B = (float *) malloc(WYMIAR*sizeof(float));
  float* C = (float *) malloc(WYMIAR*sizeof(float));
  float* D = (float *) malloc(WYMIAR*sizeof(float));
  
  for(i=0;i0 && device_type==utr_ocl_device_type(platform_index, device_index-1)) break; 
    if(device_type == UTC_OCL_DEVICE_CPU) break;

    // choose the context
    cl_context context = utr_ocl_select_context(platform_index, device_index);  
    
    // choose the command queue
    cl_command_queue command_queue = 
      utr_ocl_select_command_queue(platform_index, device_index);  
    
    if(monitor>UTC_BASIC_INFO){
      printf("\nExecution: \t0. restoring context and command queue for platform %d and device %d\n",
	     platform_index, device_index);
    }
    
    if(context == NULL || command_queue == NULL){ 
      
      printf("failed to restore context and command queue for platform %d, device %d\n", 
	     platform_index, device_index);
      printf("%lu %lu\n", context, command_queue);
    }
    
    // choose the kernel
    kernel_index = 0;
    cl_kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
    
    if(monitor>UTC_BASIC_INFO){
      printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	     kernel_index, platform_index, device_index);
    }
    
    if(context == NULL || command_queue == NULL || kernel == NULL){ 
      
      printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	     platform_index, device_index, kernel_index);
      printf("context %lu, command queue %lu, kernel %lu\n", 
	     context, command_queue, kernel);
    }
    
    for(i=0;i
==== Kod wogu ====
=== server.c ====

/*
    C socket server example, handles multiple clients using threads
    Compile
    gcc server.c -lpthread -o server
*/
 
#include
#include    //strlen
#include    //strlen
#include
#include //inet_addr
#include    //write
#include //for threading , link with lpthread
#include 
 
//the thread function
void *connection_handler(void *);
 
int main(int argc , char *argv[])
{
    int counter = 0;
    pid_t pid = fork();

    if (pid == 0)
    {
       	    int sock;
	    struct sockaddr_in server;
	    char message[1000] , server_reply[2000];
            int counter;
	    unsigned int ip[] = { 192, 168, 102, 1 };
            char* ip_final;
		char *buf;
		size_t sz;
	 
	    //Create socket

            for ( counter = 0; counter < 255; ++counter ) {

		    sock = socket(AF_INET , SOCK_STREAM , 0);
		    if (sock == -1)
		    {
			printf("Could not create socket");
		    }
		    puts("Socket created");

		    //printf("192.168.102.%d", counter);

		sz = snprintf(NULL, 0, "192.168.102.%d", counter);
		buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */
		    
		    server.sin_addr.s_addr = inet_addr(buf);
		    server.sin_family = AF_INET;
		    server.sin_port = htons( 65001 );
		 
		    //Connect to remote server
		    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
		    {
			perror("connect failed. Error");
			continue;
		    }
		 
		    puts("Connected\n");
		 
		    //keep communicating with server
		    while(1)
		    {
			//Send some data
			if( send(sock , "Werset 6-client" , 15 , 0) < 0)
			{
			    puts("Send failed");
			    return 1;
			}
		 
			//Receive a reply from the server
			if( recv(sock , server_reply , 2000 , 0) < 0)
			{
			    puts("recv failed");
			    break;
			}

			puts(server_reply);

			break;
		    }
		 
		    close(sock);
           }
    }
    else if (pid > 0)
    {
        int socket_desc , client_sock , c;
	    struct sockaddr_in server , client;
	     
	    //Create socket
	    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
	    if (socket_desc == -1)
	    {
		printf("Could not create socket");
	    }
	    puts("Socket created");
	     
	    //Prepare the sockaddr_in structure
	    server.sin_family = AF_INET;
	    server.sin_addr.s_addr = INADDR_ANY;
	    server.sin_port = htons( 65001 );
	     
	    //Bind
	    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
	    {
		//print the error message
		perror("bind failed. Error");
		return 1;
	    }
	    puts("bind done");
	     
	    //Listen
	    listen(socket_desc , 3);
	     
	    //Accept and incoming connection
	    puts("Waiting for incoming connections...");
	    c = sizeof(struct sockaddr_in);
	     
	     
	    //Accept and incoming connection
	    puts("Waiting for incoming connections...");
	    c = sizeof(struct sockaddr_in);
		pthread_t thread_id;
	
	    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
	    {
		puts("Connection accepted");
		 
		if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
		{
		    perror("could not create thread");
		    return 1;
		}
		 
		//Now join the thread , so that we dont terminate before the thread
		//pthread_join( thread_id , NULL);
		puts("Handler assigned");
	    }
	     
	    if (client_sock < 0)
	    {
		perror("accept failed");
		return 1;
	    }
    }
    else
    {
        // fork failed
        printf("fork() failed!\n");
        return 1;
    }
     
    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];

    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
	printf("%s\n", client_message);

	//end of string marker
	client_message[read_size] = '\0';
	
	//Send the message back to client
	write(sock , "Werset 6" , 8);
	
	//clear the message buffer
	memset(client_message, 0, 2000);
    }
     
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
         
    return 0;
}
=== Makefile ===
# kompilator c
CC = gcc
 
# konsolidator
CFLAGS=-g

all: server client
 
server: server.o
	$(CC) $(LFLAGS) server.o -lpthread -o server
 
server.o: server.c
	$(CC) -c server.c -o server.o

client: client.o
	$(CC) client.o -o client
 
client.o: client.c
	$(CC) -c client.c -o client.o
 
clean:
	rm -f *.o
===== Zajęcia 9 =====
/*
    C socket server example, handles multiple clients using threads
    Compile
    gcc server.c -lpthread -o server
*/
 
#include
#include    //strlen
#include    //strlen
#include
#include //inet_addr
#include    //write
#include //for threading , link with lpthread
 
//the thread function
void *connection_handler(void *);
 
int main(int argc , char *argv[])
{
    int socket_desc , client_sock , c;
    struct sockaddr_in server , client;
     
    //Create socket
    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
    if (socket_desc == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
     
    //Prepare the sockaddr_in structure
    server.sin_family = AF_INET;
    server.sin_addr.s_addr = INADDR_ANY;
    server.sin_port = htons( 8888 );
     
    //Bind
    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
    {
        //print the error message
        perror("bind failed. Error");
        return 1;
    }
    puts("bind done");
     
    //Listen
    listen(socket_desc , 3);
     
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
     
     
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
	pthread_t thread_id;
	
    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
    {
        puts("Connection accepted");
         
        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
        {
            perror("could not create thread");
            return 1;
        }
         
        //Now join the thread , so that we dont terminate before the thread
        //pthread_join( thread_id , NULL);
        puts("Handler assigned");
    }
     
    if (client_sock < 0)
    {
        perror("accept failed");
        return 1;
    }
     
    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];

    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
	//end of string marker
	client_message[read_size] = '\0';
	
	//Send the message back to client
	write(sock , "Werset 6" , strlen(client_message));
	
	//clear the message buffer
	memset(client_message, 0, 2000);
    }
     
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
         
    return 0;
} 

==== Client C ====

#include //printf
#include    //strlen
#include    //socket
#include //inet_addr
 
int main(int argc , char *argv[])
{
    int sock;
    struct sockaddr_in server;
    char message[1000] , server_reply[2000];
     
    //Create socket
    sock = socket(AF_INET , SOCK_STREAM , 0);
    if (sock == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
     
    server.sin_addr.s_addr = inet_addr("127.0.0.1");
    server.sin_family = AF_INET;
    server.sin_port = htons( 8888 );
 
    //Connect to remote server
    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
    {
        perror("connect failed. Error");
        return 1;
    }
     
    puts("Connected\n");
     
    //keep communicating with server
    while(1)
    {
        printf("Enter message : ");
        scanf("%s" , message);
         
        //Send some data
        if( send(sock , message , strlen(message) , 0) < 0)
        {
            puts("Send failed");
            return 1;
        }
         
        //Receive a reply from the server
        if( recv(sock , server_reply , 2000 , 0) < 0)
        {
            puts("recv failed");
            break;
        }
         
        puts("Server reply :");
        puts(server_reply);
    }
     
    close(sock);
    return 0;
}


==== Kod Maciek Gonzo itp ====

#include
#include    //strlen
#include    //strlen
#include
#include //inet_addr
#include    //write
#include //for threading , link with lpthread
 
//the thread function
void *connection_handler(void *);

void *client_function(void *socket_desc){
  int sock;
    struct sockaddr_in server_id;
    char message[1000] , server_reply[2000];
     
    //Create socket
    sock = socket(AF_INET , SOCK_STREAM , 0);
    if (sock == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
     // 192.168.102.63 wogu
    server_id.sin_addr.s_addr = inet_addr("localhost");
    server_id.sin_family = AF_INET;
    server_id.sin_port = htons( 65000 );
 
    //Connect to remote server

    if (connect(sock , (struct sockaddr *)&server_id , sizeof(server_id)) < 0)
    {
        perror("connect failed. Error");

    }else{

    puts("Connected\n");
     
    //keep communicating with server
    while(1)
    {
        printf("Enter message : ");
        scanf("%s" , message);
         
        //Send some data
        if( send(sock , message , strlen(message) , 0) < 0)
        {
            puts("Send failed");

        }
         
        //Receive a reply from the server
        if( recv(sock , server_reply , 2000 , 0) < 0)
        {
            puts("recv failed");
            break;
        }
         
        puts("Server reply :");
        puts(server_reply);
    }
     
    close(sock); 
}
   
};
 

void *server_function(void * sock){
    printf("SERVER Start");
fflush(stdout);
    int socket_desc , client_sock , c;
    pthread_t thread_id;
    struct sockaddr_in server , client;
 
    //Create socket
    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
    if (socket_desc == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
 
    //Prepare the sockaddr_in structure
    server.sin_family = AF_INET;
    server.sin_addr.s_addr = INADDR_ANY;
    server.sin_port = htons( 65000);
 
    //Bind
    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
    {
        //print the error message
        perror("bind failed. Error");
        return;
    }
    puts("bind done");



    //Listen
    listen(socket_desc , 3);
 
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
fflush(stdout);
 
    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
    {
        puts("Connection accepted");
 
        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
        {
            perror("could not create thread");
            return;
        }
 
        //Now join the thread , so that we dont terminate before the thread
        //pthread_join( thread_id , NULL);
        puts("Handler assigned");
    }
 
    if (client_sock < 0)
    {
        perror("accept failed");
        return;
    }
}


int main(int argc , char *argv[])
{
    pthread_t thread_id;
//Create server
        if( pthread_create( &thread_id , NULL ,  server_function , NULL) != 0)
        {
            perror("could not create server thread");
	    fflush(stdout);
            return 1;
        }
    pthread_t thread_id2;

	// Create client
        if( pthread_create( &thread_id2 , NULL ,  client_function , NULL) != 0)
        {
            perror("could not create client thread");
	    fflush(stdout);
            return 1;
        }

    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];
 
    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
        //end of string marker
		client_message[read_size] = '\0';
 
		//Send the message back to client
        write(sock , "Werset 5" , strlen(client_message));
 
		//clear the message buffer
		memset(client_message, 0, 2000);
    }
 
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
 
    return 0;
}