Spis treści

Zajęcia 1

Makefile

# kompilator c
CCOMP = mpicc
 
# konsolidator
LINK = mpicc
 
MPIRUN = /usr/lib64/openmpi/bin/mpiexec 
 
# opcje optymalizacji:
# wersja do debugowania
# OPT = -g -DDEBUG -p
# wersja zoptymalizowana do mierzenia czasu
# OPT = -O3 -fopenmp -p
 
# pliki naglowkowe
#INC = -I../pomiar_czasu
 
# biblioteki
#LIB = -L../pomiar_czasu -lm
LIB = -lm
 
# zaleznosci i komendy
 
heat: heat.o
	$(LINK) $(OPT) heat.o -o heat $(LIB)
 
heat.o: heat.c
	$(CCOMP) -c $(OPT) heat.c $(INC)
 
run:
	$(MPIRUN) -np 8 ./heat
 
clean:
	rm -f *.o

heat.c

# include <stdlib.h>
# include <stdio.h>
# include <math.h>
 
# include "mpi.h"
 
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
 
/******************************************************************************/
 
int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1.0; // prawy brzeg przedzialu
  int i;  
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;
 
  MPI_Init ( &argc, &argv );
 
  MPI_Comm_rank ( MPI_COMM_WORLD, &id );
 
  MPI_Comm_size ( MPI_COMM_WORLD, &p );
 
  n = 12; // liczba punktow dla kazdego wezla 
  i = 0;  // poczatkowa chwila czasu
 
  // wspolrzedna lewego punktu dla wezla id
  x_min = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double ) ( p * n + 1              );
 
  i = n + 1;
 
  // wspolrzedna prawego punktu dla wezla id
  x_max = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double )( p * n + 1              );
 
  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
 
  MPI_Finalize ( );
}
 
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;
 
  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
 
  k = 0.002 / ( double ) p; // przewodniosc cieplna
 
  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
 
  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max   
           + ( double ) ( n + 1 - i ) * x_min ) 
           / ( double ) ( n + 1     );
  }
 
  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }
 
  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;
 
  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }
 
  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
 
  for ( j = 1; j <= j_max; j++ )
  {
 
    // wymiana informacji z wezlami sasiednimi
    tag = 1;
 
    if ( id < p - 1 )
    {
      MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
    }
 
    if ( 0 < id )
    {
      MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
    }
 
    tag = 2;
 
    if ( 0 < id )
    {
      // DO UZUPELNIENIA
	MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
    }
 
    if ( id < p - 1 )
    {
      // DO UZUPELNIENIA   
	MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );  
    }
 
 
    // implementacja wzoru roznicowego
    for ( i = 1; i <= n; i++ )
    {
      h_new[i] = h[i] + t_del * ( 
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del 
        + 2.0 * sin ( x[i] * t ) );
    }
 
    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max   
            + ( double ) ( j_max - j         ) * t_min ) 
            / ( double ) ( j_max     - j_min );
 
    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;
 
  }
 
  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
 
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
 
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
 
  tag = 12;
 
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );      
  }
 
  wtime = MPI_Wtime ( ) - wtime;
 
  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }
 
  // wydruk wyniku
  printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" );
 
  free ( h );
  free ( h_new );
  free ( x );
 
  return;
}

Zajęcia 2

heat.c

# include <stdlib.h>
# include <stdio.h>
# include <math.h>
 
# include "mpi.h"
 
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
 
/******************************************************************************/
 
int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1000000.0; // prawy brzeg przedzialu
  int i;  
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;
 
  MPI_Init ( &argc, &argv );
 
  MPI_Comm_rank ( MPI_COMM_WORLD, &id );
 
  MPI_Comm_size ( MPI_COMM_WORLD, &p );
 
  n = 1000000; // liczba punktow dla kazdego wezla 
  i = 0;  // poczatkowa chwila czasu
 
  // wspolrzedna lewego punktu dla wezla id
  x_min = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double ) ( p * n + 1              );
 
  i = n + 1;
 
  // wspolrzedna prawego punktu dla wezla id
  x_max = ( ( double )( p * n + 1 - id * n - i ) * a   
          + ( double )(             id * n + i ) * b ) 
          / ( double )( p * n + 1              );
 
  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
 
  MPI_Finalize ( );
 
 
}
 
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;
 
  MPI_Request req1, req2, req3, req4;
  MPI_Status stat1, stat2, stat3, stat4;
 
  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
 
  k = 0.002 / ( double ) p; // przewodniosc cieplna
 
  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
 
  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max   
           + ( double ) ( n + 1 - i ) * x_min ) 
           / ( double ) ( n + 1     );
  }
 
  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }
 
  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;
 
  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }
 
  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
 
  for ( j = 1; j <= j_max; j++ )
  {
 
    // wymiana informacji z wezlami sasiednimi
 
    ///NON BLOCK
    if ( id > 0 )
    {
        MPI_Irecv ( &h[0], 1, MPI_DOUBLE, id-1, 1, MPI_COMM_WORLD, &req1 );
    }
    if ( id < p - 1 )
    { 
	MPI_Irecv ( &h[n+1], 1, MPI_DOUBLE, id+1, 2, MPI_COMM_WORLD, &req2 );
    }
    ///END NON BLOCK
 
    if ( id > 0 )
    {
	MPI_Isend ( &h[1], 1, MPI_DOUBLE, id-1, 2, MPI_COMM_WORLD, &req3 );
    }
    if ( id < p - 1 )
    {
        MPI_Isend ( &h[n], 1, MPI_DOUBLE, id+1, 1, MPI_COMM_WORLD, &req4 );
    }
 
 
 
    // implementacja wzoru roznicowego
    for ( i = 2; i <= n-1; i++ )
    {
      h_new[i] = h[i] + t_del * ( 
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del 
        + 2.0 * sin ( x[i] * t ) );
    }
 
 
    ///NON BLOCK 
    ///WAIT FOR RECEIVE
    if ( id > 0 ) 
    {
      MPI_Wait(&req1, &stat1); 
    }
    if ( id < p - 1 ) 
    {
      MPI_Wait(&req2, &stat2); 
    }
 
    int tmp = 1;
    h_new[tmp] = h[tmp] + t_del * ( 
        k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del 
        + 2.0 * sin ( x[tmp] * t ) );
    tmp = n;
    h_new[tmp] = h[tmp] + t_del * ( 
        k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del 
        + 2.0 * sin ( x[tmp] * t ) );
 
 
    ///WAIT FOR SEND
    if ( id > 0 ) 
    {
      MPI_Wait(&req3, &stat3); 
    }
    if ( id < p - 1 ) 
    {
      MPI_Wait(&req4, &stat4); 
    }
 
    ///END NON BLOCK
 
 
 
    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max   
            + ( double ) ( j_max - j         ) * t_min ) 
            / ( double ) ( j_max     - j_min );
 
    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;
 
  }
 
  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
 
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
 
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
 
  tag = 12;
 
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );      
  }
 
  wtime = MPI_Wtime ( ) - wtime;
 
  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }
 
  // wydruk wyniku
  /*printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" ); */
 
  free ( h );
  free ( h_new );
  free ( x );
 
  return;
}

Zajęcia 3

child.c

# include <stdlib.h>
# include <stdio.h>
# include <math.h>
 
# include "mpi.h"
 
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
 
/******************************************************************************/
 
int main ( int argc, char *argv[] )
{
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  double x_max;
  double x_min;
  MPI_Comm parentcomm; 
  MPI_Status status;
 
  MPI_Init ( &argc, &argv );
 
  MPI_Comm_rank ( MPI_COMM_WORLD, &id );
 
  MPI_Comm_size ( MPI_COMM_WORLD, &p );
 
  MPI_Comm_get_parent(&parentcomm);
 
  MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status );
 
  MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status );
  MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &status );
 
  //MPI_Barrier(MPI_COMM_WORLD);
 
  heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
 
  MPI_Finalize ( );
 
  return 0;
}
 
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
  double cfl;
  double *h;
  double *h_new;
  int i;
  int ierr;
  int j;
  int j_max;
  int j_min;
  double k;
  MPI_Status status;
  double t;
  double t_del;
  double t_max;
  double t_min;
  int tag;
  double wtime;
  double *x;
  double x_del;
 
  h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
  h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
  x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
 
  k = 0.002 / ( double ) p; // przewodniosc cieplna
 
  j_min = 0; // indeksy krokow czasowych - min i max
  j_max = 100;
  t_min = 0.0; // chwile czasu - min i max
  t_max = 10.0;
  t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
 
  x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
  for ( i = 0; i <= n + 1; i++ )
  {
    x[i] = ( ( double ) (         i ) * x_max  
           + ( double ) ( n + 1 - i ) * x_min )
           / ( double ) ( n + 1     );
  }
 
  // ustawienie warunku poczatkowego
  for ( i = 0; i <= n + 1; i++ )
  {
    h[i] = 95.0;
  }
 
  // sprawdzenie stabilnosci schematu
  cfl = k * t_del / x_del / x_del;
 
  if ( 0.5 <= cfl )
  {
    printf ( "  CFL condition failed.\n" );
    exit ( 1 );
  }
 
  wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
 
  for ( j = 1; j <= j_max; j++ )
  {
 
    // wymiana informacji z wezlami sasiednimi
    tag = 1;
 
    if ( id < p - 1 )
    {
      MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
    }
 
    if ( 0 < id )
    {
      MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
    }
 
    tag = 2;
 
 if ( 0 < id ) {
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );     
  }
 
 
    // implementacja wzoru roznicowego
    for ( i = 1; i <= n; i++ )
    {
      h_new[i] = h[i] + t_del * (
        k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
        + 2.0 * sin ( x[i] * t ) );
    }
 
    // nowa chwila czasu
    t = ( ( double ) (         j - j_min ) * t_max  
            + ( double ) ( j_max - j         ) * t_min )
            / ( double ) ( j_max     - j_min );
 
    // przygotowanie do nastepnego kroku czasowego
    for ( i = 1; i < n + 1; i++ )
    {
      h[i] = h_new[i];
    }
    if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
    if ( id == p - 1 ) h[n+1] = 75;
 
  }
 
  // koncowa wymiana informacji z wezlami sasiednimi
  tag = 11;
 
  if ( id < p - 1 ) {
    MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
  }
 
  if ( 0 < id ) {
    MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
  }
 
  tag = 12;
 
  if ( 0 < id ) {
    // DO UZUPELNIENIA
    MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
  }
 
  if ( id < p - 1 ) {
    // DO UZUPELNIENIA
    MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );     
  }
 
  wtime = MPI_Wtime ( ) - wtime;
 
  if ( id == 0 )
  {
    printf ( "\n" );
    printf ( "  Wall clock elapsed seconds = %f\n", wtime );
  }
 
  // wydruk wyniku
  printf ( "%2d  T= %f\n", id, t );
  printf ( "%2d  X= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", x[i] );
  }
  printf ( "\n" );
  printf ( "%2d  H= ", id );
  for ( i = 0; i <= n + 1; i++ )
  {
    printf ( "%7.2f", h[i] );
  }
  printf ( "\n" );
 
  free ( h );
  free ( h_new );
  free ( x );
 
  return;
}

parent.c

# include <stdlib.h>
# include <stdio.h>
# include <math.h>
 
# include "mpi.h"
 
int main ( int argc, char *argv[] );
 
/******************************************************************************/
 
int main ( int argc, char *argv[] )
{
  double a = 0.0; // lewy brzeg przedzialu
  double b = 1.0; // prawy brzeg przedzialu
  int i; 
  int id; // rank
  int n;  // liczba punktow dla kazdego wezla
  int p;  // size
  int np = 8; // liczba procesow
  double x_max;
  double x_min;
  MPI_Comm komunikator;
  int *errcodes;
 
  MPI_Init ( &argc, &argv );
 
  //MPI_Comm_rank ( MPI_COMM_WORLD, &id );
 
  //MPI_Comm_size ( MPI_COMM_WORLD, &p );
 
  MPI_Comm_spawn("dziecko", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &komunikator, errcodes);
 
  n = 12; // liczba punktow dla kazdego wezla
  int j;
  for(j=0;j<np;j++){
   i = 0;  // poczatkowa chwila czasu
    // wspolrzedna lewego punktu dla wezla id
    x_min = ( ( double )( np * n + 1 - j * n - i ) * a  
          + ( double )(             j * n + i ) * b )
          / ( double ) ( np * n + 1              );
 
    i = n + 1;
 
    // wspolrzedna prawego punktu dla wezla id
    x_max = ( ( double )( np * n + 1 - j * n - i ) * a  
          + ( double )(             j * n + i ) * b )
          / ( double )( np * n + 1              );
 
    MPI_Send ( &x_min, 1, MPI_DOUBLE, j, 1, komunikator );
    MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 2, komunikator );
    MPI_Send ( &n, 1, MPI_INT, j, 3, komunikator );
 
  }
 
  MPI_Finalize ( );
 
  return 0;
}
 
 
 
 
 
 
    struct rekord wys;
    MPI_Datatype rekord_typ;
    int tab_dlug_blokow[3] = {1, 1, 1};
    MPI_Datatype tab_typow[3] = {MPI_DOUBLE, MPI_DOUBLE, MPI_INT};
    MPI_Aint podstawa, tab_odstepow[3];
 
    MPI_Get_address(&wys.x_min, &tab_odstepow[0]);
    MPI_Get_address(&wys.x_max, &tab_odstepow[1]);
    MPI_Get_address(&wys.n, &tab_odstepow[2]);
 
    MPI_Type_struct(3, tab_dlug_blokow, tab_odstepow, tab_typow, &rekord_typ);
    MPI_Type_commit(&rekord_typ);
    MPI_Recv ( &wys, 1, rekord_typ, 0, 1, parentcomm, &status );

Zajęcia 4

wkrótce

Zajęcia 5

Makefile

# optimization and other system dependent options
#include  make.$(SRR_ARCH)
# or directly
include  make.lab_404_NVIDIA
 
NAME = Hello_GPU
 
program: main.o 
	$(CC) $(LDFL) main.o  $(LIB) -o $(NAME)
 
main.o: main.c 
	$(CC) $(CFL) -c main.c  $(INC) -o main.o
 
clean:
	rm -f obj/*
	rm -f $(NAME)

make.lab_404_NVIDIA

# C compiler
#CC = icc
CC = gcc
 
# C++ compiler
CPPC = icpc
#CPPC = g++
 
# Loader (to link C/C++ and Fortran libraries)
LD = icpc
#LD = g++
 
# Archiver
AR = ar r
 
# For removing files
RM = rm -f
 
# Include directories
INC = -I/opt/cuda7/include
 
# Standard and/or local libraries
LIB = -L/opt/cuda7/lib64 -lOpenCL 
 
# C optimization and other flags
#CFL(icc) = -O3 -openmp
#CFL(gcc) = -O3 -fopenmp
#CFL(icc,debug) = -g
#CFL(gcc) = -g
CFL =
 
# Loader optimization and other flags
#LDFL(debug) = -g 
#LDFL(profile) = -p                                
LDFL =

main.c

#include<stdlib.h>
#include<stdio.h>
 
#include <CL/cl.h>
 
// functions to display platform and device properties
void DisplayPlatformInfo(
			 cl_platform_id id, 
			 cl_platform_info name,
			 char* str)
{
  cl_int retval;
  size_t paramValueSize;
 
  retval = clGetPlatformInfo(
			     id,
			     name,
			     0,
			     NULL,
			     &paramValueSize);
  if (retval != CL_SUCCESS){
    printf("Failed to find OpenCL platform %s.\n", str);
    return;
  }
 
  char * info = (char *)malloc(sizeof(char) * paramValueSize);
  retval = clGetPlatformInfo(
			     id,
			     name,
			     paramValueSize,
			     info,
			     NULL);
  if (retval != CL_SUCCESS)  {
    printf("Failed to find OpenCL platform %s.\n", str);
    return;
  }
 
  printf("\t%s:\t%s\n", str, info );
  free(info); 
}
 
void DisplayDeviceInfo_char(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;
 
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   &paramValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  char * info = (char *)malloc(sizeof(char) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
 
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  printf("\t\t%s:\t%s\n", str, info );
  free(info);
};
 
void DisplayDeviceInfo_ulong(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;// = sizeof(cl_ulong);
 
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   &paramValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
 
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  printf("\t\t%s:\t%lu MB\n", str, info[0] / 1024 / 1024 );
  free(info);
};
 
void DisplayDeviceInfo_uint(
		       cl_device_id id, 
		       cl_device_info name,
		       char* str)
{
  cl_int retval;
  size_t paramValueSize;// = sizeof(cl_ulong);
 
  retval = clGetDeviceInfo(
			   id,
			   name,
			   0,
			   NULL,
			   &paramValueSize);
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize);
  retval = clGetDeviceInfo(
			   id,
			   name,
			   paramValueSize,
			   info,
			   NULL);
 
  if (retval != CL_SUCCESS) {
    printf("Failed to find OpenCL device info %s.\n", str);
    return;
  }
 
  printf("\t\t%s:\t%d MHz\n", str, info[0] );
  free(info);
};
 
 
// main program controlling execution of CPU code and OpenCL kernels
int main(int argc, char** argv)
{
  cl_uint number_of_contexts = 2;
  cl_context context = NULL;
  cl_context list_of_contexts[2] = {0,0};
  cl_command_queue commandQueue = 0;
  cl_program program = 0;
  cl_uint number_of_devices;
  cl_device_id device = 0;
  cl_device_id *list_of_devices;
  cl_device_type type;
  cl_kernel kernel = 0;
  cl_mem memObjects[3] = { 0, 0, 0 };
  cl_int retval;
  int icon, idev;
  cl_uint numPlatforms;
  cl_platform_id * platformIds;
  cl_uint i,j;
 
  // flag to control displaying
  int Monitor = 1;
 
 
  // Create OpenCL contexts 
 
  // First, query the total number of platforms
  retval = clGetPlatformIDs(0, (cl_platform_id *) NULL, &numPlatforms);
 
  // Next, allocate memory for the installed plaforms, and qeury 
  // to get the list.
  platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
 
  // Then, query the platform IDs
  retval = clGetPlatformIDs(numPlatforms, platformIds, NULL);
 
  if(Monitor>=0){
    printf("Number of platforms: \t%d\n", numPlatforms); 
  }
 
  // Iterate through the list of platforms displaying associated information
  for (i = 0; i < numPlatforms; i++) {
 
    if(Monitor>0){
 
      printf("Platform ID - %d\n",i);
      // First we display information associated with the platform
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_NAME, 
			"CL_PLATFORM_NAME");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_PROFILE, 
			"CL_PLATFORM_PROFILE");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_VERSION, 
			"CL_PLATFORM_VERSION");
      DisplayPlatformInfo(
			platformIds[i], 
			CL_PLATFORM_VENDOR, 
			"CL_PLATFORM_VENDOR");
    }
 
  }
 
  // For the first platform
  int iplat;
  printf("Select Platform ID: "); scanf("%d", &iplat);
 
  // Query the set of devices associated with the platform
  retval = clGetDeviceIDs(
			  platformIds[iplat],
			  CL_DEVICE_TYPE_ALL,
			  0,
			  NULL,
			  &number_of_devices);
 
 
  list_of_devices = 
    (cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices);
 
  retval = clGetDeviceIDs(
			  platformIds[iplat],
			  CL_DEVICE_TYPE_ALL,
			  number_of_devices,
			  list_of_devices,
			  NULL);
 
  if(Monitor>=0){
    printf("Number of devices: \t%d\n", number_of_devices); 
  }
 
  // Iterate through each device, displaying associated information
  for (j = 0; j < number_of_devices; j++) {
 
    clGetDeviceInfo(list_of_devices[j], CL_DEVICE_TYPE, 
		    sizeof(cl_device_type), &type, NULL);
 
    if(Monitor>0){
 
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_NAME, 
			"CL_DEVICE_NAME");
 
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_VENDOR, 
			"CL_DEVICE_VENDOR");
 
      DisplayDeviceInfo_char(
			list_of_devices[j], 
			CL_DEVICE_VERSION, 
			"CL_DEVICE_VERSION");
 
      DisplayDeviceInfo_ulong(
			list_of_devices[j], 
			CL_DEVICE_GLOBAL_MEM_SIZE, 
			"CL_DEVICE_GLOBAL_MEM_SIZE");
 
      DisplayDeviceInfo_uint(
			list_of_devices[j], 
			CL_DEVICE_MAX_CLOCK_FREQUENCY, 
			"CL_DEVICE_MAX_CLOCK_FREQUENCY");
      printf("\n");
    }
  }
 
  // Next, create OpenCL contexts on platforms
  cl_context_properties contextProperties[] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties)platformIds[iplat],
    0
  };
 
  if(Monitor>0){
    printf("Creating CPU context %d on platform %d\n", 1, iplat);
  }
 
  list_of_contexts[1] = 
    clCreateContextFromType(contextProperties, 
			    CL_DEVICE_TYPE_CPU, NULL, NULL, &retval);
 
  if(Monitor>=0 && retval != CL_SUCCESS){
    printf("Could not create CPU context on platform %d\n", i);
  }
 
  if(Monitor>0){
    printf("Creating GPU context 0 on platform %d\n", iplat);
  }
 
  list_of_contexts[0] = 
    clCreateContextFromType(contextProperties, 
			    CL_DEVICE_TYPE_GPU, NULL, NULL, &retval);
 
  if(Monitor>=0 && retval != CL_SUCCESS){
    printf("Could not create GPU context on platform %d\n", i);
  }
 
  // in a loop over devices of the seleceted platform   
  for(idev=0; idev<number_of_devices;idev++){
 
    if(Monitor>0){
      printf("\nFor context %d and device %d:\n",
	     idev, idev);
    }
    device = list_of_devices[idev];
    icon = idev;
 
    // choose OpenCL context on first available platform
    context = list_of_contexts[icon];
 
    if(context !=0){
 
      commandQueue = clCreateCommandQueue(context, device, 0, NULL);
      if (commandQueue == NULL) {
	printf("Failed to create commandQueue for device %d\n", idev);
	exit(0);
      }
 
      if(Monitor>0){
	printf("Reading program from source\n");
      }
 
      // read source code from file
      FILE *fp;
      char* source;
      long int size;
 
      fp = fopen("HelloWorld.cl", "rb");
      if(!fp) {
	printf("Could not open kernel file\n");
	exit(-1);
      }
      int status = fseek(fp, 0, SEEK_END);
      if(status != 0) {
	printf("Error seeking to end of file\n");
	exit(-1);
      }
      size = ftell(fp);
      if(size < 0) {
	printf("Error getting file position\n");
	exit(-1);
      }
 
      rewind(fp);
 
      source = (char *)malloc(size + 1);
 
      int i;
      for (i = 0; i < size+1; i++) {
	source[i]='\0';
      }
 
      if(source == NULL) {
	printf("Error allocating space for the kernel source\n");
	exit(-1);
      }
 
      fread(source, 1, size, fp);
      source[size] = '\0';
 
      const char* source_const = source;
      cl_program program = clCreateProgramWithSource(context, 1,
						     &source_const,
						     NULL, NULL);
      if (program == NULL)
	{
	  printf("Failed to create CL program from source.\n");
	  exit(-1);
	}
 
      if(Monitor>0){
	printf("Creating program and kernel\n");
      }
      // build program (passing options to compiler if necessary
      retval = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
      char* buildLog; size_t size_of_buildLog; 
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
			    0, NULL, &size_of_buildLog); 
      buildLog = malloc(size_of_buildLog+1); 
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
			    size_of_buildLog, buildLog, NULL); 
      buildLog[size_of_buildLog]= '\0'; 
      printf("Kernel buildLog: %s\n", buildLog); 
      if (retval != CL_SUCCESS)
	{
	  printf("Error in kernel\n");
	  clReleaseProgram(program);
	  exit(-1);
	}
 
 
      // Create OpenCL kernel
      kernel = clCreateKernel(program, "hello_kernel", NULL);
      if (kernel == NULL)
	{
	  printf("Failed to create kernel.\n");
	  exit(0);
	}
 
      if(Monitor>0){
	printf("Creating memory objects\n");
      }
      // Create memory objects that will be used as arguments to
      // kernel.  First create host memory arrays that will be
      // used to store the arguments to the kernel
      float result[1];
      float a[1];
      float b[1];
      a[0] = 2;
      b[0] = 2;
 
      memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
				     sizeof(float), NULL, NULL);
      memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
				     sizeof(float), NULL, NULL);
      memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
				     sizeof(float), NULL, NULL);
 
      if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){
	printf("Error creating memory objects.\n");
	return 0;
      }
 
      if(Monitor>0){
	printf("Sending kernel arguments\n");
      }
      retval = clEnqueueWriteBuffer(
				    commandQueue, 
				    memObjects[0], 
				    CL_FALSE, 
				    0, 
				    sizeof(float),                         
				    a, 
				    0, 
				    NULL, 
				    NULL);
 
      // Use clEnqueueWriteBuffer() to write input array B to 
      // the device buffer bufferB
      retval = clEnqueueWriteBuffer(
				    commandQueue, 
				    memObjects[1], 
				    CL_FALSE, 
				    0, 
				    sizeof(float),                         
				    b,
				    0, 
				    NULL, 
				    NULL);
 
      // Set the kernel arguments (result, a, b)
      retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
      retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
      retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to Set the kernel arguments.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
 
      if(Monitor>0){
	printf("Running the kernel!\n");
      }
      size_t globalWorkSize[1] = { 1 };
      size_t localWorkSize[1] = { 1 };
 
      // Queue the kernel up for execution across the array
      retval = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
				      globalWorkSize, localWorkSize,
				      0, NULL, NULL);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to queue kernel for execution.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
 
      if(Monitor>0){
	printf("Transfering back results\n");
      }
      // Read the output buffer back to the Host
      retval = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
				   0, sizeof(float), result,
				   0, NULL, NULL);
      if (retval != CL_SUCCESS)
	{
	  printf("Failed to read result buffer.\n");
	  //Cleanup(context, commandQueue, program, kernel, memObjects);
	  return 1;
	}
 
      // Verify the output
      if(result[0]==4)  {
        printf("Output is correct: %lf + %lf = %lf\n",
	       a[0], b[0], result[0]);
      } else {
        printf("Output is incorrect: %lf + %lf != %lf\n",
	       a[0], b[0], result[0]);
      }
 
      for (i = 0; i < 3; i++)
      	{
      	  if (memObjects[i] != 0)
            clReleaseMemObject(memObjects[i]);
      	}
      if (commandQueue != 0)
        clReleaseCommandQueue(commandQueue);
 
      if (kernel != 0)
        clReleaseKernel(kernel);
 
      if (program != 0)
        clReleaseProgram(program);
 
    }
  }
 
  free(list_of_devices);
  free(platformIds);
 
  return 0;
}

HelloWorld.cl

__kernel void hello_kernel(__global const float *a,
		           __global const float *b,
			   __global float *result)
{
    int gid = get_global_id(0);
 
    if(gid<1) result[gid] = a[gid] + b[gid];
}

Zajęcia 6

Pliki zadania: opencl_vecadd.tgz Działające 3 kernele: sala404.zip

opencl_vecadd/vecadd_host/execute_kernels.c

#include<stdlib.h>
#include<stdio.h>
#include <math.h>
 
#include<omp.h>
#include <CL/cl.h>
 
#include"uth_ocl_intf.h"
 
#include"./vecadd_host.h"
#include"./hardware_params.h"
 
#define time_measurments
 
#ifdef time_measurments
#include"uth_time_intf.h"
  static double t_begin, t_end, t_total;
#endif
 
int execute_kernel_2_CPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
 
  // create the kernel for CPU
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
				  // kernel name:         , file:
				  "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. Restoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
  int nr_cores_CPU = omp_get_num_procs();
  int nr_threads = nr_cores_CPU; // total number of threads
  int work_group_size = 1; // number of threads in work-group
  int nr_work_groups = nr_threads / work_group_size;
  globalWorkSize[0] = nr_threads ;
  localWorkSize[0] = work_group_size ;
 
  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	 nr_cores_CPU, nr_cores_CPU, 1);
  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	 work_group_size, nr_threads / work_group_size, 
	 nr_threads / work_group_size / nr_cores_CPU);
  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, 
	 globalWorkSize[0]/nr_cores_CPU);
  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
 
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  int size = array_size;
  int size_per_thread = ceilf((float)size/nr_threads); 
 
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
  if (retval != CL_SUCCESS) {
    printf("Failed to Set the kernel arguments.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
 
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
 
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
 
int execute_kernel_3_CPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the kernel for CPU
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
				  // kernel name:         , file:
				  "vecadd_3_opt_blocks_kernel","vecadd_3_opt_blocks.cl",monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
  int nr_cores_CPU = omp_get_num_procs();
  int nr_threads = nr_cores_CPU; // total number of threads
  int work_group_size = 1; // number of threads in work-group
  int nr_work_groups = nr_threads / work_group_size;
  globalWorkSize[0] = nr_threads ;
  localWorkSize[0] = work_group_size ;
 
  printf("\t\tExecuting kernel %d, on platform %d and device %d\n",
	 kernel_index, platform_index, device_index);
  printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	 nr_cores_CPU, nr_cores_CPU, 1);
  printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	 work_group_size, nr_threads / work_group_size, 
	 nr_threads / work_group_size / nr_cores_CPU);
  printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	 globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, 
	 globalWorkSize[0]/nr_cores_CPU);
  printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	 array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  int size = array_size;
 
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  if (retval != CL_SUCCESS) {
    printf("Failed to Set the kernel arguments.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    return 1;
  }
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
// tutaj modyfikujemy
int execute_kernel_0_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the kernel for GPU
    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
 
  // globalna liczba wątków i lokalna liczba wątków
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size;
  localWorkSize[0] = work_group_size;
 
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
 
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  // Set the kernel arguments (result, a, b)
  // ustawiamy liczbę argumentów
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
 
int execute_kernel_1_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the kernel for GPU
/*
    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
 
    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file: */
 				    "vecadd_1_kernel", "vecadd_1.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
 
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size / 4;
  localWorkSize[0] = work_group_size;
 
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
 
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
 
int execute_kernel_2_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the kernel for GPU
/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file: */
 				    "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
 
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size / work_group_size;
  localWorkSize[0] = work_group_size;
 
  int size_per_thread = work_group_size;
  int size = array_size;
 
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
 
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
  retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
 
int execute_kernel_3_GPU(
			 int platform_index,
			 int device_index,
			 int kernel_index,
			 int array_size,
			 cl_mem* memObjects,
			 int monitor
			 )
{
 
  cl_kernel kernel;
  cl_event kernel_execution;
  cl_ulong startTime;
  cl_ulong endTime;
  double exec_time;
  cl_int retval;
 
  size_t globalWorkSize[1] = { 0 };
  size_t localWorkSize[1] = { 0 };
 
  int i,j,k;
 
  int size = array_size;
 
  /*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the kernel for GPU
/*    utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				    // kernel name:         , file:
				    "vecadd_0_kernel", "vecadd_0.cl", monitor);*/
     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
 				    // kernel name:         , file:
				    "vecadd_3_opt_blocks_kernel", "vecadd_3_opt_blocks.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
 
 
  // choose the context
  cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
  // choose the command queue
  cl_command_queue command_queue = 
    utr_ocl_select_command_queue(platform_index, device_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
	   platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL){ 
    printf("failed to restore context and command queue for platform %d, device %d\n", 
	   platform_index, device_index);
    printf("%lu %lu\n", context, command_queue);
  }
 
 
  int NDRange_dim = 1;
 
  int work_group_size = WORK_GROUP_SIZE;
  globalWorkSize[0] = array_size;
  localWorkSize[0] = work_group_size;
 
  if(monitor>UTC_BASIC_INFO){
    printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
    printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
	   NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
    printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
	   localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
	   globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
    printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
    printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
	   globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], 
	   globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
  }
 
 
  // choose the kernel
  kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
  if(monitor>UTC_BASIC_INFO){
    printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	   kernel_index, platform_index, device_index);
  }
 
  if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
    printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	   platform_index, device_index, kernel_index);
    printf("context %lu, command queue %lu, kernel %lu\n", 
	   context, command_queue, kernel);
  }
 
  // Set the kernel arguments (result, a, b)
  retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
  retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
  retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
  retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
 
  if (retval != CL_SUCCESS) {
      printf("Failed to Set the kernel arguments.\n");
      //Cleanup(context, command_queue, program, kernel, memObjects);
      return 1;
    }
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_begin = time_clock();
#endif
  // Queue the kernel up for execution across the array
  retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
				  globalWorkSize, localWorkSize,
				  0, NULL, &kernel_execution);
  clWaitForEvents(1, &kernel_execution);
 
 
#ifdef time_measurments
  clFinish(command_queue);
  t_end = time_clock();
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(kernel_execution,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  exec_time = ((double)endTime - (double)startTime)*1.0e-9;
  printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
	 t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
  printf("\tNumber of operations %d, performance %lf GFlops\n",
	 array_size, array_size / exec_time * 1e-9);
  printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
	 3*array_size*sizeof(float)*1e-9,
	 3*array_size*sizeof(float)/exec_time*1e-9);
#endif
 
  if (retval != CL_SUCCESS) {
    printf("Failed to queue kernel for execution.\n");
    //Cleanup(context, command_queue, program, kernel, memObjects);
    exit(-1);
  }
 
  return(0);
}
 
 
 
 
 
/*     // create the fourth kernel for GPU */
/*     kernel_index = 4;  */
/*     utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, */
/* 				    // kernel name:         , file: */
/* 				    "vecadd_4_opt_blocks_vect_kernel", "vecadd_4_opt_blocks_vect.cl", monitor); */

opencl_vecadd/vecadd_2_blocks.cl

__kernel void vecadd_2_blocks_kernel(__global const float *a,
				     __global const float *b,
				     __global float *result,
		                     const int size,
				     const int size_per_thread)
{
  int gid = get_global_id(0);
  int index_start = gid * size_per_thread;
  int index_end = (gid+1) * size_per_thread;
 
  if (gid == 0) {
	//printf("size: %d, %d", size, size_per_thread);
  }
 
  for (int i=index_start; i < index_end && i < size; i++) {	
  //for (int i=index_start; i < index_end ; i++) {	
    result[i] = a[i]+b[i];
  }
}

Zajęcia 7

zajecia_7.zip

tmp/opencl_mat_transp/mat_transp_1.cl

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
__kernel void mat_transp_1_kernel(
  __global float *A , 
  __global float *B ,
  __global float *C ,
  int N)
{
 
int id = get_global_id(0) * get_global_size(0) + get_global_id(1);
float temp = 0.0;
 
for(int i = 0; i<N;i++){
	temp += A[id*N + i] *B[i];
}
 
C[id] = temp;
}

tmp/opencl_mat_transp/mat_transp/mat_transp.c

#include<stdlib.h>
#include<stdio.h>
#include <math.h>
 
#include <CL/cl.h>
 
#include"uth_ocl_intf.h"
 
#define time_measurments
 
#ifdef time_measurments
#include"uth_time_intf.h"
  static double t_begin, t_end, t_total;
#endif
 
 
 
#define BLOCK_SIZE 16
#define NR_GROUPS 16
#define MULT 16
#define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT)
#define ROZMIAR (WYMIAR*WYMIAR)
// Matrices are stored in row-major order: 
// M(row, col) = M( row * WYMIAR + col ) 
 
int verify_result(
  float* result,
  float* result_compare
		  )
{
  // Verify the result
  int result_OK = 1;
  int i,j;
  for(i = 0; i < WYMIAR; i++) {
      if(fabs(result[i] - result_compare[i])>1.e-6) {
	result_OK = 0;
	break;
    }
  }
  printf("\t\t6. verifying results: ");
  if(result_OK) {
    printf("Output is correct\n");
  } else {
    printf("Output is incorrect\n");
    j = 0;
    for(i = 0; i < WYMIAR, j<100; i++) {
	if(fabs(result[i] - result_compare[i])>1.e-9) {
	  j++;
	  printf("%d %16.8f %16.8f\n", 
	  	 i, result[i], result_compare[i]);
	}
    }
    exit(0);
  }
    /* for(i = 0; i < length; i++) { */
    /*   printf("%16.8f %16.8f\n", result[i], result_compare[i]); */
    /* } */
 
  return(result_OK);
}
 
  /*----------------KERNEL CREATION PHASE----------------------*/
void create_kernels()
{
 
  // for all operations indicate explicit info messages
  int monitor = UTC_BASIC_INFO + 1;
 
  int kernel_index;
 
  int platform_index = utv_ocl_struct.current_platform_index;
 
  if(utr_ocl_CPU_context_exists(platform_index)){
 
#ifdef time_measurments
  t_begin = time_clock();
#endif
 
  // create the first kernel for CPU
  /* kernel_index = 0;  */
  /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, */
  /* 				  // kernel name:         , file: */
  /* 				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor); */
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
 
  }
 
  if(utr_ocl_GPU_context_exists(platform_index)){
 
 
#ifdef time_measurments
  t_begin = time_clock();
#endif
  // create the first kernel for GPU
  kernel_index = 0; 
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				  // kernel name:         , file:
				  "mat_transp_1_kernel", "mat_transp_1.cl", monitor);
 
#ifdef time_measurments
  t_end = time_clock();
  printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
 
  // create the second kernel for GPU
  kernel_index = 1; 
  utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
				  // kernel name:         , file:
				  "mat_transp_2_kernel", "mat_transp_2.cl", monitor);
 
 
 
  }
 
}
 
  /*----------------EXECUTION PHASE----------------------*/
int execute_kernels()
{
 
  // for all operations indicate explicit info messages
  int monitor = UTC_BASIC_INFO + 1;
 
  int platform_index = utv_ocl_struct.current_platform_index;
  utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index];
 
  int kernel_index;
  int i,j,n;
 
  double nr_access;
  double t1,t2;
 
  if(monitor>UTC_BASIC_INFO){
    printf("\n------------Starting execution phase----------------\n");
  }
 
  // create matrices
  n=WYMIAR;
  float* A = (float *) malloc(ROZMIAR*sizeof(float));
  float* B = (float *) malloc(WYMIAR*sizeof(float));
  float* C = (float *) malloc(WYMIAR*sizeof(float));
  float* D = (float *) malloc(WYMIAR*sizeof(float));
 
  for(i=0;i<ROZMIAR;i++) A[i]=1.0*i/10000000.0;
 
  for(i=0;i<WYMIAR;i++) B[i]=1.0*i/10000000.0;
 
  // mnozenie macierz wektor
  for(i=0; i<WYMIAR; i++){
	for(j=0; j<WYMIAR; j++){
		D[i] += A[i*WYMIAR + j] * B[j];	
	}
  }
 
  nr_access= ROZMIAR + 2.0*WYMIAR; // read + write
 
  printf("mat_transp: nr_access %lf\n", nr_access);
 
  // get hardware characteristics to select good matrix shape
  // the set of device characteristics stored in data structure
  int device_index = 0; 
  utt_ocl_device_struct device_struct = 
    utv_ocl_struct.list_of_platforms[platform_index].list_of_devices[device_index];
  double global_mem_bytes = device_struct.global_mem_bytes;
  double global_max_alloc = device_struct.global_max_alloc;
  double local_mem_bytes = device_struct.local_mem_bytes;
  double constant_mem_bytes = device_struct.constant_mem_bytes;
  int max_num_comp_units = device_struct.max_num_comp_units;
  int max_work_group_size = device_struct.max_work_group_size;
 
  // in a loop over devices (or for a selected device)
  int idev=0; 
  for(idev=0; idev<platform_struct.number_of_devices; idev++){
 
    // int device_type = .....
    // choose device_index
    // int device_index = utr_ocl_select_device(platform_index, device_type);
    int device_index = idev;
    int device_type = utr_ocl_device_type(platform_index, device_index); 
 
    if(device_index>0 && device_type==utr_ocl_device_type(platform_index, device_index-1)) break; 
    if(device_type == UTC_OCL_DEVICE_CPU) break;
 
    // choose the context
    cl_context context = utr_ocl_select_context(platform_index, device_index);  
 
    // choose the command queue
    cl_command_queue command_queue = 
      utr_ocl_select_command_queue(platform_index, device_index);  
 
    if(monitor>UTC_BASIC_INFO){
      printf("\nExecution: \t0. restoring context and command queue for platform %d and device %d\n",
	     platform_index, device_index);
    }
 
    if(context == NULL || command_queue == NULL){ 
 
      printf("failed to restore context and command queue for platform %d, device %d\n", 
	     platform_index, device_index);
      printf("%lu %lu\n", context, command_queue);
    }
 
    // choose the kernel
    kernel_index = 0;
    cl_kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);  
 
    if(monitor>UTC_BASIC_INFO){
      printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
	     kernel_index, platform_index, device_index);
    }
 
    if(context == NULL || command_queue == NULL || kernel == NULL){ 
 
      printf("failed to restore kernel for platform %d, device %d, kernel %d\n", 
	     platform_index, device_index, kernel_index);
      printf("context %lu, command queue %lu, kernel %lu\n", 
	     context, command_queue, kernel);
    }
 
    for(i=0;i<WYMIAR;i++) C[i]=0.0;
    time_init(); t1 = time_clock();
 
    // call routine to perform matrix transposition
    mat_transp_host(kernel_index, A, B, C, n, context, kernel, command_queue);
 
    t2 = time_clock(); time_print();
    printf("GB/s = %lf\n\n", nr_access*sizeof(float)/(t2-t1)/1024/1024/1024);
 
 
    // verify result 
    verify_result(C, D);
 
 
  } // end loop over devices
 
  return(0);
}
 
 
 
// Matrix transposition - Host code 
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
int mat_transp_host(
  int kernel_index,
  float* A, 
  float* B,
  float* C, 
  int N, 
  const cl_context context, 
  const cl_kernel mat_transp_kernel, 
  const cl_command_queue queue
		) 
{ 
 
// Load A to device memory 
  size_t size_bytesm = N*N*sizeof(float); 
  size_t size_bytesw = N*sizeof(float); 
 
 
  cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY,
				size_bytesm, NULL, NULL); 
 
  // Write A to device memory 
  clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, size_bytesm, A, 0, 0, 0); 
 
  cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY,
				size_bytesw, NULL, NULL); 
 
  // Write A to device memory 
  clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, size_bytesw, B, 0, 0, 0); 
 
  // Allocate B in device memory 
  cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size_bytesw, NULL, NULL); 
 
 
  // Invoke kernel 
  clSetKernelArg(mat_transp_kernel, 0, sizeof(cl_mem), (void*)&d_A);
  clSetKernelArg(mat_transp_kernel, 1, sizeof(cl_mem), (void*)&d_B);
  clSetKernelArg(mat_transp_kernel, 2, sizeof(cl_mem), (void*)&d_C);
  clSetKernelArg(mat_transp_kernel, 3, sizeof(int), (void*)&N);
 
  size_t localWorkSize[3];
  size_t globalWorkSize[3];
  cl_uint work_dim;
 
  if(kernel_index==3){
 
    work_dim = 2;
    localWorkSize[0] = BLOCK_SIZE; 
    globalWorkSize[0] = N/MULT; 
    localWorkSize[1] = BLOCK_SIZE; 
    globalWorkSize[1] = N/MULT; 
    localWorkSize[2] = 0; 
    globalWorkSize[2] = 0; 
 
 
  }
  else{
 
    work_dim = 2;
    localWorkSize[0] = BLOCK_SIZE; 
    globalWorkSize[0] = sqrt(N); 
    localWorkSize[1] = BLOCK_SIZE; 
    globalWorkSize[1] = sqrt(N); 
    localWorkSize[2] = 0; 
    globalWorkSize[2] = 0; 
 
  }
 
  clFinish(queue);
  double t1 = time_clock();
  // Enqueue a kernel run call
  cl_event ndrEvt;
  clEnqueueNDRangeKernel(queue, mat_transp_kernel, work_dim, 0, 
			 globalWorkSize, localWorkSize, 0, 0, &ndrEvt); 
  clWaitForEvents(1, &ndrEvt);
  clFinish(queue);
  double t2 = time_clock();
  // Calculate performance 
  cl_ulong startTime;
  cl_ulong endTime;
 
  // Get kernel profiling info 
  clGetEventProfilingInfo(ndrEvt,
			  CL_PROFILING_COMMAND_START,
			  sizeof(cl_ulong),
			  &startTime,
			  0);
  clGetEventProfilingInfo(ndrEvt,
			  CL_PROFILING_COMMAND_END,
			  sizeof(cl_ulong),
			  &endTime,
			  0);
  double time = (double)endTime - (double)startTime;
  printf("\nKernel execution internal: time %lf, GB/s = %lf\n", 
	 time*1.0e-9, 2.0*N*N*sizeof(float)/(time*1.0e-9)/1024/1024/1024);
 
  printf("Kernel execution external: time %lf,  GB/s = %lf\n\n", 
	 t2-t1, 2.0*N*N*sizeof(float)/(t2-t1)/1024/1024/1024);
 
  // Read B from device memory 
  clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, size_bytesw, C, 0, 0, 0); 
 
  // Free device memory 
  clReleaseMemObject(d_A); 
  clReleaseMemObject(d_B); 
  clReleaseMemObject(d_C);
 
  return(0);
 
}

Kod wogu

server.c

/*
    C socket server example, handles multiple clients using threads
    Compile
    gcc server.c -lpthread -o server
*/
 
#include<stdio.h>
#include<string.h>    //strlen
#include<stdlib.h>    //strlen
#include<sys/socket.h>
#include<arpa/inet.h> //inet_addr
#include<unistd.h>    //write
#include<pthread.h> //for threading , link with lpthread
#include <unistd.h>
 
//the thread function
void *connection_handler(void *);
 
int main(int argc , char *argv[])
{
    int counter = 0;
    pid_t pid = fork();
 
    if (pid == 0)
    {
       	    int sock;
	    struct sockaddr_in server;
	    char message[1000] , server_reply[2000];
            int counter;
	    unsigned int ip[] = { 192, 168, 102, 1 };
            char* ip_final;
		char *buf;
		size_t sz;
 
	    //Create socket
 
            for ( counter = 0; counter < 255; ++counter ) {
 
		    sock = socket(AF_INET , SOCK_STREAM , 0);
		    if (sock == -1)
		    {
			printf("Could not create socket");
		    }
		    puts("Socket created");
 
		    //printf("192.168.102.%d", counter);
 
		sz = snprintf(NULL, 0, "192.168.102.%d", counter);
		buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */
 
		    server.sin_addr.s_addr = inet_addr(buf);
		    server.sin_family = AF_INET;
		    server.sin_port = htons( 65001 );
 
		    //Connect to remote server
		    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
		    {
			perror("connect failed. Error");
			continue;
		    }
 
		    puts("Connected\n");
 
		    //keep communicating with server
		    while(1)
		    {
			//Send some data
			if( send(sock , "Werset 6-client" , 15 , 0) < 0)
			{
			    puts("Send failed");
			    return 1;
			}
 
			//Receive a reply from the server
			if( recv(sock , server_reply , 2000 , 0) < 0)
			{
			    puts("recv failed");
			    break;
			}
 
			puts(server_reply);
 
			break;
		    }
 
		    close(sock);
           }
    }
    else if (pid > 0)
    {
        int socket_desc , client_sock , c;
	    struct sockaddr_in server , client;
 
	    //Create socket
	    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
	    if (socket_desc == -1)
	    {
		printf("Could not create socket");
	    }
	    puts("Socket created");
 
	    //Prepare the sockaddr_in structure
	    server.sin_family = AF_INET;
	    server.sin_addr.s_addr = INADDR_ANY;
	    server.sin_port = htons( 65001 );
 
	    //Bind
	    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
	    {
		//print the error message
		perror("bind failed. Error");
		return 1;
	    }
	    puts("bind done");
 
	    //Listen
	    listen(socket_desc , 3);
 
	    //Accept and incoming connection
	    puts("Waiting for incoming connections...");
	    c = sizeof(struct sockaddr_in);
 
 
	    //Accept and incoming connection
	    puts("Waiting for incoming connections...");
	    c = sizeof(struct sockaddr_in);
		pthread_t thread_id;
 
	    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
	    {
		puts("Connection accepted");
 
		if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
		{
		    perror("could not create thread");
		    return 1;
		}
 
		//Now join the thread , so that we dont terminate before the thread
		//pthread_join( thread_id , NULL);
		puts("Handler assigned");
	    }
 
	    if (client_sock < 0)
	    {
		perror("accept failed");
		return 1;
	    }
    }
    else
    {
        // fork failed
        printf("fork() failed!\n");
        return 1;
    }
 
    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];
 
    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
	printf("%s\n", client_message);
 
	//end of string marker
	client_message[read_size] = '\0';
 
	//Send the message back to client
	write(sock , "Werset 6" , 8);
 
	//clear the message buffer
	memset(client_message, 0, 2000);
    }
 
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
 
    return 0;
}

Makefile

# kompilator c
CC = gcc
 
# konsolidator
CFLAGS=-g
 
all: server client
 
server: server.o
	$(CC) $(LFLAGS) server.o -lpthread -o server
 
server.o: server.c
	$(CC) -c server.c -o server.o
 
client: client.o
	$(CC) client.o -o client
 
client.o: client.c
	$(CC) -c client.c -o client.o
 
clean:
	rm -f *.o

Zajęcia 9

/*
    C socket server example, handles multiple clients using threads
    Compile
    gcc server.c -lpthread -o server
*/
 
#include<stdio.h>
#include<string.h>    //strlen
#include<stdlib.h>    //strlen
#include<sys/socket.h>
#include<arpa/inet.h> //inet_addr
#include<unistd.h>    //write
#include<pthread.h> //for threading , link with lpthread
 
//the thread function
void *connection_handler(void *);
 
int main(int argc , char *argv[])
{
    int socket_desc , client_sock , c;
    struct sockaddr_in server , client;
 
    //Create socket
    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
    if (socket_desc == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
 
    //Prepare the sockaddr_in structure
    server.sin_family = AF_INET;
    server.sin_addr.s_addr = INADDR_ANY;
    server.sin_port = htons( 8888 );
 
    //Bind
    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
    {
        //print the error message
        perror("bind failed. Error");
        return 1;
    }
    puts("bind done");
 
    //Listen
    listen(socket_desc , 3);
 
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
 
 
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
	pthread_t thread_id;
 
    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
    {
        puts("Connection accepted");
 
        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
        {
            perror("could not create thread");
            return 1;
        }
 
        //Now join the thread , so that we dont terminate before the thread
        //pthread_join( thread_id , NULL);
        puts("Handler assigned");
    }
 
    if (client_sock < 0)
    {
        perror("accept failed");
        return 1;
    }
 
    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];
 
    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
	//end of string marker
	client_message[read_size] = '\0';
 
	//Send the message back to client
	write(sock , "Werset 6" , strlen(client_message));
 
	//clear the message buffer
	memset(client_message, 0, 2000);
    }
 
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
 
    return 0;
} 

Client C

#include<stdio.h> //printf
#include<string.h>    //strlen
#include<sys/socket.h>    //socket
#include<arpa/inet.h> //inet_addr
 
int main(int argc , char *argv[])
{
    int sock;
    struct sockaddr_in server;
    char message[1000] , server_reply[2000];
 
    //Create socket
    sock = socket(AF_INET , SOCK_STREAM , 0);
    if (sock == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
 
    server.sin_addr.s_addr = inet_addr("127.0.0.1");
    server.sin_family = AF_INET;
    server.sin_port = htons( 8888 );
 
    //Connect to remote server
    if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
    {
        perror("connect failed. Error");
        return 1;
    }
 
    puts("Connected\n");
 
    //keep communicating with server
    while(1)
    {
        printf("Enter message : ");
        scanf("%s" , message);
 
        //Send some data
        if( send(sock , message , strlen(message) , 0) < 0)
        {
            puts("Send failed");
            return 1;
        }
 
        //Receive a reply from the server
        if( recv(sock , server_reply , 2000 , 0) < 0)
        {
            puts("recv failed");
            break;
        }
 
        puts("Server reply :");
        puts(server_reply);
    }
 
    close(sock);
    return 0;
}

Kod Maciek Gonzo itp

#include<stdio.h>
#include<string.h>    //strlen
#include<stdlib.h>    //strlen
#include<sys/socket.h>
#include<arpa/inet.h> //inet_addr
#include<unistd.h>    //write
#include<pthread.h> //for threading , link with lpthread
 
//the thread function
void *connection_handler(void *);
 
void *client_function(void *socket_desc){
  int sock;
    struct sockaddr_in server_id;
    char message[1000] , server_reply[2000];
 
    //Create socket
    sock = socket(AF_INET , SOCK_STREAM , 0);
    if (sock == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
     // 192.168.102.63 wogu
    server_id.sin_addr.s_addr = inet_addr("localhost");
    server_id.sin_family = AF_INET;
    server_id.sin_port = htons( 65000 );
 
    //Connect to remote server
 
    if (connect(sock , (struct sockaddr *)&server_id , sizeof(server_id)) < 0)
    {
        perror("connect failed. Error");
 
    }else{
 
    puts("Connected\n");
 
    //keep communicating with server
    while(1)
    {
        printf("Enter message : ");
        scanf("%s" , message);
 
        //Send some data
        if( send(sock , message , strlen(message) , 0) < 0)
        {
            puts("Send failed");
 
        }
 
        //Receive a reply from the server
        if( recv(sock , server_reply , 2000 , 0) < 0)
        {
            puts("recv failed");
            break;
        }
 
        puts("Server reply :");
        puts(server_reply);
    }
 
    close(sock); 
}
 
};
 
 
void *server_function(void * sock){
    printf("SERVER Start");
fflush(stdout);
    int socket_desc , client_sock , c;
    pthread_t thread_id;
    struct sockaddr_in server , client;
 
    //Create socket
    socket_desc = socket(AF_INET , SOCK_STREAM , 0);
    if (socket_desc == -1)
    {
        printf("Could not create socket");
    }
    puts("Socket created");
 
    //Prepare the sockaddr_in structure
    server.sin_family = AF_INET;
    server.sin_addr.s_addr = INADDR_ANY;
    server.sin_port = htons( 65000);
 
    //Bind
    if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
    {
        //print the error message
        perror("bind failed. Error");
        return;
    }
    puts("bind done");
 
 
 
    //Listen
    listen(socket_desc , 3);
 
    //Accept and incoming connection
    puts("Waiting for incoming connections...");
    c = sizeof(struct sockaddr_in);
fflush(stdout);
 
    while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
    {
        puts("Connection accepted");
 
        if( pthread_create( &thread_id , NULL ,  connection_handler , (void*) &client_sock) < 0)
        {
            perror("could not create thread");
            return;
        }
 
        //Now join the thread , so that we dont terminate before the thread
        //pthread_join( thread_id , NULL);
        puts("Handler assigned");
    }
 
    if (client_sock < 0)
    {
        perror("accept failed");
        return;
    }
}
 
 
int main(int argc , char *argv[])
{
    pthread_t thread_id;
//Create server
        if( pthread_create( &thread_id , NULL ,  server_function , NULL) != 0)
        {
            perror("could not create server thread");
	    fflush(stdout);
            return 1;
        }
    pthread_t thread_id2;
 
	// Create client
        if( pthread_create( &thread_id2 , NULL ,  client_function , NULL) != 0)
        {
            perror("could not create client thread");
	    fflush(stdout);
            return 1;
        }
 
    return 0;
}
 
/*
 * This will handle connection for each client
 * */
void *connection_handler(void *socket_desc)
{
    //Get the socket descriptor
    int sock = *(int*)socket_desc;
    int read_size;
    char *message , client_message[2000];
 
    //Receive a message from client
    while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
    {
        //end of string marker
		client_message[read_size] = '\0';
 
		//Send the message back to client
        write(sock , "Werset 5" , strlen(client_message));
 
		//clear the message buffer
		memset(client_message, 0, 2000);
    }
 
    if(read_size == 0)
    {
        puts("Client disconnected");
        fflush(stdout);
    }
    else if(read_size == -1)
    {
        perror("recv failed");
    }
 
    return 0;
}