# kompilator c CCOMP = mpicc # konsolidator LINK = mpicc MPIRUN = /usr/lib64/openmpi/bin/mpiexec # opcje optymalizacji: # wersja do debugowania # OPT = -g -DDEBUG -p # wersja zoptymalizowana do mierzenia czasu # OPT = -O3 -fopenmp -p # pliki naglowkowe #INC = -I../pomiar_czasu # biblioteki #LIB = -L../pomiar_czasu -lm LIB = -lm # zaleznosci i komendy heat: heat.o $(LINK) $(OPT) heat.o -o heat $(LIB) heat.o: heat.c $(CCOMP) -c $(OPT) heat.c $(INC) run: $(MPIRUN) -np 8 ./heat clean: rm -f *.o
# include <stdlib.h> # include <stdio.h> # include <math.h> # include "mpi.h" int main ( int argc, char *argv[] ); void heat_part ( int n, int p, int id, double x_min, double x_max ); /******************************************************************************/ int main ( int argc, char *argv[] ) { double a = 0.0; // lewy brzeg przedzialu double b = 1.0; // prawy brzeg przedzialu int i; int id; // rank int n; // liczba punktow dla kazdego wezla int p; // size double x_max; double x_min; MPI_Init ( &argc, &argv ); MPI_Comm_rank ( MPI_COMM_WORLD, &id ); MPI_Comm_size ( MPI_COMM_WORLD, &p ); n = 12; // liczba punktow dla kazdego wezla i = 0; // poczatkowa chwila czasu // wspolrzedna lewego punktu dla wezla id x_min = ( ( double )( p * n + 1 - id * n - i ) * a + ( double )( id * n + i ) * b ) / ( double ) ( p * n + 1 ); i = n + 1; // wspolrzedna prawego punktu dla wezla id x_max = ( ( double )( p * n + 1 - id * n - i ) * a + ( double )( id * n + i ) * b ) / ( double )( p * n + 1 ); heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla MPI_Finalize ( ); } /******************************************************************************/ // obliczenia dla pojedynczego wezla - pojedynczego podobszaru /******************************************************************************/ void heat_part ( int n, int p, int id, double x_min, double x_max ) { double cfl; double *h; double *h_new; int i; int ierr; int j; int j_max; int j_min; double k; MPI_Status status; double t; double t_del; double t_max; double t_min; int tag; double wtime; double *x; double x_del; h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1 x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow k = 0.002 / ( double ) p; // przewodniosc cieplna j_min = 0; // indeksy krokow czasowych - min i max j_max = 100; t_min = 0.0; // chwile czasu - min i max t_max = 10.0; t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami for ( i = 0; i <= n + 1; i++ ) { x[i] = ( ( double ) ( i ) * x_max + ( double ) ( n + 1 - i ) * x_min ) / ( double ) ( n + 1 ); } // ustawienie warunku poczatkowego for ( i = 0; i <= n + 1; i++ ) { h[i] = 95.0; } // sprawdzenie stabilnosci schematu cfl = k * t_del / x_del / x_del; if ( 0.5 <= cfl ) { printf ( " CFL condition failed.\n" ); exit ( 1 ); } wtime = MPI_Wtime ( ); //poczatek pomiaru czasu for ( j = 1; j <= j_max; j++ ) { // wymiana informacji z wezlami sasiednimi tag = 1; if ( id < p - 1 ) { MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD ); } if ( 0 < id ) { MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status ); } tag = 2; if ( 0 < id ) { // DO UZUPELNIENIA MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); } if ( id < p - 1 ) { // DO UZUPELNIENIA MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status ); } // implementacja wzoru roznicowego for ( i = 1; i <= n; i++ ) { h_new[i] = h[i] + t_del * ( k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del + 2.0 * sin ( x[i] * t ) ); } // nowa chwila czasu t = ( ( double ) ( j - j_min ) * t_max + ( double ) ( j_max - j ) * t_min ) / ( double ) ( j_max - j_min ); // przygotowanie do nastepnego kroku czasowego for ( i = 1; i < n + 1; i++ ) { h[i] = h_new[i]; } if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t ); if ( id == p - 1 ) h[n+1] = 75; } // koncowa wymiana informacji z wezlami sasiednimi tag = 11; if ( id < p - 1 ) { MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD ); } if ( 0 < id ) { MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status ); } tag = 12; if ( 0 < id ) { // DO UZUPELNIENIA MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); } if ( id < p - 1 ) { // DO UZUPELNIENIA MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status ); } wtime = MPI_Wtime ( ) - wtime; if ( id == 0 ) { printf ( "\n" ); printf ( " Wall clock elapsed seconds = %f\n", wtime ); } // wydruk wyniku printf ( "%2d T= %f\n", id, t ); printf ( "%2d X= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", x[i] ); } printf ( "\n" ); printf ( "%2d H= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", h[i] ); } printf ( "\n" ); free ( h ); free ( h_new ); free ( x ); return; }
# include <stdlib.h> # include <stdio.h> # include <math.h> # include "mpi.h" int main ( int argc, char *argv[] ); void heat_part ( int n, int p, int id, double x_min, double x_max ); /******************************************************************************/ int main ( int argc, char *argv[] ) { double a = 0.0; // lewy brzeg przedzialu double b = 1000000.0; // prawy brzeg przedzialu int i; int id; // rank int n; // liczba punktow dla kazdego wezla int p; // size double x_max; double x_min; MPI_Init ( &argc, &argv ); MPI_Comm_rank ( MPI_COMM_WORLD, &id ); MPI_Comm_size ( MPI_COMM_WORLD, &p ); n = 1000000; // liczba punktow dla kazdego wezla i = 0; // poczatkowa chwila czasu // wspolrzedna lewego punktu dla wezla id x_min = ( ( double )( p * n + 1 - id * n - i ) * a + ( double )( id * n + i ) * b ) / ( double ) ( p * n + 1 ); i = n + 1; // wspolrzedna prawego punktu dla wezla id x_max = ( ( double )( p * n + 1 - id * n - i ) * a + ( double )( id * n + i ) * b ) / ( double )( p * n + 1 ); heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla MPI_Finalize ( ); } /******************************************************************************/ // obliczenia dla pojedynczego wezla - pojedynczego podobszaru /******************************************************************************/ void heat_part ( int n, int p, int id, double x_min, double x_max ) { double cfl; double *h; double *h_new; int i; int ierr; int j; int j_max; int j_min; double k; MPI_Status status; double t; double t_del; double t_max; double t_min; int tag; double wtime; double *x; double x_del; MPI_Request req1, req2, req3, req4; MPI_Status stat1, stat2, stat3, stat4; h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1 x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow k = 0.002 / ( double ) p; // przewodniosc cieplna j_min = 0; // indeksy krokow czasowych - min i max j_max = 100; t_min = 0.0; // chwile czasu - min i max t_max = 10.0; t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami for ( i = 0; i <= n + 1; i++ ) { x[i] = ( ( double ) ( i ) * x_max + ( double ) ( n + 1 - i ) * x_min ) / ( double ) ( n + 1 ); } // ustawienie warunku poczatkowego for ( i = 0; i <= n + 1; i++ ) { h[i] = 95.0; } // sprawdzenie stabilnosci schematu cfl = k * t_del / x_del / x_del; if ( 0.5 <= cfl ) { printf ( " CFL condition failed.\n" ); exit ( 1 ); } wtime = MPI_Wtime ( ); //poczatek pomiaru czasu for ( j = 1; j <= j_max; j++ ) { // wymiana informacji z wezlami sasiednimi ///NON BLOCK if ( id > 0 ) { MPI_Irecv ( &h[0], 1, MPI_DOUBLE, id-1, 1, MPI_COMM_WORLD, &req1 ); } if ( id < p - 1 ) { MPI_Irecv ( &h[n+1], 1, MPI_DOUBLE, id+1, 2, MPI_COMM_WORLD, &req2 ); } ///END NON BLOCK if ( id > 0 ) { MPI_Isend ( &h[1], 1, MPI_DOUBLE, id-1, 2, MPI_COMM_WORLD, &req3 ); } if ( id < p - 1 ) { MPI_Isend ( &h[n], 1, MPI_DOUBLE, id+1, 1, MPI_COMM_WORLD, &req4 ); } // implementacja wzoru roznicowego for ( i = 2; i <= n-1; i++ ) { h_new[i] = h[i] + t_del * ( k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del + 2.0 * sin ( x[i] * t ) ); } ///NON BLOCK ///WAIT FOR RECEIVE if ( id > 0 ) { MPI_Wait(&req1, &stat1); } if ( id < p - 1 ) { MPI_Wait(&req2, &stat2); } int tmp = 1; h_new[tmp] = h[tmp] + t_del * ( k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del + 2.0 * sin ( x[tmp] * t ) ); tmp = n; h_new[tmp] = h[tmp] + t_del * ( k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del + 2.0 * sin ( x[tmp] * t ) ); ///WAIT FOR SEND if ( id > 0 ) { MPI_Wait(&req3, &stat3); } if ( id < p - 1 ) { MPI_Wait(&req4, &stat4); } ///END NON BLOCK // nowa chwila czasu t = ( ( double ) ( j - j_min ) * t_max + ( double ) ( j_max - j ) * t_min ) / ( double ) ( j_max - j_min ); // przygotowanie do nastepnego kroku czasowego for ( i = 1; i < n + 1; i++ ) { h[i] = h_new[i]; } if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t ); if ( id == p - 1 ) h[n+1] = 75; } // koncowa wymiana informacji z wezlami sasiednimi tag = 11; if ( id < p - 1 ) { MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD ); } if ( 0 < id ) { MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status ); } tag = 12; if ( 0 < id ) { // DO UZUPELNIENIA MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); } if ( id < p - 1 ) { // DO UZUPELNIENIA MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status ); } wtime = MPI_Wtime ( ) - wtime; if ( id == 0 ) { printf ( "\n" ); printf ( " Wall clock elapsed seconds = %f\n", wtime ); } // wydruk wyniku /*printf ( "%2d T= %f\n", id, t ); printf ( "%2d X= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", x[i] ); } printf ( "\n" ); printf ( "%2d H= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", h[i] ); } printf ( "\n" ); */ free ( h ); free ( h_new ); free ( x ); return; }
# include <stdlib.h> # include <stdio.h> # include <math.h> # include "mpi.h" int main ( int argc, char *argv[] ); void heat_part ( int n, int p, int id, double x_min, double x_max ); /******************************************************************************/ int main ( int argc, char *argv[] ) { int id; // rank int n; // liczba punktow dla kazdego wezla int p; // size double x_max; double x_min; MPI_Comm parentcomm; MPI_Status status; MPI_Init ( &argc, &argv ); MPI_Comm_rank ( MPI_COMM_WORLD, &id ); MPI_Comm_size ( MPI_COMM_WORLD, &p ); MPI_Comm_get_parent(&parentcomm); MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status ); MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status ); MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &status ); //MPI_Barrier(MPI_COMM_WORLD); heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla MPI_Finalize ( ); return 0; } /******************************************************************************/ // obliczenia dla pojedynczego wezla - pojedynczego podobszaru /******************************************************************************/ void heat_part ( int n, int p, int id, double x_min, double x_max ) { double cfl; double *h; double *h_new; int i; int ierr; int j; int j_max; int j_min; double k; MPI_Status status; double t; double t_del; double t_max; double t_min; int tag; double wtime; double *x; double x_del; h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1 x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow k = 0.002 / ( double ) p; // przewodniosc cieplna j_min = 0; // indeksy krokow czasowych - min i max j_max = 100; t_min = 0.0; // chwile czasu - min i max t_max = 10.0; t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami for ( i = 0; i <= n + 1; i++ ) { x[i] = ( ( double ) ( i ) * x_max + ( double ) ( n + 1 - i ) * x_min ) / ( double ) ( n + 1 ); } // ustawienie warunku poczatkowego for ( i = 0; i <= n + 1; i++ ) { h[i] = 95.0; } // sprawdzenie stabilnosci schematu cfl = k * t_del / x_del / x_del; if ( 0.5 <= cfl ) { printf ( " CFL condition failed.\n" ); exit ( 1 ); } wtime = MPI_Wtime ( ); //poczatek pomiaru czasu for ( j = 1; j <= j_max; j++ ) { // wymiana informacji z wezlami sasiednimi tag = 1; if ( id < p - 1 ) { MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD ); } if ( 0 < id ) { MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status ); } tag = 2; if ( 0 < id ) { MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); } if ( id < p - 1 ) { MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status ); } // implementacja wzoru roznicowego for ( i = 1; i <= n; i++ ) { h_new[i] = h[i] + t_del * ( k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del + 2.0 * sin ( x[i] * t ) ); } // nowa chwila czasu t = ( ( double ) ( j - j_min ) * t_max + ( double ) ( j_max - j ) * t_min ) / ( double ) ( j_max - j_min ); // przygotowanie do nastepnego kroku czasowego for ( i = 1; i < n + 1; i++ ) { h[i] = h_new[i]; } if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t ); if ( id == p - 1 ) h[n+1] = 75; } // koncowa wymiana informacji z wezlami sasiednimi tag = 11; if ( id < p - 1 ) { MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD ); } if ( 0 < id ) { MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status ); } tag = 12; if ( 0 < id ) { // DO UZUPELNIENIA MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD ); } if ( id < p - 1 ) { // DO UZUPELNIENIA MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status ); } wtime = MPI_Wtime ( ) - wtime; if ( id == 0 ) { printf ( "\n" ); printf ( " Wall clock elapsed seconds = %f\n", wtime ); } // wydruk wyniku printf ( "%2d T= %f\n", id, t ); printf ( "%2d X= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", x[i] ); } printf ( "\n" ); printf ( "%2d H= ", id ); for ( i = 0; i <= n + 1; i++ ) { printf ( "%7.2f", h[i] ); } printf ( "\n" ); free ( h ); free ( h_new ); free ( x ); return; }
# include <stdlib.h> # include <stdio.h> # include <math.h> # include "mpi.h" int main ( int argc, char *argv[] ); /******************************************************************************/ int main ( int argc, char *argv[] ) { double a = 0.0; // lewy brzeg przedzialu double b = 1.0; // prawy brzeg przedzialu int i; int id; // rank int n; // liczba punktow dla kazdego wezla int p; // size int np = 8; // liczba procesow double x_max; double x_min; MPI_Comm komunikator; int *errcodes; MPI_Init ( &argc, &argv ); //MPI_Comm_rank ( MPI_COMM_WORLD, &id ); //MPI_Comm_size ( MPI_COMM_WORLD, &p ); MPI_Comm_spawn("dziecko", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &komunikator, errcodes); n = 12; // liczba punktow dla kazdego wezla int j; for(j=0;j<np;j++){ i = 0; // poczatkowa chwila czasu // wspolrzedna lewego punktu dla wezla id x_min = ( ( double )( np * n + 1 - j * n - i ) * a + ( double )( j * n + i ) * b ) / ( double ) ( np * n + 1 ); i = n + 1; // wspolrzedna prawego punktu dla wezla id x_max = ( ( double )( np * n + 1 - j * n - i ) * a + ( double )( j * n + i ) * b ) / ( double )( np * n + 1 ); MPI_Send ( &x_min, 1, MPI_DOUBLE, j, 1, komunikator ); MPI_Send ( &x_max, 1, MPI_DOUBLE, j, 2, komunikator ); MPI_Send ( &n, 1, MPI_INT, j, 3, komunikator ); } MPI_Finalize ( ); return 0; } struct rekord wys; MPI_Datatype rekord_typ; int tab_dlug_blokow[3] = {1, 1, 1}; MPI_Datatype tab_typow[3] = {MPI_DOUBLE, MPI_DOUBLE, MPI_INT}; MPI_Aint podstawa, tab_odstepow[3]; MPI_Get_address(&wys.x_min, &tab_odstepow[0]); MPI_Get_address(&wys.x_max, &tab_odstepow[1]); MPI_Get_address(&wys.n, &tab_odstepow[2]); MPI_Type_struct(3, tab_dlug_blokow, tab_odstepow, tab_typow, &rekord_typ); MPI_Type_commit(&rekord_typ); MPI_Recv ( &wys, 1, rekord_typ, 0, 1, parentcomm, &status );
wkrótce
# optimization and other system dependent options #include make.$(SRR_ARCH) # or directly include make.lab_404_NVIDIA NAME = Hello_GPU program: main.o $(CC) $(LDFL) main.o $(LIB) -o $(NAME) main.o: main.c $(CC) $(CFL) -c main.c $(INC) -o main.o clean: rm -f obj/* rm -f $(NAME)
# C compiler #CC = icc CC = gcc # C++ compiler CPPC = icpc #CPPC = g++ # Loader (to link C/C++ and Fortran libraries) LD = icpc #LD = g++ # Archiver AR = ar r # For removing files RM = rm -f # Include directories INC = -I/opt/cuda7/include # Standard and/or local libraries LIB = -L/opt/cuda7/lib64 -lOpenCL # C optimization and other flags #CFL(icc) = -O3 -openmp #CFL(gcc) = -O3 -fopenmp #CFL(icc,debug) = -g #CFL(gcc) = -g CFL = # Loader optimization and other flags #LDFL(debug) = -g #LDFL(profile) = -p LDFL =
#include<stdlib.h> #include<stdio.h> #include <CL/cl.h> // functions to display platform and device properties void DisplayPlatformInfo( cl_platform_id id, cl_platform_info name, char* str) { cl_int retval; size_t paramValueSize; retval = clGetPlatformInfo( id, name, 0, NULL, ¶mValueSize); if (retval != CL_SUCCESS){ printf("Failed to find OpenCL platform %s.\n", str); return; } char * info = (char *)malloc(sizeof(char) * paramValueSize); retval = clGetPlatformInfo( id, name, paramValueSize, info, NULL); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL platform %s.\n", str); return; } printf("\t%s:\t%s\n", str, info ); free(info); } void DisplayDeviceInfo_char( cl_device_id id, cl_device_info name, char* str) { cl_int retval; size_t paramValueSize; retval = clGetDeviceInfo( id, name, 0, NULL, ¶mValueSize); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } char * info = (char *)malloc(sizeof(char) * paramValueSize); retval = clGetDeviceInfo( id, name, paramValueSize, info, NULL); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } printf("\t\t%s:\t%s\n", str, info ); free(info); }; void DisplayDeviceInfo_ulong( cl_device_id id, cl_device_info name, char* str) { cl_int retval; size_t paramValueSize;// = sizeof(cl_ulong); retval = clGetDeviceInfo( id, name, 0, NULL, ¶mValueSize); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize); retval = clGetDeviceInfo( id, name, paramValueSize, info, NULL); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } printf("\t\t%s:\t%lu MB\n", str, info[0] / 1024 / 1024 ); free(info); }; void DisplayDeviceInfo_uint( cl_device_id id, cl_device_info name, char* str) { cl_int retval; size_t paramValueSize;// = sizeof(cl_ulong); retval = clGetDeviceInfo( id, name, 0, NULL, ¶mValueSize); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize); retval = clGetDeviceInfo( id, name, paramValueSize, info, NULL); if (retval != CL_SUCCESS) { printf("Failed to find OpenCL device info %s.\n", str); return; } printf("\t\t%s:\t%d MHz\n", str, info[0] ); free(info); }; // main program controlling execution of CPU code and OpenCL kernels int main(int argc, char** argv) { cl_uint number_of_contexts = 2; cl_context context = NULL; cl_context list_of_contexts[2] = {0,0}; cl_command_queue commandQueue = 0; cl_program program = 0; cl_uint number_of_devices; cl_device_id device = 0; cl_device_id *list_of_devices; cl_device_type type; cl_kernel kernel = 0; cl_mem memObjects[3] = { 0, 0, 0 }; cl_int retval; int icon, idev; cl_uint numPlatforms; cl_platform_id * platformIds; cl_uint i,j; // flag to control displaying int Monitor = 1; // Create OpenCL contexts // First, query the total number of platforms retval = clGetPlatformIDs(0, (cl_platform_id *) NULL, &numPlatforms); // Next, allocate memory for the installed plaforms, and qeury // to get the list. platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms); // Then, query the platform IDs retval = clGetPlatformIDs(numPlatforms, platformIds, NULL); if(Monitor>=0){ printf("Number of platforms: \t%d\n", numPlatforms); } // Iterate through the list of platforms displaying associated information for (i = 0; i < numPlatforms; i++) { if(Monitor>0){ printf("Platform ID - %d\n",i); // First we display information associated with the platform DisplayPlatformInfo( platformIds[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); DisplayPlatformInfo( platformIds[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); DisplayPlatformInfo( platformIds[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); DisplayPlatformInfo( platformIds[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); } } // For the first platform int iplat; printf("Select Platform ID: "); scanf("%d", &iplat); // Query the set of devices associated with the platform retval = clGetDeviceIDs( platformIds[iplat], CL_DEVICE_TYPE_ALL, 0, NULL, &number_of_devices); list_of_devices = (cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices); retval = clGetDeviceIDs( platformIds[iplat], CL_DEVICE_TYPE_ALL, number_of_devices, list_of_devices, NULL); if(Monitor>=0){ printf("Number of devices: \t%d\n", number_of_devices); } // Iterate through each device, displaying associated information for (j = 0; j < number_of_devices; j++) { clGetDeviceInfo(list_of_devices[j], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); if(Monitor>0){ DisplayDeviceInfo_char( list_of_devices[j], CL_DEVICE_NAME, "CL_DEVICE_NAME"); DisplayDeviceInfo_char( list_of_devices[j], CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR"); DisplayDeviceInfo_char( list_of_devices[j], CL_DEVICE_VERSION, "CL_DEVICE_VERSION"); DisplayDeviceInfo_ulong( list_of_devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE"); DisplayDeviceInfo_uint( list_of_devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, "CL_DEVICE_MAX_CLOCK_FREQUENCY"); printf("\n"); } } // Next, create OpenCL contexts on platforms cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIds[iplat], 0 }; if(Monitor>0){ printf("Creating CPU context %d on platform %d\n", 1, iplat); } list_of_contexts[1] = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU, NULL, NULL, &retval); if(Monitor>=0 && retval != CL_SUCCESS){ printf("Could not create CPU context on platform %d\n", i); } if(Monitor>0){ printf("Creating GPU context 0 on platform %d\n", iplat); } list_of_contexts[0] = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &retval); if(Monitor>=0 && retval != CL_SUCCESS){ printf("Could not create GPU context on platform %d\n", i); } // in a loop over devices of the seleceted platform for(idev=0; idev<number_of_devices;idev++){ if(Monitor>0){ printf("\nFor context %d and device %d:\n", idev, idev); } device = list_of_devices[idev]; icon = idev; // choose OpenCL context on first available platform context = list_of_contexts[icon]; if(context !=0){ commandQueue = clCreateCommandQueue(context, device, 0, NULL); if (commandQueue == NULL) { printf("Failed to create commandQueue for device %d\n", idev); exit(0); } if(Monitor>0){ printf("Reading program from source\n"); } // read source code from file FILE *fp; char* source; long int size; fp = fopen("HelloWorld.cl", "rb"); if(!fp) { printf("Could not open kernel file\n"); exit(-1); } int status = fseek(fp, 0, SEEK_END); if(status != 0) { printf("Error seeking to end of file\n"); exit(-1); } size = ftell(fp); if(size < 0) { printf("Error getting file position\n"); exit(-1); } rewind(fp); source = (char *)malloc(size + 1); int i; for (i = 0; i < size+1; i++) { source[i]='\0'; } if(source == NULL) { printf("Error allocating space for the kernel source\n"); exit(-1); } fread(source, 1, size, fp); source[size] = '\0'; const char* source_const = source; cl_program program = clCreateProgramWithSource(context, 1, &source_const, NULL, NULL); if (program == NULL) { printf("Failed to create CL program from source.\n"); exit(-1); } if(Monitor>0){ printf("Creating program and kernel\n"); } // build program (passing options to compiler if necessary retval = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); char* buildLog; size_t size_of_buildLog; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size_of_buildLog); buildLog = malloc(size_of_buildLog+1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size_of_buildLog, buildLog, NULL); buildLog[size_of_buildLog]= '\0'; printf("Kernel buildLog: %s\n", buildLog); if (retval != CL_SUCCESS) { printf("Error in kernel\n"); clReleaseProgram(program); exit(-1); } // Create OpenCL kernel kernel = clCreateKernel(program, "hello_kernel", NULL); if (kernel == NULL) { printf("Failed to create kernel.\n"); exit(0); } if(Monitor>0){ printf("Creating memory objects\n"); } // Create memory objects that will be used as arguments to // kernel. First create host memory arrays that will be // used to store the arguments to the kernel float result[1]; float a[1]; float b[1]; a[0] = 2; b[0] = 2; memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float), NULL, NULL); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float), NULL, NULL); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, NULL); if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){ printf("Error creating memory objects.\n"); return 0; } if(Monitor>0){ printf("Sending kernel arguments\n"); } retval = clEnqueueWriteBuffer( commandQueue, memObjects[0], CL_FALSE, 0, sizeof(float), a, 0, NULL, NULL); // Use clEnqueueWriteBuffer() to write input array B to // the device buffer bufferB retval = clEnqueueWriteBuffer( commandQueue, memObjects[1], CL_FALSE, 0, sizeof(float), b, 0, NULL, NULL); // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } if(Monitor>0){ printf("Running the kernel!\n"); } size_t globalWorkSize[1] = { 1 }; size_t localWorkSize[1] = { 1 }; // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } if(Monitor>0){ printf("Transfering back results\n"); } // Read the output buffer back to the Host retval = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, sizeof(float), result, 0, NULL, NULL); if (retval != CL_SUCCESS) { printf("Failed to read result buffer.\n"); //Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Verify the output if(result[0]==4) { printf("Output is correct: %lf + %lf = %lf\n", a[0], b[0], result[0]); } else { printf("Output is incorrect: %lf + %lf != %lf\n", a[0], b[0], result[0]); } for (i = 0; i < 3; i++) { if (memObjects[i] != 0) clReleaseMemObject(memObjects[i]); } if (commandQueue != 0) clReleaseCommandQueue(commandQueue); if (kernel != 0) clReleaseKernel(kernel); if (program != 0) clReleaseProgram(program); } } free(list_of_devices); free(platformIds); return 0; }
__kernel void hello_kernel(__global const float *a, __global const float *b, __global float *result) { int gid = get_global_id(0); if(gid<1) result[gid] = a[gid] + b[gid]; }
Pliki zadania: opencl_vecadd.tgz Działające 3 kernele: sala404.zip
#include<stdlib.h> #include<stdio.h> #include <math.h> #include<omp.h> #include <CL/cl.h> #include"uth_ocl_intf.h" #include"./vecadd_host.h" #include"./hardware_params.h" #define time_measurments #ifdef time_measurments #include"uth_time_intf.h" static double t_begin, t_end, t_total; #endif int execute_kernel_2_CPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for CPU utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, // kernel name: , file: "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. Restoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; int nr_cores_CPU = omp_get_num_procs(); int nr_threads = nr_cores_CPU; // total number of threads int work_group_size = 1; // number of threads in work-group int nr_work_groups = nr_threads / work_group_size; globalWorkSize[0] = nr_threads ; localWorkSize[0] = work_group_size ; printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", nr_cores_CPU, nr_cores_CPU, 1); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", work_group_size, nr_threads / work_group_size, nr_threads / work_group_size / nr_cores_CPU); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, globalWorkSize[0]/nr_cores_CPU); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU); // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } int size = array_size; int size_per_thread = ceilf((float)size/nr_threads); // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); retval |= clSetKernelArg(kernel, 3, sizeof(int), &size); retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } if(monitor>UTC_BASIC_INFO){ printf("\t\t4. executing kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } int execute_kernel_3_CPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for CPU utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, // kernel name: , file: "vecadd_3_opt_blocks_kernel","vecadd_3_opt_blocks.cl",monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; int nr_cores_CPU = omp_get_num_procs(); int nr_threads = nr_cores_CPU; // total number of threads int work_group_size = 1; // number of threads in work-group int nr_work_groups = nr_threads / work_group_size; globalWorkSize[0] = nr_threads ; localWorkSize[0] = work_group_size ; printf("\t\tExecuting kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", nr_cores_CPU, nr_cores_CPU, 1); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", work_group_size, nr_threads / work_group_size, nr_threads / work_group_size / nr_cores_CPU); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU, globalWorkSize[0]/nr_cores_CPU); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU); // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } int size = array_size; // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); retval |= clSetKernelArg(kernel, 3, sizeof(int), &size); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); return 1; } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } // tutaj modyfikujemy int execute_kernel_0_GPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for GPU utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "vecadd_0_kernel", "vecadd_0.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; // globalna liczba wątków i lokalna liczba wątków int work_group_size = WORK_GROUP_SIZE; globalWorkSize[0] = array_size; localWorkSize[0] = work_group_size; if(monitor>UTC_BASIC_INFO){ printf("\t\t4. executing kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", localWorkSize[0], globalWorkSize[0]/localWorkSize[0], globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS, globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); } // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } // Set the kernel arguments (result, a, b) // ustawiamy liczbę argumentów retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); return 1; } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } int execute_kernel_1_GPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for GPU /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "vecadd_0_kernel", "vecadd_0.cl", monitor);*/ utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: */ "vecadd_1_kernel", "vecadd_1.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; int work_group_size = WORK_GROUP_SIZE; globalWorkSize[0] = array_size / 4; localWorkSize[0] = work_group_size; if(monitor>UTC_BASIC_INFO){ printf("\t\t4. executing kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", localWorkSize[0], globalWorkSize[0]/localWorkSize[0], globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS, globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); } // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); return 1; } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } int execute_kernel_2_GPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for GPU /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "vecadd_0_kernel", "vecadd_0.cl", monitor);*/ utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: */ "vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; int work_group_size = WORK_GROUP_SIZE; globalWorkSize[0] = array_size / work_group_size; localWorkSize[0] = work_group_size; int size_per_thread = work_group_size; int size = array_size; if(monitor>UTC_BASIC_INFO){ printf("\t\t4. executing kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", localWorkSize[0], globalWorkSize[0]/localWorkSize[0], globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS, globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); } // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); retval |= clSetKernelArg(kernel, 3, sizeof(int), &size); retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); return 1; } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } int execute_kernel_3_GPU( int platform_index, int device_index, int kernel_index, int array_size, cl_mem* memObjects, int monitor ) { cl_kernel kernel; cl_event kernel_execution; cl_ulong startTime; cl_ulong endTime; double exec_time; cl_int retval; size_t globalWorkSize[1] = { 0 }; size_t localWorkSize[1] = { 0 }; int i,j,k; int size = array_size; /*----------------KERNEL CREATION PHASE----------------------*/ #ifdef time_measurments t_begin = time_clock(); #endif // create the kernel for GPU /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "vecadd_0_kernel", "vecadd_0.cl", monitor);*/ utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "vecadd_3_opt_blocks_kernel", "vecadd_3_opt_blocks.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin); #endif // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } int NDRange_dim = 1; int work_group_size = WORK_GROUP_SIZE; globalWorkSize[0] = array_size; localWorkSize[0] = work_group_size; if(monitor>UTC_BASIC_INFO){ printf("\t\t4. executing kernel %d, on platform %d and device %d\n", kernel_index, platform_index, device_index); printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n", NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU); printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n", localWorkSize[0], globalWorkSize[0]/localWorkSize[0], globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS); printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS, globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n", globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0], globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS)); } // choose the kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } // Set the kernel arguments (result, a, b) retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); retval |= clSetKernelArg(kernel, 3, sizeof(int), &size); if (retval != CL_SUCCESS) { printf("Failed to Set the kernel arguments.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); return 1; } #ifdef time_measurments clFinish(command_queue); t_begin = time_clock(); #endif // Queue the kernel up for execution across the array retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_execution); clWaitForEvents(1, &kernel_execution); #ifdef time_measurments clFinish(command_queue); t_end = time_clock(); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(kernel_execution, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); exec_time = ((double)endTime - (double)startTime)*1.0e-9; printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n", t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9); printf("\tNumber of operations %d, performance %lf GFlops\n", array_size, array_size / exec_time * 1e-9); printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n", 3*array_size*sizeof(float)*1e-9, 3*array_size*sizeof(float)/exec_time*1e-9); #endif if (retval != CL_SUCCESS) { printf("Failed to queue kernel for execution.\n"); //Cleanup(context, command_queue, program, kernel, memObjects); exit(-1); } return(0); } /* // create the fourth kernel for GPU */ /* kernel_index = 4; */ /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, */ /* // kernel name: , file: */ /* "vecadd_4_opt_blocks_vect_kernel", "vecadd_4_opt_blocks_vect.cl", monitor); */
__kernel void vecadd_2_blocks_kernel(__global const float *a, __global const float *b, __global float *result, const int size, const int size_per_thread) { int gid = get_global_id(0); int index_start = gid * size_per_thread; int index_end = (gid+1) * size_per_thread; if (gid == 0) { //printf("size: %d, %d", size, size_per_thread); } for (int i=index_start; i < index_end && i < size; i++) { //for (int i=index_start; i < index_end ; i++) { result[i] = a[i]+b[i]; } }
#pragma OPENCL EXTENSION cl_khr_fp64 : enable __kernel void mat_transp_1_kernel( __global float *A , __global float *B , __global float *C , int N) { int id = get_global_id(0) * get_global_size(0) + get_global_id(1); float temp = 0.0; for(int i = 0; i<N;i++){ temp += A[id*N + i] *B[i]; } C[id] = temp; }
#include<stdlib.h> #include<stdio.h> #include <math.h> #include <CL/cl.h> #include"uth_ocl_intf.h" #define time_measurments #ifdef time_measurments #include"uth_time_intf.h" static double t_begin, t_end, t_total; #endif #define BLOCK_SIZE 16 #define NR_GROUPS 16 #define MULT 16 #define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT) #define ROZMIAR (WYMIAR*WYMIAR) // Matrices are stored in row-major order: // M(row, col) = M( row * WYMIAR + col ) int verify_result( float* result, float* result_compare ) { // Verify the result int result_OK = 1; int i,j; for(i = 0; i < WYMIAR; i++) { if(fabs(result[i] - result_compare[i])>1.e-6) { result_OK = 0; break; } } printf("\t\t6. verifying results: "); if(result_OK) { printf("Output is correct\n"); } else { printf("Output is incorrect\n"); j = 0; for(i = 0; i < WYMIAR, j<100; i++) { if(fabs(result[i] - result_compare[i])>1.e-9) { j++; printf("%d %16.8f %16.8f\n", i, result[i], result_compare[i]); } } exit(0); } /* for(i = 0; i < length; i++) { */ /* printf("%16.8f %16.8f\n", result[i], result_compare[i]); */ /* } */ return(result_OK); } /*----------------KERNEL CREATION PHASE----------------------*/ void create_kernels() { // for all operations indicate explicit info messages int monitor = UTC_BASIC_INFO + 1; int kernel_index; int platform_index = utv_ocl_struct.current_platform_index; if(utr_ocl_CPU_context_exists(platform_index)){ #ifdef time_measurments t_begin = time_clock(); #endif // create the first kernel for CPU /* kernel_index = 0; */ /* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, */ /* // kernel name: , file: */ /* "mat_transp_1_kernel", "mat_transp_1.cl", monitor); */ #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin); #endif } if(utr_ocl_GPU_context_exists(platform_index)){ #ifdef time_measurments t_begin = time_clock(); #endif // create the first kernel for GPU kernel_index = 0; utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "mat_transp_1_kernel", "mat_transp_1.cl", monitor); #ifdef time_measurments t_end = time_clock(); printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin); #endif // create the second kernel for GPU kernel_index = 1; utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, // kernel name: , file: "mat_transp_2_kernel", "mat_transp_2.cl", monitor); } } /*----------------EXECUTION PHASE----------------------*/ int execute_kernels() { // for all operations indicate explicit info messages int monitor = UTC_BASIC_INFO + 1; int platform_index = utv_ocl_struct.current_platform_index; utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index]; int kernel_index; int i,j,n; double nr_access; double t1,t2; if(monitor>UTC_BASIC_INFO){ printf("\n------------Starting execution phase----------------\n"); } // create matrices n=WYMIAR; float* A = (float *) malloc(ROZMIAR*sizeof(float)); float* B = (float *) malloc(WYMIAR*sizeof(float)); float* C = (float *) malloc(WYMIAR*sizeof(float)); float* D = (float *) malloc(WYMIAR*sizeof(float)); for(i=0;i<ROZMIAR;i++) A[i]=1.0*i/10000000.0; for(i=0;i<WYMIAR;i++) B[i]=1.0*i/10000000.0; // mnozenie macierz wektor for(i=0; i<WYMIAR; i++){ for(j=0; j<WYMIAR; j++){ D[i] += A[i*WYMIAR + j] * B[j]; } } nr_access= ROZMIAR + 2.0*WYMIAR; // read + write printf("mat_transp: nr_access %lf\n", nr_access); // get hardware characteristics to select good matrix shape // the set of device characteristics stored in data structure int device_index = 0; utt_ocl_device_struct device_struct = utv_ocl_struct.list_of_platforms[platform_index].list_of_devices[device_index]; double global_mem_bytes = device_struct.global_mem_bytes; double global_max_alloc = device_struct.global_max_alloc; double local_mem_bytes = device_struct.local_mem_bytes; double constant_mem_bytes = device_struct.constant_mem_bytes; int max_num_comp_units = device_struct.max_num_comp_units; int max_work_group_size = device_struct.max_work_group_size; // in a loop over devices (or for a selected device) int idev=0; for(idev=0; idev<platform_struct.number_of_devices; idev++){ // int device_type = ..... // choose device_index // int device_index = utr_ocl_select_device(platform_index, device_type); int device_index = idev; int device_type = utr_ocl_device_type(platform_index, device_index); if(device_index>0 && device_type==utr_ocl_device_type(platform_index, device_index-1)) break; if(device_type == UTC_OCL_DEVICE_CPU) break; // choose the context cl_context context = utr_ocl_select_context(platform_index, device_index); // choose the command queue cl_command_queue command_queue = utr_ocl_select_command_queue(platform_index, device_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t0. restoring context and command queue for platform %d and device %d\n", platform_index, device_index); } if(context == NULL || command_queue == NULL){ printf("failed to restore context and command queue for platform %d, device %d\n", platform_index, device_index); printf("%lu %lu\n", context, command_queue); } // choose the kernel kernel_index = 0; cl_kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index); if(monitor>UTC_BASIC_INFO){ printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n", kernel_index, platform_index, device_index); } if(context == NULL || command_queue == NULL || kernel == NULL){ printf("failed to restore kernel for platform %d, device %d, kernel %d\n", platform_index, device_index, kernel_index); printf("context %lu, command queue %lu, kernel %lu\n", context, command_queue, kernel); } for(i=0;i<WYMIAR;i++) C[i]=0.0; time_init(); t1 = time_clock(); // call routine to perform matrix transposition mat_transp_host(kernel_index, A, B, C, n, context, kernel, command_queue); t2 = time_clock(); time_print(); printf("GB/s = %lf\n\n", nr_access*sizeof(float)/(t2-t1)/1024/1024/1024); // verify result verify_result(C, D); } // end loop over devices return(0); } // Matrix transposition - Host code // Matrix dimensions are assumed to be multiples of BLOCK_SIZE int mat_transp_host( int kernel_index, float* A, float* B, float* C, int N, const cl_context context, const cl_kernel mat_transp_kernel, const cl_command_queue queue ) { // Load A to device memory size_t size_bytesm = N*N*sizeof(float); size_t size_bytesw = N*sizeof(float); cl_mem d_A = clCreateBuffer(context, CL_MEM_READ_ONLY, size_bytesm, NULL, NULL); // Write A to device memory clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, size_bytesm, A, 0, 0, 0); cl_mem d_B = clCreateBuffer(context, CL_MEM_READ_ONLY, size_bytesw, NULL, NULL); // Write A to device memory clEnqueueWriteBuffer(queue, d_B, CL_TRUE, 0, size_bytesw, B, 0, 0, 0); // Allocate B in device memory cl_mem d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size_bytesw, NULL, NULL); // Invoke kernel clSetKernelArg(mat_transp_kernel, 0, sizeof(cl_mem), (void*)&d_A); clSetKernelArg(mat_transp_kernel, 1, sizeof(cl_mem), (void*)&d_B); clSetKernelArg(mat_transp_kernel, 2, sizeof(cl_mem), (void*)&d_C); clSetKernelArg(mat_transp_kernel, 3, sizeof(int), (void*)&N); size_t localWorkSize[3]; size_t globalWorkSize[3]; cl_uint work_dim; if(kernel_index==3){ work_dim = 2; localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = N/MULT; localWorkSize[1] = BLOCK_SIZE; globalWorkSize[1] = N/MULT; localWorkSize[2] = 0; globalWorkSize[2] = 0; } else{ work_dim = 2; localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = sqrt(N); localWorkSize[1] = BLOCK_SIZE; globalWorkSize[1] = sqrt(N); localWorkSize[2] = 0; globalWorkSize[2] = 0; } clFinish(queue); double t1 = time_clock(); // Enqueue a kernel run call cl_event ndrEvt; clEnqueueNDRangeKernel(queue, mat_transp_kernel, work_dim, 0, globalWorkSize, localWorkSize, 0, 0, &ndrEvt); clWaitForEvents(1, &ndrEvt); clFinish(queue); double t2 = time_clock(); // Calculate performance cl_ulong startTime; cl_ulong endTime; // Get kernel profiling info clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); double time = (double)endTime - (double)startTime; printf("\nKernel execution internal: time %lf, GB/s = %lf\n", time*1.0e-9, 2.0*N*N*sizeof(float)/(time*1.0e-9)/1024/1024/1024); printf("Kernel execution external: time %lf, GB/s = %lf\n\n", t2-t1, 2.0*N*N*sizeof(float)/(t2-t1)/1024/1024/1024); // Read B from device memory clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, size_bytesw, C, 0, 0, 0); // Free device memory clReleaseMemObject(d_A); clReleaseMemObject(d_B); clReleaseMemObject(d_C); return(0); }
/* C socket server example, handles multiple clients using threads Compile gcc server.c -lpthread -o server */ #include<stdio.h> #include<string.h> //strlen #include<stdlib.h> //strlen #include<sys/socket.h> #include<arpa/inet.h> //inet_addr #include<unistd.h> //write #include<pthread.h> //for threading , link with lpthread #include <unistd.h> //the thread function void *connection_handler(void *); int main(int argc , char *argv[]) { int counter = 0; pid_t pid = fork(); if (pid == 0) { int sock; struct sockaddr_in server; char message[1000] , server_reply[2000]; int counter; unsigned int ip[] = { 192, 168, 102, 1 }; char* ip_final; char *buf; size_t sz; //Create socket for ( counter = 0; counter < 255; ++counter ) { sock = socket(AF_INET , SOCK_STREAM , 0); if (sock == -1) { printf("Could not create socket"); } puts("Socket created"); //printf("192.168.102.%d", counter); sz = snprintf(NULL, 0, "192.168.102.%d", counter); buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */ server.sin_addr.s_addr = inet_addr(buf); server.sin_family = AF_INET; server.sin_port = htons( 65001 ); //Connect to remote server if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0) { perror("connect failed. Error"); continue; } puts("Connected\n"); //keep communicating with server while(1) { //Send some data if( send(sock , "Werset 6-client" , 15 , 0) < 0) { puts("Send failed"); return 1; } //Receive a reply from the server if( recv(sock , server_reply , 2000 , 0) < 0) { puts("recv failed"); break; } puts(server_reply); break; } close(sock); } } else if (pid > 0) { int socket_desc , client_sock , c; struct sockaddr_in server , client; //Create socket socket_desc = socket(AF_INET , SOCK_STREAM , 0); if (socket_desc == -1) { printf("Could not create socket"); } puts("Socket created"); //Prepare the sockaddr_in structure server.sin_family = AF_INET; server.sin_addr.s_addr = INADDR_ANY; server.sin_port = htons( 65001 ); //Bind if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0) { //print the error message perror("bind failed. Error"); return 1; } puts("bind done"); //Listen listen(socket_desc , 3); //Accept and incoming connection puts("Waiting for incoming connections..."); c = sizeof(struct sockaddr_in); //Accept and incoming connection puts("Waiting for incoming connections..."); c = sizeof(struct sockaddr_in); pthread_t thread_id; while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) ) { puts("Connection accepted"); if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0) { perror("could not create thread"); return 1; } //Now join the thread , so that we dont terminate before the thread //pthread_join( thread_id , NULL); puts("Handler assigned"); } if (client_sock < 0) { perror("accept failed"); return 1; } } else { // fork failed printf("fork() failed!\n"); return 1; } return 0; } /* * This will handle connection for each client * */ void *connection_handler(void *socket_desc) { //Get the socket descriptor int sock = *(int*)socket_desc; int read_size; char *message , client_message[2000]; //Receive a message from client while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 ) { printf("%s\n", client_message); //end of string marker client_message[read_size] = '\0'; //Send the message back to client write(sock , "Werset 6" , 8); //clear the message buffer memset(client_message, 0, 2000); } if(read_size == 0) { puts("Client disconnected"); fflush(stdout); } else if(read_size == -1) { perror("recv failed"); } return 0; }
# kompilator c CC = gcc # konsolidator CFLAGS=-g all: server client server: server.o $(CC) $(LFLAGS) server.o -lpthread -o server server.o: server.c $(CC) -c server.c -o server.o client: client.o $(CC) client.o -o client client.o: client.c $(CC) -c client.c -o client.o clean: rm -f *.o
/* C socket server example, handles multiple clients using threads Compile gcc server.c -lpthread -o server */ #include<stdio.h> #include<string.h> //strlen #include<stdlib.h> //strlen #include<sys/socket.h> #include<arpa/inet.h> //inet_addr #include<unistd.h> //write #include<pthread.h> //for threading , link with lpthread //the thread function void *connection_handler(void *); int main(int argc , char *argv[]) { int socket_desc , client_sock , c; struct sockaddr_in server , client; //Create socket socket_desc = socket(AF_INET , SOCK_STREAM , 0); if (socket_desc == -1) { printf("Could not create socket"); } puts("Socket created"); //Prepare the sockaddr_in structure server.sin_family = AF_INET; server.sin_addr.s_addr = INADDR_ANY; server.sin_port = htons( 8888 ); //Bind if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0) { //print the error message perror("bind failed. Error"); return 1; } puts("bind done"); //Listen listen(socket_desc , 3); //Accept and incoming connection puts("Waiting for incoming connections..."); c = sizeof(struct sockaddr_in); //Accept and incoming connection puts("Waiting for incoming connections..."); c = sizeof(struct sockaddr_in); pthread_t thread_id; while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) ) { puts("Connection accepted"); if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0) { perror("could not create thread"); return 1; } //Now join the thread , so that we dont terminate before the thread //pthread_join( thread_id , NULL); puts("Handler assigned"); } if (client_sock < 0) { perror("accept failed"); return 1; } return 0; } /* * This will handle connection for each client * */ void *connection_handler(void *socket_desc) { //Get the socket descriptor int sock = *(int*)socket_desc; int read_size; char *message , client_message[2000]; //Receive a message from client while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 ) { //end of string marker client_message[read_size] = '\0'; //Send the message back to client write(sock , "Werset 6" , strlen(client_message)); //clear the message buffer memset(client_message, 0, 2000); } if(read_size == 0) { puts("Client disconnected"); fflush(stdout); } else if(read_size == -1) { perror("recv failed"); } return 0; }
#include<stdio.h> //printf #include<string.h> //strlen #include<sys/socket.h> //socket #include<arpa/inet.h> //inet_addr int main(int argc , char *argv[]) { int sock; struct sockaddr_in server; char message[1000] , server_reply[2000]; //Create socket sock = socket(AF_INET , SOCK_STREAM , 0); if (sock == -1) { printf("Could not create socket"); } puts("Socket created"); server.sin_addr.s_addr = inet_addr("127.0.0.1"); server.sin_family = AF_INET; server.sin_port = htons( 8888 ); //Connect to remote server if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0) { perror("connect failed. Error"); return 1; } puts("Connected\n"); //keep communicating with server while(1) { printf("Enter message : "); scanf("%s" , message); //Send some data if( send(sock , message , strlen(message) , 0) < 0) { puts("Send failed"); return 1; } //Receive a reply from the server if( recv(sock , server_reply , 2000 , 0) < 0) { puts("recv failed"); break; } puts("Server reply :"); puts(server_reply); } close(sock); return 0; }
#include<stdio.h> #include<string.h> //strlen #include<stdlib.h> //strlen #include<sys/socket.h> #include<arpa/inet.h> //inet_addr #include<unistd.h> //write #include<pthread.h> //for threading , link with lpthread //the thread function void *connection_handler(void *); void *client_function(void *socket_desc){ int sock; struct sockaddr_in server_id; char message[1000] , server_reply[2000]; //Create socket sock = socket(AF_INET , SOCK_STREAM , 0); if (sock == -1) { printf("Could not create socket"); } puts("Socket created"); // 192.168.102.63 wogu server_id.sin_addr.s_addr = inet_addr("localhost"); server_id.sin_family = AF_INET; server_id.sin_port = htons( 65000 ); //Connect to remote server if (connect(sock , (struct sockaddr *)&server_id , sizeof(server_id)) < 0) { perror("connect failed. Error"); }else{ puts("Connected\n"); //keep communicating with server while(1) { printf("Enter message : "); scanf("%s" , message); //Send some data if( send(sock , message , strlen(message) , 0) < 0) { puts("Send failed"); } //Receive a reply from the server if( recv(sock , server_reply , 2000 , 0) < 0) { puts("recv failed"); break; } puts("Server reply :"); puts(server_reply); } close(sock); } }; void *server_function(void * sock){ printf("SERVER Start"); fflush(stdout); int socket_desc , client_sock , c; pthread_t thread_id; struct sockaddr_in server , client; //Create socket socket_desc = socket(AF_INET , SOCK_STREAM , 0); if (socket_desc == -1) { printf("Could not create socket"); } puts("Socket created"); //Prepare the sockaddr_in structure server.sin_family = AF_INET; server.sin_addr.s_addr = INADDR_ANY; server.sin_port = htons( 65000); //Bind if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0) { //print the error message perror("bind failed. Error"); return; } puts("bind done"); //Listen listen(socket_desc , 3); //Accept and incoming connection puts("Waiting for incoming connections..."); c = sizeof(struct sockaddr_in); fflush(stdout); while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) ) { puts("Connection accepted"); if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0) { perror("could not create thread"); return; } //Now join the thread , so that we dont terminate before the thread //pthread_join( thread_id , NULL); puts("Handler assigned"); } if (client_sock < 0) { perror("accept failed"); return; } } int main(int argc , char *argv[]) { pthread_t thread_id; //Create server if( pthread_create( &thread_id , NULL , server_function , NULL) != 0) { perror("could not create server thread"); fflush(stdout); return 1; } pthread_t thread_id2; // Create client if( pthread_create( &thread_id2 , NULL , client_function , NULL) != 0) { perror("could not create client thread"); fflush(stdout); return 1; } return 0; } /* * This will handle connection for each client * */ void *connection_handler(void *socket_desc) { //Get the socket descriptor int sock = *(int*)socket_desc; int read_size; char *message , client_message[2000]; //Receive a message from client while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 ) { //end of string marker client_message[read_size] = '\0'; //Send the message back to client write(sock , "Werset 5" , strlen(client_message)); //clear the message buffer memset(client_message, 0, 2000); } if(read_size == 0) { puts("Client disconnected"); fflush(stdout); } else if(read_size == -1) { perror("recv failed"); } return 0; }