====== Zajęcia 1 ======
==== Makefile ====
# kompilator c
CCOMP = mpicc
# konsolidator
LINK = mpicc
MPIRUN = /usr/lib64/openmpi/bin/mpiexec
# opcje optymalizacji:
# wersja do debugowania
# OPT = -g -DDEBUG -p
# wersja zoptymalizowana do mierzenia czasu
# OPT = -O3 -fopenmp -p
# pliki naglowkowe
#INC = -I../pomiar_czasu
# biblioteki
#LIB = -L../pomiar_czasu -lm
LIB = -lm
# zaleznosci i komendy
heat: heat.o
$(LINK) $(OPT) heat.o -o heat $(LIB)
heat.o: heat.c
$(CCOMP) -c $(OPT) heat.c $(INC)
run:
$(MPIRUN) -np 8 ./heat
clean:
rm -f *.o
==== heat.c ====
# include
# include
# include
# include "mpi.h"
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
/******************************************************************************/
int main ( int argc, char *argv[] )
{
double a = 0.0; // lewy brzeg przedzialu
double b = 1.0; // prawy brzeg przedzialu
int i;
int id; // rank
int n; // liczba punktow dla kazdego wezla
int p; // size
double x_max;
double x_min;
MPI_Init ( &argc, &argv );
MPI_Comm_rank ( MPI_COMM_WORLD, &id );
MPI_Comm_size ( MPI_COMM_WORLD, &p );
n = 12; // liczba punktow dla kazdego wezla
i = 0; // poczatkowa chwila czasu
// wspolrzedna lewego punktu dla wezla id
x_min = ( ( double )( p * n + 1 - id * n - i ) * a
+ ( double )( id * n + i ) * b )
/ ( double ) ( p * n + 1 );
i = n + 1;
// wspolrzedna prawego punktu dla wezla id
x_max = ( ( double )( p * n + 1 - id * n - i ) * a
+ ( double )( id * n + i ) * b )
/ ( double )( p * n + 1 );
heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
MPI_Finalize ( );
}
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
double cfl;
double *h;
double *h_new;
int i;
int ierr;
int j;
int j_max;
int j_min;
double k;
MPI_Status status;
double t;
double t_del;
double t_max;
double t_min;
int tag;
double wtime;
double *x;
double x_del;
h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
k = 0.002 / ( double ) p; // przewodniosc cieplna
j_min = 0; // indeksy krokow czasowych - min i max
j_max = 100;
t_min = 0.0; // chwile czasu - min i max
t_max = 10.0;
t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
for ( i = 0; i <= n + 1; i++ )
{
x[i] = ( ( double ) ( i ) * x_max
+ ( double ) ( n + 1 - i ) * x_min )
/ ( double ) ( n + 1 );
}
// ustawienie warunku poczatkowego
for ( i = 0; i <= n + 1; i++ )
{
h[i] = 95.0;
}
// sprawdzenie stabilnosci schematu
cfl = k * t_del / x_del / x_del;
if ( 0.5 <= cfl )
{
printf ( " CFL condition failed.\n" );
exit ( 1 );
}
wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
for ( j = 1; j <= j_max; j++ )
{
// wymiana informacji z wezlami sasiednimi
tag = 1;
if ( id < p - 1 )
{
MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
}
if ( 0 < id )
{
MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
}
tag = 2;
if ( 0 < id )
{
// DO UZUPELNIENIA
MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
}
if ( id < p - 1 )
{
// DO UZUPELNIENIA
MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
}
// implementacja wzoru roznicowego
for ( i = 1; i <= n; i++ )
{
h_new[i] = h[i] + t_del * (
k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
+ 2.0 * sin ( x[i] * t ) );
}
// nowa chwila czasu
t = ( ( double ) ( j - j_min ) * t_max
+ ( double ) ( j_max - j ) * t_min )
/ ( double ) ( j_max - j_min );
// przygotowanie do nastepnego kroku czasowego
for ( i = 1; i < n + 1; i++ )
{
h[i] = h_new[i];
}
if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
if ( id == p - 1 ) h[n+1] = 75;
}
// koncowa wymiana informacji z wezlami sasiednimi
tag = 11;
if ( id < p - 1 ) {
MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
}
if ( 0 < id ) {
MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
}
tag = 12;
if ( 0 < id ) {
// DO UZUPELNIENIA
MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
}
if ( id < p - 1 ) {
// DO UZUPELNIENIA
MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
}
wtime = MPI_Wtime ( ) - wtime;
if ( id == 0 )
{
printf ( "\n" );
printf ( " Wall clock elapsed seconds = %f\n", wtime );
}
// wydruk wyniku
printf ( "%2d T= %f\n", id, t );
printf ( "%2d X= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", x[i] );
}
printf ( "\n" );
printf ( "%2d H= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", h[i] );
}
printf ( "\n" );
free ( h );
free ( h_new );
free ( x );
return;
}
====== Zajęcia 2 ======
==== heat.c ====
# include
# include
# include
# include "mpi.h"
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
/******************************************************************************/
int main ( int argc, char *argv[] )
{
double a = 0.0; // lewy brzeg przedzialu
double b = 1000000.0; // prawy brzeg przedzialu
int i;
int id; // rank
int n; // liczba punktow dla kazdego wezla
int p; // size
double x_max;
double x_min;
MPI_Init ( &argc, &argv );
MPI_Comm_rank ( MPI_COMM_WORLD, &id );
MPI_Comm_size ( MPI_COMM_WORLD, &p );
n = 1000000; // liczba punktow dla kazdego wezla
i = 0; // poczatkowa chwila czasu
// wspolrzedna lewego punktu dla wezla id
x_min = ( ( double )( p * n + 1 - id * n - i ) * a
+ ( double )( id * n + i ) * b )
/ ( double ) ( p * n + 1 );
i = n + 1;
// wspolrzedna prawego punktu dla wezla id
x_max = ( ( double )( p * n + 1 - id * n - i ) * a
+ ( double )( id * n + i ) * b )
/ ( double )( p * n + 1 );
heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
MPI_Finalize ( );
}
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
double cfl;
double *h;
double *h_new;
int i;
int ierr;
int j;
int j_max;
int j_min;
double k;
MPI_Status status;
double t;
double t_del;
double t_max;
double t_min;
int tag;
double wtime;
double *x;
double x_del;
MPI_Request req1, req2, req3, req4;
MPI_Status stat1, stat2, stat3, stat4;
h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
k = 0.002 / ( double ) p; // przewodniosc cieplna
j_min = 0; // indeksy krokow czasowych - min i max
j_max = 100;
t_min = 0.0; // chwile czasu - min i max
t_max = 10.0;
t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
for ( i = 0; i <= n + 1; i++ )
{
x[i] = ( ( double ) ( i ) * x_max
+ ( double ) ( n + 1 - i ) * x_min )
/ ( double ) ( n + 1 );
}
// ustawienie warunku poczatkowego
for ( i = 0; i <= n + 1; i++ )
{
h[i] = 95.0;
}
// sprawdzenie stabilnosci schematu
cfl = k * t_del / x_del / x_del;
if ( 0.5 <= cfl )
{
printf ( " CFL condition failed.\n" );
exit ( 1 );
}
wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
for ( j = 1; j <= j_max; j++ )
{
// wymiana informacji z wezlami sasiednimi
///NON BLOCK
if ( id > 0 )
{
MPI_Irecv ( &h[0], 1, MPI_DOUBLE, id-1, 1, MPI_COMM_WORLD, &req1 );
}
if ( id < p - 1 )
{
MPI_Irecv ( &h[n+1], 1, MPI_DOUBLE, id+1, 2, MPI_COMM_WORLD, &req2 );
}
///END NON BLOCK
if ( id > 0 )
{
MPI_Isend ( &h[1], 1, MPI_DOUBLE, id-1, 2, MPI_COMM_WORLD, &req3 );
}
if ( id < p - 1 )
{
MPI_Isend ( &h[n], 1, MPI_DOUBLE, id+1, 1, MPI_COMM_WORLD, &req4 );
}
// implementacja wzoru roznicowego
for ( i = 2; i <= n-1; i++ )
{
h_new[i] = h[i] + t_del * (
k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
+ 2.0 * sin ( x[i] * t ) );
}
///NON BLOCK
///WAIT FOR RECEIVE
if ( id > 0 )
{
MPI_Wait(&req1, &stat1);
}
if ( id < p - 1 )
{
MPI_Wait(&req2, &stat2);
}
int tmp = 1;
h_new[tmp] = h[tmp] + t_del * (
k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del
+ 2.0 * sin ( x[tmp] * t ) );
tmp = n;
h_new[tmp] = h[tmp] + t_del * (
k * ( h[tmp-1] - 2.0 * h[tmp] + h[tmp+1] ) / x_del / x_del
+ 2.0 * sin ( x[tmp] * t ) );
///WAIT FOR SEND
if ( id > 0 )
{
MPI_Wait(&req3, &stat3);
}
if ( id < p - 1 )
{
MPI_Wait(&req4, &stat4);
}
///END NON BLOCK
// nowa chwila czasu
t = ( ( double ) ( j - j_min ) * t_max
+ ( double ) ( j_max - j ) * t_min )
/ ( double ) ( j_max - j_min );
// przygotowanie do nastepnego kroku czasowego
for ( i = 1; i < n + 1; i++ )
{
h[i] = h_new[i];
}
if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
if ( id == p - 1 ) h[n+1] = 75;
}
// koncowa wymiana informacji z wezlami sasiednimi
tag = 11;
if ( id < p - 1 ) {
MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
}
if ( 0 < id ) {
MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
}
tag = 12;
if ( 0 < id ) {
// DO UZUPELNIENIA
MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
}
if ( id < p - 1 ) {
// DO UZUPELNIENIA
MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
}
wtime = MPI_Wtime ( ) - wtime;
if ( id == 0 )
{
printf ( "\n" );
printf ( " Wall clock elapsed seconds = %f\n", wtime );
}
// wydruk wyniku
/*printf ( "%2d T= %f\n", id, t );
printf ( "%2d X= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", x[i] );
}
printf ( "\n" );
printf ( "%2d H= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", h[i] );
}
printf ( "\n" ); */
free ( h );
free ( h_new );
free ( x );
return;
}
===== Zajęcia 3 =====
==== child.c ====
# include
# include
# include
# include "mpi.h"
int main ( int argc, char *argv[] );
void heat_part ( int n, int p, int id, double x_min, double x_max );
/******************************************************************************/
int main ( int argc, char *argv[] )
{
int id; // rank
int n; // liczba punktow dla kazdego wezla
int p; // size
double x_max;
double x_min;
MPI_Comm parentcomm;
MPI_Status status;
MPI_Init ( &argc, &argv );
MPI_Comm_rank ( MPI_COMM_WORLD, &id );
MPI_Comm_size ( MPI_COMM_WORLD, &p );
MPI_Comm_get_parent(&parentcomm);
MPI_Recv ( &x_min, 1, MPI_DOUBLE, 0, 1, parentcomm, &status );
MPI_Recv ( &x_max, 1, MPI_DOUBLE, 0, 2, parentcomm, &status );
MPI_Recv ( &n, 1, MPI_INT, 0, 3, parentcomm, &status );
//MPI_Barrier(MPI_COMM_WORLD);
heat_part ( n, p, id, x_min, x_max ); // obliczenia dla pojedynczego wezla
MPI_Finalize ( );
return 0;
}
/******************************************************************************/
// obliczenia dla pojedynczego wezla - pojedynczego podobszaru
/******************************************************************************/
void heat_part ( int n, int p, int id, double x_min, double x_max )
{
double cfl;
double *h;
double *h_new;
int i;
int ierr;
int j;
int j_max;
int j_min;
double k;
MPI_Status status;
double t;
double t_del;
double t_max;
double t_min;
int tag;
double wtime;
double *x;
double x_del;
h = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i
h_new = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // rozwiazanie dla t_i+1
x = ( double * ) malloc ( ( n + 2 ) * sizeof ( double ) ); // wspolrzedne punktow
k = 0.002 / ( double ) p; // przewodniosc cieplna
j_min = 0; // indeksy krokow czasowych - min i max
j_max = 100;
t_min = 0.0; // chwile czasu - min i max
t_max = 10.0;
t_del = ( t_max - t_min ) / ( double ) ( j_max - j_min ); // krok czasowy Delta t
x_del = ( x_max - x_min ) / ( double ) ( n + 1 ); // odstep miedzy punktami
for ( i = 0; i <= n + 1; i++ )
{
x[i] = ( ( double ) ( i ) * x_max
+ ( double ) ( n + 1 - i ) * x_min )
/ ( double ) ( n + 1 );
}
// ustawienie warunku poczatkowego
for ( i = 0; i <= n + 1; i++ )
{
h[i] = 95.0;
}
// sprawdzenie stabilnosci schematu
cfl = k * t_del / x_del / x_del;
if ( 0.5 <= cfl )
{
printf ( " CFL condition failed.\n" );
exit ( 1 );
}
wtime = MPI_Wtime ( ); //poczatek pomiaru czasu
for ( j = 1; j <= j_max; j++ )
{
// wymiana informacji z wezlami sasiednimi
tag = 1;
if ( id < p - 1 )
{
MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
}
if ( 0 < id )
{
MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
}
tag = 2;
if ( 0 < id ) {
MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
}
if ( id < p - 1 ) {
MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
}
// implementacja wzoru roznicowego
for ( i = 1; i <= n; i++ )
{
h_new[i] = h[i] + t_del * (
k * ( h[i-1] - 2.0 * h[i] + h[i+1] ) / x_del / x_del
+ 2.0 * sin ( x[i] * t ) );
}
// nowa chwila czasu
t = ( ( double ) ( j - j_min ) * t_max
+ ( double ) ( j_max - j ) * t_min )
/ ( double ) ( j_max - j_min );
// przygotowanie do nastepnego kroku czasowego
for ( i = 1; i < n + 1; i++ )
{
h[i] = h_new[i];
}
if ( 0 == id ) h[0] = 100.0 + 10.0 * sin ( t );
if ( id == p - 1 ) h[n+1] = 75;
}
// koncowa wymiana informacji z wezlami sasiednimi
tag = 11;
if ( id < p - 1 ) {
MPI_Send ( &h[n], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD );
}
if ( 0 < id ) {
MPI_Recv ( &h[0], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD, &status );
}
tag = 12;
if ( 0 < id ) {
// DO UZUPELNIENIA
MPI_Send ( &h[1], 1, MPI_DOUBLE, id-1, tag, MPI_COMM_WORLD );
}
if ( id < p - 1 ) {
// DO UZUPELNIENIA
MPI_Recv ( &h[n+1], 1, MPI_DOUBLE, id+1, tag, MPI_COMM_WORLD, &status );
}
wtime = MPI_Wtime ( ) - wtime;
if ( id == 0 )
{
printf ( "\n" );
printf ( " Wall clock elapsed seconds = %f\n", wtime );
}
// wydruk wyniku
printf ( "%2d T= %f\n", id, t );
printf ( "%2d X= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", x[i] );
}
printf ( "\n" );
printf ( "%2d H= ", id );
for ( i = 0; i <= n + 1; i++ )
{
printf ( "%7.2f", h[i] );
}
printf ( "\n" );
free ( h );
free ( h_new );
free ( x );
return;
}
==== parent.c ====
# include
# include
# include
# include "mpi.h"
int main ( int argc, char *argv[] );
/******************************************************************************/
int main ( int argc, char *argv[] )
{
double a = 0.0; // lewy brzeg przedzialu
double b = 1.0; // prawy brzeg przedzialu
int i;
int id; // rank
int n; // liczba punktow dla kazdego wezla
int p; // size
int np = 8; // liczba procesow
double x_max;
double x_min;
MPI_Comm komunikator;
int *errcodes;
MPI_Init ( &argc, &argv );
//MPI_Comm_rank ( MPI_COMM_WORLD, &id );
//MPI_Comm_size ( MPI_COMM_WORLD, &p );
MPI_Comm_spawn("dziecko", MPI_ARGV_NULL, np, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &komunikator, errcodes);
n = 12; // liczba punktow dla kazdego wezla
int j;
for(j=0;j
====== Zajęcia 4 ======
wkrótce
====== Zajęcia 5 ======
==== Makefile ====
# optimization and other system dependent options
#include make.$(SRR_ARCH)
# or directly
include make.lab_404_NVIDIA
NAME = Hello_GPU
program: main.o
$(CC) $(LDFL) main.o $(LIB) -o $(NAME)
main.o: main.c
$(CC) $(CFL) -c main.c $(INC) -o main.o
clean:
rm -f obj/*
rm -f $(NAME)
==== make.lab_404_NVIDIA ====
# C compiler
#CC = icc
CC = gcc
# C++ compiler
CPPC = icpc
#CPPC = g++
# Loader (to link C/C++ and Fortran libraries)
LD = icpc
#LD = g++
# Archiver
AR = ar r
# For removing files
RM = rm -f
# Include directories
INC = -I/opt/cuda7/include
# Standard and/or local libraries
LIB = -L/opt/cuda7/lib64 -lOpenCL
# C optimization and other flags
#CFL(icc) = -O3 -openmp
#CFL(gcc) = -O3 -fopenmp
#CFL(icc,debug) = -g
#CFL(gcc) = -g
CFL =
# Loader optimization and other flags
#LDFL(debug) = -g
#LDFL(profile) = -p
LDFL =
==== main.c ====
#include
#include
#include
// functions to display platform and device properties
void DisplayPlatformInfo(
cl_platform_id id,
cl_platform_info name,
char* str)
{
cl_int retval;
size_t paramValueSize;
retval = clGetPlatformInfo(
id,
name,
0,
NULL,
¶mValueSize);
if (retval != CL_SUCCESS){
printf("Failed to find OpenCL platform %s.\n", str);
return;
}
char * info = (char *)malloc(sizeof(char) * paramValueSize);
retval = clGetPlatformInfo(
id,
name,
paramValueSize,
info,
NULL);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL platform %s.\n", str);
return;
}
printf("\t%s:\t%s\n", str, info );
free(info);
}
void DisplayDeviceInfo_char(
cl_device_id id,
cl_device_info name,
char* str)
{
cl_int retval;
size_t paramValueSize;
retval = clGetDeviceInfo(
id,
name,
0,
NULL,
¶mValueSize);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
char * info = (char *)malloc(sizeof(char) * paramValueSize);
retval = clGetDeviceInfo(
id,
name,
paramValueSize,
info,
NULL);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
printf("\t\t%s:\t%s\n", str, info );
free(info);
};
void DisplayDeviceInfo_ulong(
cl_device_id id,
cl_device_info name,
char* str)
{
cl_int retval;
size_t paramValueSize;// = sizeof(cl_ulong);
retval = clGetDeviceInfo(
id,
name,
0,
NULL,
¶mValueSize);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
cl_ulong * info = (cl_ulong *)malloc(sizeof(cl_ulong) * paramValueSize);
retval = clGetDeviceInfo(
id,
name,
paramValueSize,
info,
NULL);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
printf("\t\t%s:\t%lu MB\n", str, info[0] / 1024 / 1024 );
free(info);
};
void DisplayDeviceInfo_uint(
cl_device_id id,
cl_device_info name,
char* str)
{
cl_int retval;
size_t paramValueSize;// = sizeof(cl_ulong);
retval = clGetDeviceInfo(
id,
name,
0,
NULL,
¶mValueSize);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
cl_uint * info = (cl_uint *)malloc(sizeof(cl_uint) * paramValueSize);
retval = clGetDeviceInfo(
id,
name,
paramValueSize,
info,
NULL);
if (retval != CL_SUCCESS) {
printf("Failed to find OpenCL device info %s.\n", str);
return;
}
printf("\t\t%s:\t%d MHz\n", str, info[0] );
free(info);
};
// main program controlling execution of CPU code and OpenCL kernels
int main(int argc, char** argv)
{
cl_uint number_of_contexts = 2;
cl_context context = NULL;
cl_context list_of_contexts[2] = {0,0};
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_uint number_of_devices;
cl_device_id device = 0;
cl_device_id *list_of_devices;
cl_device_type type;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0 };
cl_int retval;
int icon, idev;
cl_uint numPlatforms;
cl_platform_id * platformIds;
cl_uint i,j;
// flag to control displaying
int Monitor = 1;
// Create OpenCL contexts
// First, query the total number of platforms
retval = clGetPlatformIDs(0, (cl_platform_id *) NULL, &numPlatforms);
// Next, allocate memory for the installed plaforms, and qeury
// to get the list.
platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
// Then, query the platform IDs
retval = clGetPlatformIDs(numPlatforms, platformIds, NULL);
if(Monitor>=0){
printf("Number of platforms: \t%d\n", numPlatforms);
}
// Iterate through the list of platforms displaying associated information
for (i = 0; i < numPlatforms; i++) {
if(Monitor>0){
printf("Platform ID - %d\n",i);
// First we display information associated with the platform
DisplayPlatformInfo(
platformIds[i],
CL_PLATFORM_NAME,
"CL_PLATFORM_NAME");
DisplayPlatformInfo(
platformIds[i],
CL_PLATFORM_PROFILE,
"CL_PLATFORM_PROFILE");
DisplayPlatformInfo(
platformIds[i],
CL_PLATFORM_VERSION,
"CL_PLATFORM_VERSION");
DisplayPlatformInfo(
platformIds[i],
CL_PLATFORM_VENDOR,
"CL_PLATFORM_VENDOR");
}
}
// For the first platform
int iplat;
printf("Select Platform ID: "); scanf("%d", &iplat);
// Query the set of devices associated with the platform
retval = clGetDeviceIDs(
platformIds[iplat],
CL_DEVICE_TYPE_ALL,
0,
NULL,
&number_of_devices);
list_of_devices =
(cl_device_id *) malloc (sizeof(cl_device_id) * number_of_devices);
retval = clGetDeviceIDs(
platformIds[iplat],
CL_DEVICE_TYPE_ALL,
number_of_devices,
list_of_devices,
NULL);
if(Monitor>=0){
printf("Number of devices: \t%d\n", number_of_devices);
}
// Iterate through each device, displaying associated information
for (j = 0; j < number_of_devices; j++) {
clGetDeviceInfo(list_of_devices[j], CL_DEVICE_TYPE,
sizeof(cl_device_type), &type, NULL);
if(Monitor>0){
DisplayDeviceInfo_char(
list_of_devices[j],
CL_DEVICE_NAME,
"CL_DEVICE_NAME");
DisplayDeviceInfo_char(
list_of_devices[j],
CL_DEVICE_VENDOR,
"CL_DEVICE_VENDOR");
DisplayDeviceInfo_char(
list_of_devices[j],
CL_DEVICE_VERSION,
"CL_DEVICE_VERSION");
DisplayDeviceInfo_ulong(
list_of_devices[j],
CL_DEVICE_GLOBAL_MEM_SIZE,
"CL_DEVICE_GLOBAL_MEM_SIZE");
DisplayDeviceInfo_uint(
list_of_devices[j],
CL_DEVICE_MAX_CLOCK_FREQUENCY,
"CL_DEVICE_MAX_CLOCK_FREQUENCY");
printf("\n");
}
}
// Next, create OpenCL contexts on platforms
cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)platformIds[iplat],
0
};
if(Monitor>0){
printf("Creating CPU context %d on platform %d\n", 1, iplat);
}
list_of_contexts[1] =
clCreateContextFromType(contextProperties,
CL_DEVICE_TYPE_CPU, NULL, NULL, &retval);
if(Monitor>=0 && retval != CL_SUCCESS){
printf("Could not create CPU context on platform %d\n", i);
}
if(Monitor>0){
printf("Creating GPU context 0 on platform %d\n", iplat);
}
list_of_contexts[0] =
clCreateContextFromType(contextProperties,
CL_DEVICE_TYPE_GPU, NULL, NULL, &retval);
if(Monitor>=0 && retval != CL_SUCCESS){
printf("Could not create GPU context on platform %d\n", i);
}
// in a loop over devices of the seleceted platform
for(idev=0; idev0){
printf("\nFor context %d and device %d:\n",
idev, idev);
}
device = list_of_devices[idev];
icon = idev;
// choose OpenCL context on first available platform
context = list_of_contexts[icon];
if(context !=0){
commandQueue = clCreateCommandQueue(context, device, 0, NULL);
if (commandQueue == NULL) {
printf("Failed to create commandQueue for device %d\n", idev);
exit(0);
}
if(Monitor>0){
printf("Reading program from source\n");
}
// read source code from file
FILE *fp;
char* source;
long int size;
fp = fopen("HelloWorld.cl", "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
int status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
const char* source_const = source;
cl_program program = clCreateProgramWithSource(context, 1,
&source_const,
NULL, NULL);
if (program == NULL)
{
printf("Failed to create CL program from source.\n");
exit(-1);
}
if(Monitor>0){
printf("Creating program and kernel\n");
}
// build program (passing options to compiler if necessary
retval = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
char* buildLog; size_t size_of_buildLog;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
0, NULL, &size_of_buildLog);
buildLog = malloc(size_of_buildLog+1);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
size_of_buildLog, buildLog, NULL);
buildLog[size_of_buildLog]= '\0';
printf("Kernel buildLog: %s\n", buildLog);
if (retval != CL_SUCCESS)
{
printf("Error in kernel\n");
clReleaseProgram(program);
exit(-1);
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "hello_kernel", NULL);
if (kernel == NULL)
{
printf("Failed to create kernel.\n");
exit(0);
}
if(Monitor>0){
printf("Creating memory objects\n");
}
// Create memory objects that will be used as arguments to
// kernel. First create host memory arrays that will be
// used to store the arguments to the kernel
float result[1];
float a[1];
float b[1];
a[0] = 2;
b[0] = 2;
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
sizeof(float), NULL, NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY ,
sizeof(float), NULL, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float), NULL, NULL);
if (memObjects[0]==NULL || memObjects[1]==NULL || memObjects[2]==NULL){
printf("Error creating memory objects.\n");
return 0;
}
if(Monitor>0){
printf("Sending kernel arguments\n");
}
retval = clEnqueueWriteBuffer(
commandQueue,
memObjects[0],
CL_FALSE,
0,
sizeof(float),
a,
0,
NULL,
NULL);
// Use clEnqueueWriteBuffer() to write input array B to
// the device buffer bufferB
retval = clEnqueueWriteBuffer(
commandQueue,
memObjects[1],
CL_FALSE,
0,
sizeof(float),
b,
0,
NULL,
NULL);
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (retval != CL_SUCCESS)
{
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
if(Monitor>0){
printf("Running the kernel!\n");
}
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (retval != CL_SUCCESS)
{
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
if(Monitor>0){
printf("Transfering back results\n");
}
// Read the output buffer back to the Host
retval = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
0, sizeof(float), result,
0, NULL, NULL);
if (retval != CL_SUCCESS)
{
printf("Failed to read result buffer.\n");
//Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Verify the output
if(result[0]==4) {
printf("Output is correct: %lf + %lf = %lf\n",
a[0], b[0], result[0]);
} else {
printf("Output is incorrect: %lf + %lf != %lf\n",
a[0], b[0], result[0]);
}
for (i = 0; i < 3; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
}
}
free(list_of_devices);
free(platformIds);
return 0;
}
==== HelloWorld.cl ====
__kernel void hello_kernel(__global const float *a,
__global const float *b,
__global float *result)
{
int gid = get_global_id(0);
if(gid<1) result[gid] = a[gid] + b[gid];
}
===== Zajęcia 6 =====
Pliki zadania: {{:studia:magisterskie:1sem:opencl_vecadd.tgz|}}
Działające 3 kernele: {{:studia:magisterskie:1sem:sala404.zip|}}
==== opencl_vecadd/vecadd_host/execute_kernels.c ====
#include
#include
#include
#include
#include
#include"uth_ocl_intf.h"
#include"./vecadd_host.h"
#include"./hardware_params.h"
#define time_measurments
#ifdef time_measurments
#include"uth_time_intf.h"
static double t_begin, t_end, t_total;
#endif
int execute_kernel_2_CPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for CPU
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
// kernel name: , file:
"vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. Restoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
int nr_cores_CPU = omp_get_num_procs();
int nr_threads = nr_cores_CPU; // total number of threads
int work_group_size = 1; // number of threads in work-group
int nr_work_groups = nr_threads / work_group_size;
globalWorkSize[0] = nr_threads ;
localWorkSize[0] = work_group_size ;
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
nr_cores_CPU, nr_cores_CPU, 1);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
work_group_size, nr_threads / work_group_size,
nr_threads / work_group_size / nr_cores_CPU);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU,
globalWorkSize[0]/nr_cores_CPU);
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
int size = array_size;
int size_per_thread = ceilf((float)size/nr_threads);
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
if(monitor>UTC_BASIC_INFO){
printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
int execute_kernel_3_CPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for CPU
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index,
// kernel name: , file:
"vecadd_3_opt_blocks_kernel","vecadd_3_opt_blocks.cl",monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
int nr_cores_CPU = omp_get_num_procs();
int nr_threads = nr_cores_CPU; // total number of threads
int work_group_size = 1; // number of threads in work-group
int nr_work_groups = nr_threads / work_group_size;
globalWorkSize[0] = nr_threads ;
localWorkSize[0] = work_group_size ;
printf("\t\tExecuting kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
nr_cores_CPU, nr_cores_CPU, 1);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
work_group_size, nr_threads / work_group_size,
nr_threads / work_group_size / nr_cores_CPU);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/nr_cores_CPU,
globalWorkSize[0]/nr_cores_CPU);
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
array_size, array_size/globalWorkSize[0], array_size/nr_cores_CPU);
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
int size = array_size;
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
return 1;
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
// tutaj modyfikujemy
int execute_kernel_0_GPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for GPU
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"vecadd_0_kernel", "vecadd_0.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
// globalna liczba wątków i lokalna liczba wątków
int work_group_size = WORK_GROUP_SIZE;
globalWorkSize[0] = array_size;
localWorkSize[0] = work_group_size;
if(monitor>UTC_BASIC_INFO){
printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
}
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
// Set the kernel arguments (result, a, b)
// ustawiamy liczbę argumentów
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
return 1;
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
int execute_kernel_1_GPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for GPU
/*
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"vecadd_0_kernel", "vecadd_0.cl", monitor);*/
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file: */
"vecadd_1_kernel", "vecadd_1.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
int work_group_size = WORK_GROUP_SIZE;
globalWorkSize[0] = array_size / 4;
localWorkSize[0] = work_group_size;
if(monitor>UTC_BASIC_INFO){
printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
}
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
return 1;
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
int execute_kernel_2_GPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for GPU
/* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"vecadd_0_kernel", "vecadd_0.cl", monitor);*/
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file: */
"vecadd_2_blocks_kernel", "vecadd_2_blocks.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
int work_group_size = WORK_GROUP_SIZE;
globalWorkSize[0] = array_size / work_group_size;
localWorkSize[0] = work_group_size;
int size_per_thread = work_group_size;
int size = array_size;
if(monitor>UTC_BASIC_INFO){
printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
}
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
retval |= clSetKernelArg(kernel, 4, sizeof(int), &size_per_thread);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
return 1;
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
int execute_kernel_3_GPU(
int platform_index,
int device_index,
int kernel_index,
int array_size,
cl_mem* memObjects,
int monitor
)
{
cl_kernel kernel;
cl_event kernel_execution;
cl_ulong startTime;
cl_ulong endTime;
double exec_time;
cl_int retval;
size_t globalWorkSize[1] = { 0 };
size_t localWorkSize[1] = { 0 };
int i,j,k;
int size = array_size;
/*----------------KERNEL CREATION PHASE----------------------*/
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the kernel for GPU
/* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"vecadd_0_kernel", "vecadd_0.cl", monitor);*/
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"vecadd_3_opt_blocks_kernel", "vecadd_3_opt_blocks.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel: %lf\n", t_end-t_begin);
#endif
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \tRestoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
int NDRange_dim = 1;
int work_group_size = WORK_GROUP_SIZE;
globalWorkSize[0] = array_size;
localWorkSize[0] = work_group_size;
if(monitor>UTC_BASIC_INFO){
printf("\t\t4. executing kernel %d, on platform %d and device %d\n",
kernel_index, platform_index, device_index);
printf("PARAMETERS: \tnr_CU %d, nr_cores %d, nr_cores_per_CU %d\n",
NR_COMP_UNITS, NR_CORES_PER_CU*NR_COMP_UNITS, NR_CORES_PER_CU);
printf("\t\tworkgroup size %d, nr_workgroups %d ,nr_workgroups_per_CU %d\n",
localWorkSize[0], globalWorkSize[0]/localWorkSize[0],
globalWorkSize[0]/localWorkSize[0]/NR_COMP_UNITS);
printf("\t\tnr_threads %d, nr_threads_per_CU %d, nr_threads_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/NR_COMP_UNITS,
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
printf("\t\tarray size %d, nr_entries_per_thread %d, nr_entries_per_core %d\n",
globalWorkSize[0], globalWorkSize[0]/globalWorkSize[0],
globalWorkSize[0]/(NR_CORES_PER_CU*NR_COMP_UNITS));
}
// choose the kernel
kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
// Set the kernel arguments (result, a, b)
retval = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
retval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
retval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
retval |= clSetKernelArg(kernel, 3, sizeof(int), &size);
if (retval != CL_SUCCESS) {
printf("Failed to Set the kernel arguments.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
return 1;
}
#ifdef time_measurments
clFinish(command_queue);
t_begin = time_clock();
#endif
// Queue the kernel up for execution across the array
retval = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &kernel_execution);
clWaitForEvents(1, &kernel_execution);
#ifdef time_measurments
clFinish(command_queue);
t_end = time_clock();
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
0);
clGetEventProfilingInfo(kernel_execution,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
0);
exec_time = ((double)endTime - (double)startTime)*1.0e-9;
printf("EXECUTION TIME: executing kernel: %lf (profiler: %lf)\n",
t_end-t_begin, ((double)endTime - (double)startTime)*1.0e-9);
printf("\tNumber of operations %d, performance %lf GFlops\n",
array_size, array_size / exec_time * 1e-9);
printf("\tGBytes transferred to processor %lf, speed %lf GB/s\n",
3*array_size*sizeof(float)*1e-9,
3*array_size*sizeof(float)/exec_time*1e-9);
#endif
if (retval != CL_SUCCESS) {
printf("Failed to queue kernel for execution.\n");
//Cleanup(context, command_queue, program, kernel, memObjects);
exit(-1);
}
return(0);
}
/* // create the fourth kernel for GPU */
/* kernel_index = 4; */
/* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index, */
/* // kernel name: , file: */
/* "vecadd_4_opt_blocks_vect_kernel", "vecadd_4_opt_blocks_vect.cl", monitor); */
==== opencl_vecadd/vecadd_2_blocks.cl ====
__kernel void vecadd_2_blocks_kernel(__global const float *a,
__global const float *b,
__global float *result,
const int size,
const int size_per_thread)
{
int gid = get_global_id(0);
int index_start = gid * size_per_thread;
int index_end = (gid+1) * size_per_thread;
if (gid == 0) {
//printf("size: %d, %d", size, size_per_thread);
}
for (int i=index_start; i < index_end && i < size; i++) {
//for (int i=index_start; i < index_end ; i++) {
result[i] = a[i]+b[i];
}
}
===== Zajęcia 7 =====
{{:studia:magisterskie:1sem:zajecia_7.zip|}}
==== tmp/opencl_mat_transp/mat_transp_1.cl ====
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void mat_transp_1_kernel(
__global float *A ,
__global float *B ,
__global float *C ,
int N)
{
int id = get_global_id(0) * get_global_size(0) + get_global_id(1);
float temp = 0.0;
for(int i = 0; i
==== tmp/opencl_mat_transp/mat_transp/mat_transp.c ====
#include
#include
#include
#include
#include"uth_ocl_intf.h"
#define time_measurments
#ifdef time_measurments
#include"uth_time_intf.h"
static double t_begin, t_end, t_total;
#endif
#define BLOCK_SIZE 16
#define NR_GROUPS 16
#define MULT 16
#define WYMIAR (BLOCK_SIZE*NR_GROUPS*MULT)
#define ROZMIAR (WYMIAR*WYMIAR)
// Matrices are stored in row-major order:
// M(row, col) = M( row * WYMIAR + col )
int verify_result(
float* result,
float* result_compare
)
{
// Verify the result
int result_OK = 1;
int i,j;
for(i = 0; i < WYMIAR; i++) {
if(fabs(result[i] - result_compare[i])>1.e-6) {
result_OK = 0;
break;
}
}
printf("\t\t6. verifying results: ");
if(result_OK) {
printf("Output is correct\n");
} else {
printf("Output is incorrect\n");
j = 0;
for(i = 0; i < WYMIAR, j<100; i++) {
if(fabs(result[i] - result_compare[i])>1.e-9) {
j++;
printf("%d %16.8f %16.8f\n",
i, result[i], result_compare[i]);
}
}
exit(0);
}
/* for(i = 0; i < length; i++) { */
/* printf("%16.8f %16.8f\n", result[i], result_compare[i]); */
/* } */
return(result_OK);
}
/*----------------KERNEL CREATION PHASE----------------------*/
void create_kernels()
{
// for all operations indicate explicit info messages
int monitor = UTC_BASIC_INFO + 1;
int kernel_index;
int platform_index = utv_ocl_struct.current_platform_index;
if(utr_ocl_CPU_context_exists(platform_index)){
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the first kernel for CPU
/* kernel_index = 0; */
/* utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_CPU, kernel_index, */
/* // kernel name: , file: */
/* "mat_transp_1_kernel", "mat_transp_1.cl", monitor); */
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
}
if(utr_ocl_GPU_context_exists(platform_index)){
#ifdef time_measurments
t_begin = time_clock();
#endif
// create the first kernel for GPU
kernel_index = 0;
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"mat_transp_1_kernel", "mat_transp_1.cl", monitor);
#ifdef time_measurments
t_end = time_clock();
printf("EXECUTION TIME: creating CPU kernel %d: %lf\n", kernel_index, t_end-t_begin);
#endif
// create the second kernel for GPU
kernel_index = 1;
utr_ocl_create_kernel_dev_type( platform_index, UTC_OCL_DEVICE_GPU, kernel_index,
// kernel name: , file:
"mat_transp_2_kernel", "mat_transp_2.cl", monitor);
}
}
/*----------------EXECUTION PHASE----------------------*/
int execute_kernels()
{
// for all operations indicate explicit info messages
int monitor = UTC_BASIC_INFO + 1;
int platform_index = utv_ocl_struct.current_platform_index;
utt_ocl_platform_struct platform_struct = utv_ocl_struct.list_of_platforms[platform_index];
int kernel_index;
int i,j,n;
double nr_access;
double t1,t2;
if(monitor>UTC_BASIC_INFO){
printf("\n------------Starting execution phase----------------\n");
}
// create matrices
n=WYMIAR;
float* A = (float *) malloc(ROZMIAR*sizeof(float));
float* B = (float *) malloc(WYMIAR*sizeof(float));
float* C = (float *) malloc(WYMIAR*sizeof(float));
float* D = (float *) malloc(WYMIAR*sizeof(float));
for(i=0;i0 && device_type==utr_ocl_device_type(platform_index, device_index-1)) break;
if(device_type == UTC_OCL_DEVICE_CPU) break;
// choose the context
cl_context context = utr_ocl_select_context(platform_index, device_index);
// choose the command queue
cl_command_queue command_queue =
utr_ocl_select_command_queue(platform_index, device_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t0. restoring context and command queue for platform %d and device %d\n",
platform_index, device_index);
}
if(context == NULL || command_queue == NULL){
printf("failed to restore context and command queue for platform %d, device %d\n",
platform_index, device_index);
printf("%lu %lu\n", context, command_queue);
}
// choose the kernel
kernel_index = 0;
cl_kernel kernel = utr_ocl_select_kernel(platform_index, device_index, kernel_index);
if(monitor>UTC_BASIC_INFO){
printf("\nExecution: \t3. restoring kernel %d for platform %d and device %d\n",
kernel_index, platform_index, device_index);
}
if(context == NULL || command_queue == NULL || kernel == NULL){
printf("failed to restore kernel for platform %d, device %d, kernel %d\n",
platform_index, device_index, kernel_index);
printf("context %lu, command queue %lu, kernel %lu\n",
context, command_queue, kernel);
}
for(i=0;i
==== Kod wogu ====
=== server.c ====
/*
C socket server example, handles multiple clients using threads
Compile
gcc server.c -lpthread -o server
*/
#include
#include //strlen
#include //strlen
#include
#include //inet_addr
#include //write
#include //for threading , link with lpthread
#include
//the thread function
void *connection_handler(void *);
int main(int argc , char *argv[])
{
int counter = 0;
pid_t pid = fork();
if (pid == 0)
{
int sock;
struct sockaddr_in server;
char message[1000] , server_reply[2000];
int counter;
unsigned int ip[] = { 192, 168, 102, 1 };
char* ip_final;
char *buf;
size_t sz;
//Create socket
for ( counter = 0; counter < 255; ++counter ) {
sock = socket(AF_INET , SOCK_STREAM , 0);
if (sock == -1)
{
printf("Could not create socket");
}
puts("Socket created");
//printf("192.168.102.%d", counter);
sz = snprintf(NULL, 0, "192.168.102.%d", counter);
buf = (char *)malloc(sz + 1); /* make sure you check for != NULL in real code */
server.sin_addr.s_addr = inet_addr(buf);
server.sin_family = AF_INET;
server.sin_port = htons( 65001 );
//Connect to remote server
if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
{
perror("connect failed. Error");
continue;
}
puts("Connected\n");
//keep communicating with server
while(1)
{
//Send some data
if( send(sock , "Werset 6-client" , 15 , 0) < 0)
{
puts("Send failed");
return 1;
}
//Receive a reply from the server
if( recv(sock , server_reply , 2000 , 0) < 0)
{
puts("recv failed");
break;
}
puts(server_reply);
break;
}
close(sock);
}
}
else if (pid > 0)
{
int socket_desc , client_sock , c;
struct sockaddr_in server , client;
//Create socket
socket_desc = socket(AF_INET , SOCK_STREAM , 0);
if (socket_desc == -1)
{
printf("Could not create socket");
}
puts("Socket created");
//Prepare the sockaddr_in structure
server.sin_family = AF_INET;
server.sin_addr.s_addr = INADDR_ANY;
server.sin_port = htons( 65001 );
//Bind
if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
{
//print the error message
perror("bind failed. Error");
return 1;
}
puts("bind done");
//Listen
listen(socket_desc , 3);
//Accept and incoming connection
puts("Waiting for incoming connections...");
c = sizeof(struct sockaddr_in);
//Accept and incoming connection
puts("Waiting for incoming connections...");
c = sizeof(struct sockaddr_in);
pthread_t thread_id;
while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
{
puts("Connection accepted");
if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0)
{
perror("could not create thread");
return 1;
}
//Now join the thread , so that we dont terminate before the thread
//pthread_join( thread_id , NULL);
puts("Handler assigned");
}
if (client_sock < 0)
{
perror("accept failed");
return 1;
}
}
else
{
// fork failed
printf("fork() failed!\n");
return 1;
}
return 0;
}
/*
* This will handle connection for each client
* */
void *connection_handler(void *socket_desc)
{
//Get the socket descriptor
int sock = *(int*)socket_desc;
int read_size;
char *message , client_message[2000];
//Receive a message from client
while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
{
printf("%s\n", client_message);
//end of string marker
client_message[read_size] = '\0';
//Send the message back to client
write(sock , "Werset 6" , 8);
//clear the message buffer
memset(client_message, 0, 2000);
}
if(read_size == 0)
{
puts("Client disconnected");
fflush(stdout);
}
else if(read_size == -1)
{
perror("recv failed");
}
return 0;
}
=== Makefile ===
# kompilator c
CC = gcc
# konsolidator
CFLAGS=-g
all: server client
server: server.o
$(CC) $(LFLAGS) server.o -lpthread -o server
server.o: server.c
$(CC) -c server.c -o server.o
client: client.o
$(CC) client.o -o client
client.o: client.c
$(CC) -c client.c -o client.o
clean:
rm -f *.o
===== Zajęcia 9 =====
/*
C socket server example, handles multiple clients using threads
Compile
gcc server.c -lpthread -o server
*/
#include
#include //strlen
#include //strlen
#include
#include //inet_addr
#include //write
#include //for threading , link with lpthread
//the thread function
void *connection_handler(void *);
int main(int argc , char *argv[])
{
int socket_desc , client_sock , c;
struct sockaddr_in server , client;
//Create socket
socket_desc = socket(AF_INET , SOCK_STREAM , 0);
if (socket_desc == -1)
{
printf("Could not create socket");
}
puts("Socket created");
//Prepare the sockaddr_in structure
server.sin_family = AF_INET;
server.sin_addr.s_addr = INADDR_ANY;
server.sin_port = htons( 8888 );
//Bind
if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
{
//print the error message
perror("bind failed. Error");
return 1;
}
puts("bind done");
//Listen
listen(socket_desc , 3);
//Accept and incoming connection
puts("Waiting for incoming connections...");
c = sizeof(struct sockaddr_in);
//Accept and incoming connection
puts("Waiting for incoming connections...");
c = sizeof(struct sockaddr_in);
pthread_t thread_id;
while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
{
puts("Connection accepted");
if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0)
{
perror("could not create thread");
return 1;
}
//Now join the thread , so that we dont terminate before the thread
//pthread_join( thread_id , NULL);
puts("Handler assigned");
}
if (client_sock < 0)
{
perror("accept failed");
return 1;
}
return 0;
}
/*
* This will handle connection for each client
* */
void *connection_handler(void *socket_desc)
{
//Get the socket descriptor
int sock = *(int*)socket_desc;
int read_size;
char *message , client_message[2000];
//Receive a message from client
while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
{
//end of string marker
client_message[read_size] = '\0';
//Send the message back to client
write(sock , "Werset 6" , strlen(client_message));
//clear the message buffer
memset(client_message, 0, 2000);
}
if(read_size == 0)
{
puts("Client disconnected");
fflush(stdout);
}
else if(read_size == -1)
{
perror("recv failed");
}
return 0;
}
==== Client C ====
#include //printf
#include //strlen
#include //socket
#include //inet_addr
int main(int argc , char *argv[])
{
int sock;
struct sockaddr_in server;
char message[1000] , server_reply[2000];
//Create socket
sock = socket(AF_INET , SOCK_STREAM , 0);
if (sock == -1)
{
printf("Could not create socket");
}
puts("Socket created");
server.sin_addr.s_addr = inet_addr("127.0.0.1");
server.sin_family = AF_INET;
server.sin_port = htons( 8888 );
//Connect to remote server
if (connect(sock , (struct sockaddr *)&server , sizeof(server)) < 0)
{
perror("connect failed. Error");
return 1;
}
puts("Connected\n");
//keep communicating with server
while(1)
{
printf("Enter message : ");
scanf("%s" , message);
//Send some data
if( send(sock , message , strlen(message) , 0) < 0)
{
puts("Send failed");
return 1;
}
//Receive a reply from the server
if( recv(sock , server_reply , 2000 , 0) < 0)
{
puts("recv failed");
break;
}
puts("Server reply :");
puts(server_reply);
}
close(sock);
return 0;
}
==== Kod Maciek Gonzo itp ====
#include
#include //strlen
#include //strlen
#include
#include //inet_addr
#include //write
#include //for threading , link with lpthread
//the thread function
void *connection_handler(void *);
void *client_function(void *socket_desc){
int sock;
struct sockaddr_in server_id;
char message[1000] , server_reply[2000];
//Create socket
sock = socket(AF_INET , SOCK_STREAM , 0);
if (sock == -1)
{
printf("Could not create socket");
}
puts("Socket created");
// 192.168.102.63 wogu
server_id.sin_addr.s_addr = inet_addr("localhost");
server_id.sin_family = AF_INET;
server_id.sin_port = htons( 65000 );
//Connect to remote server
if (connect(sock , (struct sockaddr *)&server_id , sizeof(server_id)) < 0)
{
perror("connect failed. Error");
}else{
puts("Connected\n");
//keep communicating with server
while(1)
{
printf("Enter message : ");
scanf("%s" , message);
//Send some data
if( send(sock , message , strlen(message) , 0) < 0)
{
puts("Send failed");
}
//Receive a reply from the server
if( recv(sock , server_reply , 2000 , 0) < 0)
{
puts("recv failed");
break;
}
puts("Server reply :");
puts(server_reply);
}
close(sock);
}
};
void *server_function(void * sock){
printf("SERVER Start");
fflush(stdout);
int socket_desc , client_sock , c;
pthread_t thread_id;
struct sockaddr_in server , client;
//Create socket
socket_desc = socket(AF_INET , SOCK_STREAM , 0);
if (socket_desc == -1)
{
printf("Could not create socket");
}
puts("Socket created");
//Prepare the sockaddr_in structure
server.sin_family = AF_INET;
server.sin_addr.s_addr = INADDR_ANY;
server.sin_port = htons( 65000);
//Bind
if( bind(socket_desc,(struct sockaddr *)&server , sizeof(server)) < 0)
{
//print the error message
perror("bind failed. Error");
return;
}
puts("bind done");
//Listen
listen(socket_desc , 3);
//Accept and incoming connection
puts("Waiting for incoming connections...");
c = sizeof(struct sockaddr_in);
fflush(stdout);
while( (client_sock = accept(socket_desc, (struct sockaddr *)&client, (socklen_t*)&c)) )
{
puts("Connection accepted");
if( pthread_create( &thread_id , NULL , connection_handler , (void*) &client_sock) < 0)
{
perror("could not create thread");
return;
}
//Now join the thread , so that we dont terminate before the thread
//pthread_join( thread_id , NULL);
puts("Handler assigned");
}
if (client_sock < 0)
{
perror("accept failed");
return;
}
}
int main(int argc , char *argv[])
{
pthread_t thread_id;
//Create server
if( pthread_create( &thread_id , NULL , server_function , NULL) != 0)
{
perror("could not create server thread");
fflush(stdout);
return 1;
}
pthread_t thread_id2;
// Create client
if( pthread_create( &thread_id2 , NULL , client_function , NULL) != 0)
{
perror("could not create client thread");
fflush(stdout);
return 1;
}
return 0;
}
/*
* This will handle connection for each client
* */
void *connection_handler(void *socket_desc)
{
//Get the socket descriptor
int sock = *(int*)socket_desc;
int read_size;
char *message , client_message[2000];
//Receive a message from client
while( (read_size = recv(sock , client_message , 2000 , 0)) > 0 )
{
//end of string marker
client_message[read_size] = '\0';
//Send the message back to client
write(sock , "Werset 5" , strlen(client_message));
//clear the message buffer
memset(client_message, 0, 2000);
}
if(read_size == 0)
{
puts("Client disconnected");
fflush(stdout);
}
else if(read_size == -1)
{
perror("recv failed");
}
return 0;
}