Task 1 (Normalizing Vectors)

Scaffold Head: #include<iostream> // cout, endl #include<algorithm> // iota #include<cmath> // sqrt #include<omp.h> // benchmark below (mutli-threading with openMP pragmas) /////////////////////////////////////////////////////////////////////////////// // IGNORE THIS HELPERS (taken from https://github.com/gravitino/cudahelpers) /////////////////////////////////////////////////////////////////////////////// // safe division #define SDIV(x,y)(((x)+(y)-1)/(y)) // error makro #define CUERR { \ cudaError_t err; \ if ((err = cudaGetLastError()) != cudaSuccess) { \ std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ << __FILE__ << ", line " << __LINE__ << std::endl; \ exit(1); \ } \ } // convenient timers #define TIMERSTART(label) \ cudaEvent_t start##label, stop##label; \ float time##label; \ cudaEventCreate(&start##label); \ cudaEventCreate(&stop##label); \ cudaEventRecord(start##label, 0); #define TIMERSTOP(label) \ cudaEventRecord(stop##label, 0); \ cudaEventSynchronize(stop##label); \ cudaEventElapsedTime(&time##label, start##label, stop##label); \ std::cout << "#" << time##label \ << " ms (" << #label << ")" << std::endl;
Scaffold Foot: /////////////////////////////////////////////////////////////////////////// // BENCHMARKS AND CHECKS (you may ignore this, especially the openMP part) /////////////////////////////////////////////////////////////////////////// // check for correct result computed by CUDA for (size_t index = 0; index < N; index++) { const float x = v[4*index], y = v[4*index+1], z = v[4*index+2], w = v[4*index+3]; const float residue = x*x+y*y+z*z+w*w-1; if (residue*residue > 1E-6) { std::cout << "error at postion " << index << std::endl; break; } } // measure time on single-threaded host TIMERSTART(overallSingleCore) for (size_t index = 0; index < N; index++) { const float x = v[4*index], y = v[4*index+1], z = v[4*index+2], w = v[4*index+3]; const float rev_sqrt = 1.0/std::sqrt(x*x+y*y+z*z+w*w); v[4*index] = x*rev_sqrt; v[4*index+1] = y*rev_sqrt; v[4*index+2] = z*rev_sqrt; v[4*index+3] = w*rev_sqrt; } TIMERSTOP(overallSingleCore) // measure time on multi-threaded host TIMERSTART(overallMultiCore) # pragma omp parallel for for (size_t index = 0; index < N; index++) { const float x = v[4*index], y = v[4*index+1], z = v[4*index+2], w = v[4*index+3]; const float rev_sqrt = 1.0/std::sqrt(x*x+y*y+z*z+w*w); v[4*index] = x*rev_sqrt; v[4*index+1] = y*rev_sqrt; v[4*index+2] = z*rev_sqrt; v[4*index+3] = w*rev_sqrt; } TIMERSTOP(overallMultiCore) // get rid of the memory cudaFree(V); cudaFreeHost(v); // print status float usedMem = 4.0*N*sizeof(float)/(1L<<30); std::cout << "#processed " << usedMem << " gigabytes." << std::endl; std::cout << "CUDA programming is fun!" << std::endl; }

Start time:: Do 12 Apr 2018 16:11:00
End time:: Fr 01 Mär 2019 12:00:00

General test timeout:: 10.0 seconds

Tests

Comment prefix	`#`
Given input
Expected output	CUDA programming is fun!

Task 1 (Normalizing Vectors) Assignment

Tests