SAUCE
Home
Events
Listing
Future
Previous
Accelerated Computing with GPUs 2020
Data Mining - Winter 20/21
High Performance Computing 2019
Einführung in die Bioinformatik WS19/20
Computational Logic
Parallel Algorithms and Architectures 2019
DSEA Praktikum 2018/19
Deep Learning 2018
High Performance Computing 2018
Parallel Algorithms and Architectures 2018
Datenstrukturen und effiziente Algorithmen Ws 18/19
EiP SoSe 18
bio-st-18
EiP WS 2017/18
High Performance Computing 2017
Datenstrukturen und effiziente Algorithmen WiSe 17/18
PS SS 2017
Einfuehrung in die Programmierung SS17
Parallel Algorithms and Architectures 2017
High Performance Computing 2016
DSEA 2016/17
EiP WS2016/17
Parallel Algorithms and Architectures 2016
PS SS 2016
Krypto SS 2016
EiP SS 2016
DSEA Praktikum WS 2015/16
DSEA WS 2015/16
News
Documentation
About
Changelog
Roadmap
Deutsche Dokumentation
Tips and Tricks
Test configuration
Language information
Contact
Login
Parallel Algorithms and Architectures 2018
Interactive Programming Exercises in the Lecture
Interactive Programming Exercises in the Lecture
Sheet 3
Sheet 4 (Array Reversal, Determinants)
Sheet 5 (Knapsack)
Sheet 6 (Warp Intrinsics)
Sheet 7 (Sparse Matrices, Page Rank)
Sheet 8 (Streams, Multi-GPU)
Sheet 9 (Jacobi Iteration)
Lecture 8: Streamed Covariance Matrix Computation
Lecture 3: Vector Addition
Lecture 4: Covariance Matrix
Lecture 5: Dynamic Time Warping
Lecture 6: Prefix Scan
Lecture 7: SpMV/ELL
Lecture 8: Streamed Covariance Matrix Computation
Lecture 9: MPI Matrix Multiplication
Lecture 10: Parallel Merge
Lecture 8: Streamed Covariance Matrix Computation
Assignment
Scaffold Head
#include <iostream> #include <fstream> #ifndef __CUDACC__ #include <chrono> #endif #ifndef __CUDACC__ #define TIMERSTART(label) \ std::chrono::time_point<std::chrono::system_clock> a##label, b##label; \ a##label = std::chrono::system_clock::now(); #else #define TIMERSTART(label) \ cudaSetDevice(0); \ cudaEvent_t start##label, stop##label; \ float time##label; \ cudaEventCreate(&start##label); \ cudaEventCreate(&stop##label); \ cudaEventRecord(start##label, 0); #endif #ifndef __CUDACC__ #define TIMERSTOP(label) \ b##label = std::chrono::system_clock::now(); \ std::chrono::duration<double> delta##label = b##label-a##label; \ std::cout << "# elapsed time ("<< #label <<"): " \ << delta##label.count() << "s" << std::endl; #else #define TIMERSTOP(label) \ cudaSetDevice(0); \ cudaEventRecord(stop##label, 0); \ cudaEventSynchronize(stop##label); \ cudaEventElapsedTime(&time##label, start##label, stop##label); \ std::cout << "#TIMING: " << time##label << " ms (" << #label << ")" \ << std::endl; #endif #ifdef __CUDACC__ #define CUERR { \ cudaError_t err; \ if ((err = cudaGetLastError()) != cudaSuccess) { \ std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ << __FILE__ << ", line " << __LINE__ << std::endl; \ exit(1); \ } \ } #endif // safe division #define SDIV(x,y)(((x)+(y)-1)/(y)) template < typename index_t, typename value_t> void load_binary( const value_t * data, const index_t length, std::string filename) { std::ifstream ifile(filename.c_str(), std::ios::binary); ifile.read((char*) data, sizeof(value_t)*length); ifile.close(); } template < typename index_t, typename value_t, uint32_t chunk_size=32 > __global__ void shared_covariance_kernel( value_t * Data, value_t * Cov, index_t num_entries, index_t num_features) { // convenience variables const index_t base_x = blockIdx.x*chunk_size; const index_t base_y = blockIdx.y*chunk_size; const index_t thid_y = threadIdx.y; const index_t thid_x = threadIdx.x; const index_t x = base_x + thid_x; const index_t y = base_y + thid_y; // optional early exit: -500ms if (base_x > base_y) return; // allocate shared memory __shared__ value_t cache_x[chunk_size][chunk_size]; __shared__ value_t cache_y[chunk_size][chunk_size]; // compute the number of chunks to be computed const index_t num_chunks = SDIV(num_entries, chunk_size); // accumulated value of scalar product value_t accum = 0; // for each chunk for (index_t chunk = 0; chunk < num_chunks; chunk++) { // assign thread IDs to rows and columns const index_t row = thid_y + chunk*chunk_size; const index_t col_x = thid_x + base_x; const index_t col_y = thid_x + base_y; // check if valid row or column indices const bool valid_row = row < num_entries; const bool valid_col_x = col_x < num_features; const bool valid_col_y = col_y < num_features; // fill shared memory with tiles where thid_y enumerates // image identifiers (entries) and thid_x denotes feature // coordinates (pixels). cache_x corresponds to x and // cache_y to y where Cov[x,y] is the pairwise covariance cache_x[thid_y][thid_x] = valid_row*valid_col_x ? Data[row*num_features+col_x] : 0; cache_y[thid_y][thid_x] = valid_row*valid_col_y ? Data[row*num_features+col_y] : 0; // this is needed to ensure that all threads finished writing // shared memory __syncthreads(); // optional early exit: -100ms if (x <= y) // here we actually evaluate the scalar product for (index_t entry = 0; entry < chunk_size; entry++) accum += cache_y[entry][thid_y]*cache_x[entry][thid_x]; // this is needed to ensure that shared memory can be over- // written again in the next iteration __syncthreads(); } // since Cov[x,y] = Cov[y,x] we only compute one entry if (y < num_features && x <= y) Cov[y*num_features+x] = Cov[x*num_features+y] = accum;//num_entries; } template <uint64_t num_gpus, uint64_t num_streams> struct partition_t { static constexpr uint64_t num_slots = num_gpus*num_streams; uint64_t displs[num_slots]; uint64_t counts[num_slots]; partition_t(uint64_t length) { uint64_t batch_size = (length+num_slots-1)/num_slots; for (uint64_t gpu = 0; gpu < num_gpus; gpu++) { for (uint64_t stream = 0; stream < num_streams; stream++) { const uint64_t slot = gpu*num_streams+stream; const uint64_t lower = slot*batch_size; const uint64_t upper = std::min(lower+batch_size, length); const uint64_t count = upper-lower; displs[slot] = lower; counts[slot] = count; } } } uint64_t get_count(uint64_t gpu, uint64_t stream) { return counts[gpu*num_streams+stream]; } uint64_t get_displ(uint64_t gpu, uint64_t stream) { return displs[gpu*num_streams+stream]; } }; int main (int argc, char * argv[]) { TIMERSTART(init) // 202599 grayscale images each of shape 55 x 45 constexpr uint64_t imgs = 202599, rows = 55, cols = 45; // pointer for data matrix and mean vector float * data = nullptr; cudaMallocHost(&data, sizeof(float)*imgs*rows*cols); CUERR load_binary(data, imgs*rows*cols, "./celebA_centered.bin"); constexpr uint64_t num_gpus = 2, num_streams = 4; partition_t<num_gpus, num_streams> part(imgs); float * Data[num_gpus][num_streams], * Cov [num_gpus][num_streams], * cov [num_gpus][num_streams]; cudaStream_t streams[num_gpus][num_streams]; for (uint64_t gpu = 0; gpu < num_gpus; gpu++) { cudaSetDevice(gpu); for (uint64_t stream = 0; stream < num_streams; stream++) { const uint64_t entries = part.get_count(gpu, stream); cudaStreamCreate(&streams[gpu][stream]); cudaMalloc (&Data[gpu][stream], sizeof(float)*entries*rows*cols); cudaMalloc (&Cov[gpu][stream], sizeof(float)*rows*cols*rows*cols); cudaMallocHost(&cov[gpu][stream], sizeof(float)*rows*cols*rows*cols); cudaMemset (Data[gpu][stream], 0, sizeof(float)*entries*rows*cols); cudaMemset (Cov[gpu][stream], 0, sizeof(float)*rows*cols*rows*cols); cudaMemset (cov[gpu][stream], 0, sizeof(float)*rows*cols*rows*cols); } } CUERR TIMERSTOP(init)
Scaffold Foot
TIMERSTART(postprocessing) float * result = nullptr, * truth = nullptr; cudaMallocHost(&result, sizeof(float)*rows*cols*rows*cols); CUERR cudaMallocHost(&truth, sizeof(float)*rows*cols*rows*cols); CUERR cudaMemset(result, 0, sizeof(float)*rows*cols*rows*cols); CUERR load_binary(truth, rows*cols*rows*cols, "./celebA_covariance.bin"); # pragma omp parallel for (uint64_t gpu = 0; gpu < num_gpus; gpu++) for (uint64_t stream = 0; stream < num_streams; stream++) # pragma omp for for (uint64_t i = 0; i < rows*cols*rows*cols; i++) result[i] += cov[gpu][stream][i]; for (uint64_t i = 0; i < rows*cols*rows*cols; i++) { const auto res = result[i]/imgs - truth[i]; if (res*res > 1000) { std::cout << "ERROR: " << result[i]/imgs << " " << truth[i] << " " << (i % (cols*rows)) << " " << (i / (cols*rows)) << std::endl; break; } } for(uint64_t gpu = 0; gpu < num_gpus; gpu++) { cudaSetDevice(gpu); for (uint64_t stream = 0; stream < num_streams; stream++) { cudaFree (Data[gpu][stream]); cudaFree (Cov[gpu][stream]); cudaFreeHost(cov[gpu][stream]); cudaStreamSynchronize(streams[gpu][stream]); cudaStreamDestroy(streams[gpu][stream]); } } CUERR cudaFreeHost(result); CUERR cudaFreeHost(truth); cudaFreeHost(data); CUERR TIMERSTOP(postprocessing) std::cout << "CUDA programming is fun!" << std::endl; }
Start time:
Do 12 Apr 2018 16:11:00
End time:
Fr 01 Mär 2019 12:00:00
General test timeout:
10.0 seconds
Tests
Comment prefix
#
Given input
Expected output
CUDA programming is fun!