Lecture 8: Streamed Covariance Matrix Computation

Scaffold Head: #include <iostream> #include <fstream> #ifndef __CUDACC__ #include <chrono> #endif #ifndef __CUDACC__ #define TIMERSTART(label) \ std::chrono::time_point<std::chrono::system_clock> a##label, b##label; \ a##label = std::chrono::system_clock::now(); #else #define TIMERSTART(label) \ cudaSetDevice(0); \ cudaEvent_t start##label, stop##label; \ float time##label; \ cudaEventCreate(&start##label); \ cudaEventCreate(&stop##label); \ cudaEventRecord(start##label, 0); #endif #ifndef __CUDACC__ #define TIMERSTOP(label) \ b##label = std::chrono::system_clock::now(); \ std::chrono::duration<double> delta##label = b##label-a##label; \ std::cout << "# elapsed time ("<< #label <<"): " \ << delta##label.count() << "s" << std::endl; #else #define TIMERSTOP(label) \ cudaSetDevice(0); \ cudaEventRecord(stop##label, 0); \ cudaEventSynchronize(stop##label); \ cudaEventElapsedTime(&time##label, start##label, stop##label); \ std::cout << "#TIMING: " << time##label << " ms (" << #label << ")" \ << std::endl; #endif #ifdef __CUDACC__ #define CUERR { \ cudaError_t err; \ if ((err = cudaGetLastError()) != cudaSuccess) { \ std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ << __FILE__ << ", line " << __LINE__ << std::endl; \ exit(1); \ } \ } #endif // safe division #define SDIV(x,y)(((x)+(y)-1)/(y)) template < typename index_t, typename value_t> void load_binary( const value_t * data, const index_t length, std::string filename) { std::ifstream ifile(filename.c_str(), std::ios::binary); ifile.read((char*) data, sizeof(value_t)*length); ifile.close(); } template < typename index_t, typename value_t, uint32_t chunk_size=32 > __global__ void shared_covariance_kernel( value_t * Data, value_t * Cov, index_t num_entries, index_t num_features) { // convenience variables const index_t base_x = blockIdx.x*chunk_size; const index_t base_y = blockIdx.y*chunk_size; const index_t thid_y = threadIdx.y; const index_t thid_x = threadIdx.x; const index_t x = base_x + thid_x; const index_t y = base_y + thid_y; // optional early exit: -500ms if (base_x > base_y) return; // allocate shared memory __shared__ value_t cache_x[chunk_size][chunk_size]; __shared__ value_t cache_y[chunk_size][chunk_size]; // compute the number of chunks to be computed const index_t num_chunks = SDIV(num_entries, chunk_size); // accumulated value of scalar product value_t accum = 0; // for each chunk for (index_t chunk = 0; chunk < num_chunks; chunk++) { // assign thread IDs to rows and columns const index_t row = thid_y + chunk*chunk_size; const index_t col_x = thid_x + base_x; const index_t col_y = thid_x + base_y; // check if valid row or column indices const bool valid_row = row < num_entries; const bool valid_col_x = col_x < num_features; const bool valid_col_y = col_y < num_features; // fill shared memory with tiles where thid_y enumerates // image identifiers (entries) and thid_x denotes feature // coordinates (pixels). cache_x corresponds to x and // cache_y to y where Cov[x,y] is the pairwise covariance cache_x[thid_y][thid_x] = valid_row*valid_col_x ? Data[row*num_features+col_x] : 0; cache_y[thid_y][thid_x] = valid_row*valid_col_y ? Data[row*num_features+col_y] : 0; // this is needed to ensure that all threads finished writing // shared memory __syncthreads(); // optional early exit: -100ms if (x <= y) // here we actually evaluate the scalar product for (index_t entry = 0; entry < chunk_size; entry++) accum += cache_y[entry][thid_y]*cache_x[entry][thid_x]; // this is needed to ensure that shared memory can be over- // written again in the next iteration __syncthreads(); } // since Cov[x,y] = Cov[y,x] we only compute one entry if (y < num_features && x <= y) Cov[y*num_features+x] = Cov[x*num_features+y] = accum;//num_entries; } template <uint64_t num_gpus, uint64_t num_streams> struct partition_t { static constexpr uint64_t num_slots = num_gpus*num_streams; uint64_t displs[num_slots]; uint64_t counts[num_slots]; partition_t(uint64_t length) { uint64_t batch_size = (length+num_slots-1)/num_slots; for (uint64_t gpu = 0; gpu < num_gpus; gpu++) { for (uint64_t stream = 0; stream < num_streams; stream++) { const uint64_t slot = gpu*num_streams+stream; const uint64_t lower = slot*batch_size; const uint64_t upper = std::min(lower+batch_size, length); const uint64_t count = upper-lower; displs[slot] = lower; counts[slot] = count; } } } uint64_t get_count(uint64_t gpu, uint64_t stream) { return counts[gpu*num_streams+stream]; } uint64_t get_displ(uint64_t gpu, uint64_t stream) { return displs[gpu*num_streams+stream]; } }; int main (int argc, char * argv[]) { TIMERSTART(init) // 202599 grayscale images each of shape 55 x 45 constexpr uint64_t imgs = 202599, rows = 55, cols = 45; // pointer for data matrix and mean vector float * data = nullptr; cudaMallocHost(&data, sizeof(float)*imgs*rows*cols); CUERR load_binary(data, imgs*rows*cols, "./celebA_centered.bin"); constexpr uint64_t num_gpus = 2, num_streams = 4; partition_t<num_gpus, num_streams> part(imgs); float * Data[num_gpus][num_streams], * Cov [num_gpus][num_streams], * cov [num_gpus][num_streams]; cudaStream_t streams[num_gpus][num_streams]; for (uint64_t gpu = 0; gpu < num_gpus; gpu++) { cudaSetDevice(gpu); for (uint64_t stream = 0; stream < num_streams; stream++) { const uint64_t entries = part.get_count(gpu, stream); cudaStreamCreate(&streams[gpu][stream]); cudaMalloc (&Data[gpu][stream], sizeof(float)*entries*rows*cols); cudaMalloc (&Cov[gpu][stream], sizeof(float)*rows*cols*rows*cols); cudaMallocHost(&cov[gpu][stream], sizeof(float)*rows*cols*rows*cols); cudaMemset (Data[gpu][stream], 0, sizeof(float)*entries*rows*cols); cudaMemset (Cov[gpu][stream], 0, sizeof(float)*rows*cols*rows*cols); cudaMemset (cov[gpu][stream], 0, sizeof(float)*rows*cols*rows*cols); } } CUERR TIMERSTOP(init)
Scaffold Foot: TIMERSTART(postprocessing) float * result = nullptr, * truth = nullptr; cudaMallocHost(&result, sizeof(float)*rows*cols*rows*cols); CUERR cudaMallocHost(&truth, sizeof(float)*rows*cols*rows*cols); CUERR cudaMemset(result, 0, sizeof(float)*rows*cols*rows*cols); CUERR load_binary(truth, rows*cols*rows*cols, "./celebA_covariance.bin"); # pragma omp parallel for (uint64_t gpu = 0; gpu < num_gpus; gpu++) for (uint64_t stream = 0; stream < num_streams; stream++) # pragma omp for for (uint64_t i = 0; i < rows*cols*rows*cols; i++) result[i] += cov[gpu][stream][i]; for (uint64_t i = 0; i < rows*cols*rows*cols; i++) { const auto res = result[i]/imgs - truth[i]; if (res*res > 1000) { std::cout << "ERROR: " << result[i]/imgs << " " << truth[i] << " " << (i % (cols*rows)) << " " << (i / (cols*rows)) << std::endl; break; } } for(uint64_t gpu = 0; gpu < num_gpus; gpu++) { cudaSetDevice(gpu); for (uint64_t stream = 0; stream < num_streams; stream++) { cudaFree (Data[gpu][stream]); cudaFree (Cov[gpu][stream]); cudaFreeHost(cov[gpu][stream]); cudaStreamSynchronize(streams[gpu][stream]); cudaStreamDestroy(streams[gpu][stream]); } } CUERR cudaFreeHost(result); CUERR cudaFreeHost(truth); cudaFreeHost(data); CUERR TIMERSTOP(postprocessing) std::cout << "CUDA programming is fun!" << std::endl; }

Start time:: Do 12 Apr 2018 16:11:00
End time:: Fr 01 Mär 2019 12:00:00

General test timeout:: 10.0 seconds

Tests

Comment prefix	`#`
Given input
Expected output	CUDA programming is fun!

Lecture 8: Streamed Covariance Matrix Computation Assignment

Tests