Lab 6
Approach I
Code
cudaMemcpy(gpu_b, b, sizeof(int) * size * size, cudaMemcpyHostToDevice);
for (i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
cudaMemcpyAsync(&gpu_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
matrix_multiplication<<<dimGrid, dimBlock, 0, stream[i]>>>(gpu_a, gpu_b, gpu_c, i);
cudaMemcpyAsync(&gpu_c[offset], &c[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
Result
nStreams = 1
:time is 92.022301 ms
c[451][451]=208282624nStreams = 4
:time is 91.575935 ms
c[451][451]=208282624nStreams = 16
:time is 91.250397 ms
c[451][451]=208282624
Approach II
Code
cudaMemcpy(gpu_b, b, sizeof(int) * size * size, cudaMemcpyHostToDevice);
for (i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
cudaMemcpyAsync(&gpu_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
}
for (i = 0; i < nStreams; ++i)
{
matrix_multiplication<<<dimGrid, dimBlock, 0, stream[i]>>>(gpu_a, gpu_b, gpu_c, i);
}
for (i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
cudaMemcpyAsync(&c[offset], &gpu_c[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
Result
nStreams = 1
:time is 92.028160 ms
c[451][451]=208282624nStreams = 4
:time is 91.512543 ms
c[451][451]=208282624nStreams = 16
:time is 90.706047 ms
c[451][451]=208282624
Observation
- More streams make the execution time faster. This is because more actions (memcopy and kernel) can be overlapped when we have more streams, saving some time on the overall execution.
- Approach 2 is faster than approach 1 for
nStreams = 4
andnStreams = 16
because we don't have the wait for the block kernel function call each iteration. We can just dispatch all the memcopy calls all at once.