Why is the second pass over a vector larger than the cache faster?
I am playing around with "memory bandwidth". Let's forget about this term, which is not well defined, and concentrate on the following code:
#include <iostream>
#include <chrono>
#include <vector>
int main() {
auto size = std::vector<std::size_t>{
1024 / sizeof(std::size_t), // 1kB
(1024 * 1024) / sizeof(std::size_t), // 1MB
(1024 * 1024 * 10) / sizeof(std::size_t), // 10 MB
(1024 * 1024 * 100) / sizeof(std::size_t), // 100 MB
(1024 * 1024 * 1024) / sizeof(std::size_t), // 1GB
(1024 * 1024 * 2047) / sizeof(std::size_t)}; // 2GB
auto size_name = std::vector<std::string>{
"1 KB", "1 MB", "10 MB", "100 MB", "1 GB", "2 GB"};
auto check = std::size_t{0};
for (auto i = std::size_t{0}; i < size.size(); ++i) {
auto v = new std::size_t[size[i]];
auto date_start_cold = std::chrono::high_resolution_clock::now();
for (auto k = std::size_t{0}; k < size[i]; ++k) {
check += v[k];
}
auto date_end_cold = std::chrono::high_resolution_clock::now();
auto time_cold = std::chrono::duration_cast<std::chrono::nanoseconds>(
date_end_cold - date_start_cold).count();
auto date_start_warm = std::chrono::high_resolution_clock::now();
for (auto k = std::size_t{0}; k < size[i]; ++k) {
check += v[k];
}
auto date_end_warm = std::chrono::high_resolution_clock::now();
auto time_warm = std::chrono::duration_cast<std::chrono::nanoseconds>(
date_end_warm - date_start_warm).count();
std::cout << "Bandwith for reading n = " << size_name[i] << ": "
<< (size[i] * sizeof(std::size_t)) / static_cast<double>(time_cold)
<< " GB/s (cold), "
<< (size[i] * sizeof(std::size_t)) / static_cast<double>(time_warm)
<< " GB/s (warm)" << std::endl;
delete[] v;
}
std::cout << "Check: " << check << std::endl;
std::cout << std::endl;
for (auto i = std::size_t{0}; i < size.size(); ++i) {
auto v = new std::size_t[size[i]];
auto date_start_cold = std::chrono::high_resolution_clock::now();
for (auto k = std::size_t{0}; k < size[i]; ++k) {
v[k] = k;
}
auto date_end_cold = std::chrono::high_resolution_clock::now();
auto time_cold = std::chrono::duration_cast<std::chrono::nanoseconds>(
date_end_cold - date_start_cold).count();
auto date_start_warm = std::chrono::high_resolution_clock::now();
for (auto k = std::size_t{0}; k < size[i]; ++k) {
v[k] = k;
}
auto date_end_warm = std::chrono::high_resolution_clock::now();
auto time_warm = std::chrono::duration_cast<std::chrono::nanoseconds>(
date_end_warm - date_start_warm).count();
std::cout << "Bandwith for writing n = " << size_name[i] << ": "
<< (size[i] * sizeof(std::size_t)) / static_cast<double>(time_cold)
<< " GB/s (cold), "
<< (size[i] * sizeof(std::size_t)) / static_cast<double>(time_warm)
<< " GB/s (warm)" << std::endl;
check += v[size[i] - 1];
delete[] v;
}
std::cout << "Check: " << check << std::endl;
return 0;
}
My L3 cache is only 6MB. So I didn't expect a second pass over a 1GB vector to be faster. In the idea that I have a cache, when the last element of a 1GB vector is read, the first element of the vector is no longer in the cache. So I expect the cache to be "cold" when reading the first element again. But here are the results I get on my MacBook Air:
Bandwith for reading n = 1 KB: 2.37037 GB/s (cold), 6.2439 GB/s (warm)
Bandwith for reading n = 1 MB: 1.53677 GB/s (cold), 16.1215 GB/s (warm)
Bandwith for reading n = 10 MB: 1.9973 GB/s (cold), 8.19585 GB/s (warm)
Bandwith for reading n = 100 MB: 2.7617 GB/s (cold), 13.675 GB/s (warm)
Bandwith for reading n = 1 GB: 2.86934 GB/s (cold), 16.1397 GB/s (warm)
Bandwith for reading n = 2 GB: 2.79094 GB/s (cold), 14.2438 GB/s (warm)
Bandwith for writing n = 1 KB: 0.392789 GB/s (cold), 9.94175 GB/s (warm)
Bandwith for writing n = 1 MB: 5.74285 GB/s (cold), 16.2936 GB/s (warm)
Bandwith for writing n = 10 MB: 5.94669 GB/s (cold), 9.37043 GB/s (warm)
Bandwith for writing n = 100 MB: 7.02979 GB/s (cold), 9.08043 GB/s (warm)
Bandwith for writing n = 1 GB: 2.5859 GB/s (cold), 8.64279 GB/s (warm)
Bandwith for writing n = 2 GB: 2.34798 GB/s (cold), 8.99719 GB/s (warm)
Does anyone have an explanation for this behavior?
+3
source to share
No one has answered this question yet
See similar questions:
or similar: