Why is the second pass over a vector larger than the cache faster?

Question

Why is the second pass over a vector larger than the cache faster?

I am playing around with "memory bandwidth". Let's forget about this term, which is not well defined, and concentrate on the following code:

#include <iostream>
#include <chrono>
#include <vector>

int main() {
  auto size = std::vector<std::size_t>{
      1024 / sizeof(std::size_t),                  // 1kB
      (1024 * 1024) / sizeof(std::size_t),         // 1MB
      (1024 * 1024 * 10) / sizeof(std::size_t),    // 10 MB
      (1024 * 1024 * 100) / sizeof(std::size_t),   // 100 MB
      (1024 * 1024 * 1024) / sizeof(std::size_t),  // 1GB
      (1024 * 1024 * 2047) / sizeof(std::size_t)}; // 2GB
  auto size_name = std::vector<std::string>{
      "1 KB", "1 MB", "10 MB", "100 MB", "1 GB", "2 GB"};
  auto check = std::size_t{0};

  for (auto i = std::size_t{0}; i < size.size(); ++i) {
    auto v = new std::size_t[size[i]];

    auto date_start_cold = std::chrono::high_resolution_clock::now();
    for (auto k = std::size_t{0}; k < size[i]; ++k) {
      check += v[k];
    }
    auto date_end_cold = std::chrono::high_resolution_clock::now();
    auto time_cold = std::chrono::duration_cast<std::chrono::nanoseconds>(
        date_end_cold - date_start_cold).count();

    auto date_start_warm = std::chrono::high_resolution_clock::now();
    for (auto k = std::size_t{0}; k < size[i]; ++k) {
      check += v[k];
    }
    auto date_end_warm = std::chrono::high_resolution_clock::now();
    auto time_warm = std::chrono::duration_cast<std::chrono::nanoseconds>(
        date_end_warm - date_start_warm).count();
    std::cout << "Bandwith for reading n = " << size_name[i] << ": "
        << (size[i] * sizeof(std::size_t)) / static_cast<double>(time_cold)
        << " GB/s (cold), "
        << (size[i] * sizeof(std::size_t)) / static_cast<double>(time_warm)
        << " GB/s (warm)" << std::endl;

    delete[] v;
  }

  std::cout << "Check: " << check << std::endl;
  std::cout << std::endl;

  for (auto i = std::size_t{0}; i < size.size(); ++i) {
    auto v = new std::size_t[size[i]];

    auto date_start_cold = std::chrono::high_resolution_clock::now();
    for (auto k = std::size_t{0}; k < size[i]; ++k) {
      v[k] = k;
    }
    auto date_end_cold = std::chrono::high_resolution_clock::now();
    auto time_cold = std::chrono::duration_cast<std::chrono::nanoseconds>(
        date_end_cold - date_start_cold).count();

    auto date_start_warm = std::chrono::high_resolution_clock::now();
    for (auto k = std::size_t{0}; k < size[i]; ++k) {
      v[k] = k;
    }
    auto date_end_warm = std::chrono::high_resolution_clock::now();
    auto time_warm = std::chrono::duration_cast<std::chrono::nanoseconds>(
        date_end_warm - date_start_warm).count();
    std::cout << "Bandwith for writing n = " << size_name[i] << ": "
        << (size[i] * sizeof(std::size_t)) / static_cast<double>(time_cold)
        << " GB/s (cold), "
        << (size[i] * sizeof(std::size_t)) / static_cast<double>(time_warm)
        << " GB/s (warm)" << std::endl;

    check += v[size[i] - 1];
    delete[] v;
  }

  std::cout << "Check: " << check << std::endl;

  return 0;
}

My L3 cache is only 6MB. So I didn't expect a second pass over a 1GB vector to be faster. In the idea that I have a cache, when the last element of a 1GB vector is read, the first element of the vector is no longer in the cache. So I expect the cache to be "cold" when reading the first element again. But here are the results I get on my MacBook Air:

Bandwith for reading n = 1 KB: 2.37037 GB/s (cold), 6.2439 GB/s (warm)
Bandwith for reading n = 1 MB: 1.53677 GB/s (cold), 16.1215 GB/s (warm)
Bandwith for reading n = 10 MB: 1.9973 GB/s (cold), 8.19585 GB/s (warm)
Bandwith for reading n = 100 MB: 2.7617 GB/s (cold), 13.675 GB/s (warm)
Bandwith for reading n = 1 GB: 2.86934 GB/s (cold), 16.1397 GB/s (warm)
Bandwith for reading n = 2 GB: 2.79094 GB/s (cold), 14.2438 GB/s (warm)

Bandwith for writing n = 1 KB: 0.392789 GB/s (cold), 9.94175 GB/s (warm)
Bandwith for writing n = 1 MB: 5.74285 GB/s (cold), 16.2936 GB/s (warm)
Bandwith for writing n = 10 MB: 5.94669 GB/s (cold), 9.37043 GB/s (warm)
Bandwith for writing n = 100 MB: 7.02979 GB/s (cold), 9.08043 GB/s (warm)
Bandwith for writing n = 1 GB: 2.5859 GB/s (cold), 8.64279 GB/s (warm)
Bandwith for writing n = 2 GB: 2.34798 GB/s (cold), 8.99719 GB/s (warm)