Improve codegen for deque.

This patch rewrites a few loops in deque and split_buffer to better
optimize the codegen. For constructors like
`deque<unsigned char> d(500000, 0);` this patch results in a 2x speedup.

The patch improves the codegen  in roughly three ways:

1. Changes do { ... } while (...) loops into more typical for loops.
  The optimizer can reason about normal looking loops better.

2. Split the iteration over a range into (A) iteration over the blocks,
then (B) iteration within the block. This nested structure helps LLVM
lower the inner loop to `memset`.

3. Do fewer things each iteration. Some of these loops were incrementing
  or changing 4-5 variables every loop (in addition to the
  construction). Previously most loops would increment the end pointer,
  the size, and decrement the count of remaining items to construct.
  Now we only increment a single pointer for most iterations.

git-svn-id: https://llvm.org/svn/llvm-project/libcxx/trunk@368547 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/benchmarks/ContainerBenchmarks.hpp b/benchmarks/ContainerBenchmarks.hpp
index 648617a..581d3c5 100644
--- a/benchmarks/ContainerBenchmarks.hpp
+++ b/benchmarks/ContainerBenchmarks.hpp
@@ -1,8 +1,18 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #ifndef BENCHMARK_CONTAINER_BENCHMARKS_HPP
 #define BENCHMARK_CONTAINER_BENCHMARKS_HPP
 
 #include <cassert>
 
+#include "Utilities.hpp"
 #include "benchmark/benchmark.h"
 
 namespace ContainerBenchmarks {
@@ -12,7 +22,7 @@
   auto size = st.range(0);
   for (auto _ : st) {
     Container c(size);
-    benchmark::DoNotOptimize(c.data());
+    DoNotOptimizeData(c);
   }
 }
 
@@ -21,7 +31,7 @@
   const auto size = st.range(0);
   for (auto _ : st) {
     Container c(size, val);
-    benchmark::DoNotOptimize(c.data());
+    DoNotOptimizeData(c);
   }
 }
 
@@ -33,7 +43,7 @@
     benchmark::DoNotOptimize(&in);
     while (st.KeepRunning()) {
         Container c(begin, end);
-        benchmark::DoNotOptimize(c.data());
+        DoNotOptimizeData(c);
     }
 }
 
diff --git a/benchmarks/Utilities.hpp b/benchmarks/Utilities.hpp
new file mode 100644
index 0000000..1b25f74
--- /dev/null
+++ b/benchmarks/Utilities.hpp
@@ -0,0 +1,33 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BENCHMARK_UTILITIES_HPP
+#define BENCHMARK_UTILITIES_HPP
+
+#include <cassert>
+#include <type_traits>
+
+#include "benchmark/benchmark.h"
+
+namespace UtilitiesInternal {
+  template <class Container>
+  auto HaveDataImpl(int) -> decltype((std::declval<Container&>().data(), std::true_type{}));
+  template <class Container>
+  auto HaveDataImpl(long) -> std::false_type;
+  template <class T>
+  using HasData = decltype(HaveDataImpl<T>(0));
+} // namespace UtilitiesInternal
+
+template <class Container, std::enable_if_t<UtilitiesInternal::HasData<Container>::value>* = nullptr>
+void DoNotOptimizeData(Container &c) { benchmark::DoNotOptimize(c.data()); }
+template <class Container, std::enable_if_t<!UtilitiesInternal::HasData<Container>::value>* = nullptr>
+void DoNotOptimizeData(Container &c) { benchmark::DoNotOptimize(&c); }
+
+
+#endif // BENCHMARK_UTILITIES_HPP
diff --git a/benchmarks/deque.bench.cpp b/benchmarks/deque.bench.cpp
new file mode 100644
index 0000000..44a492b
--- /dev/null
+++ b/benchmarks/deque.bench.cpp
@@ -0,0 +1,47 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <deque>
+
+#include "benchmark/benchmark.h"
+
+#include "ContainerBenchmarks.hpp"
+#include "GenerateInput.hpp"
+
+using namespace ContainerBenchmarks;
+
+constexpr std::size_t TestNumInputs = 1024;
+
+BENCHMARK_CAPTURE(BM_ConstructSize,
+    deque_byte,
+    std::deque<unsigned char>{})->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_ConstructSizeValue,
+    deque_byte,
+    std::deque<unsigned char>{}, 0)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter,
+  deque_char,
+  std::deque<char>{},
+  getRandomIntegerInputs<char>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter,
+  deque_size_t,
+  std::deque<size_t>{},
+  getRandomIntegerInputs<size_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter,
+  deque_string,
+  std::deque<std::string>{},
+  getRandomStringInputs)->Arg(TestNumInputs);
+
+
+
+
+BENCHMARK_MAIN();
diff --git a/include/__split_buffer b/include/__split_buffer
index 1daa4e5..095fe89 100644
--- a/include/__split_buffer
+++ b/include/__split_buffer
@@ -161,6 +161,19 @@
     _LIBCPP_INLINE_VISIBILITY
     void __move_assign_alloc(__split_buffer&, false_type) _NOEXCEPT
         {}
+
+    struct _ConstructTransaction {
+      explicit _ConstructTransaction(pointer* __p, size_type __n) _NOEXCEPT
+      : __pos_(*__p), __end_(*__p + __n), __dest_(__p) {
+      }
+      ~_ConstructTransaction() {
+        *__dest_ = __pos_;
+      }
+      pointer __pos_;
+     const pointer __end_;
+    private:
+     pointer *__dest_;
+    };
 };
 
 template <class _Tp, class _Allocator>
@@ -197,13 +210,10 @@
 void
 __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n)
 {
-    __alloc_rr& __a = this->__alloc();
-    do
-    {
-        __alloc_traits::construct(__a, _VSTD::__to_raw_pointer(this->__end_));
-        ++this->__end_;
-        --__n;
-    } while (__n > 0);
+    _ConstructTransaction __tx(&this->__end_, __n);
+    for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
+        __alloc_traits::construct(this->__alloc(), _VSTD::__to_raw_pointer(__tx.__pos_));
+    }
 }
 
 //  Copy constructs __n objects starting at __end_ from __x
@@ -216,13 +226,11 @@
 void
 __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x)
 {
-    __alloc_rr& __a = this->__alloc();
-    do
-    {
-        __alloc_traits::construct(__a, _VSTD::__to_raw_pointer(this->__end_), __x);
-        ++this->__end_;
-        --__n;
-    } while (__n > 0);
+    _ConstructTransaction __tx(&this->__end_, __n);
+    for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
+        __alloc_traits::construct(this->__alloc(),
+            _VSTD::__to_raw_pointer(__tx.__pos_), __x);
+    }
 }
 
 template <class _Tp, class _Allocator>
@@ -262,11 +270,10 @@
 >::type
 __split_buffer<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last)
 {
-    __alloc_rr& __a = this->__alloc();
-    for (; __first != __last; ++__first)
-    {
-        __alloc_traits::construct(__a, _VSTD::__to_raw_pointer(this->__end_), *__first);
-        ++this->__end_;
+    _ConstructTransaction __tx(&this->__end_, std::distance(__first, __last));
+    for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, ++__first) {
+        __alloc_traits::construct(this->__alloc(),
+            _VSTD::__to_raw_pointer(__tx.__pos_), *__first);
     }
 }
 
diff --git a/include/deque b/include/deque
index fe988d0..cb7e4e5 100644
--- a/include/deque
+++ b/include/deque
@@ -956,6 +956,74 @@
     typedef __deque_iterator<value_type, const_pointer, const_reference, __map_const_pointer,
                              difference_type>    const_iterator;
 
+    struct __deque_block_range {
+      explicit __deque_block_range(pointer __b, pointer __e) _NOEXCEPT : __begin_(__b), __end_(__e) {}
+      const pointer __begin_;
+      const pointer __end_;
+    };
+
+    struct __deque_range {
+      iterator __pos_;
+      const iterator __end_;
+
+      __deque_range(iterator __pos, iterator __e) _NOEXCEPT
+        : __pos_(__pos), __end_(__e) {}
+
+      explicit operator bool() const _NOEXCEPT {
+        return __pos_ != __end_;
+      }
+
+      __deque_range begin() const {
+        return *this;
+      }
+
+      __deque_range end() const {
+        return __deque_range(__end_, __end_);
+      }
+      __deque_block_range operator*() const _NOEXCEPT {
+         if (__pos_.__m_iter_ == __end_.__m_iter_) {
+          return __deque_block_range(__pos_.__ptr_, __end_.__ptr_);
+        }
+        return __deque_block_range(__pos_.__ptr_, *__pos_.__m_iter_ + __block_size);
+      }
+
+      __deque_range& operator++() _NOEXCEPT {
+        if (__pos_.__m_iter_ == __end_.__m_iter_) {
+          __pos_ = __end_;
+        } else {
+          ++__pos_.__m_iter_;
+          __pos_.__ptr_ = *__pos_.__m_iter_;
+        }
+        return *this;
+      }
+
+
+      friend bool operator==(__deque_range const& __lhs, __deque_range const& __rhs) {
+        return __lhs.__pos_ == __rhs.__pos_;
+      }
+      friend bool operator!=(__deque_range const& __lhs, __deque_range const& __rhs) {
+        return !(__lhs == __rhs);
+      }
+    };
+
+
+
+    struct _ConstructTransaction {
+      _ConstructTransaction(__deque_base* __db, __deque_block_range& __r)
+        : __pos_(__r.__begin_), __end_(__r.__end_), __begin_(__r.__begin_), __base_(__db) {}
+
+
+      ~_ConstructTransaction() {
+        __base_->size() += (__pos_ - __begin_);
+      }
+
+      pointer __pos_;
+      const pointer __end_;
+    private:
+      const pointer __begin_;
+      __deque_base * const __base_;
+    };
+
 protected:
     __map __map_;
     size_type __start_;
@@ -1222,6 +1290,10 @@
     typedef _VSTD::reverse_iterator<iterator>       reverse_iterator;
     typedef _VSTD::reverse_iterator<const_iterator> const_reverse_iterator;
 
+    using typename __base::__deque_range;
+    using typename __base::__deque_block_range;
+    using typename __base::_ConstructTransaction;
+
     // construct/copy/destroy:
     _LIBCPP_INLINE_VISIBILITY
     deque()
@@ -2299,8 +2371,12 @@
     if (__n > __back_capacity)
         __add_back_capacity(__n - __back_capacity);
     // __n <= __back_capacity
-    for (iterator __i = __base::end(); __f != __l; ++__i, (void) ++__f, ++__base::size())
-        __alloc_traits::construct(__a, _VSTD::addressof(*__i), *__f);
+    for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
+      _ConstructTransaction __tx(this, __br);
+      for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, (void)++__f) {
+        __alloc_traits::construct(__a, std::__to_raw_pointer(__tx.__pos_), *__f);
+      }
+    }
 }
 
 template <class _Tp, class _Allocator>
@@ -2312,8 +2388,12 @@
     if (__n > __back_capacity)
         __add_back_capacity(__n - __back_capacity);
     // __n <= __back_capacity
-    for (iterator __i = __base::end(); __n; --__n, ++__i, ++__base::size())
-        __alloc_traits::construct(__a, _VSTD::addressof(*__i));
+    for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
+      _ConstructTransaction __tx(this, __br);
+      for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
+        __alloc_traits::construct(__a, std::__to_raw_pointer(__tx.__pos_));
+      }
+    }
 }
 
 template <class _Tp, class _Allocator>
@@ -2325,8 +2405,13 @@
     if (__n > __back_capacity)
         __add_back_capacity(__n - __back_capacity);
     // __n <= __back_capacity
-    for (iterator __i = __base::end(); __n; --__n, ++__i, ++__base::size())
-        __alloc_traits::construct(__a, _VSTD::addressof(*__i), __v);
+    for (__deque_block_range __br : __deque_range(__base::end(), __base::end() + __n)) {
+      _ConstructTransaction __tx(this, __br);
+      for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
+        __alloc_traits::construct(__a, std::__to_raw_pointer(__tx.__pos_), __v);
+      }
+    }
+
 }
 
 // Create front capacity for one block of elements.