Skip to content

Commit c8ab083

Browse files
committed
1. Adding stream API for non temporal data transfers
2. Adding xsimd::fence as a wrapper around std atomic for cache coherence 3. Adding tests
1 parent c3a8d37 commit c8ab083

10 files changed

Lines changed: 350 additions & 0 deletions

File tree

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,12 @@ namespace xsimd
292292
return load_unaligned(mem, b, A {});
293293
}
294294

295+
template <class A, class T>
296+
XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
297+
{
298+
return load_aligned(mem, b, A {});
299+
}
300+
295301
// load_aligned
296302
namespace detail
297303
{
@@ -438,6 +444,12 @@ namespace xsimd
438444
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
439445
}
440446

447+
template <class A, class T_in, class T_out>
448+
XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
449+
{
450+
return load_aligned<A>(mem, cvt, A {});
451+
}
452+
441453
// rotate_right
442454
template <size_t N, class A, class T>
443455
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -679,6 +691,12 @@ namespace xsimd
679691
mem[i] = bool(buffer[i]);
680692
}
681693

694+
template <class A, class T>
695+
XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
696+
{
697+
store(self, mem, A {});
698+
}
699+
682700
// store_aligned
683701
template <class A, class T_in, class T_out>
684702
XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
@@ -697,6 +715,12 @@ namespace xsimd
697715
return store_aligned<A>(mem, self, common {});
698716
}
699717

718+
template <class A, class T_in, class T_out>
719+
XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
720+
{
721+
store_aligned<A>(mem, self, A {});
722+
}
723+
700724
// swizzle
701725
template <class A, class T, class ITy, ITy... Vs>
702726
XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
@@ -778,6 +802,12 @@ namespace xsimd
778802
return detail::load_complex(hi, lo, A {});
779803
}
780804

805+
template <class A, class T_out, class T_in>
806+
XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
807+
{
808+
return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
809+
}
810+
781811
// store_complex_aligned
782812
template <class A, class T_out, class T_in>
783813
XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
@@ -802,6 +832,12 @@ namespace xsimd
802832
hi.store_unaligned(buffer + real_batch::size);
803833
}
804834

835+
template <class A, class T_out, class T_in>
836+
XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
837+
{
838+
store_complex_aligned<A>(dst, src, A {});
839+
}
840+
805841
// transpose
806842
template <class A, class T>
807843
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,23 @@ namespace xsimd
15151515
return _mm256_storeu_pd(mem, self);
15161516
}
15171517

1518+
// store_stream
1519+
template <class A>
1520+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1521+
{
1522+
_mm256_stream_ps(mem, self);
1523+
}
1524+
template <class A>
1525+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1526+
{
1527+
_mm256_stream_pd(mem, self);
1528+
}
1529+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1530+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1531+
{
1532+
_mm256_stream_si256((__m256i*)mem, self);
1533+
}
1534+
15181535
// sub
15191536
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
15201537
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,23 @@ namespace xsimd
229229
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
230230
}
231231

232+
// load_stream
233+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
234+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
235+
{
236+
return _mm256_stream_load_si256((__m256i const*)mem);
237+
}
238+
template <class A>
239+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
240+
{
241+
return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
242+
}
243+
template <class A>
244+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
245+
{
246+
return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
247+
}
248+
232249
// bitwise_and
233250
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
234251
XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,23 @@ namespace xsimd
15131513
return _mm512_loadu_pd(mem);
15141514
}
15151515

1516+
// load_stream
1517+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1518+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1519+
{
1520+
return _mm512_stream_load_si512((__m512i*)mem);
1521+
}
1522+
template <class A>
1523+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1524+
{
1525+
return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
1526+
}
1527+
template <class A>
1528+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1529+
{
1530+
return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
1531+
}
1532+
15161533
// lt
15171534
template <class A>
15181535
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -2285,6 +2302,23 @@ namespace xsimd
22852302
return _mm512_storeu_pd(mem, self);
22862303
}
22872304

2305+
// store_stream
2306+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2307+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
2308+
{
2309+
_mm512_stream_si512((__m512i*)mem, self);
2310+
}
2311+
template <class A>
2312+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
2313+
{
2314+
_mm512_stream_ps(mem, self);
2315+
}
2316+
template <class A>
2317+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
2318+
{
2319+
_mm512_stream_pd(mem, self);
2320+
}
2321+
22882322
// sub
22892323
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
22902324
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,23 @@ namespace xsimd
19271927
return _mm_storeu_pd(mem, self);
19281928
}
19291929

1930+
// store_stream
1931+
template <class A>
1932+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1933+
{
1934+
_mm_stream_ps(mem, self);
1935+
}
1936+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1937+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1938+
{
1939+
_mm_stream_si128((__m128i*)mem, self);
1940+
}
1941+
template <class A>
1942+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1943+
{
1944+
_mm_stream_pd(mem, self);
1945+
}
1946+
19301947
// sub
19311948
template <class A>
19321949
XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,23 @@ namespace xsimd
228228
}
229229
}
230230

231+
// load_stream
232+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
233+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
234+
{
235+
return _mm_stream_load_si128((__m128i*)mem);
236+
}
237+
template <class A>
238+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
239+
{
240+
return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
241+
}
242+
template <class A>
243+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
244+
{
245+
return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
246+
}
247+
231248
// min
232249
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
233250
XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept

include/xsimd/memory/xsimd_alignment.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ namespace xsimd
3333
{
3434
};
3535

36+
/**
37+
* @struct stream_mode
38+
* @brief tag for load and store of aligned non-temporal memory.
39+
*
40+
* Streaming accesses expect aligned pointers. When no architecture-specific
41+
* implementation is available, they fall back to aligned semantics.
42+
*/
43+
struct stream_mode
44+
{
45+
};
46+
3647
/***********************
3748
* Allocator alignment *
3849
***********************/

include/xsimd/types/xsimd_api.hpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef XSIMD_API_HPP
1313
#define XSIMD_API_HPP
1414

15+
#include <atomic>
1516
#include <complex>
1617
#include <cstddef>
1718
#include <limits>
@@ -1334,6 +1335,30 @@ namespace xsimd
13341335
return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
13351336
}
13361337

1338+
template <class To, class A = default_arch, class From>
1339+
XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
1340+
{
1341+
using batch_value_type = typename simd_return_type<From, To, A>::value_type;
1342+
detail::static_check_supported_config<From, A>();
1343+
detail::static_check_supported_config<To, A>();
1344+
return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1345+
}
1346+
1347+
template <class To, class A = default_arch>
1348+
XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
1349+
{
1350+
detail::static_check_supported_config<To, A>();
1351+
return simd_return_type<bool, To, A>::load_stream(ptr);
1352+
}
1353+
1354+
template <class To, class A = default_arch, class From>
1355+
XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
1356+
{
1357+
detail::static_check_supported_config<To, A>();
1358+
using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
1359+
return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1360+
}
1361+
13371362
#ifdef XSIMD_ENABLE_XTL_COMPLEX
13381363
template <class To, class A = default_arch, class From, bool i3ec>
13391364
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
@@ -1342,6 +1367,14 @@ namespace xsimd
13421367
detail::static_check_supported_config<From, A>();
13431368
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
13441369
}
1370+
1371+
template <class To, class A = default_arch, class From, bool i3ec>
1372+
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
1373+
{
1374+
detail::static_check_supported_config<To, A>();
1375+
detail::static_check_supported_config<From, A>();
1376+
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
1377+
}
13451378
#endif
13461379

13471380
/**
@@ -1416,6 +1449,13 @@ namespace xsimd
14161449
return load_as<From, A>(ptr, unaligned_mode {});
14171450
}
14181451

1452+
template <class A = default_arch, class From>
1453+
XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
1454+
{
1455+
detail::static_check_supported_config<From, A>();
1456+
return load_as<From, A>(ptr, stream_mode {});
1457+
}
1458+
14191459
/**
14201460
* @ingroup batch_data_transfer
14211461
*
@@ -2420,12 +2460,40 @@ namespace xsimd
24202460
kernel::store_complex_aligned<A>(dst, src, A {});
24212461
}
24222462

2463+
template <class To, class A = default_arch, class From>
2464+
XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
2465+
{
2466+
detail::static_check_supported_config<From, A>();
2467+
kernel::store_stream<A>(dst, src, A {});
2468+
}
2469+
2470+
template <class A = default_arch, class From>
2471+
XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
2472+
{
2473+
detail::static_check_supported_config<From, A>();
2474+
kernel::store_stream<A>(src, dst, A {});
2475+
}
2476+
2477+
template <class To, class A = default_arch, class From>
2478+
XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2479+
{
2480+
detail::static_check_supported_config<std::complex<From>, A>();
2481+
kernel::store_complex_stream<A>(dst, src, A {});
2482+
}
2483+
24232484
#ifdef XSIMD_ENABLE_XTL_COMPLEX
24242485
template <class To, class A = default_arch, class From, bool i3ec>
24252486
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
24262487
{
24272488
store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
24282489
}
2490+
2491+
template <class To, class A = default_arch, class From, bool i3ec>
2492+
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2493+
{
2494+
detail::static_check_supported_config<std::complex<From>, A>();
2495+
store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
2496+
}
24292497
#endif
24302498

24312499
/**
@@ -2494,6 +2562,22 @@ namespace xsimd
24942562
store_as<T, A>(mem, val, unaligned_mode {});
24952563
}
24962564

2565+
template <class A, class T>
2566+
XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
2567+
{
2568+
store_as<T, A>(mem, val, stream_mode {});
2569+
}
2570+
2571+
/**
2572+
* @ingroup batch_data_transfer
2573+
*
2574+
* Issues a sequentially consistent memory fence.
2575+
*/
2576+
XSIMD_INLINE void fence() noexcept
2577+
{
2578+
std::atomic_thread_fence(std::memory_order_seq_cst);
2579+
}
2580+
24972581
/**
24982582
* @ingroup batch_data_transfer
24992583
*

0 commit comments

Comments
 (0)