api6_1/c++/vectorize__neon_8h_source.html

 #ifndef OPENMM_VECTORIZE_NEON_H_

 #define OPENMM_VECTORIZE_NEON_H_


 /* -------------------------------------------------------------------------- *

  *                                   OpenMM                                   *

  * -------------------------------------------------------------------------- *

  * This is part of the OpenMM molecular simulation toolkit originating from   *

  * Simbios, the NIH National Center for Physics-Based Simulation of           *

  * Biological Structures at Stanford, funded under the NIH Roadmap for        *

  * Medical Research, grant U54 GM072970. See https://simtk.org.               *

  *                                                                            *

  * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *

  * Authors: Mateus Lima, Peter Eastman                                        *

  * Contributors:                                                              *

  *                                                                            *

  * Permission is hereby granted, free of charge, to any person obtaining a    *

  * copy of this software and associated documentation files (the "Software"), *

  * to deal in the Software without restriction, including without limitation  *

  * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *

  * and/or sell copies of the Software, and to permit persons to whom the      *

  * Software is furnished to do so, subject to the following conditions:       *

  *                                                                            *

  * The above copyright notice and this permission notice shall be included in *

  * all copies or substantial portions of the Software.                        *

  *                                                                            *

  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *

  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *

  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *

  * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *

  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *

  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *

  * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *

  * -------------------------------------------------------------------------- */


 #include <cpu-features.h>

 #include <arm_neon.h>

 #include <cmath>


 typedef int int32_t;


 // This file defines classes and functions to simplify vectorizing code with NEON.


 static bool isVec4Supported() {

     uint64_t features = android_getCpuFeatures();

     return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;

 }


 class ivec4;


 class fvec4 {

 public:

     float32x4_t val;


     fvec4() {}

     fvec4(float v) : val(vdupq_n_f32(v)) {}

     fvec4(float v1, float v2, float v3, float v4) {

         float v[] = {v1, v2, v3, v4};

         val = vld1q_f32(v);

     }

     fvec4(float32x4_t v) : val(v) {}

     fvec4(const float* v) : val(vld1q_f32(v)) {}

     operator float32x4_t() const {

         return val;

     }

     float operator[](int i) const {

         switch (i) {

             case 0:

                 return vgetq_lane_f32(val, 0);

             case 1:

                 return vgetq_lane_f32(val, 1);

             case 2:

                 return vgetq_lane_f32(val, 2);

             case 3:

                 return vgetq_lane_f32(val, 3);

         }

         return 0.0f;

     }

     void store(float* v) const {

         vst1q_f32(v, val);

     }

     fvec4 operator+(const fvec4& other) const {

         return vaddq_f32(val, other);

     }

     fvec4 operator-(const fvec4& other) const {

         return vsubq_f32(val, other);

     }

     fvec4 operator*(const fvec4& other) const {

         return vmulq_f32(val, other);

     }

     fvec4 operator/(const fvec4& other) const {

         // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.


         float32x4_t reciprocal = vrecpeq_f32(other);

         reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);

         reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);

         fvec4 result = vmulq_f32(val,reciprocal);

         return result;

     }

     void operator+=(const fvec4& other) {

         val = vaddq_f32(val, other);

     }

     void operator-=(const fvec4& other) {

         val = vsubq_f32(val, other);

     }

     void operator*=(const fvec4& other) {

         val = vmulq_f32(val, other);

     }

     void operator/=(const fvec4& other) {

         val = *this/other;

     }

     fvec4 operator-() const {

         return vnegq_f32(val);

     }

     fvec4 operator&(const fvec4& other) const {

         return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));

     }

     fvec4 operator|(const fvec4& other) const {

         return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));

     }

     fvec4 operator==(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));

     }

     fvec4 operator!=(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))

     }

     fvec4 operator>(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));

     }

     fvec4 operator<(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));

     }

     fvec4 operator>=(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));

     }

     fvec4 operator<=(const fvec4& other) const {

         return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));

     }

     operator ivec4() const;

 };


 class ivec4 {

 public:


     int32x4_t val;


     ivec4() {}

     ivec4(int v) : val(vdupq_n_s32(v)) {}

     ivec4(int v1, int v2, int v3, int v4) {

         int v[] = {v1, v2, v3, v4};

         val = vld1q_s32(v);

     }

     ivec4(int32x4_t v) : val(v) {}

     ivec4(const int* v) : val(vld1q_s32(v)) {}

     operator int32x4_t() const {

         return val;

     }

     int operator[](int i) const {

         switch (i) {

             case 0:

                 return vgetq_lane_s32(val, 0);

             case 1:

                 return vgetq_lane_s32(val, 1);

             case 2:

                 return vgetq_lane_s32(val, 2);

             case 3:

                 return vgetq_lane_s32(val, 3);

         }

         return 0;

     }

     void store(int* v) const {

         vst1q_s32(v, val);

     }

     ivec4 operator+(const ivec4& other) const {

         return vaddq_s32(val, other);

     }

     ivec4 operator-(const ivec4& other) const {

         return vsubq_s32(val, other);

     }

     ivec4 operator*(const ivec4& other) const {

         return vmulq_s32(val, other);

     }

     void operator+=(const ivec4& other) {

         val = vaddq_s32(val, other);

     }

     void operator-=(const ivec4& other) {

         val = vsubq_s32(val, other);

     }

     void operator*=(const ivec4& other) {

         val = vmulq_s32(val, other);

     }

     ivec4 operator-() const {

         return vnegq_s32(val);

     }

     ivec4 operator&(const ivec4& other) const {

         return vandq_s32(val, other);

     }

     ivec4 operator|(const ivec4& other) const {

         return vorrq_s32(val, other);

     }

     ivec4 operator==(const ivec4& other) const {

         return vreinterpretq_s32_u32(vceqq_s32(val, other));

     }

     ivec4 operator!=(const ivec4& other) const {

         return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))

     }

     ivec4 operator>(const ivec4& other) const {

         return vreinterpretq_s32_u32(vcgtq_s32(val, other));

     }

     ivec4 operator<(const ivec4& other) const {

         return vreinterpretq_s32_u32(vcltq_s32(val, other));

     }

     ivec4 operator>=(const ivec4& other) const {

         return vreinterpretq_s32_u32(vcgeq_s32(val, other));

     }

     ivec4 operator<=(const ivec4& other) const {

         return vreinterpretq_s32_u32(vcleq_s32(val, other));

     }

     operator fvec4() const;

 };


 // Conversion operators.


 inline fvec4::operator ivec4() const {

     return ivec4(vcvtq_s32_f32(val));

 }


 inline ivec4::operator fvec4() const {

     return fvec4(vcvtq_f32_s32(val));

 }


 // Functions that operate on fvec4s.


 static inline fvec4 min(const fvec4& v1, const fvec4& v2) {

     return vminq_f32(v1, v2);

 }


 static inline fvec4 max(const fvec4& v1, const fvec4& v2) {

     return vmaxq_f32(v1, v2);

 }


 static inline fvec4 abs(const fvec4& v) {

     return vabsq_f32(v);

 }


 static inline fvec4 sqrt(const fvec4& v) {

     float32x4_t recipSqrt = vrsqrteq_f32(v);

     recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));

     recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));

     return vmulq_f32(v, recipSqrt);

 }


 static inline float dot3(const fvec4& v1, const fvec4& v2) {

     fvec4 result = v1*v2;

     return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);

 }


 static inline float dot4(const fvec4& v1, const fvec4& v2) {

     fvec4 result = v1*v2;

     return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);

 }


 static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {

     float32x4x2_t t1 = vuzpq_f32(v1, v3);

     float32x4x2_t t2 = vuzpq_f32(v2, v4);

     float32x4x2_t t3 = vtrnq_f32(t1.val[0], t2.val[0]);

     float32x4x2_t t4 = vtrnq_f32(t1.val[1], t2.val[1]);

     v1 = t3.val[0];

     v2 = t4.val[0];

     v3 = t3.val[1];

     v4 = t4.val[1];

 }


 // Functions that operate on ivec4s.


 static inline ivec4 min(const ivec4& v1, const ivec4& v2) {

     return vminq_s32(v1, v2);

 }


 static inline ivec4 max(const ivec4& v1, const ivec4& v2) {

     return vmaxq_s32(v1, v2);

 }


 static inline ivec4 abs(const ivec4& v) {

     return vabdq_s32(v, ivec4(0));

 }


 static inline bool any(const ivec4& v) {

     return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);

 }


 // Mathematical operators involving a scalar and a vector.


 static inline fvec4 operator+(float v1, const fvec4& v2) {

     return fvec4(v1)+v2;

 }


 static inline fvec4 operator-(float v1, const fvec4& v2) {

     return fvec4(v1)-v2;

 }


 static inline fvec4 operator*(float v1, const fvec4& v2) {

     return fvec4(v1)*v2;

 }


 static inline fvec4 operator/(float v1, const fvec4& v2) {

     return fvec4(v1)/v2;

 }


 // Operations for blending fvec4s based on an ivec4.


 static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {

     return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);

 }


 // These are at the end since they involve other functions defined above.


 static inline fvec4 round(const fvec4& v) {

     fvec4 shift(0x1.0p23f);

     fvec4 absResult = (abs(v)+shift)-shift;

     return blend(v, absResult, ivec4(0x7FFFFFFF));

 }


 static inline fvec4 floor(const fvec4& v) {

     fvec4 rounded = round(v);

     return rounded + blend(0.0f, -1.0f, rounded>v);

 }


 static inline fvec4 ceil(const fvec4& v) {

     fvec4 rounded = round(v);

     return rounded + blend(0.0f, 1.0f, rounded<v);

 }


 #endif /*OPENMM_VECTORIZE_NEON_H_*/

fvec4::operator>=
fvec4 operator>=(const fvec4 &other) const
Definition: vectorize_neon.h:138

fvec4::operator+
fvec4 operator+(const fvec4 &other) const
Definition: vectorize_neon.h:87

ivec4
A four element vector of ints.
Definition: vectorize_neon.h:150

ivec4::operator==
ivec4 operator==(const ivec4 &other) const
Definition: vectorize_neon.h:209

ivec4::operator[]
int operator[](int i) const
Definition: vectorize_neon.h:166

fvec4::operator-
fvec4 operator-(const fvec4 &other) const
Definition: vectorize_neon.h:90

fvec4::val
float32x4_t val
Definition: vectorize_neon.h:58

fvec4::operator[]
float operator[](int i) const
Definition: vectorize_neon.h:71

ivec4::operator<
ivec4 operator<(const ivec4 &other) const
Definition: vectorize_neon.h:218

fvec4::operator<
fvec4 operator<(const fvec4 &other) const
Definition: vectorize_neon.h:135

fvec4::operator/=
void operator/=(const fvec4 &other)
Definition: vectorize_neon.h:114

fvec4::operator*=
void operator*=(const fvec4 &other)
Definition: vectorize_neon.h:111

fvec4::fvec4
fvec4(const float *v)
Definition: vectorize_neon.h:67

ivec4::operator-
ivec4 operator-(const ivec4 &other) const
Definition: vectorize_neon.h:185

ivec4::store
void store(int *v) const
Definition: vectorize_neon.h:179

ivec4::operator>=
ivec4 operator>=(const ivec4 &other) const
Definition: vectorize_neon.h:221

fvec4::fvec4
fvec4(float32x4_t v)
Definition: vectorize_neon.h:66

fvec4
A four element vector of floats.
Definition: vectorize_neon.h:56

ivec4::operator!=
ivec4 operator!=(const ivec4 &other) const
Definition: vectorize_neon.h:212

ivec4::ivec4
ivec4(int v1, int v2, int v3, int v4)
Definition: vectorize_neon.h:157

ivec4::ivec4
ivec4(const int *v)
Definition: vectorize_neon.h:162

fvec4::operator==
fvec4 operator==(const fvec4 &other) const
Definition: vectorize_neon.h:126

fvec4::operator<=
fvec4 operator<=(const fvec4 &other) const
Definition: vectorize_neon.h:141

fvec4::store
void store(float *v) const
Definition: vectorize_neon.h:84

ivec4::operator-=
void operator-=(const ivec4 &other)
Definition: vectorize_neon.h:194

ivec4::operator+=
void operator+=(const ivec4 &other)
Definition: vectorize_neon.h:191

ivec4::operator+
ivec4 operator+(const ivec4 &other) const
Definition: vectorize_neon.h:182

fvec4::fvec4
fvec4()
Definition: vectorize_neon.h:60

ivec4::operator-
ivec4 operator-() const
Definition: vectorize_neon.h:200

fvec4::operator/
fvec4 operator/(const fvec4 &other) const
Definition: vectorize_neon.h:96

ivec4::operator|
ivec4 operator|(const ivec4 &other) const
Definition: vectorize_neon.h:206

fvec4::operator!=
fvec4 operator!=(const fvec4 &other) const
Definition: vectorize_neon.h:129

ivec4::operator>
ivec4 operator>(const ivec4 &other) const
Definition: vectorize_neon.h:215

ivec4::ivec4
ivec4()
Definition: vectorize_neon.h:155

fvec4::operator-=
void operator-=(const fvec4 &other)
Definition: vectorize_neon.h:108

fvec4::operator&
fvec4 operator&(const fvec4 &other) const
Definition: vectorize_neon.h:120

fvec4::operator*
fvec4 operator*(const fvec4 &other) const
Definition: vectorize_neon.h:93

fvec4::fvec4
fvec4(float v1, float v2, float v3, float v4)
Definition: vectorize_neon.h:62

fvec4::operator|
fvec4 operator|(const fvec4 &other) const
Definition: vectorize_neon.h:123

fvec4::fvec4
fvec4(float v)
Definition: vectorize_neon.h:61

fvec4::operator>
fvec4 operator>(const fvec4 &other) const
Definition: vectorize_neon.h:132

fvec4::operator-
fvec4 operator-() const
Definition: vectorize_neon.h:117

fvec4::operator+=
void operator+=(const fvec4 &other)
Definition: vectorize_neon.h:105

ivec4::ivec4
ivec4(int v)
Definition: vectorize_neon.h:156

ivec4::operator*
ivec4 operator*(const ivec4 &other) const
Definition: vectorize_neon.h:188

ivec4::operator<=
ivec4 operator<=(const ivec4 &other) const
Definition: vectorize_neon.h:224

ivec4::ivec4
ivec4(int32x4_t v)
Definition: vectorize_neon.h:161

ivec4::operator&
ivec4 operator&(const ivec4 &other) const
Definition: vectorize_neon.h:203

ivec4::operator*=
void operator*=(const ivec4 &other)
Definition: vectorize_neon.h:197

ivec4::val
int32x4_t val
Definition: vectorize_neon.h:153