-:    0:Source:/usr/lib/gcc/x86_64-linux-gnu/14/include/xmmintrin.h
        -:    0:Graph:tspi4.gcno
        -:    0:Data:tspi4.gcda
        -:    0:Runs:1
        -:    1:/* Copyright (C) 2002-2024 Free Software Foundation, Inc.
        -:    2:
        -:    3:   This file is part of GCC.
        -:    4:
        -:    5:   GCC is free software; you can redistribute it and/or modify
        -:    6:   it under the terms of the GNU General Public License as published by
        -:    7:   the Free Software Foundation; either version 3, or (at your option)
        -:    8:   any later version.
        -:    9:
        -:   10:   GCC is distributed in the hope that it will be useful,
        -:   11:   but WITHOUT ANY WARRANTY; without even the implied warranty of
        -:   12:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        -:   13:   GNU General Public License for more details.
        -:   14:
        -:   15:   Under Section 7 of GPL version 3, you are granted additional
        -:   16:   permissions described in the GCC Runtime Library Exception, version
        -:   17:   3.1, as published by the Free Software Foundation.
        -:   18:
        -:   19:   You should have received a copy of the GNU General Public License and
        -:   20:   a copy of the GCC Runtime Library Exception along with this program;
        -:   21:   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
        -:   22:   <http://www.gnu.org/licenses/>.  */
        -:   23:
        -:   24:/* Implemented from the specification included in the Intel C++ Compiler
        -:   25:   User Guide and Reference, version 9.0.  */
        -:   26:
        -:   27:#ifndef _XMMINTRIN_H_INCLUDED
        -:   28:#define _XMMINTRIN_H_INCLUDED
        -:   29:
        -:   30:/* We need type definitions from the MMX header file.  */
        -:   31:#include <mmintrin.h>
        -:   32:
        -:   33:/* Get _mm_malloc () and _mm_free ().  */
        -:   34:#include <mm_malloc.h>
        -:   35:
        -:   36:/* Constants for use with _mm_prefetch.  */
        -:   37:enum _mm_hint
        -:   38:{
        -:   39:  _MM_HINT_IT0 = 19,
        -:   40:  _MM_HINT_IT1 = 18,
        -:   41:  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
        -:   42:  _MM_HINT_ET0 = 7,
        -:   43:  _MM_HINT_ET1 = 6,
        -:   44:  _MM_HINT_T0 = 3,
        -:   45:  _MM_HINT_T1 = 2,
        -:   46:  _MM_HINT_T2 = 1,
        -:   47:  _MM_HINT_NTA = 0
        -:   48:};
        -:   49:
        -:   50:/* Loads one cache line from address P to a location "closer" to the
        -:   51:   processor.  The selector I specifies the type of prefetch operation.  */
        -:   52:#ifdef __OPTIMIZE__
        -:   53:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:   54:_mm_prefetch (const void *__P, enum _mm_hint __I)
        -:   55:{
 12504999:   56:  __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
        -:   57:			   __I & 0x3, (__I & 0x10) >> 4);
        -:   58:}
        -:   59:#else
        -:   60:#define _mm_prefetch(P, I) \
        -:   61:  __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
        -:   62:#endif
        -:   63:
        -:   64:#ifndef __SSE__
        -:   65:#pragma GCC push_options
        -:   66:#pragma GCC target("sse")
        -:   67:#define __DISABLE_SSE__
        -:   68:#endif /* __SSE__ */
        -:   69:
        -:   70:/* The Intel API is flexible enough that we must allow aliasing with other
        -:   71:   vector types, and their scalar components.  */
        -:   72:typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
        -:   73:
        -:   74:/* Unaligned version of the same type.  */
        -:   75:typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
        -:   76:
        -:   77:/* Internal data types for implementing the intrinsics.  */
        -:   78:typedef float __v4sf __attribute__ ((__vector_size__ (16)));
        -:   79:
        -:   80:/* Create a selector for use with the SHUFPS instruction.  */
        -:   81:#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
        -:   82: (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
        -:   83:
        -:   84:/* Bits in the MXCSR.  */
        -:   85:#define _MM_EXCEPT_MASK       0x003f
        -:   86:#define _MM_EXCEPT_INVALID    0x0001
        -:   87:#define _MM_EXCEPT_DENORM     0x0002
        -:   88:#define _MM_EXCEPT_DIV_ZERO   0x0004
        -:   89:#define _MM_EXCEPT_OVERFLOW   0x0008
        -:   90:#define _MM_EXCEPT_UNDERFLOW  0x0010
        -:   91:#define _MM_EXCEPT_INEXACT    0x0020
        -:   92:
        -:   93:#define _MM_MASK_MASK         0x1f80
        -:   94:#define _MM_MASK_INVALID      0x0080
        -:   95:#define _MM_MASK_DENORM       0x0100
        -:   96:#define _MM_MASK_DIV_ZERO     0x0200
        -:   97:#define _MM_MASK_OVERFLOW     0x0400
        -:   98:#define _MM_MASK_UNDERFLOW    0x0800
        -:   99:#define _MM_MASK_INEXACT      0x1000
        -:  100:
        -:  101:#define _MM_ROUND_MASK        0x6000
        -:  102:#define _MM_ROUND_NEAREST     0x0000
        -:  103:#define _MM_ROUND_DOWN        0x2000
        -:  104:#define _MM_ROUND_UP          0x4000
        -:  105:#define _MM_ROUND_TOWARD_ZERO 0x6000
        -:  106:
        -:  107:#define _MM_FLUSH_ZERO_MASK   0x8000
        -:  108:#define _MM_FLUSH_ZERO_ON     0x8000
        -:  109:#define _MM_FLUSH_ZERO_OFF    0x0000
        -:  110:
        -:  111:/* Create an undefined vector.  */
        -:  112:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  113:_mm_undefined_ps (void)
        -:  114:{
        -:  115:#pragma GCC diagnostic push
        -:  116:#pragma GCC diagnostic ignored "-Winit-self"
        -:  117:  __m128 __Y = __Y;
        -:  118:#pragma GCC diagnostic pop
        -:  119:  return __Y;
        -:  120:}
        -:  121:
        -:  122:/* Create a vector of zeros.  */
        -:  123:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  124:_mm_setzero_ps (void)
        -:  125:{
        -:  126:  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
        -:  127:}
        -:  128:
        -:  129:/* Perform the respective operation on the lower SPFP (single-precision
        -:  130:   floating-point) values of A and B; the upper three SPFP values are
        -:  131:   passed through from A.  */
        -:  132:
        -:  133:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  134:_mm_add_ss (__m128 __A, __m128 __B)
        -:  135:{
        -:  136:  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
        -:  137:}
        -:  138:
        -:  139:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  140:_mm_sub_ss (__m128 __A, __m128 __B)
        -:  141:{
        -:  142:  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
        -:  143:}
        -:  144:
        -:  145:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  146:_mm_mul_ss (__m128 __A, __m128 __B)
        -:  147:{
        -:  148:  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
        -:  149:}
        -:  150:
        -:  151:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  152:_mm_div_ss (__m128 __A, __m128 __B)
        -:  153:{
        -:  154:  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
        -:  155:}
        -:  156:
        -:  157:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  158:_mm_sqrt_ss (__m128 __A)
        -:  159:{
        -:  160:  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
        -:  161:}
        -:  162:
        -:  163:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  164:_mm_rcp_ss (__m128 __A)
        -:  165:{
        -:  166:  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
        -:  167:}
        -:  168:
        -:  169:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  170:_mm_rsqrt_ss (__m128 __A)
        -:  171:{
        -:  172:  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
        -:  173:}
        -:  174:
        -:  175:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  176:_mm_min_ss (__m128 __A, __m128 __B)
        -:  177:{
        -:  178:  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
        -:  179:}
        -:  180:
        -:  181:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  182:_mm_max_ss (__m128 __A, __m128 __B)
        -:  183:{
        -:  184:  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
        -:  185:}
        -:  186:
        -:  187:/* Perform the respective operation on the four SPFP values in A and B.  */
        -:  188:
        -:  189:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  190:_mm_add_ps (__m128 __A, __m128 __B)
        -:  191:{
        -:  192:  return (__m128) ((__v4sf)__A + (__v4sf)__B);
        -:  193:}
        -:  194:
        -:  195:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  196:_mm_sub_ps (__m128 __A, __m128 __B)
        -:  197:{
        -:  198:  return (__m128) ((__v4sf)__A - (__v4sf)__B);
        -:  199:}
        -:  200:
        -:  201:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  202:_mm_mul_ps (__m128 __A, __m128 __B)
        -:  203:{
        -:  204:  return (__m128) ((__v4sf)__A * (__v4sf)__B);
        -:  205:}
        -:  206:
        -:  207:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  208:_mm_div_ps (__m128 __A, __m128 __B)
        -:  209:{
        -:  210:  return (__m128) ((__v4sf)__A / (__v4sf)__B);
        -:  211:}
        -:  212:
        -:  213:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  214:_mm_sqrt_ps (__m128 __A)
        -:  215:{
        -:  216:  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
        -:  217:}
        -:  218:
        -:  219:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  220:_mm_rcp_ps (__m128 __A)
        -:  221:{
        -:  222:  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
        -:  223:}
        -:  224:
        -:  225:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  226:_mm_rsqrt_ps (__m128 __A)
        -:  227:{
        -:  228:  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
        -:  229:}
        -:  230:
        -:  231:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  232:_mm_min_ps (__m128 __A, __m128 __B)
        -:  233:{
        -:  234:  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
        -:  235:}
        -:  236:
        -:  237:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  238:_mm_max_ps (__m128 __A, __m128 __B)
        -:  239:{
        -:  240:  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
        -:  241:}
        -:  242:
        -:  243:/* Perform logical bit-wise operations on 128-bit values.  */
        -:  244:
        -:  245:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  246:_mm_and_ps (__m128 __A, __m128 __B)
        -:  247:{
        -:  248:  return __builtin_ia32_andps (__A, __B);
        -:  249:}
        -:  250:
        -:  251:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  252:_mm_andnot_ps (__m128 __A, __m128 __B)
        -:  253:{
        -:  254:  return __builtin_ia32_andnps (__A, __B);
        -:  255:}
        -:  256:
        -:  257:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  258:_mm_or_ps (__m128 __A, __m128 __B)
        -:  259:{
        -:  260:  return __builtin_ia32_orps (__A, __B);
        -:  261:}
        -:  262:
        -:  263:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  264:_mm_xor_ps (__m128 __A, __m128 __B)
        -:  265:{
        -:  266:  return __builtin_ia32_xorps (__A, __B);
        -:  267:}
        -:  268:
        -:  269:/* Perform a comparison on the lower SPFP values of A and B.  If the
        -:  270:   comparison is true, place a mask of all ones in the result, otherwise a
        -:  271:   mask of zeros.  The upper three SPFP values are passed through from A.  */
        -:  272:
        -:  273:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  274:_mm_cmpeq_ss (__m128 __A, __m128 __B)
        -:  275:{
        -:  276:  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
        -:  277:}
        -:  278:
        -:  279:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  280:_mm_cmplt_ss (__m128 __A, __m128 __B)
        -:  281:{
        -:  282:  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
        -:  283:}
        -:  284:
        -:  285:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  286:_mm_cmple_ss (__m128 __A, __m128 __B)
        -:  287:{
        -:  288:  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
        -:  289:}
        -:  290:
        -:  291:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  292:_mm_cmpgt_ss (__m128 __A, __m128 __B)
        -:  293:{
        -:  294:  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
        -:  295:					(__v4sf)
        -:  296:					__builtin_ia32_cmpltss ((__v4sf) __B,
        -:  297:								(__v4sf)
        -:  298:								__A));
        -:  299:}
        -:  300:
        -:  301:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  302:_mm_cmpge_ss (__m128 __A, __m128 __B)
        -:  303:{
        -:  304:  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
        -:  305:					(__v4sf)
        -:  306:					__builtin_ia32_cmpless ((__v4sf) __B,
        -:  307:								(__v4sf)
        -:  308:								__A));
        -:  309:}
        -:  310:
        -:  311:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  312:_mm_cmpneq_ss (__m128 __A, __m128 __B)
        -:  313:{
        -:  314:  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
        -:  315:}
        -:  316:
        -:  317:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  318:_mm_cmpnlt_ss (__m128 __A, __m128 __B)
        -:  319:{
        -:  320:  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
        -:  321:}
        -:  322:
        -:  323:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  324:_mm_cmpnle_ss (__m128 __A, __m128 __B)
        -:  325:{
        -:  326:  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
        -:  327:}
        -:  328:
        -:  329:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  330:_mm_cmpngt_ss (__m128 __A, __m128 __B)
        -:  331:{
        -:  332:  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
        -:  333:					(__v4sf)
        -:  334:					__builtin_ia32_cmpnltss ((__v4sf) __B,
        -:  335:								 (__v4sf)
        -:  336:								 __A));
        -:  337:}
        -:  338:
        -:  339:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  340:_mm_cmpnge_ss (__m128 __A, __m128 __B)
        -:  341:{
        -:  342:  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
        -:  343:					(__v4sf)
        -:  344:					__builtin_ia32_cmpnless ((__v4sf) __B,
        -:  345:								 (__v4sf)
        -:  346:								 __A));
        -:  347:}
        -:  348:
        -:  349:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  350:_mm_cmpord_ss (__m128 __A, __m128 __B)
        -:  351:{
        -:  352:  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
        -:  353:}
        -:  354:
        -:  355:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  356:_mm_cmpunord_ss (__m128 __A, __m128 __B)
        -:  357:{
        -:  358:  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
        -:  359:}
        -:  360:
        -:  361:/* Perform a comparison on the four SPFP values of A and B.  For each
        -:  362:   element, if the comparison is true, place a mask of all ones in the
        -:  363:   result, otherwise a mask of zeros.  */
        -:  364:
        -:  365:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  366:_mm_cmpeq_ps (__m128 __A, __m128 __B)
        -:  367:{
        -:  368:  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
        -:  369:}
        -:  370:
        -:  371:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  372:_mm_cmplt_ps (__m128 __A, __m128 __B)
        -:  373:{
        -:  374:  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
        -:  375:}
        -:  376:
        -:  377:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  378:_mm_cmple_ps (__m128 __A, __m128 __B)
        -:  379:{
        -:  380:  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
        -:  381:}
        -:  382:
        -:  383:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  384:_mm_cmpgt_ps (__m128 __A, __m128 __B)
        -:  385:{
        -:  386:  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
        -:  387:}
        -:  388:
        -:  389:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  390:_mm_cmpge_ps (__m128 __A, __m128 __B)
        -:  391:{
        -:  392:  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
        -:  393:}
        -:  394:
        -:  395:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  396:_mm_cmpneq_ps (__m128 __A, __m128 __B)
        -:  397:{
        -:  398:  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
        -:  399:}
        -:  400:
        -:  401:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  402:_mm_cmpnlt_ps (__m128 __A, __m128 __B)
        -:  403:{
        -:  404:  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
        -:  405:}
        -:  406:
        -:  407:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  408:_mm_cmpnle_ps (__m128 __A, __m128 __B)
        -:  409:{
        -:  410:  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
        -:  411:}
        -:  412:
        -:  413:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  414:_mm_cmpngt_ps (__m128 __A, __m128 __B)
        -:  415:{
        -:  416:  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
        -:  417:}
        -:  418:
        -:  419:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  420:_mm_cmpnge_ps (__m128 __A, __m128 __B)
        -:  421:{
        -:  422:  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
        -:  423:}
        -:  424:
        -:  425:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  426:_mm_cmpord_ps (__m128 __A, __m128 __B)
        -:  427:{
        -:  428:  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
        -:  429:}
        -:  430:
        -:  431:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  432:_mm_cmpunord_ps (__m128 __A, __m128 __B)
        -:  433:{
        -:  434:  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
        -:  435:}
        -:  436:
        -:  437:/* Compare the lower SPFP values of A and B and return 1 if true
        -:  438:   and 0 if false.  */
        -:  439:
        -:  440:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  441:_mm_comieq_ss (__m128 __A, __m128 __B)
        -:  442:{
        -:  443:  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
        -:  444:}
        -:  445:
        -:  446:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  447:_mm_comilt_ss (__m128 __A, __m128 __B)
        -:  448:{
        -:  449:  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
        -:  450:}
        -:  451:
        -:  452:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  453:_mm_comile_ss (__m128 __A, __m128 __B)
        -:  454:{
        -:  455:  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
        -:  456:}
        -:  457:
        -:  458:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  459:_mm_comigt_ss (__m128 __A, __m128 __B)
        -:  460:{
        -:  461:  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
        -:  462:}
        -:  463:
        -:  464:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  465:_mm_comige_ss (__m128 __A, __m128 __B)
        -:  466:{
        -:  467:  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
        -:  468:}
        -:  469:
        -:  470:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  471:_mm_comineq_ss (__m128 __A, __m128 __B)
        -:  472:{
        -:  473:  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
        -:  474:}
        -:  475:
        -:  476:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  477:_mm_ucomieq_ss (__m128 __A, __m128 __B)
        -:  478:{
        -:  479:  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
        -:  480:}
        -:  481:
        -:  482:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  483:_mm_ucomilt_ss (__m128 __A, __m128 __B)
        -:  484:{
        -:  485:  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
        -:  486:}
        -:  487:
        -:  488:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  489:_mm_ucomile_ss (__m128 __A, __m128 __B)
        -:  490:{
        -:  491:  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
        -:  492:}
        -:  493:
        -:  494:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  495:_mm_ucomigt_ss (__m128 __A, __m128 __B)
        -:  496:{
        -:  497:  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
        -:  498:}
        -:  499:
        -:  500:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  501:_mm_ucomige_ss (__m128 __A, __m128 __B)
        -:  502:{
        -:  503:  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
        -:  504:}
        -:  505:
        -:  506:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  507:_mm_ucomineq_ss (__m128 __A, __m128 __B)
        -:  508:{
        -:  509:  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
        -:  510:}
        -:  511:
        -:  512:/* Convert the lower SPFP value to a 32-bit integer according to the current
        -:  513:   rounding mode.  */
        -:  514:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  515:_mm_cvtss_si32 (__m128 __A)
        -:  516:{
        -:  517:  return __builtin_ia32_cvtss2si ((__v4sf) __A);
        -:  518:}
        -:  519:
        -:  520:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  521:_mm_cvt_ss2si (__m128 __A)
        -:  522:{
        -:  523:  return _mm_cvtss_si32 (__A);
        -:  524:}
        -:  525:
        -:  526:#ifdef __x86_64__
        -:  527:/* Convert the lower SPFP value to a 32-bit integer according to the
        -:  528:   current rounding mode.  */
        -:  529:
        -:  530:/* Intel intrinsic.  */
        -:  531:extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  532:_mm_cvtss_si64 (__m128 __A)
        -:  533:{
        -:  534:  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
        -:  535:}
        -:  536:
        -:  537:/* Microsoft intrinsic.  */
        -:  538:extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  539:_mm_cvtss_si64x (__m128 __A)
        -:  540:{
        -:  541:  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
        -:  542:}
        -:  543:#endif
        -:  544:
        -:  545:/* Convert the two lower SPFP values to 32-bit integers according to the
        -:  546:   current rounding mode.  Return the integers in packed form.  */
        -:  547:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  548:_mm_cvtps_pi32 (__m128 __A)
        -:  549:{
        -:  550:  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
        -:  551:}
        -:  552:
        -:  553:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  554:_mm_cvt_ps2pi (__m128 __A)
        -:  555:{
        -:  556:  return _mm_cvtps_pi32 (__A);
        -:  557:}
        -:  558:
        -:  559:/* Truncate the lower SPFP value to a 32-bit integer.  */
        -:  560:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  561:_mm_cvttss_si32 (__m128 __A)
        -:  562:{
        -:  563:  return __builtin_ia32_cvttss2si ((__v4sf) __A);
        -:  564:}
        -:  565:
        -:  566:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  567:_mm_cvtt_ss2si (__m128 __A)
        -:  568:{
        -:  569:  return _mm_cvttss_si32 (__A);
        -:  570:}
        -:  571:
        -:  572:#ifdef __x86_64__
        -:  573:/* Truncate the lower SPFP value to a 32-bit integer.  */
        -:  574:
        -:  575:/* Intel intrinsic.  */
        -:  576:extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  577:_mm_cvttss_si64 (__m128 __A)
        -:  578:{
        -:  579:  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
        -:  580:}
        -:  581:
        -:  582:/* Microsoft intrinsic.  */
        -:  583:extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  584:_mm_cvttss_si64x (__m128 __A)
        -:  585:{
        -:  586:  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
        -:  587:}
        -:  588:#endif
        -:  589:
        -:  590:/* Truncate the two lower SPFP values to 32-bit integers.  Return the
        -:  591:   integers in packed form.  */
        -:  592:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  593:_mm_cvttps_pi32 (__m128 __A)
        -:  594:{
        -:  595:  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
        -:  596:}
        -:  597:
        -:  598:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  599:_mm_cvtt_ps2pi (__m128 __A)
        -:  600:{
        -:  601:  return _mm_cvttps_pi32 (__A);
        -:  602:}
        -:  603:
        -:  604:/* Convert B to a SPFP value and insert it as element zero in A.  */
        -:  605:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  606:_mm_cvtsi32_ss (__m128 __A, int __B)
        -:  607:{
        -:  608:  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
        -:  609:}
        -:  610:
        -:  611:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  612:_mm_cvt_si2ss (__m128 __A, int __B)
        -:  613:{
        -:  614:  return _mm_cvtsi32_ss (__A, __B);
        -:  615:}
        -:  616:
        -:  617:#ifdef __x86_64__
        -:  618:/* Convert B to a SPFP value and insert it as element zero in A.  */
        -:  619:
        -:  620:/* Intel intrinsic.  */
        -:  621:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  622:_mm_cvtsi64_ss (__m128 __A, long long __B)
        -:  623:{
        -:  624:  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
        -:  625:}
        -:  626:
        -:  627:/* Microsoft intrinsic.  */
        -:  628:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  629:_mm_cvtsi64x_ss (__m128 __A, long long __B)
        -:  630:{
        -:  631:  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
        -:  632:}
        -:  633:#endif
        -:  634:
        -:  635:/* Convert the two 32-bit values in B to SPFP form and insert them
        -:  636:   as the two lower elements in A.  */
        -:  637:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  638:_mm_cvtpi32_ps (__m128 __A, __m64 __B)
        -:  639:{
        -:  640:  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
        -:  641:}
        -:  642:
        -:  643:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  644:_mm_cvt_pi2ps (__m128 __A, __m64 __B)
        -:  645:{
        -:  646:  return _mm_cvtpi32_ps (__A, __B);
        -:  647:}
        -:  648:
        -:  649:/* Convert the four signed 16-bit values in A to SPFP form.  */
        -:  650:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  651:_mm_cvtpi16_ps (__m64 __A)
        -:  652:{
        -:  653:  __v4hi __sign;
        -:  654:  __v2si __hisi, __losi;
        -:  655:  __v4sf __zero, __ra, __rb;
        -:  656:
        -:  657:  /* This comparison against zero gives us a mask that can be used to
        -:  658:     fill in the missing sign bits in the unpack operations below, so
        -:  659:     that we get signed values after unpacking.  */
        -:  660:  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
        -:  661:
        -:  662:  /* Convert the four words to doublewords.  */
        -:  663:  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
        -:  664:  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
        -:  665:
        -:  666:  /* Convert the doublewords to floating point two at a time.  */
        -:  667:  __zero = (__v4sf) _mm_setzero_ps ();
        -:  668:  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
        -:  669:  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
        -:  670:
        -:  671:  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
        -:  672:}
        -:  673:
        -:  674:/* Convert the four unsigned 16-bit values in A to SPFP form.  */
        -:  675:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  676:_mm_cvtpu16_ps (__m64 __A)
        -:  677:{
        -:  678:  __v2si __hisi, __losi;
        -:  679:  __v4sf __zero, __ra, __rb;
        -:  680:
        -:  681:  /* Convert the four words to doublewords.  */
        -:  682:  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
        -:  683:  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
        -:  684:
        -:  685:  /* Convert the doublewords to floating point two at a time.  */
        -:  686:  __zero = (__v4sf) _mm_setzero_ps ();
        -:  687:  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
        -:  688:  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
        -:  689:
        -:  690:  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
        -:  691:}
        -:  692:
        -:  693:/* Convert the low four signed 8-bit values in A to SPFP form.  */
        -:  694:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  695:_mm_cvtpi8_ps (__m64 __A)
        -:  696:{
        -:  697:  __v8qi __sign;
        -:  698:
        -:  699:  /* This comparison against zero gives us a mask that can be used to
        -:  700:     fill in the missing sign bits in the unpack operations below, so
        -:  701:     that we get signed values after unpacking.  */
        -:  702:  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
        -:  703:
        -:  704:  /* Convert the four low bytes to words.  */
        -:  705:  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
        -:  706:
        -:  707:  return _mm_cvtpi16_ps(__A);
        -:  708:}
        -:  709:
        -:  710:/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
        -:  711:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  712:_mm_cvtpu8_ps(__m64 __A)
        -:  713:{
        -:  714:  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
        -:  715:  return _mm_cvtpu16_ps(__A);
        -:  716:}
        -:  717:
        -:  718:/* Convert the four signed 32-bit values in A and B to SPFP form.  */
        -:  719:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  720:_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
        -:  721:{
        -:  722:  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
        -:  723:  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
        -:  724:  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
        -:  725:  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
        -:  726:}
        -:  727:
        -:  728:/* Convert the four SPFP values in A to four signed 16-bit integers.  */
        -:  729:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  730:_mm_cvtps_pi16(__m128 __A)
        -:  731:{
        -:  732:  __v4sf __hisf = (__v4sf)__A;
        -:  733:  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
        -:  734:  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
        -:  735:  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
        -:  736:  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
        -:  737:}
        -:  738:
        -:  739:/* Convert the four SPFP values in A to four signed 8-bit integers.  */
        -:  740:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  741:_mm_cvtps_pi8(__m128 __A)
        -:  742:{
        -:  743:  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
        -:  744:  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
        -:  745:}
        -:  746:
        -:  747:/* Selects four specific SPFP values from A and B based on MASK.  */
        -:  748:#ifdef __OPTIMIZE__
        -:  749:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  750:_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
        -:  751:{
        -:  752:  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
        -:  753:}
        -:  754:#else
        -:  755:#define _mm_shuffle_ps(A, B, MASK)					\
        -:  756:  ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),			\
        -:  757:				   (__v4sf)(__m128)(B), (int)(MASK)))
        -:  758:#endif
        -:  759:
        -:  760:/* Selects and interleaves the upper two SPFP values from A and B.  */
        -:  761:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  762:_mm_unpackhi_ps (__m128 __A, __m128 __B)
        -:  763:{
        -:  764:  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
        -:  765:}
        -:  766:
        -:  767:/* Selects and interleaves the lower two SPFP values from A and B.  */
        -:  768:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  769:_mm_unpacklo_ps (__m128 __A, __m128 __B)
        -:  770:{
        -:  771:  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
        -:  772:}
        -:  773:
        -:  774:/* Sets the upper two SPFP values with 64-bits of data loaded from P;
        -:  775:   the lower two values are passed through from A.  */
        -:  776:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  777:_mm_loadh_pi (__m128 __A, __m64 const *__P)
        -:  778:{
        -:  779:  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
        -:  780:}
        -:  781:
        -:  782:/* Stores the upper two SPFP values of A into P.  */
        -:  783:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  784:_mm_storeh_pi (__m64 *__P, __m128 __A)
        -:  785:{
        -:  786:  __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
        -:  787:}
        -:  788:
        -:  789:/* Moves the upper two values of B into the lower two values of A.  */
        -:  790:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  791:_mm_movehl_ps (__m128 __A, __m128 __B)
        -:  792:{
        -:  793:  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
        -:  794:}
        -:  795:
        -:  796:/* Moves the lower two values of B into the upper two values of A.  */
        -:  797:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  798:_mm_movelh_ps (__m128 __A, __m128 __B)
        -:  799:{
        -:  800:  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
        -:  801:}
        -:  802:
        -:  803:/* Sets the lower two SPFP values with 64-bits of data loaded from P;
        -:  804:   the upper two values are passed through from A.  */
        -:  805:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  806:_mm_loadl_pi (__m128 __A, __m64 const *__P)
        -:  807:{
        -:  808:  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
        -:  809:}
        -:  810:
        -:  811:/* Stores the lower two SPFP values of A into P.  */
        -:  812:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  813:_mm_storel_pi (__m64 *__P, __m128 __A)
        -:  814:{
        -:  815:  __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
        -:  816:}
        -:  817:
        -:  818:/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
        -:  819:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  820:_mm_movemask_ps (__m128 __A)
        -:  821:{
        -:  822:  return __builtin_ia32_movmskps ((__v4sf)__A);
        -:  823:}
        -:  824:
        -:  825:/* Return the contents of the control register.  */
        -:  826:extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  827:_mm_getcsr (void)
        -:  828:{
        -:  829:  return __builtin_ia32_stmxcsr ();
        -:  830:}
        -:  831:
        -:  832:/* Read exception bits from the control register.  */
        -:  833:extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  834:_MM_GET_EXCEPTION_STATE (void)
        -:  835:{
        -:  836:  return _mm_getcsr() & _MM_EXCEPT_MASK;
        -:  837:}
        -:  838:
        -:  839:extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  840:_MM_GET_EXCEPTION_MASK (void)
        -:  841:{
        -:  842:  return _mm_getcsr() & _MM_MASK_MASK;
        -:  843:}
        -:  844:
        -:  845:extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  846:_MM_GET_ROUNDING_MODE (void)
        -:  847:{
        -:  848:  return _mm_getcsr() & _MM_ROUND_MASK;
        -:  849:}
        -:  850:
        -:  851:extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  852:_MM_GET_FLUSH_ZERO_MODE (void)
        -:  853:{
        -:  854:  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
        -:  855:}
        -:  856:
        -:  857:/* Set the control register to I.  */
        -:  858:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  859:_mm_setcsr (unsigned int __I)
        -:  860:{
        -:  861:  __builtin_ia32_ldmxcsr (__I);
        -:  862:}
        -:  863:
        -:  864:/* Set exception bits in the control register.  */
        -:  865:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  866:_MM_SET_EXCEPTION_STATE(unsigned int __mask)
        -:  867:{
        -:  868:  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
        -:  869:}
        -:  870:
        -:  871:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  872:_MM_SET_EXCEPTION_MASK (unsigned int __mask)
        -:  873:{
        -:  874:  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
        -:  875:}
        -:  876:
        -:  877:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  878:_MM_SET_ROUNDING_MODE (unsigned int __mode)
        -:  879:{
        -:  880:  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
        -:  881:}
        -:  882:
        -:  883:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  884:_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
        -:  885:{
        -:  886:  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
        -:  887:}
        -:  888:
        -:  889:/* Create a vector with element 0 as F and the rest zero.  */
        -:  890:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  891:_mm_set_ss (float __F)
        -:  892:{
        -:  893:  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
        -:  894:}
        -:  895:
        -:  896:/* Create a vector with all four elements equal to F.  */
        -:  897:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  898:_mm_set1_ps (float __F)
        -:  899:{
        -:  900:  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
        -:  901:}
        -:  902:
        -:  903:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  904:_mm_set_ps1 (float __F)
        -:  905:{
        -:  906:  return _mm_set1_ps (__F);
        -:  907:}
        -:  908:
        -:  909:/* Create a vector with element 0 as *P and the rest zero.  */
        -:  910:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  911:_mm_load_ss (float const *__P)
        -:  912:{
        -:  913:  return _mm_set_ss (*__P);
        -:  914:}
        -:  915:
        -:  916:/* Create a vector with all four elements equal to *P.  */
        -:  917:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  918:_mm_load1_ps (float const *__P)
        -:  919:{
        -:  920:  return _mm_set1_ps (*__P);
        -:  921:}
        -:  922:
        -:  923:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  924:_mm_load_ps1 (float const *__P)
        -:  925:{
        -:  926:  return _mm_load1_ps (__P);
        -:  927:}
        -:  928:
        -:  929:/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
        -:  930:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  931:_mm_load_ps (float const *__P)
        -:  932:{
        -:  933:  return *(__m128 *)__P;
        -:  934:}
        -:  935:
        -:  936:/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
        -:  937:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  938:_mm_loadu_ps (float const *__P)
        -:  939:{
        -:  940:  return *(__m128_u *)__P;
        -:  941:}
        -:  942:
        -:  943:/* Load four SPFP values in reverse order.  The address must be aligned.  */
        -:  944:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  945:_mm_loadr_ps (float const *__P)
        -:  946:{
        -:  947:  __v4sf __tmp = *(__v4sf *)__P;
        -:  948:  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
        -:  949:}
        -:  950:
        -:  951:/* Create the vector [Z Y X W].  */
        -:  952:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  953:_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
        -:  954:{
        -:  955:  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
        -:  956:}
        -:  957:
        -:  958:/* Create the vector [W X Y Z].  */
        -:  959:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  960:_mm_setr_ps (float __Z, float __Y, float __X, float __W)
        -:  961:{
        -:  962:  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
        -:  963:}
        -:  964:
        -:  965:/* Stores the lower SPFP value.  */
        -:  966:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  967:_mm_store_ss (float *__P, __m128 __A)
        -:  968:{
        -:  969:  *__P = ((__v4sf)__A)[0];
        -:  970:}
        -:  971:
        -:  972:extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  973:_mm_cvtss_f32 (__m128 __A)
        -:  974:{
        -:  975:  return ((__v4sf)__A)[0];
        -:  976:}
        -:  977:
        -:  978:/* Store four SPFP values.  The address must be 16-byte aligned.  */
        -:  979:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  980:_mm_store_ps (float *__P, __m128 __A)
        -:  981:{
        -:  982:  *(__m128 *)__P = __A;
        -:  983:}
        -:  984:
        -:  985:/* Store four SPFP values.  The address need not be 16-byte aligned.  */
        -:  986:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  987:_mm_storeu_ps (float *__P, __m128 __A)
        -:  988:{
        -:  989:  *(__m128_u *)__P = __A;
        -:  990:}
        -:  991:
        -:  992:/* Store the lower SPFP value across four words.  */
        -:  993:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -:  994:_mm_store1_ps (float *__P, __m128 __A)
        -:  995:{
        -:  996:  __v4sf __va = (__v4sf)__A;
        -:  997:  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
        -:  998:  _mm_storeu_ps (__P, __tmp);
        -:  999:}
        -: 1000:
        -: 1001:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1002:_mm_store_ps1 (float *__P, __m128 __A)
        -: 1003:{
        -: 1004:  _mm_store1_ps (__P, __A);
        -: 1005:}
        -: 1006:
        -: 1007:/* Store four SPFP values in reverse order.  The address must be aligned.  */
        -: 1008:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1009:_mm_storer_ps (float *__P, __m128 __A)
        -: 1010:{
        -: 1011:  __v4sf __va = (__v4sf)__A;
        -: 1012:  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
        -: 1013:  _mm_store_ps (__P, __tmp);
        -: 1014:}
        -: 1015:
        -: 1016:/* Sets the low SPFP value of A from the low value of B.  */
        -: 1017:extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1018:_mm_move_ss (__m128 __A, __m128 __B)
        -: 1019:{
        -: 1020:  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
        -: 1021:                                     __extension__
        -: 1022:                                     (__attribute__((__vector_size__ (16))) int)
        -: 1023:                                     {4,1,2,3});
        -: 1024:}
        -: 1025:
        -: 1026:/* Extracts one of the four words of A.  The selector N must be immediate.  */
        -: 1027:#ifdef __OPTIMIZE__
        -: 1028:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1029:_mm_extract_pi16 (__m64 const __A, int const __N)
        -: 1030:{
        -: 1031:  return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
        -: 1032:}
        -: 1033:
        -: 1034:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1035:_m_pextrw (__m64 const __A, int const __N)
        -: 1036:{
        -: 1037:  return _mm_extract_pi16 (__A, __N);
        -: 1038:}
        -: 1039:#else
        -: 1040:#define _mm_extract_pi16(A, N)	\
        -: 1041:  ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
        -: 1042:
        -: 1043:#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
        -: 1044:#endif
        -: 1045:
        -: 1046:/* Inserts word D into one of four words of A.  The selector N must be
        -: 1047:   immediate.  */
        -: 1048:#ifdef __OPTIMIZE__
        -: 1049:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1050:_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
        -: 1051:{
        -: 1052:  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
        -: 1053:}
        -: 1054:
        -: 1055:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1056:_m_pinsrw (__m64 const __A, int const __D, int const __N)
        -: 1057:{
        -: 1058:  return _mm_insert_pi16 (__A, __D, __N);
        -: 1059:}
        -: 1060:#else
        -: 1061:#define _mm_insert_pi16(A, D, N)				\
        -: 1062:  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),	\
        -: 1063:					(int)(D), (int)(N)))
        -: 1064:
        -: 1065:#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
        -: 1066:#endif
        -: 1067:
        -: 1068:/* Compute the element-wise maximum of signed 16-bit values.  */
        -: 1069:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1070:_mm_max_pi16 (__m64 __A, __m64 __B)
        -: 1071:{
        -: 1072:  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
        -: 1073:}
        -: 1074:
        -: 1075:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1076:_m_pmaxsw (__m64 __A, __m64 __B)
        -: 1077:{
        -: 1078:  return _mm_max_pi16 (__A, __B);
        -: 1079:}
        -: 1080:
        -: 1081:/* Compute the element-wise maximum of unsigned 8-bit values.  */
        -: 1082:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1083:_mm_max_pu8 (__m64 __A, __m64 __B)
        -: 1084:{
        -: 1085:  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
        -: 1086:}
        -: 1087:
        -: 1088:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1089:_m_pmaxub (__m64 __A, __m64 __B)
        -: 1090:{
        -: 1091:  return _mm_max_pu8 (__A, __B);
        -: 1092:}
        -: 1093:
        -: 1094:/* Compute the element-wise minimum of signed 16-bit values.  */
        -: 1095:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1096:_mm_min_pi16 (__m64 __A, __m64 __B)
        -: 1097:{
        -: 1098:  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
        -: 1099:}
        -: 1100:
        -: 1101:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1102:_m_pminsw (__m64 __A, __m64 __B)
        -: 1103:{
        -: 1104:  return _mm_min_pi16 (__A, __B);
        -: 1105:}
        -: 1106:
        -: 1107:/* Compute the element-wise minimum of unsigned 8-bit values.  */
        -: 1108:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1109:_mm_min_pu8 (__m64 __A, __m64 __B)
        -: 1110:{
        -: 1111:  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
        -: 1112:}
        -: 1113:
        -: 1114:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1115:_m_pminub (__m64 __A, __m64 __B)
        -: 1116:{
        -: 1117:  return _mm_min_pu8 (__A, __B);
        -: 1118:}
        -: 1119:
        -: 1120:/* Create an 8-bit mask of the signs of 8-bit values.  */
        -: 1121:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1122:_mm_movemask_pi8 (__m64 __A)
        -: 1123:{
        -: 1124:  return __builtin_ia32_pmovmskb ((__v8qi)__A);
        -: 1125:}
        -: 1126:
        -: 1127:extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1128:_m_pmovmskb (__m64 __A)
        -: 1129:{
        -: 1130:  return _mm_movemask_pi8 (__A);
        -: 1131:}
        -: 1132:
        -: 1133:/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
        -: 1134:   in B and produce the high 16 bits of the 32-bit results.  */
        -: 1135:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1136:_mm_mulhi_pu16 (__m64 __A, __m64 __B)
        -: 1137:{
        -: 1138:  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
        -: 1139:}
        -: 1140:
        -: 1141:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1142:_m_pmulhuw (__m64 __A, __m64 __B)
        -: 1143:{
        -: 1144:  return _mm_mulhi_pu16 (__A, __B);
        -: 1145:}
        -: 1146:
        -: 1147:/* Return a combination of the four 16-bit values in A.  The selector
        -: 1148:   must be an immediate.  */
        -: 1149:#ifdef __OPTIMIZE__
        -: 1150:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1151:_mm_shuffle_pi16 (__m64 __A, int const __N)
        -: 1152:{
        -: 1153:  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
        -: 1154:}
        -: 1155:
        -: 1156:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1157:_m_pshufw (__m64 __A, int const __N)
        -: 1158:{
        -: 1159:  return _mm_shuffle_pi16 (__A, __N);
        -: 1160:}
        -: 1161:#else
        -: 1162:#define _mm_shuffle_pi16(A, N) \
        -: 1163:  ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
        -: 1164:
        -: 1165:#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
        -: 1166:#endif
        -: 1167:
        -: 1168:/* Conditionally store byte elements of A into P.  The high bit of each
        -: 1169:   byte in the selector N determines whether the corresponding byte from
        -: 1170:   A is stored.  */
        -: 1171:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1172:_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
        -: 1173:{
        -: 1174:#ifdef __MMX_WITH_SSE__
        -: 1175:  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
        -: 1176:     64:127 at address __P.  */
        -: 1177:  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
        -: 1178:  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
        -: 1179:  /* Zero-extend __A and __N to 128 bits.  */
        -: 1180:  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
        -: 1181:  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
        -: 1182:
        -: 1183:  /* Check the alignment of __P.  */
        -: 1184:  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
        -: 1185:  if (offset)
        -: 1186:    {
        -: 1187:      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
        -: 1188:	 Otherwise, subtract __P by the misalignment.  */
        -: 1189:      if (offset > 8)
        -: 1190:	offset = 8;
        -: 1191:      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
        -: 1192:
        -: 1193:      /* Shift __A128 and __N128 to the left by the adjustment.  */
        -: 1194:      switch (offset)
        -: 1195:	{
        -: 1196:	case 1:
        -: 1197:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
        -: 1198:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
        -: 1199:	  break;
        -: 1200:	case 2:
        -: 1201:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
        -: 1202:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
        -: 1203:	  break;
        -: 1204:	case 3:
        -: 1205:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
        -: 1206:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
        -: 1207:	  break;
        -: 1208:	case 4:
        -: 1209:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
        -: 1210:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
        -: 1211:	  break;
        -: 1212:	case 5:
        -: 1213:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
        -: 1214:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
        -: 1215:	  break;
        -: 1216:	case 6:
        -: 1217:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
        -: 1218:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
        -: 1219:	  break;
        -: 1220:	case 7:
        -: 1221:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
        -: 1222:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
        -: 1223:	  break;
        -: 1224:	case 8:
        -: 1225:	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
        -: 1226:	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
        -: 1227:	  break;
        -: 1228:	default:
        -: 1229:	  break;
        -: 1230:	}
        -: 1231:    }
        -: 1232:  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
        -: 1233:#else
        -: 1234:  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
        -: 1235:#endif
        -: 1236:}
        -: 1237:
        -: 1238:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1239:_m_maskmovq (__m64 __A, __m64 __N, char *__P)
        -: 1240:{
        -: 1241:  _mm_maskmove_si64 (__A, __N, __P);
        -: 1242:}
        -: 1243:
        -: 1244:/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
        -: 1245:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1246:_mm_avg_pu8 (__m64 __A, __m64 __B)
        -: 1247:{
        -: 1248:  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
        -: 1249:}
        -: 1250:
        -: 1251:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1252:_m_pavgb (__m64 __A, __m64 __B)
        -: 1253:{
        -: 1254:  return _mm_avg_pu8 (__A, __B);
        -: 1255:}
        -: 1256:
        -: 1257:/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
        -: 1258:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1259:_mm_avg_pu16 (__m64 __A, __m64 __B)
        -: 1260:{
        -: 1261:  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
        -: 1262:}
        -: 1263:
        -: 1264:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1265:_m_pavgw (__m64 __A, __m64 __B)
        -: 1266:{
        -: 1267:  return _mm_avg_pu16 (__A, __B);
        -: 1268:}
        -: 1269:
        -: 1270:/* Compute the sum of the absolute differences of the unsigned 8-bit
        -: 1271:   values in A and B.  Return the value in the lower 16-bit word; the
        -: 1272:   upper words are cleared.  */
        -: 1273:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1274:_mm_sad_pu8 (__m64 __A, __m64 __B)
        -: 1275:{
        -: 1276:  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
        -: 1277:}
        -: 1278:
        -: 1279:extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1280:_m_psadbw (__m64 __A, __m64 __B)
        -: 1281:{
        -: 1282:  return _mm_sad_pu8 (__A, __B);
        -: 1283:}
        -: 1284:
        -: 1285:/* Stores the data in A to the address P without polluting the caches.  */
        -: 1286:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1287:_mm_stream_pi (__m64 *__P, __m64 __A)
        -: 1288:{
        -: 1289:  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
        -: 1290:}
        -: 1291:
        -: 1292:/* Likewise.  The address must be 16-byte aligned.  */
        -: 1293:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1294:_mm_stream_ps (float *__P, __m128 __A)
        -: 1295:{
        -: 1296:  __builtin_ia32_movntps (__P, (__v4sf)__A);
        -: 1297:}
        -: 1298:
        -: 1299:/* Guarantees that every preceding store is globally visible before
        -: 1300:   any subsequent store.  */
        -: 1301:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1302:_mm_sfence (void)
        -: 1303:{
        -: 1304:  __builtin_ia32_sfence ();
        -: 1305:}
        -: 1306:
        -: 1307:/* Transpose the 4x4 matrix composed of row[0-3].  */
        -: 1308:#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
        -: 1309:do {									\
        -: 1310:  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
        -: 1311:  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
        -: 1312:  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
        -: 1313:  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
        -: 1314:  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
        -: 1315:  (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
        -: 1316:  (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
        -: 1317:  (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
        -: 1318:  (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
        -: 1319:} while (0)
        -: 1320:
        -: 1321:/* For backward source compatibility.  */
        -: 1322:# include <emmintrin.h>
        -: 1323:
        -: 1324:#ifdef __DISABLE_SSE__
        -: 1325:#undef __DISABLE_SSE__
        -: 1326:#pragma GCC pop_options
        -: 1327:#endif /* __DISABLE_SSE__ */
        -: 1328:
        -: 1329:/* The execution of the next instruction is delayed by an implementation
        -: 1330:   specific amount of time.  The instruction does not modify the
        -: 1331:   architectural state.  This is after the pop_options pragma because
        -: 1332:   it does not require SSE support in the processor--the encoding is a
        -: 1333:   nop on processors that do not support it.  */
        -: 1334:extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
        -: 1335:_mm_pause (void)
        -: 1336:{
        -: 1337:  __builtin_ia32_pause ();
        -: 1338:}
        -: 1339:
        -: 1340:#endif /* _XMMINTRIN_H_INCLUDED */