#include #include #include #include #include #include #include #include typedef struct { double x; double y; } point; double sqr(double x) { return x*x; } double dist(double x[], double y[], int i, int j) { return sqrt(sqr(x[i]-x[j])+ sqr(y[i]-y[j])); } double DistSqrd(point cities[], int i, int j) { return (sqr(cities[i].x-cities[j].x)+ sqr(cities[i].y-cities[j].y)); } void swap(double *p, double *q) { double tmp=*p; *p = *q; *q = tmp; } static __inline double _mm256_castpd256_sd(__m256d x) { double y; _mm_store_sd(&y, _mm256_castpd256_pd128(x)); return y; } // tourx and toury aligned on 32-byte boundary. // Alignment is not necessary for correctness, but very important for execution speed // Both tourx[] and toury[] should have space for 4 guard values at the end void tsp(point cities[], double tourx[], double toury[], int ncities) { int i; tourx[0] = cities[ncities-1].x; toury[0] = cities[ncities-1].y; for (i=1; i= ncities) break; min_k = k; } } dist_va[0] = minDist; } double *px = &tourx[k]; double *py = &toury[k]; __m256d minDist_v = _mm256_broadcast_sd(&dist_va[0]); while (_mm256_castpd256_sd(minDist_v) != 0) // finish when found two cities in the same place { int le_msk; do { __m256d x = _mm256_loadu_pd(px); __m256d y = _mm256_loadu_pd(py); __m256d dx_v = _mm256_sub_pd(x, thisx_v); __m256d dy_v = _mm256_sub_pd(y, thisy_v); _mm_prefetch(&px[40], 0); _mm_prefetch(&py[40], 0); px += 4; py += 4; dist_v = _mm256_add_pd(_mm256_mul_pd(dx_v, dx_v),_mm256_mul_pd(dy_v, dy_v)); __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ); le_msk = _mm256_movemask_pd(le_v); } while (le_msk == 0); _mm256_storeu_pd(dist_va, dist_v); int prev_k = px - tourx - 4; do { #ifdef MSVC long unsigned ki; _BitScanForward(&ki, le_msk); #else int ki = __builtin_ctz(le_msk); #endif minDist_v = _mm256_broadcast_sd(&dist_va[ki]); int nxt_k = prev_k + ki; if (nxt_k < ncities) min_k = nxt_k; __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LT_OQ); // LT rather than LE, to guarantee a forward progress le_msk = _mm256_movemask_pd(le_v); } while (le_msk != 0); } swap(&tourx[i],&tourx[min_k]); swap(&toury[i],&toury[min_k]); } } int main(int argc, char *argv[]) { int i, ncities; point *cities; double *tourx; double *toury; FILE *psfile; double sumdist = 0.0; if (argc!=2) { fprintf(stderr, "usage: %s ", argv[0]); exit(1); } ncities = atoi(argv[1]); cities = alloca(ncities*sizeof(point)); tourx=alloca((ncities+4)*sizeof(double))+4*sizeof(double); toury=alloca(ncities*sizeof(double)); for (i=0; i