spstabilize4.c (1965B)
1 #include <rcx/all.h> 2 #include <rcx/simd.h> 3 #include <stdio.h> 4 5 #include "common.h" 6 7 #ifndef R_HAVE_AVX2 8 #error "AVX2 support required" 9 #endif 10 11 #define USAGE "usage: %s\n" 12 13 Sandpile 14 stabilize(Sandpile sp) { 15 usize w = sp.w; 16 usize h = sp.h; 17 18 u32 *sand[2]; 19 sand[0] = sp.sand; 20 sand[1] = r_eallocz((w + 2) * (h + 2) * sizeof(u32)); 21 22 isize nxv = (isize)w / 8; // Number of x vectors that fit in w 23 v8u32 v3 = v8u32_fill(3); 24 for (usize i = 0;; i = !i) { 25 usize unstable = 0; 26 27 for (isize y = 1; y <= h; y++) { 28 isize j = y * ((isize)w + 2) + 1; 29 30 for (isize xv = 0; xv < nxv; xv++, j += 8) { 31 v8u32 a = v8u32_loadu((v8u32a1 *)&sand[i][j]); 32 a = v8u32_and(a, v3); 33 34 #define ADD(dx, dy) \ 35 do { \ 36 isize dj = (dy) * ((isize)w + 2) + (dx); \ 37 v8u32 b = v8u32_loadu((v8u32a1 *)&sand[i][j + dj]); \ 38 b = v8u32_sri(b, 2); \ 39 a = v8u32_add(a, b); \ 40 } while (0) 41 ADD(+1, +0); 42 ADD(+0, +1); 43 ADD(-1, +0); 44 ADD(+0, -1); 45 #undef ADD 46 47 v8u32 g = v8u32_cmpgt(a, v3); 48 unstable += !v8u32_testz(g, g); 49 50 v8u32_storeu((v8u32a1 *)&sand[!i][j], a); 51 } 52 53 // TODO: Try dealing with tail with masked vector instead? Note 54 // that this would require a minimum width/height of 3. 55 for (isize x = 8*nxv; x < (isize)w; x++, j++) { 56 u32 a = sand[i][j]; 57 a = a & 3; 58 59 #define ADD(dx, dy) \ 60 do { \ 61 isize dj = (dy) * ((isize)w + 2) + (dx); \ 62 u32 b = sand[i][j + dj]; \ 63 b = b >> 2; \ 64 a = a + b; \ 65 } while (0) 66 ADD(+1, +0); 67 ADD(+0, +1); 68 ADD(-1, +0); 69 ADD(+0, -1); 70 #undef ADD 71 72 unstable += a > 3; 73 74 sand[!i][j] = a; 75 } 76 } 77 78 if (!unstable) { 79 free(sand[i]); 80 return (Sandpile){.w = w, .h = h, .sand = sand[!i]}; 81 } 82 } 83 } 84 85 int 86 main(int argc, char **argv) { 87 CHECK_FOR_HELP_OPTION(USAGE, argc, argv); 88 89 if (argc != 1) { 90 fprintf(stderr, USAGE, argv[0]); 91 return 1; 92 } 93 94 Sandpile sp = sp_fread(stdin); 95 sp = stabilize(sp); 96 sp_fwrite(stdout, sp); 97 }