I've had a look at your rather marvellous code and I've come up with the following implementation.

Code: Select all

```
section .data
;start: dw 0xEEEE, 0xEEEE,0xEEEE, 0xEEEE,0xEEEE, 0xEEEE,0xEEEE, 0xEEEE
start:dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x3333,0x3333,0x3333,0x3333,0x3333,0x3333,0x3333,0x3333
nw: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff
n: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555
w: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 1,1,1,1,1,1,1,1
se: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff
s: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555
e: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFfe;dw 1,1,1,1,1,1,1,1
result:dw 0,0,0,0,0,0,0,0
section .text
global CMAIN
CMAIN:
mov rbp, rsp; for correct debugging
; A cell looks like this:
; DCB 876
; AxA 504
; DCB 321
; we're using half-adder logic to store the 1, 2 and 4's count in 3 bitplanes.
movdqu xmm15,[start] ;***** xmm15 8-D'
movdqu xmm1,[s]
movdqu xmm2,[e]
movdqu xmm3,[se]
;now create 9 planes, each one slightly shifted.
;xmm0 is the center plane.
vpsrldq xmm3,xmm3,16-4 ;keep the bottom 2 rows of NW and shift them to the top
vpsrldq xmm6,xmm1,16-2 ;N5 keep the bottom 1 rows of N and shift them to the top.
vpsrldq xmm1,xmm1,16-4 ;N3 keep the bottom 2 rows of N and shift them to the top.
vpsrlw xmm2,xmm2,14 ;W6 keep the 2 rightmost rows of W
vpsrlw xmm3,xmm3,14 ;NW1 keep the 2 rightmost rows of NW
vpslldq xmm5,xmm15,4 ;main3
vpslldq xmm4,xmm15,2 ;main5
vpxor xmm14,xmm1,xmm5 ;***** xmm14 3 - D
vpxor xmm13,xmm4,xmm6 ;***** xmm13 5 - A'
;we are done with N, xmm1
vpsrlw xmm1,xmm2,1 ;W7
vpsllw xmm7,xmm15,1 ;main7
vpsllw xmm8,xmm13,1 ;main0+N0
vpsllw xmm9,xmm14,1 ;main2+N2
vpxor xmm12,xmm7,xmm1 ;***** xmm12 7 - C
vpsllw xmm7,xmm7,1 ;main6
vpxor xmm11,xmm7,xmm2 ;***** xmm11 6 - B'
vpslldq xmm10,xmm11,2 ;main4+w4
vpsrldq xmm7,xmm3,2 ;NW4
vpsllw xmm6,xmm6,2 ;N4
vpxor xmm10,xmm10,xmm7 ;main4+w4+N4
vpxor xmm10,xmm10,xmm6 ;***** xmm10 4 - A
vpslldq xmm1,xmm1,2 ;W0
vpsrlw xmm7,xmm7,1 ;NW0
vpxor xmm0,xmm8,xmm1 ;main0+N0+W0
vpxor xmm0,xmm0,xmm7 ;***** xmm0 0 - x
vpslldq xmm1,xmm2,4 ;W1
vpsllw xmm8,xmm9,1 ;main1+N1
vpxor xmm8,xmm8,xmm1 ;main1+N1+W1
vpxor xmm8,xmm8,xmm3 ;**** xmm8 1 - B
vpsrlw xmm4,xmm1,1 ;W2
vpsrlw xmm5,xmm3,1 ;NW2
vpxor xmm1,xmm4,xmm5 ;W2+NW2
vpxor xmm9,xmm9,xmm1 ;main2+N2+W2+NW2
;First get all the counts
vpxor xmm1,xmm8,xmm9 ;1's count of c
vpand xmm2,xmm8,xmm9 ;2's count of c
vpxor xmm3,xmm10,xmm11 ;1's count of a
vpand xmm4,xmm10,xmm11 ;2's count of a
vpxor xmm5,xmm12,xmm13 ;1's count of b
vpand xmm6,xmm12,xmm13 ;2's count of b
vpxor xmm7,xmm14,xmm15 ;1's count of d
vpand xmm8,xmm14,xmm15 ;2's count of d
;Now add the 1's together
vpand xmm10,xmm1,xmm3 ;2's count of CA
vpxor xmm1,xmm1,xmm3 ;combined ones of CA
vpand xmm12,xmm5,xmm7 ;2's count of BD
vpxor xmm5,xmm5,xmm7 ;combined ones of BD
vpand xmm14,xmm1,xmm5 ;2's count of CABD
vpxor xmm1,xmm1,xmm5 ;final count of the 1's
;now we need to add all the 2's together.
vpand xmm3,xmm2,xmm4 ;4's count of ca
vpxor xmm2,xmm2,xmm4 ;2's count of ca
vpand xmm5,xmm6,xmm8 ;4's count of bd
vpxor xmm6,xmm6,xmm8 ;2's count of bd
vpand xmm7,xmm10,xmm12 ;4's count of CABD
vpxor xmm8,xmm10,xmm12 ;2's count of CABD
vpand xmm9,xmm2,xmm6 ;4's count of cabd
vpxor xmm4,xmm2,xmm6 ;2's count of cabd
vpand xmm11,xmm8,xmm14 ;4's count of CABD+abcd
vpxor xmm12,xmm8,xmm14 ;2's count of CABD+abcd
;add all 4's
vpor xmm15,xmm3,xmm5
vpor xmm13,xmm7,xmm9
vpor xmm14,xmm11,xmm15
;add the 2's
vpxor xmm2,xmm12,xmm4
;final add
vpor xmm4,xmm14,xmm13
;now we have all the counts.
;register usage:
;xmm0 - original pattern
;xmm1 - count of the ones
;xmm2 - count of the twos
;xmm15 - count of the fours
vpand xmm0,xmm0,xmm2 ;anything with a 2 stays the same
vpand xmm3,xmm2,xmm1 ;anything with a 3 is alive
vpor xmm0,xmm0,xmm3 ;add the alive cells
vpandn xmm0,xmm4,xmm0 ;subtract the cells with 4 or more neighbors
;ret
; A cell looks like this:
; BCD 123
; AxA 405
; BCD 678
; we're using half-adder logic to store the 1, 2 and 4's count in 3 bitplanes.
movdqu xmm15,[start] ;***** xmm15 8-D'
movdqu xmm1,[n]
movdqu xmm2,[w]
movdqu xmm3,[nw]
;now create 9 planes, each one slightly shifted.
;xmm0 is the center plane.
vpslldq xmm3,xmm3,16-4 ;keep the bottom 2 rows of NW and shift them to the top
vpslldq xmm6,xmm1,16-2 ;N5 keep the bottom 1 rows of N and shift them to the top.
vpslldq xmm1,xmm1,16-4 ;N3 keep the bottom 2 rows of N and shift them to the top.
vpsllw xmm2,xmm2,14 ;W6 keep the 2 rightmost rows of W
vpsllw xmm3,xmm3,14 ;NW1 keep the 2 rightmost rows of NW
vpsrldq xmm5,xmm15,4 ;main3
vpsrldq xmm4,xmm15,2 ;main5
vpxor xmm14,xmm1,xmm5 ;***** xmm14 3 - D
vpxor xmm13,xmm4,xmm6 ;***** xmm13 5 - A'
;we are done with N, xmm1
vpsllw xmm1,xmm2,1 ;W7
vpsrlw xmm7,xmm15,1 ;main7
vpsrlw xmm8,xmm13,1 ;main0+N0
vpsrlw xmm9,xmm14,1 ;main2+N2
vpxor xmm12,xmm7,xmm1 ;***** xmm12 7 - C
vpsrlw xmm7,xmm7,1 ;main6
vpxor xmm11,xmm7,xmm2 ;***** xmm11 6 - B'
vpsrldq xmm10,xmm11,2 ;main4+w4
vpslldq xmm7,xmm3,2 ;NW4
vpsrlw xmm6,xmm6,2 ;N4
vpxor xmm10,xmm10,xmm7 ;main4+w4+N4
vpxor xmm10,xmm10,xmm6 ;***** xmm10 4 - A
vpsrldq xmm1,xmm1,2 ;W0
vpsllw xmm7,xmm7,1 ;NW0
vpxor xmm0,xmm8,xmm1 ;main0+N0+W0
vpxor xmm0,xmm0,xmm7 ;***** xmm0 0 - x
vpsrldq xmm1,xmm2,4 ;W1
vpsrlw xmm8,xmm9,1 ;main1+N1
vpxor xmm8,xmm8,xmm1 ;main1+N1+W1
vpxor xmm8,xmm8,xmm3 ;**** xmm8 1 - B
vpsllw xmm4,xmm1,1 ;W2
vpsllw xmm5,xmm3,1 ;NW2
vpxor xmm1,xmm4,xmm5 ;W2+NW2
vpxor xmm9,xmm9,xmm1 ;main2+N2+W2+NW2
;First get all the counts
vpxor xmm1,xmm8,xmm9 ;1's count of c
vpand xmm2,xmm8,xmm9 ;2's count of c
vpxor xmm3,xmm10,xmm11 ;1's count of a
vpand xmm4,xmm10,xmm11 ;2's count of a
vpxor xmm5,xmm12,xmm13 ;1's count of b
vpand xmm6,xmm12,xmm13 ;2's count of b
vpxor xmm7,xmm14,xmm15 ;1's count of d
vpand xmm8,xmm14,xmm15 ;2's count of d
;Now add the 1's together
vpand xmm10,xmm1,xmm3 ;2's count of CA
vpxor xmm1,xmm1,xmm3 ;combined ones of CA
vpand xmm12,xmm5,xmm7 ;2's count of BD
vpxor xmm5,xmm5,xmm7 ;combined ones of BD
vpand xmm14,xmm1,xmm5 ;2's count of CABD
vpxor xmm1,xmm1,xmm5 ;final count of the 1's
;now we need to add all the 2's together.
vpand xmm3,xmm2,xmm4 ;4's count of ca
vpxor xmm2,xmm2,xmm4 ;2's count of ca
vpand xmm5,xmm6,xmm8 ;4's count of bd
vpxor xmm6,xmm6,xmm8 ;2's count of bd
vpand xmm7,xmm10,xmm12 ;4's count of CABD
vpxor xmm8,xmm10,xmm12 ;2's count of CABD
vpand xmm9,xmm2,xmm6 ;4's count of cabd
vpxor xmm4,xmm2,xmm6 ;2's count of cabd
vpand xmm11,xmm8,xmm14 ;4's count of CABD+abcd
vpxor xmm12,xmm8,xmm14 ;2's count of CABD+abcd
;add all 4's
vpor xmm15,xmm3,xmm5
vpor xmm13,xmm7,xmm9
vpor xmm14,xmm11,xmm15
;add the 2's
vpxor xmm2,xmm12,xmm4
;final add
vpor xmm4,xmm14,xmm13
;now we have all the counts.
;register usage:
;xmm0 - original pattern
;xmm1 - count of the ones
;xmm2 - count of the twos
;xmm15 - count of the fours
vpand xmm7,xmm0,xmm2 ;anything with a 2 stays the same
vpand xmm3,xmm2,xmm1 ;anything with a 3 is alive
pxor xmm15,xmm15 ;zero xmm15 for the comparison against 0
vpor xmm7,xmm7,xmm3 ;add the alive cells
xor eax,eax
;bit 7 of al will be shifted into bit 16 of eax at a later stage and so on....
;The status bit work as follows
;7----6----5----4----3----2----1----0
;+ changed | alive +
;NW W N all NW W N all
vpandn xmm4,xmm4,xmm7 ;subtract the cells with 4 or more neighbors
vpxor xmm0,xmm7,xmm0 ;see which cells have changed
vpsrldq xmm5,xmm4,16-4 ;new N
vpsrldq xmm1,xmm0,16-4 ;changes in N
vpsrlw xmm6,xmm4,16-2 ;new W
vpsrlw xmm2,xmm0,16-2 ;changes in W
vpsrlw xmm7,xmm5,16-2 ;new NW
vpsrlw xmm3,xmm0,16-2 ;changes in NW
; test the 2 qwords in each vector against zero
vpcmpeqq xmm11, xmm1, xmm15
vpcmpeqq xmm12, xmm3, xmm15
vpcmpeqq xmm13, xmm5, xmm15
vpcmpeqq xmm14, xmm7, xmm15
; blend the results down into xmm10 word origin
vpblendw xmm10, xmm11, xmm12, 0xAA ; 3131 3131
vpblendw xmm13, xmm13, xmm14, 0xAA ; 7575 7575
vpblendw xmm10, xmm10, xmm13, 0xCC ; 7531 7531
; test the 2 qwords in each vector against zero
vpcmpeqq xmm11, xmm2, xmm15
vpcmpeqq xmm12, xmm4, xmm15
vpcmpeqq xmm13, xmm6, xmm15
vpcmpeqq xmm14, xmm8, xmm15
; blend the results down into xmm11 word origin
vpblendw xmm11, xmm11, xmm12, 0xAA ; 4242 4242
vpblendw xmm13, xmm13, xmm14, 0xAA ; 8686 8686
vpblendw xmm11, xmm11, xmm13, 0xCC ; 8642 8642
;combine bytes of xmm10 and xmm11 together into xmm10, byte wise
;
; xmm10 77553311 77553311
; xmm11 88664422 88664422 before shift
; xmm10 07050301 07050301
; xmm11 80604020 80604020 after shift
;result 87654321 87654321 result, note that HL don't have to be the same
vpsrlw xmm10,xmm10,8
vpsllw xmm11,xmm11,8
vpor xmm10,xmm10,xmm11
;combine the low and high dqword to make sure both are zero.
vpsrldq xmm11,xmm10,8
vpand xmm10,xmm11
vpmovmskb eax,xmm10
not eax
;we now have a bitmap of area's that are alive and changing.
;count the number of live and changed cells in the result.
;count the live cells
vpsrldq xmm8,xmm7,8 ;xmm7 = result, xmm8-high result
movq rdx,xmm7 ;low part
movq rcx,xmm8 ;high part
popcnt rdx,rdx ;count the first half
popcnt rcx,rcx ;ecx = high bytes
add ecx,edx ;ecx = live cell count 0..128
vpsrldq xmm9,xmm0,8 ;xmm0 = changes, xmm9-high changes
movq rdx,xmm0 ;low part
movq r8,xmm9 ;high part
popcnt rdx,rdx ;count the low cells
popcnt r8,r8 ;count the high cells
add rdx,r8 ;get the changed bit count 0..128
```

This is the same trick that qlife/life32 and henselLife use.

With some shifting I can then create 8 different bitmaps for the neighborhoods and lookup 128 bits in one go.

AVX2 will allow 256 bits in one go, but my CPU runs ymm code twice as slow as xmm code, so I skipped the AVX2 for now.

If you add the bitplane for the count of 8 neighborhood cells, you can cover all the totalistic variants of life.

On the laptop it runs 128 bits in 38 cycles which is impressive.

With a bit more manipulation I can do the bookkeeping for the 128-bit units as well and even throw in a cell counting, but that will not be in the main running code.

I plan to use the code for a display program and pattern searcher, but am already very happy with the result.

It will also run on a GPU with some adjustments which should speed things up quite a bit more.

Anyway, thanks for your suggestion of using bitplanes to keep track of the 1,2,4,8 neighborhood bitcounts.

You can fix the problems with memory access time by doing more bookkeeping inside the calculations and use these to eliminate calculating stable/dead/p2 units.

Also staggering your even and odd generations cuts the memory load more than half (you only have to access 3 neighbor-units, not

.

Finally by calculating all 128(256) bits and not 84(196) you'll spend more time calculating but save double on memory access piecing patterns together.

Finally by accessing the odd generation units in the opposite order of the even units you'll optimize the cache coherency.

(This is what qlife (golly) does).