I've had a look at your rather marvellous code and I've come up with the following implementation.

`section .data`

;start: dw 0xEEEE, 0xEEEE,0xEEEE, 0xEEEE,0xEEEE, 0xEEEE,0xEEEE, 0xEEEE

start:dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x3333,0x3333,0x3333,0x3333,0x3333,0x3333,0x3333,0x3333

nw: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff

n: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555

w: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 1,1,1,1,1,1,1,1

se: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff

s: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff;dw 0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555,0x5555

e: dw 0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFff,0xFFfe;dw 1,1,1,1,1,1,1,1

result:dw 0,0,0,0,0,0,0,0

section .text

global CMAIN

CMAIN:

mov rbp, rsp; for correct debugging

; A cell looks like this:

; DCB 876

; AxA 504

; DCB 321

; we're using half-adder logic to store the 1, 2 and 4's count in 3 bitplanes.

movdqu xmm15,[start] ;***** xmm15 8-D'

movdqu xmm1,[s]

movdqu xmm2,[e]

movdqu xmm3,[se]

;now create 9 planes, each one slightly shifted.

;xmm0 is the center plane.

vpsrldq xmm3,xmm3,16-4 ;keep the bottom 2 rows of NW and shift them to the top

vpsrldq xmm6,xmm1,16-2 ;N5 keep the bottom 1 rows of N and shift them to the top.

vpsrldq xmm1,xmm1,16-4 ;N3 keep the bottom 2 rows of N and shift them to the top.

vpsrlw xmm2,xmm2,14 ;W6 keep the 2 rightmost rows of W

vpsrlw xmm3,xmm3,14 ;NW1 keep the 2 rightmost rows of NW

vpslldq xmm5,xmm15,4 ;main3

vpslldq xmm4,xmm15,2 ;main5

vpxor xmm14,xmm1,xmm5 ;***** xmm14 3 - D

vpxor xmm13,xmm4,xmm6 ;***** xmm13 5 - A'

;we are done with N, xmm1

vpsrlw xmm1,xmm2,1 ;W7

vpsllw xmm7,xmm15,1 ;main7

vpsllw xmm8,xmm13,1 ;main0+N0

vpsllw xmm9,xmm14,1 ;main2+N2

vpxor xmm12,xmm7,xmm1 ;***** xmm12 7 - C

vpsllw xmm7,xmm7,1 ;main6

vpxor xmm11,xmm7,xmm2 ;***** xmm11 6 - B'

vpslldq xmm10,xmm11,2 ;main4+w4

vpsrldq xmm7,xmm3,2 ;NW4

vpsllw xmm6,xmm6,2 ;N4

vpxor xmm10,xmm10,xmm7 ;main4+w4+N4

vpxor xmm10,xmm10,xmm6 ;***** xmm10 4 - A

vpslldq xmm1,xmm1,2 ;W0

vpsrlw xmm7,xmm7,1 ;NW0

vpxor xmm0,xmm8,xmm1 ;main0+N0+W0

vpxor xmm0,xmm0,xmm7 ;***** xmm0 0 - x

vpslldq xmm1,xmm2,4 ;W1

vpsllw xmm8,xmm9,1 ;main1+N1

vpxor xmm8,xmm8,xmm1 ;main1+N1+W1

vpxor xmm8,xmm8,xmm3 ;**** xmm8 1 - B

vpsrlw xmm4,xmm1,1 ;W2

vpsrlw xmm5,xmm3,1 ;NW2

vpxor xmm1,xmm4,xmm5 ;W2+NW2

vpxor xmm9,xmm9,xmm1 ;main2+N2+W2+NW2

;First get all the counts

vpxor xmm1,xmm8,xmm9 ;1's count of c

vpand xmm2,xmm8,xmm9 ;2's count of c

vpxor xmm3,xmm10,xmm11 ;1's count of a

vpand xmm4,xmm10,xmm11 ;2's count of a

vpxor xmm5,xmm12,xmm13 ;1's count of b

vpand xmm6,xmm12,xmm13 ;2's count of b

vpxor xmm7,xmm14,xmm15 ;1's count of d

vpand xmm8,xmm14,xmm15 ;2's count of d

;Now add the 1's together

vpand xmm10,xmm1,xmm3 ;2's count of CA

vpxor xmm1,xmm1,xmm3 ;combined ones of CA

vpand xmm12,xmm5,xmm7 ;2's count of BD

vpxor xmm5,xmm5,xmm7 ;combined ones of BD

vpand xmm14,xmm1,xmm5 ;2's count of CABD

vpxor xmm1,xmm1,xmm5 ;final count of the 1's

;now we need to add all the 2's together.

vpand xmm3,xmm2,xmm4 ;4's count of ca

vpxor xmm2,xmm2,xmm4 ;2's count of ca

vpand xmm5,xmm6,xmm8 ;4's count of bd

vpxor xmm6,xmm6,xmm8 ;2's count of bd

vpand xmm7,xmm10,xmm12 ;4's count of CABD

vpxor xmm8,xmm10,xmm12 ;2's count of CABD

vpand xmm9,xmm2,xmm6 ;4's count of cabd

vpxor xmm4,xmm2,xmm6 ;2's count of cabd

vpand xmm11,xmm8,xmm14 ;4's count of CABD+abcd

vpxor xmm12,xmm8,xmm14 ;2's count of CABD+abcd

;add all 4's

vpor xmm15,xmm3,xmm5

vpor xmm13,xmm7,xmm9

vpor xmm14,xmm11,xmm15

;add the 2's

vpxor xmm2,xmm12,xmm4

;final add

vpor xmm4,xmm14,xmm13

;now we have all the counts.

;register usage:

;xmm0 - original pattern

;xmm1 - count of the ones

;xmm2 - count of the twos

;xmm15 - count of the fours

vpand xmm0,xmm0,xmm2 ;anything with a 2 stays the same

vpand xmm3,xmm2,xmm1 ;anything with a 3 is alive

vpor xmm0,xmm0,xmm3 ;add the alive cells

vpandn xmm0,xmm4,xmm0 ;subtract the cells with 4 or more neighbors

;ret

; A cell looks like this:

; BCD 123

; AxA 405

; BCD 678

; we're using half-adder logic to store the 1, 2 and 4's count in 3 bitplanes.

movdqu xmm15,[start] ;***** xmm15 8-D'

movdqu xmm1,[n]

movdqu xmm2,[w]

movdqu xmm3,[nw]

;now create 9 planes, each one slightly shifted.

;xmm0 is the center plane.

vpslldq xmm3,xmm3,16-4 ;keep the bottom 2 rows of NW and shift them to the top

vpslldq xmm6,xmm1,16-2 ;N5 keep the bottom 1 rows of N and shift them to the top.

vpslldq xmm1,xmm1,16-4 ;N3 keep the bottom 2 rows of N and shift them to the top.

vpsllw xmm2,xmm2,14 ;W6 keep the 2 rightmost rows of W

vpsllw xmm3,xmm3,14 ;NW1 keep the 2 rightmost rows of NW

vpsrldq xmm5,xmm15,4 ;main3

vpsrldq xmm4,xmm15,2 ;main5

vpxor xmm14,xmm1,xmm5 ;***** xmm14 3 - D

vpxor xmm13,xmm4,xmm6 ;***** xmm13 5 - A'

;we are done with N, xmm1

vpsllw xmm1,xmm2,1 ;W7

vpsrlw xmm7,xmm15,1 ;main7

vpsrlw xmm8,xmm13,1 ;main0+N0

vpsrlw xmm9,xmm14,1 ;main2+N2

vpxor xmm12,xmm7,xmm1 ;***** xmm12 7 - C

vpsrlw xmm7,xmm7,1 ;main6

vpxor xmm11,xmm7,xmm2 ;***** xmm11 6 - B'

vpsrldq xmm10,xmm11,2 ;main4+w4

vpslldq xmm7,xmm3,2 ;NW4

vpsrlw xmm6,xmm6,2 ;N4

vpxor xmm10,xmm10,xmm7 ;main4+w4+N4

vpxor xmm10,xmm10,xmm6 ;***** xmm10 4 - A

vpsrldq xmm1,xmm1,2 ;W0

vpsllw xmm7,xmm7,1 ;NW0

vpxor xmm0,xmm8,xmm1 ;main0+N0+W0

vpxor xmm0,xmm0,xmm7 ;***** xmm0 0 - x

vpsrldq xmm1,xmm2,4 ;W1

vpsrlw xmm8,xmm9,1 ;main1+N1

vpxor xmm8,xmm8,xmm1 ;main1+N1+W1

vpxor xmm8,xmm8,xmm3 ;**** xmm8 1 - B

vpsllw xmm4,xmm1,1 ;W2

vpsllw xmm5,xmm3,1 ;NW2

vpxor xmm1,xmm4,xmm5 ;W2+NW2

vpxor xmm9,xmm9,xmm1 ;main2+N2+W2+NW2

;First get all the counts

vpxor xmm1,xmm8,xmm9 ;1's count of c

vpand xmm2,xmm8,xmm9 ;2's count of c

vpxor xmm3,xmm10,xmm11 ;1's count of a

vpand xmm4,xmm10,xmm11 ;2's count of a

vpxor xmm5,xmm12,xmm13 ;1's count of b

vpand xmm6,xmm12,xmm13 ;2's count of b

vpxor xmm7,xmm14,xmm15 ;1's count of d

vpand xmm8,xmm14,xmm15 ;2's count of d

;Now add the 1's together

vpand xmm10,xmm1,xmm3 ;2's count of CA

vpxor xmm1,xmm1,xmm3 ;combined ones of CA

vpand xmm12,xmm5,xmm7 ;2's count of BD

vpxor xmm5,xmm5,xmm7 ;combined ones of BD

vpand xmm14,xmm1,xmm5 ;2's count of CABD

vpxor xmm1,xmm1,xmm5 ;final count of the 1's

;now we need to add all the 2's together.

vpand xmm3,xmm2,xmm4 ;4's count of ca

vpxor xmm2,xmm2,xmm4 ;2's count of ca

vpand xmm5,xmm6,xmm8 ;4's count of bd

vpxor xmm6,xmm6,xmm8 ;2's count of bd

vpand xmm7,xmm10,xmm12 ;4's count of CABD

vpxor xmm8,xmm10,xmm12 ;2's count of CABD

vpand xmm9,xmm2,xmm6 ;4's count of cabd

vpxor xmm4,xmm2,xmm6 ;2's count of cabd

vpand xmm11,xmm8,xmm14 ;4's count of CABD+abcd

vpxor xmm12,xmm8,xmm14 ;2's count of CABD+abcd

;add all 4's

vpor xmm15,xmm3,xmm5

vpor xmm13,xmm7,xmm9

vpor xmm14,xmm11,xmm15

;add the 2's

vpxor xmm2,xmm12,xmm4

;final add

vpor xmm4,xmm14,xmm13

;now we have all the counts.

;register usage:

;xmm0 - original pattern

;xmm1 - count of the ones

;xmm2 - count of the twos

;xmm15 - count of the fours

vpand xmm7,xmm0,xmm2 ;anything with a 2 stays the same

vpand xmm3,xmm2,xmm1 ;anything with a 3 is alive

pxor xmm15,xmm15 ;zero xmm15 for the comparison against 0

vpor xmm7,xmm7,xmm3 ;add the alive cells

xor eax,eax

;bit 7 of al will be shifted into bit 16 of eax at a later stage and so on....

;The status bit work as follows

;7----6----5----4----3----2----1----0

;+ changed | alive +

;NW W N all NW W N all

vpandn xmm4,xmm4,xmm7 ;subtract the cells with 4 or more neighbors

vpxor xmm0,xmm7,xmm0 ;see which cells have changed

vpsrldq xmm5,xmm4,16-4 ;new N

vpsrldq xmm1,xmm0,16-4 ;changes in N

vpsrlw xmm6,xmm4,16-2 ;new W

vpsrlw xmm2,xmm0,16-2 ;changes in W

vpsrlw xmm7,xmm5,16-2 ;new NW

vpsrlw xmm3,xmm0,16-2 ;changes in NW

; test the 2 qwords in each vector against zero

vpcmpeqq xmm11, xmm1, xmm15

vpcmpeqq xmm12, xmm3, xmm15

vpcmpeqq xmm13, xmm5, xmm15

vpcmpeqq xmm14, xmm7, xmm15

; blend the results down into xmm10 word origin

vpblendw xmm10, xmm11, xmm12, 0xAA ; 3131 3131

vpblendw xmm13, xmm13, xmm14, 0xAA ; 7575 7575

vpblendw xmm10, xmm10, xmm13, 0xCC ; 7531 7531

; test the 2 qwords in each vector against zero

vpcmpeqq xmm11, xmm2, xmm15

vpcmpeqq xmm12, xmm4, xmm15

vpcmpeqq xmm13, xmm6, xmm15

vpcmpeqq xmm14, xmm8, xmm15

; blend the results down into xmm11 word origin

vpblendw xmm11, xmm11, xmm12, 0xAA ; 4242 4242

vpblendw xmm13, xmm13, xmm14, 0xAA ; 8686 8686

vpblendw xmm11, xmm11, xmm13, 0xCC ; 8642 8642

;combine bytes of xmm10 and xmm11 together into xmm10, byte wise

;

; xmm10 77553311 77553311

; xmm11 88664422 88664422 before shift

; xmm10 07050301 07050301

; xmm11 80604020 80604020 after shift

;result 87654321 87654321 result, note that HL don't have to be the same

vpsrlw xmm10,xmm10,8

vpsllw xmm11,xmm11,8

vpor xmm10,xmm10,xmm11

;combine the low and high dqword to make sure both are zero.

vpsrldq xmm11,xmm10,8

vpand xmm10,xmm11

vpmovmskb eax,xmm10

not eax

;we now have a bitmap of area's that are alive and changing.

;count the number of live and changed cells in the result.

;count the live cells

vpsrldq xmm8,xmm7,8 ;xmm7 = result, xmm8-high result

movq rdx,xmm7 ;low part

movq rcx,xmm8 ;high part

popcnt rdx,rdx ;count the first half

popcnt rcx,rcx ;ecx = high bytes

add ecx,edx ;ecx = live cell count 0..128

vpsrldq xmm9,xmm0,8 ;xmm0 = changes, xmm9-high changes

movq rdx,xmm0 ;low part

movq r8,xmm9 ;high part

popcnt rdx,rdx ;count the low cells

popcnt r8,r8 ;count the high cells

add rdx,r8 ;get the changed bit count 0..128

This is the same trick that qlife/life32 and henselLife use.

With some shifting I can then create 8 different bitmaps for the neighborhoods and lookup 128 bits in one go.

AVX2 will allow 256 bits in one go, but my CPU runs ymm code twice as slow as xmm code, so I skipped the AVX2 for now.

If you add the bitplane for the count of 8 neighborhood cells, you can cover all the totalistic variants of life.

On the laptop it runs 128 bits in 38 cycles which is impressive.

With a bit more manipulation I can do the bookkeeping for the 128-bit units as well and even throw in a cell counting, but that will not be in the main running code.

I plan to use the code for a display program and pattern searcher, but am already very happy with the result.

It will also run on a GPU with some adjustments which should speed things up quite a bit more.

Anyway, thanks for your suggestion of using bitplanes to keep track of the 1,2,4,8 neighborhood bitcounts.

You can fix the problems with memory access time by doing more bookkeeping inside the calculations and use these to eliminate calculating stable/dead/p2 units.

Also staggering your even and odd generations cuts the memory load more than half (you only have to access 3 neighbor-units, not

.

Finally by calculating all 128(256) bits and not 84(196) you'll spend more time calculating but save double on memory access piecing patterns together.

Finally by accessing the odd generation units in the opposite order of the even units you'll optimize the cache coherency.

(This is what qlife (golly) does).