pop count preko pshufb instrukcije

bmaxa

Legenda
Poruka
70.815
Ovo je korisno za starije Intel procesore koji nemaju popcnt instrukciju (2006+)
Amd je ovu instrukciju uveo otprilike kad i popcnt tako da kod njih to i nije toliko bitno. Uporedno sam dao i avx2 implementaciju i popcnt instrukciju da se uporedi.

Kod:
format elf64
section '.text' executable align 16

public init_lut
public pop_count_native
public pop_count_pshufb
public pop_count_pshufb_avx2
public init_random
align 16
pop_count_pshufb:
        xor eax,eax
    mov rcx,mask
        movdqa xmm6,dqword[rcx]
    mov rcx,lut
        movdqa xmm7,dqword[rcx]
.L0:
        mov ecx,8
        pxor xmm4,xmm4
.L1:
        movdqu xmm0,[rdi]
        movdqa xmm1,xmm0
        psrlw xmm1,4 ; shift high nimble to low
        pand xmm0,xmm6 ; 
        pand xmm1,xmm6 ; mask nibbles
        movdqa xmm2,xmm7
        movdqa xmm3,xmm7
        pshufb xmm2,xmm0 ; 
        pshufb xmm3,xmm1 ; place pop counts
        paddb xmm2,xmm3 ;
        paddb xmm4,xmm2 ; sum them
        add rdi,16
        sub rsi,2
        jz .L2
        dec ecx
        jnz .L1 ; maximum that fits into single byte is 8*15*2
.L2:
        pxor xmm0,xmm0
        psadbw xmm4,xmm0 ; sum them 
        movhlps xmm0,xmm4 ; place high word into low
        paddd xmm0,xmm4 ; sum them
        movd ecx,xmm0 ; we got pop count into low dword finally
        add rax,rcx
        test rsi,rsi
        jnz .L0
        ret
align 16
pop_count_pshufb_avx2:
        xor eax,eax
    mov rcx,mask
        vbroadcasti128 ymm6,dqword[rcx]
    mov rcx,lut
        vbroadcasti128 ymm7,dqword[rcx]
.L0:
        vpxor ymm4,ymm4,ymm4
        mov ecx,8
.L1:
        vmovdqu ymm0,[rdi]
        vmovdqa ymm1,ymm0
        vpsrlw ymm1,ymm1,4
        vpand ymm0,ymm0,ymm6
        vpand ymm1,ymm1,ymm6
        vpshufb ymm2,ymm7,ymm0
        vpshufb ymm3,ymm7,ymm1
        vpaddb ymm2,ymm2,ymm3
        vpaddb ymm4,ymm4,ymm2
        add rdi,32
        sub rsi,4
        jz .L2
        dec ecx
        jnz .L1
.L2:
        vpxor ymm0,ymm0,ymm0
        vpsadbw ymm4,ymm4,ymm0
        vperm2i128 ymm0,ymm4,ymm4,1
        vpaddd ymm0,ymm0,ymm4
        vpermq ymm4,ymm0,1
        vpaddd ymm0,ymm0,ymm4
        vmovd ecx,xmm0
        add rax,rcx
        test rsi,rsi
        jnz .L0
        vzeroupper
        ret
align 16
pop_count_native:
        xor eax,eax
.L0:
        popcnt rdx,[rdi]
        popcnt rcx,[rdi+8]
        popcnt r8,[rdi+16]
        popcnt r9,[rdi+24]
        add rax,rdx
        add rax,rcx
        add rax,r8
        add rax,r9
        add rdi,32
        sub rsi,4
        jg .L0
        ret

init_lut:
        mov ecx,16
        mov r8d,15
.L0:
        lea eax,[ecx-1]
        popcnt dx,ax
    mov r11,lut
        mov [r11+r8],dl
        dec r8d
        loop .L0
        ret

init_random:
.L0:
        rdrand rax
        mov [rdi],rax
        add rdi,8
        dec rsi
        jnz .L0
        ret

section '.data'  writeable align 16

mask db 16 dup(0xf)

section '.bss' writeable align 16

lut rb 16

primer koriscenja:
Kod:
import random,times
{.link: "popcnt.o".}
proc init_lut() {.importc,cdecl.}
proc init_random(a:ptr uint64,size:uint64) {.importc,cdecl.}
proc pop_count_native(a:ptr uint64,size:uint64):uint64 {.importc,cdecl.}
proc pop_count_pshufb(a:ptr uint64,size:uint64):uint64 {.importc,cdecl.}
proc pop_count_pshufb_avx2(a:ptr uint64,size:uint64):uint64 {.importc,cdecl.}
init_lut()
var a = newSeq[uint64](4096*4096)
init_random(addr a[0],uint64 a.len)
var t = cpuTime()
var s = pop_count_native(addr a[0],uint64 a.len)
echo s,' ',cpuTime()-t
t = cpuTime()
s = pop_count_pshufb(addr a[0],uint64 a.len)
echo s,' ',cpuTime()-t
t = cpuTime()
s = pop_count_pshufb_avx2(addr a[0],uint64 a.len)
echo s,' ',cpuTime()-t

kod mene to ovako izgleda:
Kod:
~/.../nim/popcnt >>> ./popcnt                                                                                                                                                                            
536857538 0.008074000000000137
536857538 0.01167200000000013
536857538 0.008515999999999968

znaci avx2 i native popcnt imaju iste performanse, dok je SSSE3 varijanta nesto sporija, razumljivo, ali ako bi ste naivno brojali bitove....
 

Back
Top