align 16
; rdi array,rsi size
radix_sort_avx2:
push rbx
push rbp
mov rbp,rsp
sub rsp,16+16*8+16*8
mov [rbp-8],rdi
mov [rbp-16],rsi
xor rbx,rbx
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8)],ymm0
vmovdqu [rbp-(16+12*8)],ymm0
vmovdqu [rbp-(16+8*8)],ymm0
vmovdqu [rbp-(16+4*8)],ymm0
.L00: ; allocate 16 buckets
lea rdi,[rbp-(16+16*8)+rbx*8] ; destination
mov rdx,[rbp-16]
mov rsi,16
mov rax,16*16384
cmp rdx,rax ; if array is bigger then L2 cache
cmovg rsi,rax ; alignment, L2 cache size
lea rdx,[rdx*4] ; size
call [_posix_memalign] ; huge malloc is actually
; never allocated
; untill page is touched
test rax,rax
jnz .fexit
inc rbx
cmp rbx,16
jnz .L00
vpxor ymm0,ymm0,ymm0
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
xor rcx,rcx
.L0:
xor rdx,rdx
vmovd xmm2,ecx
; vpbroadcastd ymm2,xmm1
mov rdi,[rbp-8]
vpslld xmm2,xmm2,2
.L1:; calculate index with SIMD, based on [(num[rdx]>> (ecx << 2))&0xf]
vmovdqu ymm1,[rdi+rdx*4]
;vpsrlvd ymm3,ymm1,ymm2 ; heh, finally got why vector shift
; is usefull
vpsrld ymm3,ymm1,xmm2
vpand ymm3,ymm3,yword[mask]
mov r10,8
vmovdqa ymm15,ymm3
vmovdqa ymm14,ymm1
.L22: ; this is scatter to buckets, according to dword index
; this can be unrolled, but
; I don;t know by how much performance would be increased;
vmovd esi, xmm3
vmovd r11d,xmm1
mov r8,[rbp-(16+16*8)+rsi*8] ; get bucket
vpsrldq xmm3,xmm3,4
mov r9,[rbp-(16+16*8+16*8)+rsi*8] ; get index into backet
inc qword[rbp-(16+16*8+16*8)+rsi*8] ; update index
vpsrldq xmm1,xmm1,4
mov [r8+r9*4],r11d
dec r10
cmp r10,4 ; unfortunatelly vpsrldq will not shift
; upper 128 bits into lower
je .adjust
.L44:
inc rdx
cmp rdx,[rbp-16]
jge .L33
test r10,r10
jnz .L22
jmp .L1
.adjust:
vextracti128 xmm3,ymm15,1
vextracti128 xmm1,ymm14,1
jmp .L44
.L33: ; gather buckets (one after another) into input array
xor rax,rax
lea r8,[rbp-(16+16*8+16*8)]
lea r9,[rbp-(16+16*8)]
.L2:
mov rsi,[r9]
mov r10,[r8]
test r10,r10
jz .L5
.L3:
.L4:
cmp r10,4
jl .one
vmovdqa xmm15,[rsi]
vmovdqu [rdi],xmm15
add rsi,16
add rdi,16
sub r10,4
jnz .L4
jmp .L5
.one:
mov r11d,[rsi]
mov [rdi],r11d
add rsi,4
add rdi,4
dec r10
jnz .one
.L5:
add r8,8
add r9,8
inc rax
cmp rax,16
jnz .L2
vmovdqu [rbp-(16+16*8+16*8)],ymm0
vmovdqu [rbp-(16+16*8+12*8)],ymm0
vmovdqu [rbp-(16+16*8+8*8)],ymm0
vmovdqu [rbp-(16+16*8+4*8)],ymm0
inc rcx
cmp rcx,8 ; 2*(sizeof(int))
jl .L0
.exit:
xor rbx,rbx
.L11:
mov rdi,[rbp-(16+16*8)+rbx*8]
call [_free]
.next:
inc rbx
cmp rbx,16
jnz .L11
.fexit:
mov rsp,rbp
pop rbp
pop rbx
ret