reciprocitet float64 broja uz pomoc magicne vrednosti newt-raph metoda (asm x64-64)

bmaxa

Legenda
Poruka
70.815
Prilicno efektno izracunavanje reciprociteta na procesorima koji podrzavaju fma3 instrukcije.

Kod:
; latency test
format elf64
public recip
public recip1
public recip2
public recip3
public _rdtsc
section '.text' executable
N = 1000000
recip:
recip1:
; Load constants and input
   vbroadcastsd ymm1, [one]
   vpbroadcastq ymm4, [magic]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
      vpsubq ymm2, ymm4, ymm0
      vfnmadd213pd ymm0, ymm2, ymm1
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm0, ymm2, ymm2
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

recip2:
; Load constants and input
   vbroadcastsd ymm1, [one]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
      vcvtpd2ps xmm2,ymm0
      vrcpps xmm2,xmm2
      vcvtps2pd ymm2,xmm2
      vfnmadd213pd ymm0, ymm2, ymm1
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm0, ymm2, ymm2
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

recip3:
; Load constants and input
   vbroadcastsd ymm1, [one]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
   vdivpd ymm0,ymm1,ymm0
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

_rdtsc:
   rdtscp
   shl rdx, 32
   or rax, rdx
   ret

section '.data' writeable align 16
   align 16
   one dq 3FF0000000000000h
   magic dq 7FDE6238502484BAh

Program koji koristi rutinu:

Kod:
import strfmt,random
randomize()
{.link:"latencya.o".}
proc recip1(x:ptr float64){.importc,cdecl.}
proc recip2(x:ptr float64){.importc,cdecl.}
proc recip3(x:ptr float64){.importc,cdecl.}

proc rdtsc():uint64 =
  # we have to use emit here, nim does not have volatile quanitifier for asm statement
  {.emit:
    """asm volatile(
      ".intel_syntax noprefix\n"
      "rdtscp\n"
      "shl rdx,32\n"
      "or rax,rdx\n"
      ".att_syntax\n"
      :"=a"(`result`)
      :
      :"rdx");
    """.}

var x,y : array[4,float64]
for i in x.mitems :
  i = random(1000.0)
proc f[F](ff:F,title:string) =
  y = x
  echo title
  var t0,t1,t2:array[11,uint64]
  for i in 0..10 :
    t0[i] = rdtsc()
    t1[i] = rdtsc()
    ff(addr y[0])
    t2[i] = rdtsc()
  for i in 0..3 :
    echo "{0:24.18f} {1:24.18f} {2:24.18f}".fmt(x[i],y[i],1/x[i])
  for i in 0..10 :
    echo "{0:f}\t{1:f}".fmt(float64(t1[i]-t0[i]),float64(t2[i]-t1[i])/1000000.0)
f(recip1,"recip1")
f(recip2,"recip2")
f(recip3,"recip3")

oputput, samo 4 takta za magicnu vrednost 8 za low-prec aproksimaciju preko x86 instrukcije
i preko 25 takta za obicno deljenje na mom Haswell-u.
:

Kod:
~/.../examples/assembler >>> fasm latency.asm latencya.o                                                                                                                                                  
flat assembler  version 1.72  (16384 kilobytes memory)
3 passes, 1024 bytes.
~/.../examples/assembler >>> nim c -d:release latency.nim                                                                                                                                                 
Hint: used config file '/home/bmaxa/projects/Nim/config/nim.cfg' [Conf]
Hint: system [Processing]                                                                                                                                                                                  
Hint: latency [Processing]                                                                                                                                                                                 
Hint: strfmt [Processing]                                                                                                                                                                                  
Hint: macros [Processing]                                                                                                                                                                                  
Hint: strutils [Processing]                                                                                                                                                                                
Hint: parseutils [Processing]                                                                                                                                                                              
Hint: math [Processing]                                                                                                                                                                                    
Hint: algorithm [Processing]                                                                                                                                                                               
Hint: unicode [Processing]                                                                                                                                                                                 
Hint: streams [Processing]                                                                                                                                                                                 
Hint: random [Processing]                                                                                                                                                                                  
Hint: times [Processing]                                                                                                                                                                                   
Hint: posix [Processing]                                                                                                                                                                                   
latency.nim(24, 7) Warning: random is deprecated [Deprecated]                                                                                                                                              
Hint:  [Link]                                                                                                                                                                                              
Hint: operation successful (25509 lines compiled; 0.381 sec total; 54.746MiB peakmem; Release Build) [SuccessX]                                                                                            
~/.../examples/assembler >>> ./latency                                                                                                                                                                    
recip1
   39.990172843022754944     0.025006143482434936     0.025006143482434932
  721.311186985165591040     0.001386364190717266     0.001386364190717266
  244.350428125735362496     0.004092483109894247     0.004092483109894246
  842.939881444014304128     0.001186324223130754     0.001186324223130753
32.000000       4.646356
28.000000       4.463024
32.000000       4.508955
24.000000       4.122156
51.000000       4.114704
24.000000       4.170711
24.000000       4.202169
24.000000       4.176183
51.000000       4.642827
28.000000       4.353924
24.000000       4.510225
recip2
   39.990172843022754944     0.025006143482434932     0.025006143482434932
  721.311186985165591040     0.001386364190717266     0.001386364190717266
  244.350428125735362496     0.004092483109894246     0.004092483109894246
  842.939881444014304128     0.001186324223130753     0.001186324223130753
32.000000       9.208760
24.000000       8.964789
51.000000       8.889720
24.000000       8.588568
21.000000       9.174782
32.000000       9.300807
24.000000       9.128892
32.000000       9.291208
32.000000       8.720814
21.000000       8.674830
21.000000       8.936355
recip3
   39.990172843022754944     0.025006143482434932     0.025006143482434932
  721.311186985165591040     0.001386364190717266     0.001386364190717266
  244.350428125735362496     0.004092483109894246     0.004092483109894246
  842.939881444014304128     0.001186324223130753     0.001186324223130753
40.000000       27.474120
28.000000       26.526451
48.000000       28.575053
44.000000       27.247942
21.000000       27.438378
32.000000       27.561200
24.000000       28.185877
32.000000       27.932992
24.000000       26.356293
24.000000       27.323739
24.000000       28.063525
 

Back
Top