NASM - Using Labels as Array Offsets

I am trying to write a small assembly language program that takes three arrays char

as input, calculates the skip of each element in the first array, and stores the result in the third array as shown below.

%macro prologue 0
    push    rbp
    mov     rbp,rsp
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1
segment .text
    global  avgArray 
avgArray:
    prologue

    mov [a1], rdi
    mov [a2], rsi
    mov [avg], rdx
    mov [avgL], rcx

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; array length

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:
    mov al, [rsi]
    mov dl, [r9]
    add ax, dx
    shr ax, 1
    mov [rdi], al

    add rsi, [offset]
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

      

When replaced [offset]

with, 1

it works great. However, when used [offset]

to define the next element of an array, it appears that it will not add its value to rsi

, rdi

and r9

. I checked it all with gdb. After the call, the add rsi, [offset]

address saved in rsi

remains the same.

Can someone tell me why using [offset]

doesn't work, but adding a simple 1 does?

BTW: Linux machine x86_64

+3


source to share


2 answers


Nice job of debugging your problem yourself. Since I already started looking at the code, I'll give you the efficiency / style critique as added comments:

%macro prologue 0
    push    rbp
    mov     rbp,rsp   ; you can drop this and the LEAVE.
;  Stack frames were useful before debuggers could keep track of things without them, and as a convenience
;  so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss    ; These should really be locals on the stack (or in regs!), not globals
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1

segment .text
; usually a comment with a C function prototype and description is a good idea for functions
    global  avgArray
avgArray:
    prologue

    mov [a1], rdi     ; what is this sillyness?  you have 16 registers for a reason.
    mov [a2], rsi     ; shuffling the values you want into the regs you want them in
    mov [avg], rdx    ; is best done with reg-reg moves.
    mov [avgL], rcx   ; I like to just put a comment at the top of a block of code
                      ; to document what goes in what reg.

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; This could be lea rcx, [rsi+rcx]
              ;  (since avgL is in rcx anyway as a function arg).

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:   ; you can use a local label here, starting with a .
 ; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
    mov al, [rsi]        ; there a data dependency on the old value of ax
    mov dl, [r9]         ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm

    add ax, dx           ; Avoid ALU ops on 16bit regs whenever possible.  (8bit is fine, they have diff opcodes instead of a prefix)
                         ; to avoid decode stalls on Intel
    shr ax, 1            ; Better to use 32bit regs (movsx/movzx)
    mov [rdi], al

    add rsi, [offset]    ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

      

You have a lot of free registers, why are you keeping the loop increment in memory? I hope this just ended up being debug / trial.

In addition, 1-reg addressing modes are more efficient when used as mem operands for ALU operations . Just increment one counter and use base + offset * scaled addressing when you have a lot of pointers (if you don't unwrap the loop), especially. if you download them with mov

.

This is how I would go about it (with primary analysis for Intel SnB and later):

scalar

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; if you can choose your prototype, do it so args go where you want them anyway.
    ; prologue
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

    ; mov    [rsp-8], rcx    ; if I wanted to spill  len  to memory

    add    rcx, rdi
    add    rcx, rsi
    add    rcx, rdx
    neg    rcx       ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
    ; We could also have just counted down towards zero
    ; but HW memory prefetchers have more stream slots for forward patterns than reverse.

ALIGN 16
.loop:
    ;  use movsx for signed char
    movzx  eax, [rsi+rcx]     ; dependency-breaker
    movzx  r8d, [rdx+rcx]     ; Using r8d to save push/pop of rbx
           ; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
           ; using ebx or ebp would save a REX prefix (1 insn byte).
    add    eax, r8d
    shr    eax, 1
    mov    [rdi+rcx], al

    inc    rcx     ; No cmp needed: this is the point of counting up towards zero
    jl     .loop   ; inc/jl can Macro-fuse into one uop

    ; nothing to pop, we only used caller-saved regs.
    ret

      

In Intel, the cycle is 7 uops (storage 2 uops: store address and storage data, and also cannot be a micro-fuse), so a processor that can issue 4 pins per cycle will do it after 2 cycles per byte. movzx

(to 32 or 64 bit regression) is 1 uop whether there is no port 0/1/5 uop with it for it with a micro-fuse or not. (This is reading, not reading-changing).

7 uops takes 2 chunks to 4 beats, so the loop can go out in 2 loops. There are no other bottlenecks that should prevent the actuators from keeping up with this, so it has to start one in 2 cycles.



vector

There is a vector instruction to do exactly this operation: PAVGB

is a packed avg of unsigned bytes (with a 9-bit temporary to avoid overflow, just like your add / shr).

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
    ;  use movsx for signed char
    movdqu    xmm0, [rsi+rcx]    ; 1 uop
    pavgb     xmm0, [rdx+rcx]    ; 2 uops (no micro-fusion)
    movdqu    [rdi+rcx], xmm0    ; 2 uops: no micro-fusion

    add    rcx, 16
    jl     .loop          ; 1 macro-fused uop add/branch
    ; TODO: scalar cleanup.
    ret

      

Getting the right loop out condition is tricky, as you need to end the vector loop if the next 16B leaves the end of the array. Sample. the best way to deal with this is to decrease rcx by 15 or before adding it to pointers.

So, another 6 loops / 2 loops per iteration, but each iteration will do 16 bytes. It is ideal for unrolling, so your cycle will be multiples of 4 discs, so you don't lose release speed with less than 4 hours of cycle at the end of the cycle. 2 loads / 1 magazine per cycle is our bottleneck here as it PAVGB

has a throughput of 2 per cycle.

16B / cycle shouldn't be difficult for Haswell and later. With AVX2 with ymm registers you get 32B / cycle. (SnB / IvB can only execute two operational blocks of memory per cycle, no more than one of them is storage unless you are using 256b loads / stores). Anyway, at this point you've got a massive 16x speedup from vectorization and is generally good enough. I'm just enjoying tweaking for the theoretical maximum throughput, counting uops and unwrapping. :)

If you are going to unwrap the loop at all, then it might be worth incrementing pointers instead of index. (So ​​there would be two uses for [rdx] and one for add, as well as two uses for [rdx + rcx]).

Either way, clearing the loop setup and storing everything in a register saves a decent amount of instruction bytes and overhead for short arrays.

+1


source


So I found a solution to this problem.

Addresses avgL

and offset

, where are stored directly one after another. When reading from rcx

and storing it before, avgL

it also overwrites the value offset

. Declaring it avgL

as QWORD instead of DWORD prevents mov

data from being overwritten offset

.



The new data and bss segments looks like this

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resq    1

      

+3


source







All Articles