Assembly Optimizer

Exercise: Assembly Optimizer

Difficulty - Advanced

Learning Objectives

  • Understand Go assembly basics
  • Learn CPU architecture and registers
  • Optimize hot paths with assembly
  • Practice performance benchmarking
  • Master calling conventions

Problem Statement

Optimize performance-critical functions using Go assembly language.

Go Implementation

 1package optimizer
 2
 3// Sum adds all numbers in a slice
 4func SumGo(numbers []int64) int64 {
 5	var sum int64
 6	for _, n := range numbers {
 7		sum += n
 8	}
 9	return sum
10}
11
12// DotProduct computes dot product of two vectors
13func DotProductGo(a, b []float64) float64 {
14	if len(a) != len(b) {
15		panic("vectors must have same length")
16	}
17
18	var sum float64
19	for i := range a {
20		sum += a[i] * b[i]
21	}
22	return sum
23}
24
25// Memset fills a byte slice with a value
26func MemsetGo(s []byte, val byte) {
27	for i := range s {
28		s[i] = val
29	}
30}
31
32// FindByte finds first occurrence of byte in slice
33func FindByteGo(s []byte, b byte) int {
34	for i, v := range s {
35		if v == b {
36			return i
37		}
38	}
39	return -1
40}
41
42// ReverseBytes reverses a byte slice in place
43func ReverseBytesGo(s []byte) {
44	for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
45		s[i], s[j] = s[j], s[i]
46	}
47}

Assembly Implementation

  1// sum_amd64.s
  2// func SumAsm(numbers []int64) int64
  3TEXT ·SumAsm(SB), NOSPLIT, $0-32
  4    MOVQ numbers+0(FP), SI    // SI = pointer to slice data
  5    MOVQ numbers+8(FP), CX    // CX = slice length
  6    XORQ AX, AX               // AX = 0
  7
  8    // Handle empty slice
  9    TESTQ CX, CX
 10    JZ done
 11
 12loop:
 13    ADDQ, AX             // AX += *SI
 14    ADDQ $8, SI               // SI += 8
 15    DECQ CX                   // CX--
 16    JNZ loop                  // if CX != 0, continue loop
 17
 18done:
 19    MOVQ AX, ret+24(FP)       // return AX
 20    RET
 21
 22
 23// func DotProductAsm(a, b []float64) float64
 24TEXT ·DotProductAsm(SB), NOSPLIT, $0-56
 25    MOVQ a+0(FP), SI          // SI = pointer to a
 26    MOVQ a+8(FP), CX          // CX = len(a)
 27    MOVQ b+24(FP), DI         // DI = pointer to b
 28
 29    // Initialize sum to 0.0
 30    XORPS X0, X0              // X0 = 0.0
 31
 32    // Handle empty slice
 33    TESTQ CX, CX
 34    JZ done
 35
 36loop:
 37    MOVSD, X1            // X1 = a[i]
 38    MULSD, X1            // X1 *= b[i]
 39    ADDSD X1, X0              // X0 += X1
 40
 41    ADDQ $8, SI               // SI += 8
 42    ADDQ $8, DI               // DI += 8
 43    DECQ CX                   // CX--
 44    JNZ loop
 45
 46done:
 47    MOVSD X0, ret+48(FP)      // return X0
 48    RET
 49
 50
 51// func MemsetAsm(s []byte, val byte)
 52TEXT ·MemsetAsm(SB), NOSPLIT, $0-25
 53    MOVQ s+0(FP), DI          // DI = pointer to slice
 54    MOVQ s+8(FP), CX          // CX = slice length
 55    MOVB val+24(FP), AX       // AL = value to set
 56
 57    // Handle empty slice
 58    TESTQ CX, CX
 59    JZ done
 60
 61    // Use REP STOSB for efficient memset
 62    REP; STOSB                // Repeat store AL at [DI], CX times
 63
 64done:
 65    RET
 66
 67
 68// func FindByteAsm(s []byte, b byte) int
 69TEXT ·FindByteAsm(SB), NOSPLIT, $0-33
 70    MOVQ s+0(FP), DI          // DI = pointer to slice
 71    MOVQ s+8(FP), CX          // CX = slice length
 72    MOVB b+24(FP), AL         // AL = byte to find
 73
 74    // Handle empty slice
 75    TESTQ CX, CX
 76    JZ notfound
 77
 78    // Use REP SCASB for efficient search
 79    REPNE; SCASB              // Scan for AL in [DI], CX times
 80
 81    // Check if found
 82    JNE notfound
 83
 84    // Calculate index: original_len - remaining_len - 1
 85    MOVQ s+8(FP), AX          // AX = original length
 86    SUBQ CX, AX               // AX -= remaining length
 87    DECQ AX                   // AX--
 88    MOVQ AX, ret+32(FP)
 89    RET
 90
 91notfound:
 92    MOVQ $-1, ret+32(FP)      // return -1
 93    RET
 94
 95
 96// func ReverseBytesAsm(s []byte)
 97TEXT ·ReverseBytesAsm(SB), NOSPLIT, $0-24
 98    MOVQ s+0(FP), SI          // SI = pointer to slice start
 99    MOVQ s+8(FP), CX          // CX = slice length
100
101    // Handle empty or single element
102    CMPQ CX, $2
103    JL done
104
105    // DI = pointer to slice end - 1
106    MOVQ SI, DI
107    ADDQ CX, DI
108    DECQ DI
109
110    // CX = number of swaps
111    SHRQ $1, CX
112
113loop:
114    // Swap bytes
115    MOVB, AX             // AL = *SI
116    MOVB, BX             // BL = *DI
117    MOVB BX,             // *SI = BL
118    MOVB AX,             // *DI = AL
119
120    INCQ SI                   // SI++
121    DECQ DI                   // DI--
122    DECQ CX                   // CX--
123    JNZ loop
124
125done:
126    RET

Go Wrapper

1package optimizer
2
3// Assembly function declarations
4func SumAsm(numbers []int64) int64
5func DotProductAsm(a, b []float64) float64
6func MemsetAsm(s []byte, val byte)
7func FindByteAsm(s []byte, b byte) int
8func ReverseBytesAsm(s []byte)

Benchmarks

 1package optimizer
 2
 3import (
 4	"testing"
 5)
 6
 7func BenchmarkSumGo(b *testing.B) {
 8	numbers := make([]int64, 1000)
 9	for i := range numbers {
10		numbers[i] = int64(i)
11	}
12
13	b.ResetTimer()
14	for i := 0; i < b.N; i++ {
15		SumGo(numbers)
16	}
17}
18
19func BenchmarkSumAsm(b *testing.B) {
20	numbers := make([]int64, 1000)
21	for i := range numbers {
22		numbers[i] = int64(i)
23	}
24
25	b.ResetTimer()
26	for i := 0; i < b.N; i++ {
27		SumAsm(numbers)
28	}
29}
30
31func BenchmarkDotProductGo(b *testing.B) {
32	a := make([]float64, 1000)
33	b_vec := make([]float64, 1000)
34	for i := range a {
35		a[i] = float64(i)
36		b_vec[i] = float64(i)
37	}
38
39	b.ResetTimer()
40	for i := 0; i < b.N; i++ {
41		DotProductGo(a, b_vec)
42	}
43}
44
45func BenchmarkDotProductAsm(b *testing.B) {
46	a := make([]float64, 1000)
47	b_vec := make([]float64, 1000)
48	for i := range a {
49		a[i] = float64(i)
50		b_vec[i] = float64(i)
51	}
52
53	b.ResetTimer()
54	for i := 0; i < b.N; i++ {
55		DotProductAsm(a, b_vec)
56	}
57}
58
59func BenchmarkMemsetGo(b *testing.B) {
60	s := make([]byte, 10000)
61
62	b.ResetTimer()
63	for i := 0; i < b.N; i++ {
64		MemsetGo(s, 0xFF)
65	}
66}
67
68func BenchmarkMemsetAsm(b *testing.B) {
69	s := make([]byte, 10000)
70
71	b.ResetTimer()
72	for i := 0; i < b.N; i++ {
73		MemsetAsm(s, 0xFF)
74	}
75}

Key Assembly Instructions

Data Movement

  • MOVQ: Move quadword
  • MOVSD: Move scalar double-precision float
  • MOVB: Move byte

Arithmetic

  • ADDQ: Add quadword
  • SUBQ: Subtract quadword
  • MULSD: Multiply scalar double
  • ADDSD: Add scalar double

Logic

  • XORQ: XOR quadword
  • XORPS: XOR packed single-precision floats

Control Flow

  • JZ: Jump if zero
  • JNZ: Jump if not zero
  • JNE: Jump if not equal

String Operations

  • REP STOSB: Repeat store byte
  • REPNE SCASB: Repeat scan byte while not equal

Registers

  • AX, BX, CX, DX: General purpose
  • SI, DI: Source/Destination index
  • X0-X15: SSE registers for floating point
  • FP: Frame pointer
  • SB: Static base

Key Takeaways

  1. Measure First: Always benchmark before optimizing
  2. Hot Paths Only: Only optimize performance-critical code
  3. Maintainability: Assembly is harder to maintain
  4. Platform Specific: Assembly is architecture-specific
  5. Compiler Intelligence: Modern Go compiler is very good

Performance Tips

  1. Use SIMD: Leverage SSE/AVX instructions
  2. Minimize Branches: Branch prediction matters
  3. Cache Locality: Access memory sequentially
  4. Register Usage: Keep hot data in registers
  5. Loop Unrolling: Reduce loop overhead