Exercise: Assembly Optimizer
Difficulty - Advanced
Learning Objectives
- Understand Go assembly basics
- Learn CPU architecture and registers
- Optimize hot paths with assembly
- Practice performance benchmarking
- Master calling conventions
Problem Statement
Optimize performance-critical functions using Go assembly language.
Go Implementation
1package optimizer
2
3// Sum adds all numbers in a slice
4func SumGo(numbers []int64) int64 {
5 var sum int64
6 for _, n := range numbers {
7 sum += n
8 }
9 return sum
10}
11
12// DotProduct computes dot product of two vectors
13func DotProductGo(a, b []float64) float64 {
14 if len(a) != len(b) {
15 panic("vectors must have same length")
16 }
17
18 var sum float64
19 for i := range a {
20 sum += a[i] * b[i]
21 }
22 return sum
23}
24
25// Memset fills a byte slice with a value
26func MemsetGo(s []byte, val byte) {
27 for i := range s {
28 s[i] = val
29 }
30}
31
32// FindByte finds first occurrence of byte in slice
33func FindByteGo(s []byte, b byte) int {
34 for i, v := range s {
35 if v == b {
36 return i
37 }
38 }
39 return -1
40}
41
42// ReverseBytes reverses a byte slice in place
43func ReverseBytesGo(s []byte) {
44 for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
45 s[i], s[j] = s[j], s[i]
46 }
47}
Assembly Implementation
1// sum_amd64.s
2// func SumAsm(numbers []int64) int64
3TEXT ·SumAsm(SB), NOSPLIT, $0-32
4 MOVQ numbers+0(FP), SI // SI = pointer to slice data
5 MOVQ numbers+8(FP), CX // CX = slice length
6 XORQ AX, AX // AX = 0
7
8 // Handle empty slice
9 TESTQ CX, CX
10 JZ done
11
12loop:
13 ADDQ, AX // AX += *SI
14 ADDQ $8, SI // SI += 8
15 DECQ CX // CX--
16 JNZ loop // if CX != 0, continue loop
17
18done:
19 MOVQ AX, ret+24(FP) // return AX
20 RET
21
22
23// func DotProductAsm(a, b []float64) float64
24TEXT ·DotProductAsm(SB), NOSPLIT, $0-56
25 MOVQ a+0(FP), SI // SI = pointer to a
26 MOVQ a+8(FP), CX // CX = len(a)
27 MOVQ b+24(FP), DI // DI = pointer to b
28
29 // Initialize sum to 0.0
30 XORPS X0, X0 // X0 = 0.0
31
32 // Handle empty slice
33 TESTQ CX, CX
34 JZ done
35
36loop:
37 MOVSD, X1 // X1 = a[i]
38 MULSD, X1 // X1 *= b[i]
39 ADDSD X1, X0 // X0 += X1
40
41 ADDQ $8, SI // SI += 8
42 ADDQ $8, DI // DI += 8
43 DECQ CX // CX--
44 JNZ loop
45
46done:
47 MOVSD X0, ret+48(FP) // return X0
48 RET
49
50
51// func MemsetAsm(s []byte, val byte)
52TEXT ·MemsetAsm(SB), NOSPLIT, $0-25
53 MOVQ s+0(FP), DI // DI = pointer to slice
54 MOVQ s+8(FP), CX // CX = slice length
55 MOVB val+24(FP), AX // AL = value to set
56
57 // Handle empty slice
58 TESTQ CX, CX
59 JZ done
60
61 // Use REP STOSB for efficient memset
62 REP; STOSB // Repeat store AL at [DI], CX times
63
64done:
65 RET
66
67
68// func FindByteAsm(s []byte, b byte) int
69TEXT ·FindByteAsm(SB), NOSPLIT, $0-33
70 MOVQ s+0(FP), DI // DI = pointer to slice
71 MOVQ s+8(FP), CX // CX = slice length
72 MOVB b+24(FP), AL // AL = byte to find
73
74 // Handle empty slice
75 TESTQ CX, CX
76 JZ notfound
77
78 // Use REP SCASB for efficient search
79 REPNE; SCASB // Scan for AL in [DI], CX times
80
81 // Check if found
82 JNE notfound
83
84 // Calculate index: original_len - remaining_len - 1
85 MOVQ s+8(FP), AX // AX = original length
86 SUBQ CX, AX // AX -= remaining length
87 DECQ AX // AX--
88 MOVQ AX, ret+32(FP)
89 RET
90
91notfound:
92 MOVQ $-1, ret+32(FP) // return -1
93 RET
94
95
96// func ReverseBytesAsm(s []byte)
97TEXT ·ReverseBytesAsm(SB), NOSPLIT, $0-24
98 MOVQ s+0(FP), SI // SI = pointer to slice start
99 MOVQ s+8(FP), CX // CX = slice length
100
101 // Handle empty or single element
102 CMPQ CX, $2
103 JL done
104
105 // DI = pointer to slice end - 1
106 MOVQ SI, DI
107 ADDQ CX, DI
108 DECQ DI
109
110 // CX = number of swaps
111 SHRQ $1, CX
112
113loop:
114 // Swap bytes
115 MOVB, AX // AL = *SI
116 MOVB, BX // BL = *DI
117 MOVB BX, // *SI = BL
118 MOVB AX, // *DI = AL
119
120 INCQ SI // SI++
121 DECQ DI // DI--
122 DECQ CX // CX--
123 JNZ loop
124
125done:
126 RET
Go Wrapper
1package optimizer
2
3// Assembly function declarations
4func SumAsm(numbers []int64) int64
5func DotProductAsm(a, b []float64) float64
6func MemsetAsm(s []byte, val byte)
7func FindByteAsm(s []byte, b byte) int
8func ReverseBytesAsm(s []byte)
Benchmarks
1package optimizer
2
3import (
4 "testing"
5)
6
7func BenchmarkSumGo(b *testing.B) {
8 numbers := make([]int64, 1000)
9 for i := range numbers {
10 numbers[i] = int64(i)
11 }
12
13 b.ResetTimer()
14 for i := 0; i < b.N; i++ {
15 SumGo(numbers)
16 }
17}
18
19func BenchmarkSumAsm(b *testing.B) {
20 numbers := make([]int64, 1000)
21 for i := range numbers {
22 numbers[i] = int64(i)
23 }
24
25 b.ResetTimer()
26 for i := 0; i < b.N; i++ {
27 SumAsm(numbers)
28 }
29}
30
31func BenchmarkDotProductGo(b *testing.B) {
32 a := make([]float64, 1000)
33 b_vec := make([]float64, 1000)
34 for i := range a {
35 a[i] = float64(i)
36 b_vec[i] = float64(i)
37 }
38
39 b.ResetTimer()
40 for i := 0; i < b.N; i++ {
41 DotProductGo(a, b_vec)
42 }
43}
44
45func BenchmarkDotProductAsm(b *testing.B) {
46 a := make([]float64, 1000)
47 b_vec := make([]float64, 1000)
48 for i := range a {
49 a[i] = float64(i)
50 b_vec[i] = float64(i)
51 }
52
53 b.ResetTimer()
54 for i := 0; i < b.N; i++ {
55 DotProductAsm(a, b_vec)
56 }
57}
58
59func BenchmarkMemsetGo(b *testing.B) {
60 s := make([]byte, 10000)
61
62 b.ResetTimer()
63 for i := 0; i < b.N; i++ {
64 MemsetGo(s, 0xFF)
65 }
66}
67
68func BenchmarkMemsetAsm(b *testing.B) {
69 s := make([]byte, 10000)
70
71 b.ResetTimer()
72 for i := 0; i < b.N; i++ {
73 MemsetAsm(s, 0xFF)
74 }
75}
Key Assembly Instructions
Data Movement
MOVQ: Move quadwordMOVSD: Move scalar double-precision floatMOVB: Move byte
Arithmetic
ADDQ: Add quadwordSUBQ: Subtract quadwordMULSD: Multiply scalar doubleADDSD: Add scalar double
Logic
XORQ: XOR quadwordXORPS: XOR packed single-precision floats
Control Flow
JZ: Jump if zeroJNZ: Jump if not zeroJNE: Jump if not equal
String Operations
REP STOSB: Repeat store byteREPNE SCASB: Repeat scan byte while not equal
Registers
AX, BX, CX, DX: General purposeSI, DI: Source/Destination indexX0-X15: SSE registers for floating pointFP: Frame pointerSB: Static base
Key Takeaways
- Measure First: Always benchmark before optimizing
- Hot Paths Only: Only optimize performance-critical code
- Maintainability: Assembly is harder to maintain
- Platform Specific: Assembly is architecture-specific
- Compiler Intelligence: Modern Go compiler is very good
Performance Tips
- Use SIMD: Leverage SSE/AVX instructions
- Minimize Branches: Branch prediction matters
- Cache Locality: Access memory sequentially
- Register Usage: Keep hot data in registers
- Loop Unrolling: Reduce loop overhead
Related Topics
- Assembly Integration - Main assembly tutorial
- Performance - Performance optimization
- Compiler Optimizations - Compiler tricks