Spiga

Go学习笔记(十一):编写⾼性能的Go程序

2020-05-13 13:11:55

别让性能被“锁”住

我们来看一段代码

var cache map[string]string

const NUM_OF_READER int = 40
const READ_TIMES = 100000

func init() {
	cache = make(map[string]string)

	cache["a"] = "aa"
	cache["b"] = "bb"
}

func lockFreeAccess() {

	var wg sync.WaitGroup
	wg.Add(NUM_OF_READER)
	for i := 0; i < NUM_OF_READER; i++ {
		go func() {
			for j := 0; j < READ_TIMES; j++ {
				_, err := cache["a"]
				if !err {
					fmt.Println("Nothing")
				}
			}
			wg.Done()
		}()
	}
	wg.Wait()
}

func lockAccess() {
	var wg sync.WaitGroup
	wg.Add(NUM_OF_READER)
	m := new(sync.RWMutex)
	for i := 0; i < NUM_OF_READER; i++ {
		go func() {
			for j := 0; j < READ_TIMES; j++ {

				m.RLock()
				_, err := cache["a"]
				if !err {
					fmt.Println("Nothing")
				}
				m.RUnlock()
			}
			wg.Done()
		}()
	}
	wg.Wait()
}

这段程序一个没有锁,一个有锁。我们看一下测试结果

func BenchmarkLockFree(b *testing.B) {
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		lockFreeAccess()
	}
}
//169	   6618595 ns/op	      77 B/op	       1 allocs/op

func BenchmarkLock(b *testing.B) {
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		lockAccess()
	}
}
//8	 160448100 ns/op	   10033 B/op	      27 allocs/op

从结果我们可以看到LockFree要比Lock快了2个数量级。
上面例子之所以使用RWLock是因为go语言内置的map是非协程安全的,我们需要加锁了保证协程安全。而对map加锁对性能影响很大,go语言给我们提供了sync.Map,它是一个协程安全的map。

sync.Map

  • 适合读多写少,且 Key 相对稳定的环境
  • 采⽤了空间换时间的⽅案,并且采⽤指针的⽅式间接实现值的映射,所以存储空间会较 built-in map ⼤

https://my.oschina.net/qiangmzsx/blog/1827059

Concurrent Map

  • 适⽤于读写都很频繁的情况

https://github.com/easierway/concurrent_map

各种map的测试

1.定义Map接口

type Map interface {
	Set(key interface{}, val interface{})
	Get(key interface{}) (interface{}, bool)
	Del(key interface{})
}

2.rw_map实现

type RWLockMap struct {
	m    map[interface{}]interface{}
	lock sync.RWMutex
}

func (m *RWLockMap) Get(key interface{}) (interface{}, bool) {
	m.lock.RLock()
	v, ok := m.m[key]
	m.lock.RUnlock()
	return v, ok
}

func (m *RWLockMap) Set(key interface{}, value interface{}) {
	m.lock.Lock()
	m.m[key] = value
	m.lock.Unlock()
}

func (m *RWLockMap) Del(key interface{}) {
	m.lock.Lock()
	delete(m.m, key)
	m.lock.Unlock()
}

func CreateRWLockMap() *RWLockMap {
	m := make(map[interface{}]interface{}, 0)
	return &RWLockMap{m: m}
}

3.sync_map实现

func CreateSyncMapBenchmarkAdapter() *SyncMapBenchmarkAdapter {
	return &SyncMapBenchmarkAdapter{}
}

type SyncMapBenchmarkAdapter struct {
	m sync.Map
}

func (m *SyncMapBenchmarkAdapter) Set(key interface{}, val interface{}) {
	m.m.Store(key, val)
}

func (m *SyncMapBenchmarkAdapter) Get(key interface{}) (interface{}, bool) {
	return m.m.Load(key)
}

func (m *SyncMapBenchmarkAdapter) Del(key interface{}) {
	m.m.Delete(key)
}

4.concurrent_map实现

import "github.com/easierway/concurrent_map"

type ConcurrentMapBenchmarkAdapter struct {
	cm *concurrent_map.ConcurrentMap
}

func (m *ConcurrentMapBenchmarkAdapter) Set(key interface{}, value interface{}) {
	m.cm.Set(concurrent_map.StrKey(key.(string)), value)
}

func (m *ConcurrentMapBenchmarkAdapter) Get(key interface{}) (interface{}, bool) {
	return m.cm.Get(concurrent_map.StrKey(key.(string)))
}

func (m *ConcurrentMapBenchmarkAdapter) Del(key interface{}) {
	m.cm.Del(concurrent_map.StrKey(key.(string)))
}

func CreateConcurrentMapBenchmarkAdapter(numOfPartitions int) *ConcurrentMapBenchmarkAdapter {
	conMap := concurrent_map.CreateConcurrentMap(numOfPartitions)
	return &ConcurrentMapBenchmarkAdapter{conMap}
}

5.测试

const (
	NumOfReader = 100
	NumOfWriter = 100
)

func benchmarkMap(b *testing.B, hm Map) {
	for i := 0; i < b.N; i++ {
		var wg sync.WaitGroup
		for i := 0; i < NumOfWriter; i++ {
			wg.Add(1)
			go func() {
				for i := 0; i < 100; i++ {
					hm.Set(strconv.Itoa(i), i*i)
					hm.Set(strconv.Itoa(i), i*i)
					hm.Del(strconv.Itoa(i))
				}
				wg.Done()
			}()
		}
		for i := 0; i < NumOfReader; i++ {
			wg.Add(1)
			go func() {
				for i := 0; i < 100; i++ {
					hm.Get(strconv.Itoa(i))
				}
				wg.Done()
			}()
		}
		wg.Wait()
	}
}

func BenchmarkSyncmap(b *testing.B) {
	b.Run("map with RWLock", func(b *testing.B) {
		hm := CreateRWLockMap()
		benchmarkMap(b, hm)
	})

	b.Run("sync.map", func(b *testing.B) {
		hm := CreateSyncMapBenchmarkAdapter()
		benchmarkMap(b, hm)
	})

	b.Run("concurrent map", func(b *testing.B) {
		superman := CreateConcurrentMapBenchmarkAdapter(199)
		benchmarkMap(b, superman)
	})
}

结果分析:

1.当NumOfReader=100,NumOfWriter=100时

2.当NumOfReader=100,NumOfWriter=200时

3.当NumOfReader=200,NumOfWriter=100时

4.当NumOfReader=100,NumOfWriter=10时
从测试结果可以得到写多读少的时候使用concurrent_map,读多写少时使用sync.map。

总结

编写GC友好的代码

复杂对象尽量传递引用

  • 数组的传递
  • 结构体的传递

如下测试程序可以看到数组传递和指针传递的结果:

const NumOfElems = 1000

type Content struct {
	Detail [10000]int
}

func withValue(arr [NumOfElems]Content) int {
	//	fmt.Println(&arr[2])
	return 0
}

func withReference(arr *[NumOfElems]Content) int {
	//b := *arr
	//	fmt.Println(&arr[2])
	return 0
}

func TestFn(t *testing.T) {
	var arr [NumOfElems]Content
	//fmt.Println(&arr[2])
	withValue(arr)
	withReference(&arr)
}

func BenchmarkPassingArrayWithValue(b *testing.B) {
	var arr [NumOfElems]Content

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		withValue(arr)
	}
	b.StopTimer()
}
//76	  14751696 ns/op	80003074 B/op	       1 allocs/op

func BenchmarkPassingArrayWithRef(b *testing.B) {
	var arr [NumOfElems]Content

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		withReference(&arr)
	}
	b.StopTimer()
}
//1000000000	         0.305 ns/op	       0 B/op	       0 allocs/op

可以看到性能差距是巨大的,平常的编程中应该避免值复制。

打开GC日志

只要在程序执⾏之前加上环境变量GODEBUG=gctrace=1

如:GODEBUG=gctrace=1 go test -bench=.
GODEBUG=gctrace=1 go run main.go

⽇志详细信息参考:https://godoc.org/runtime

go tool trace

普通程序输出 trace 信息

import (
	"os"
	"runtime/trace"
)

func main() {
	f, err := os.Create("trace.out")
	if err != nil { 
		panic(err)
	}
	defer f.Close()

	err = trace.Start(f) 
	if err != nil {
		panic(err)
	}
	defer trace.Stop() 
	// Your program here
}

测试程序输出 trace 信息: go test -trace trace.out

可视化 trace 信息: go tool trace trace.out

初始化⾄合适的大小

⾃动扩容是有代价的,如下测试程序

const numOfElems = 100000
const times = 1000

func BenchmarkAutoGrow(b *testing.B) {
	for i := 0; i < b.N; i++ {
		s := []int{}
		for j := 0; j < numOfElems; j++ {
			s = append(s, j)
		}
	}
}
//2022	    590903 ns/op	 4654344 B/op	      30 allocs/op

func BenchmarkProperInit(b *testing.B) {
	for i := 0; i < b.N; i++ {
		s := make([]int, 0, numOfElems)
		for j := 0; j < numOfElems; j++ {
			s = append(s, j)
		}
	}
}
//7077	    163259 ns/op	  802820 B/op	       1 allocs/op

func BenchmarkOverSizeInit(b *testing.B) {
	for i := 0; i < b.N; i++ {
		s := make([]int, 0, numOfElems*8)
		for j := 0; j < numOfElems; j++ {
			s = append(s, j)
		}
	}
}
//1582	    704184 ns/op	 6406155 B/op	       1 allocs/op

从结果我们可以看到初始化过小会有自动扩容的代价,而初始化过大也会对性能有影响。我们编写程序时,如果能知道切片的大小,初始化合理的值是最佳的。

高效的字符串连接

字符串的连接在开发中是很常见的,下面我们比较一下各种连接方式的性能:

const numbers = 100

func BenchmarkSprintf(b *testing.B) {
	b.ResetTimer()
	for idx := 0; idx < b.N; idx++ {
		var s string
		for i := 0; i < numbers; i++ {
			s = fmt.Sprintf("%v%v", s, i)
		}
	}
	b.StopTimer()
}
//64848	     23398 ns/op	   11363 B/op	     198 allocs/op

func BenchmarkStringBuilder(b *testing.B) {
	b.ResetTimer()
	for idx := 0; idx < b.N; idx++ {
		var builder strings.Builder
		for i := 0; i < numbers; i++ {
			builder.WriteString(strconv.Itoa(i))

		}
		_ = builder.String()
	}
	b.StopTimer()
}
//668382	      1651 ns/op	     504 B/op	       6 allocs/op

func BenchmarkBytesBuf(b *testing.B) {
	b.ResetTimer()
	for idx := 0; idx < b.N; idx++ {
		var buf bytes.Buffer
		for i := 0; i < numbers; i++ {
			buf.WriteString(strconv.Itoa(i))
		}
		_ = buf.String()
	}
	b.StopTimer()
}
//812496	      1824 ns/op	     688 B/op	       4 allocs/op

func BenchmarkStringAdd(b *testing.B) {
	b.ResetTimer()
	for idx := 0; idx < b.N; idx++ {
		var s string
		for i := 0; i < numbers; i++ {
			s += strconv.Itoa(i)
		}

	}
	b.StopTimer()
}
//171823	      7720 ns/op	    9776 B/op	      99 allocs/op

可以看到性能最好的是StringBuilder, SytesBuf差距不太大,其他2种性能较差。平常开发中建议使用StringBuilder。