caching: Optimize memory allocations. (#3405)
This change brings in changes at multiple places - Reuse buffers at almost all locations ranging from rpc, fs, xl, checksum etc. - Change caching behavior to disable itself under low memory conditions i.e < 8GB of RAM. - Only objects cached are of size 1/10th the size of the cache for example if 4GB is the cache size the maximum object size which will be cached is going to be 400MB. This change is an optimization to cache more objects rather than few larger objects. - If object cache is enabled default GC percent has been reduced to 20% in lieu with newly found behavior of GC. If the cache utilization reaches 75% of the maximum value GC percent is reduced to 10% to make GC more aggressive. - Do not use *bytes.Buffer* due to its growth requirements. For every allocation *bytes.Buffer* allocates an additional buffer for its internal purposes. This is undesirable for us, so implemented a new cappedWriter which is capped to a desired size, beyond this all writes rejected. Possible fix for #3403.
This commit is contained in:
@@ -29,7 +29,7 @@ import (
|
||||
// all the disks, writes also calculate individual block's checksum
|
||||
// for future bit-rot protection.
|
||||
func erasureCreateFile(disks []StorageAPI, volume, path string, reader io.Reader, blockSize int64, dataBlocks int, parityBlocks int, algo string, writeQuorum int) (bytesWritten int64, checkSums []string, err error) {
|
||||
// Allocated blockSized buffer for reading.
|
||||
// Allocated blockSized buffer for reading from incoming stream.
|
||||
buf := make([]byte, blockSize)
|
||||
|
||||
hashWriters := newHashWriters(len(disks), algo)
|
||||
|
||||
+14
-3
@@ -21,6 +21,7 @@ import (
|
||||
"errors"
|
||||
"hash"
|
||||
"io"
|
||||
"sync"
|
||||
|
||||
"github.com/klauspost/reedsolomon"
|
||||
"github.com/minio/blake2b-simd"
|
||||
@@ -47,13 +48,23 @@ func newHash(algo string) hash.Hash {
|
||||
}
|
||||
}
|
||||
|
||||
// Hash buffer pool is a pool of reusable
|
||||
// buffers used while checksumming a stream.
|
||||
var hashBufferPool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
b := make([]byte, readSizeV1)
|
||||
return &b
|
||||
},
|
||||
}
|
||||
|
||||
// hashSum calculates the hash of the entire path and returns.
|
||||
func hashSum(disk StorageAPI, volume, path string, writer hash.Hash) ([]byte, error) {
|
||||
// Allocate staging buffer of 128KiB for copyBuffer.
|
||||
buf := make([]byte, readSizeV1)
|
||||
// Fetch staging a new staging buffer from the pool.
|
||||
bufp := hashBufferPool.Get().(*[]byte)
|
||||
defer hashBufferPool.Put(bufp)
|
||||
|
||||
// Copy entire buffer to writer.
|
||||
if err := copyBuffer(writer, disk, volume, path, buf); err != nil {
|
||||
if err := copyBuffer(writer, disk, volume, path, *bufp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
@@ -129,6 +129,8 @@ func (b *backgroundAppend) abort(uploadID string) {
|
||||
func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID string, info bgAppendPartsInfo) {
|
||||
// Holds the list of parts that is already appended to the "append" file.
|
||||
appendMeta := fsMetaV1{}
|
||||
// Allocate staging read buffer.
|
||||
buf := make([]byte, readSizeV1)
|
||||
for {
|
||||
select {
|
||||
case input := <-info.inputCh:
|
||||
@@ -151,7 +153,7 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
}
|
||||
break
|
||||
}
|
||||
if err := appendPart(disk, bucket, object, uploadID, part); err != nil {
|
||||
if err := appendPart(disk, bucket, object, uploadID, part, buf); err != nil {
|
||||
disk.DeleteFile(minioMetaTmpBucket, uploadID)
|
||||
appendMeta.Parts = nil
|
||||
input.errCh <- err
|
||||
@@ -183,12 +185,11 @@ func (b *backgroundAppend) appendParts(disk StorageAPI, bucket, object, uploadID
|
||||
|
||||
// Appends the "part" to the append-file inside "tmp/" that finally gets moved to the actual location
|
||||
// upon complete-multipart-upload.
|
||||
func appendPart(disk StorageAPI, bucket, object, uploadID string, part objectPartInfo) error {
|
||||
func appendPart(disk StorageAPI, bucket, object, uploadID string, part objectPartInfo, buf []byte) error {
|
||||
partPath := pathJoin(bucket, object, uploadID, part.Name)
|
||||
|
||||
offset := int64(0)
|
||||
totalLeft := part.Size
|
||||
buf := make([]byte, readSizeV1)
|
||||
for totalLeft > 0 {
|
||||
curLeft := int64(readSizeV1)
|
||||
if totalLeft < readSizeV1 {
|
||||
|
||||
+4
-2
@@ -61,8 +61,10 @@ var (
|
||||
|
||||
globalIsDistXL = false // "Is Distributed?" flag.
|
||||
|
||||
// Maximum cache size.
|
||||
globalMaxCacheSize = uint64(maxCacheSize)
|
||||
// Maximum cache size. Defaults to disabled.
|
||||
// Caching is enabled only for RAM size > 8GiB.
|
||||
globalMaxCacheSize = uint64(0)
|
||||
|
||||
// Cache expiry.
|
||||
globalCacheExpiry = objcache.DefaultExpiry
|
||||
// Minio local server address (in `host:port` format)
|
||||
|
||||
@@ -253,11 +253,9 @@ func retryFormattingDisks(firstDisk bool, endpoints []*url.URL, storageDisks []S
|
||||
// Print configuration errors.
|
||||
printConfigErrMsg(storageDisks, sErrs, printOnceFn())
|
||||
case WaitForAll:
|
||||
console.Printf("Initializing data volume for first time. Waiting for other servers to come online (elapsed %s)\n",
|
||||
getElapsedTime())
|
||||
console.Printf("Initializing data volume for first time. Waiting for other servers to come online (elapsed %s)\n", getElapsedTime())
|
||||
case WaitForFormatting:
|
||||
console.Println("Initializing data volume for first time. Waiting for first server to come online (elapsed %s)\n",
|
||||
getElapsedTime())
|
||||
console.Printf("Initializing data volume for first time. Waiting for first server to come online (elapsed %s)\n", getElapsedTime())
|
||||
}
|
||||
continue
|
||||
} // else We have FS backend now. Check fs format as well now.
|
||||
|
||||
@@ -66,18 +66,18 @@ func setMaxMemory() error {
|
||||
// Validate if rlimit memory is set to lower
|
||||
// than max cache size. Then we should use such value.
|
||||
if uint64(rLimit.Cur) < globalMaxCacheSize {
|
||||
globalMaxCacheSize = (80 / 100) * uint64(rLimit.Cur)
|
||||
globalMaxCacheSize = uint64(float64(50*rLimit.Cur) / 100)
|
||||
}
|
||||
|
||||
// Make sure globalMaxCacheSize is less than RAM size.
|
||||
stats, err := sys.GetStats()
|
||||
if err != nil && err != sys.ErrNotImplemented {
|
||||
// sys.GetStats() is implemented only on linux. Ignore errors
|
||||
// from other OSes.
|
||||
return err
|
||||
}
|
||||
if err == nil && stats.TotalRAM < globalMaxCacheSize {
|
||||
globalMaxCacheSize = uint64(float64(80*stats.TotalRAM) / 100)
|
||||
// If TotalRAM is >= minRAMSize we proceed to enable cache.
|
||||
// cache is always 50% of the totalRAM.
|
||||
if err == nil && stats.TotalRAM >= minRAMSize {
|
||||
globalMaxCacheSize = uint64(float64(50*stats.TotalRAM) / 100)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package cmd
|
||||
|
||||
import "github.com/minio/minio/pkg/sys"
|
||||
|
||||
func setMaxOpenFiles() error {
|
||||
// Golang uses Win32 file API (CreateFile, WriteFile, ReadFile,
|
||||
// CloseHandle, etc.), then you don't have a limit on open files
|
||||
@@ -26,6 +28,15 @@ func setMaxOpenFiles() error {
|
||||
}
|
||||
|
||||
func setMaxMemory() error {
|
||||
// TODO: explore if Win32 API's provide anything special here.
|
||||
// Make sure globalMaxCacheSize is less than RAM size.
|
||||
stats, err := sys.GetStats()
|
||||
if err != nil && err != sys.ErrNotImplemented {
|
||||
return err
|
||||
}
|
||||
// If TotalRAM is <= minRAMSize we proceed to enable cache.
|
||||
// cache is always 50% of the totalRAM.
|
||||
if err == nil && stats.TotalRAM >= minRAMSize {
|
||||
globalMaxCacheSize = uint64(float64(50*stats.TotalRAM) / 100)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"net"
|
||||
"net/rpc"
|
||||
@@ -366,6 +367,13 @@ func (n *networkStorage) ReadFile(volume string, path string, offset int64, buff
|
||||
}
|
||||
}()
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
// Recover any panic from allocation, and return error.
|
||||
err = bytes.ErrTooLarge
|
||||
}
|
||||
}() // Do not crash the server.
|
||||
|
||||
// Take remote disk offline if the total network errors.
|
||||
// are more than maximum allowable IO error limit.
|
||||
if n.networkIOErrCount > maxAllowedNetworkIOError {
|
||||
@@ -377,10 +385,12 @@ func (n *networkStorage) ReadFile(volume string, path string, offset int64, buff
|
||||
Vol: volume,
|
||||
Path: path,
|
||||
Offset: offset,
|
||||
Size: len(buffer),
|
||||
Buffer: buffer,
|
||||
}, &result)
|
||||
|
||||
// Copy results to buffer.
|
||||
copy(buffer, result)
|
||||
|
||||
// Return length of result, err if any.
|
||||
return int64(len(result)), toStorageErr(err)
|
||||
}
|
||||
|
||||
@@ -57,8 +57,8 @@ type ReadFileArgs struct {
|
||||
// Starting offset to start reading into Buffer.
|
||||
Offset int64
|
||||
|
||||
// Data size read from the path at offset.
|
||||
Size int
|
||||
// Data buffer read from the path at offset.
|
||||
Buffer []byte
|
||||
}
|
||||
|
||||
// PrepareFileArgs represents append file RPC arguments.
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"net/rpc"
|
||||
"path"
|
||||
@@ -156,19 +155,12 @@ func (s *storageServer) ReadAllHandler(args *ReadFileArgs, reply *[]byte) error
|
||||
|
||||
// ReadFileHandler - read file handler is rpc wrapper to read file.
|
||||
func (s *storageServer) ReadFileHandler(args *ReadFileArgs, reply *[]byte) (err error) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
// Recover any panic and return ErrCacheFull.
|
||||
err = bytes.ErrTooLarge
|
||||
}
|
||||
}() // Do not crash the server.
|
||||
if !isRPCTokenValid(args.Token) {
|
||||
return errInvalidToken
|
||||
}
|
||||
// Allocate the requested buffer from the client.
|
||||
*reply = make([]byte, args.Size)
|
||||
|
||||
var n int64
|
||||
n, err = s.storage.ReadFile(args.Vol, args.Path, args.Offset, *reply)
|
||||
n, err = s.storage.ReadFile(args.Vol, args.Path, args.Offset, args.Buffer)
|
||||
// Sending an error over the rpc layer, would cause unmarshalling to fail. In situations
|
||||
// when we have short read i.e `io.ErrUnexpectedEOF` treat it as good condition and copy
|
||||
// the buffer properly.
|
||||
@@ -176,7 +168,7 @@ func (s *storageServer) ReadFileHandler(args *ReadFileArgs, reply *[]byte) (err
|
||||
// Reset to nil as good condition.
|
||||
err = nil
|
||||
}
|
||||
*reply = (*reply)[0:n]
|
||||
*reply = args.Buffer[0:n]
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
@@ -62,6 +62,9 @@ func init() {
|
||||
|
||||
// Disable printing console messages during tests.
|
||||
color.Output = ioutil.Discard
|
||||
|
||||
// Enable caching.
|
||||
setMaxMemory()
|
||||
}
|
||||
|
||||
func prepareFS() (ObjectLayer, string, error) {
|
||||
|
||||
+11
-7
@@ -708,17 +708,21 @@ func (xl xlObjects) CompleteMultipartUpload(bucket string, object string, upload
|
||||
destLock := nsMutex.NewNSLock(bucket, object)
|
||||
destLock.Lock()
|
||||
defer func() {
|
||||
// A new complete multipart upload invalidates any
|
||||
// previously cached object in memory.
|
||||
xl.objCache.Delete(path.Join(bucket, object))
|
||||
if xl.objCacheEnabled {
|
||||
// A new complete multipart upload invalidates any
|
||||
// previously cached object in memory.
|
||||
xl.objCache.Delete(path.Join(bucket, object))
|
||||
}
|
||||
|
||||
// This lock also protects the cache namespace.
|
||||
destLock.Unlock()
|
||||
|
||||
// Prefetch the object from disk by triggering a fake GetObject call
|
||||
// Unlike a regular single PutObject, multipart PutObject is comes in
|
||||
// stages and it is harder to cache.
|
||||
go xl.GetObject(bucket, object, 0, objectSize, ioutil.Discard)
|
||||
if xl.objCacheEnabled {
|
||||
// Prefetch the object from disk by triggering a fake GetObject call
|
||||
// Unlike a regular single PutObject, multipart PutObject is comes in
|
||||
// stages and it is harder to cache.
|
||||
go xl.GetObject(bucket, object, 0, objectSize, ioutil.Discard)
|
||||
}
|
||||
}()
|
||||
|
||||
// Rename if an object already exists to temporary location.
|
||||
|
||||
+4
-2
@@ -621,8 +621,10 @@ func (xl xlObjects) DeleteObject(bucket, object string) (err error) {
|
||||
return toObjectErr(err, bucket, object)
|
||||
}
|
||||
|
||||
// Delete from the cache.
|
||||
xl.objCache.Delete(pathJoin(bucket, object))
|
||||
if xl.objCacheEnabled {
|
||||
// Delete from the cache.
|
||||
xl.objCache.Delete(pathJoin(bucket, object))
|
||||
}
|
||||
|
||||
// Success.
|
||||
return nil
|
||||
|
||||
+23
-12
@@ -19,6 +19,7 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -42,8 +43,9 @@ const (
|
||||
// Uploads metadata file carries per multipart object metadata.
|
||||
uploadsJSONFile = "uploads.json"
|
||||
|
||||
// 8GiB cache by default.
|
||||
maxCacheSize = 8 * humanize.GiByte
|
||||
// Represents the minimum required RAM size before
|
||||
// we enable caching.
|
||||
minRAMSize = 8 * humanize.GiByte
|
||||
|
||||
// Maximum erasure blocks.
|
||||
maxErasureBlocks = 16
|
||||
@@ -92,9 +94,6 @@ func newXLObjects(storageDisks []StorageAPI) (ObjectLayer, error) {
|
||||
// Calculate data and parity blocks.
|
||||
dataBlocks, parityBlocks := len(newStorageDisks)/2, len(newStorageDisks)/2
|
||||
|
||||
// Initialize object cache.
|
||||
objCache := objcache.New(globalMaxCacheSize, globalCacheExpiry)
|
||||
|
||||
// Initialize list pool.
|
||||
listPool := newTreeWalkPool(globalLookupTimeout)
|
||||
|
||||
@@ -103,13 +102,25 @@ func newXLObjects(storageDisks []StorageAPI) (ObjectLayer, error) {
|
||||
|
||||
// Initialize xl objects.
|
||||
xl := &xlObjects{
|
||||
mutex: &sync.Mutex{},
|
||||
storageDisks: newStorageDisks,
|
||||
dataBlocks: dataBlocks,
|
||||
parityBlocks: parityBlocks,
|
||||
listPool: listPool,
|
||||
objCache: objCache,
|
||||
objCacheEnabled: !objCacheDisabled,
|
||||
mutex: &sync.Mutex{},
|
||||
storageDisks: newStorageDisks,
|
||||
dataBlocks: dataBlocks,
|
||||
parityBlocks: parityBlocks,
|
||||
listPool: listPool,
|
||||
}
|
||||
|
||||
// Object cache is enabled when _MINIO_CACHE env is missing.
|
||||
// and cache size is > 0.
|
||||
xl.objCacheEnabled = !objCacheDisabled && globalMaxCacheSize > 0
|
||||
|
||||
// Check if object cache is enabled.
|
||||
if xl.objCacheEnabled {
|
||||
// Initialize object cache.
|
||||
objCache := objcache.New(globalMaxCacheSize, globalCacheExpiry)
|
||||
objCache.OnEviction = func(key string) {
|
||||
debug.FreeOSMemory()
|
||||
}
|
||||
xl.objCache = objCache
|
||||
}
|
||||
|
||||
// Initialize meta volume, if volume already exists ignores it.
|
||||
|
||||
Reference in New Issue
Block a user