| 
									
										
										
										
											2021-04-19 03:41:13 +08:00
										 |  |  | // Copyright (c) 2015-2021 MinIO, Inc.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This file is part of MinIO Object Storage stack
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify
 | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by
 | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or
 | 
					
						
							|  |  |  | // (at your option) any later version.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful
 | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
					
						
							|  |  |  | // GNU Affero General Public License for more details.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License
 | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-08-19 07:23:42 +08:00
										 |  |  | package cmd | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"bytes" | 
					
						
							| 
									
										
										
										
											2018-04-06 06:04:40 +08:00
										 |  |  | 	"context" | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 	"crypto/rand" | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	"io" | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	"os" | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 	"testing" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | var erasureHealTests = []struct { | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 	dataBlocks, disks int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// number of offline disks is also number of staleDisks for
 | 
					
						
							|  |  |  | 	// erasure reconstruction in this test
 | 
					
						
							|  |  |  | 	offDisks int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// bad disks are online disks which return errors
 | 
					
						
							|  |  |  | 	badDisks, badStaleDisks int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	blocksize, size int64 | 
					
						
							|  |  |  | 	algorithm       BitrotAlgorithm | 
					
						
							|  |  |  | 	shouldFail      bool | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | }{ | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: SHA256, shouldFail: false},                   // 0
 | 
					
						
							|  |  |  | 	{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false},               // 1
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false},               // 2
 | 
					
						
							|  |  |  | 	{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},  // 3
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: SHA256, shouldFail: false},                  // 4
 | 
					
						
							|  |  |  | 	{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},  // 5
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badStaleDisks: 1, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},  // 6
 | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 	{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false},            // 7
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 8
 | 
					
						
							|  |  |  | 	{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badStaleDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true},                 // 9
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},   // 10
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true},    // 11
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true},   // 12
 | 
					
						
							|  |  |  | 	{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false},              // 13
 | 
					
						
							|  |  |  | 	{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},  // 14
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true},   // 15
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false},   // 16
 | 
					
						
							|  |  |  | 	{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 17
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false},               // 18
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte * 64, algorithm: SHA256, shouldFail: false},              // 19
 | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func TestErasureHeal(t *testing.T) { | 
					
						
							|  |  |  | 	for i, test := range erasureHealTests { | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 		if test.offDisks < test.badStaleDisks { | 
					
						
							|  |  |  | 			// test case sanity check
 | 
					
						
							| 
									
										
										
										
											2022-08-05 07:10:08 +08:00
										 |  |  | 			t.Fatalf("Test %d: Bad test case - number of stale drives cannot be less than number of badstale drives", i) | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// create some test data
 | 
					
						
							| 
									
										
										
										
											2022-07-26 03:37:26 +08:00
										 |  |  | 		setup, err := newErasureTestSetup(t, test.dataBlocks, test.disks-test.dataBlocks, test.blocksize) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		if err != nil { | 
					
						
							| 
									
										
										
										
											2020-06-13 11:04:01 +08:00
										 |  |  | 			t.Fatalf("Test %d: failed to setup Erasure environment: %v", i, err) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		disks := setup.disks | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 		erasure, err := NewErasure(context.Background(), test.dataBlocks, test.disks-test.dataBlocks, test.blocksize) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err != nil { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		data := make([]byte, test.size) | 
					
						
							|  |  |  | 		if _, err = io.ReadFull(rand.Reader, data); err != nil { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to create random test data: %v", i, err) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		buffer := make([]byte, test.blocksize, 2*test.blocksize) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		writers := make([]io.Writer, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for i, disk := range disks { | 
					
						
							| 
									
										
										
										
											2021-05-17 23:32:28 +08:00
										 |  |  | 			writers[i] = newBitrotWriter(disk, "testbucket", "testobject", erasure.ShardFileSize(test.size), test.algorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 		_, err = erasure.Encode(context.Background(), bytes.NewReader(data), writers, buffer, erasure.dataBlocks+1) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotWriters(writers) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		if err != nil { | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			t.Fatalf("Test %d: failed to create random test data: %v", i, err) | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		readers := make([]io.ReaderAt, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for i, disk := range disks { | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 			shardFilesize := erasure.ShardFileSize(test.size) | 
					
						
							| 
									
										
										
										
											2021-01-08 11:27:31 +08:00
										 |  |  | 			readers[i] = newBitrotReader(disk, nil, "testbucket", "testobject", shardFilesize, test.algorithm, bitrotWriterSum(writers[i]), erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 		// setup stale disks for the test case
 | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		staleDisks := make([]StorageAPI, len(disks)) | 
					
						
							|  |  |  | 		copy(staleDisks, disks) | 
					
						
							|  |  |  | 		for j := 0; j < len(staleDisks); j++ { | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 			if j < test.offDisks { | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 				readers[j] = nil | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 			} else { | 
					
						
							|  |  |  | 				staleDisks[j] = nil | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		for j := 0; j < test.badDisks; j++ { | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 			switch r := readers[test.offDisks+j].(type) { | 
					
						
							|  |  |  | 			case *streamingBitrotReader: | 
					
						
							|  |  |  | 				r.disk = badDisk{nil} | 
					
						
							|  |  |  | 			case *wholeBitrotReader: | 
					
						
							|  |  |  | 				r.disk = badDisk{nil} | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 		for j := 0; j < test.badStaleDisks; j++ { | 
					
						
							|  |  |  | 			staleDisks[j] = badDisk{nil} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		staleWriters := make([]io.Writer, len(staleDisks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for i, disk := range staleDisks { | 
					
						
							|  |  |  | 			if disk == nil { | 
					
						
							|  |  |  | 				continue | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 			os.Remove(pathJoin(disk.String(), "testbucket", "testobject")) | 
					
						
							| 
									
										
										
										
											2021-05-17 23:32:28 +08:00
										 |  |  | 			staleWriters[i] = newBitrotWriter(disk, "testbucket", "testobject", erasure.ShardFileSize(test.size), test.algorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		// test case setup is complete - now call Heal()
 | 
					
						
							| 
									
										
										
										
											2023-08-03 17:18:18 +08:00
										 |  |  | 		err = erasure.Heal(context.Background(), staleWriters, readers, test.size, nil) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotReaders(readers) | 
					
						
							|  |  |  | 		closeBitrotWriters(staleWriters) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err != nil && !test.shouldFail { | 
					
						
							|  |  |  | 			t.Errorf("Test %d: should pass but it failed with: %v", i, err) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if err == nil && test.shouldFail { | 
					
						
							|  |  |  | 			t.Errorf("Test %d: should fail but it passed", i) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if err == nil { | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 			// Verify that checksums of staleDisks
 | 
					
						
							|  |  |  | 			// match expected values
 | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 			for i := range staleWriters { | 
					
						
							|  |  |  | 				if staleWriters[i] == nil { | 
					
						
							| 
									
										
										
										
											2017-09-21 00:50:27 +08:00
										 |  |  | 					continue | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 				} | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 				if !bytes.Equal(bitrotWriterSum(staleWriters[i]), bitrotWriterSum(writers[i])) { | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 					t.Errorf("Test %d: heal returned different bitrot checksums", i) | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2016-08-18 02:36:33 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } |