| 
									
										
										
										
											2021-04-19 03:41:13 +08:00
										 |  |  | // Copyright (c) 2015-2021 MinIO, Inc.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This file is part of MinIO Object Storage stack
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This program is free software: you can redistribute it and/or modify
 | 
					
						
							|  |  |  | // it under the terms of the GNU Affero General Public License as published by
 | 
					
						
							|  |  |  | // the Free Software Foundation, either version 3 of the License, or
 | 
					
						
							|  |  |  | // (at your option) any later version.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // This program is distributed in the hope that it will be useful
 | 
					
						
							|  |  |  | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
					
						
							|  |  |  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
					
						
							|  |  |  | // GNU Affero General Public License for more details.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // You should have received a copy of the GNU Affero General Public License
 | 
					
						
							|  |  |  | // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | 
					
						
							| 
									
										
										
										
											2016-07-06 02:41:25 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-08-19 07:23:42 +08:00
										 |  |  | package cmd | 
					
						
							| 
									
										
										
										
											2016-07-06 02:41:25 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | import ( | 
					
						
							|  |  |  | 	"bytes" | 
					
						
							| 
									
										
										
										
											2018-04-06 06:04:40 +08:00
										 |  |  | 	"context" | 
					
						
							| 
									
										
										
										
											2021-05-17 23:32:28 +08:00
										 |  |  | 	crand "crypto/rand" | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	"io" | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	"math/rand" | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | 	"testing" | 
					
						
							| 
									
										
										
										
											2016-07-26 05:17:01 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-17 23:32:28 +08:00
										 |  |  | 	"github.com/dustin/go-humanize" | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2016-07-06 02:41:25 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-05 00:45:06 +08:00
										 |  |  | func (a badDisk) ReadFile(ctx context.Context, volume string, path string, offset int64, buf []byte, verifier *BitrotVerifier) (n int64, err error) { | 
					
						
							| 
									
										
										
										
											2017-05-17 05:21:52 +08:00
										 |  |  | 	return 0, errFaultyDisk | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | var erasureDecodeTests = []struct { | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	dataBlocks                   int | 
					
						
							|  |  |  | 	onDisks, offDisks            int | 
					
						
							|  |  |  | 	blocksize, data              int64 | 
					
						
							|  |  |  | 	offset                       int64 | 
					
						
							|  |  |  | 	length                       int64 | 
					
						
							|  |  |  | 	algorithm                    BitrotAlgorithm | 
					
						
							|  |  |  | 	shouldFail, shouldFailQuorum bool | 
					
						
							|  |  |  | }{ | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},             // 0
 | 
					
						
							|  |  |  | 	{dataBlocks: 3, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false},                 // 1
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 8, offDisks: 0, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 2
 | 
					
						
							|  |  |  | 	{dataBlocks: 5, onDisks: 10, offDisks: 0, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 1, length: oneMiByte - 1, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},        // 3
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	{dataBlocks: 6, onDisks: 12, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: oneMiByte, length: 0, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, | 
					
						
							|  |  |  | 	// 4
 | 
					
						
							|  |  |  | 	{dataBlocks: 7, onDisks: 14, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: 3, length: 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                    // 5
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, onDisks: 16, offDisks: 0, blocksize: int64(oneMiByte), data: oneMiByte, offset: 4, length: 8 * 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                // 6
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 7, onDisks: 14, offDisks: 7, blocksize: int64(blockSizeV2), data: oneMiByte, offset: oneMiByte, length: 1, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false},              // 7
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, onDisks: 12, offDisks: 6, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},             // 8
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	{dataBlocks: 5, onDisks: 10, offDisks: 5, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                           // 9
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 4, onDisks: 8, offDisks: 4, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false},                              // 10
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	{dataBlocks: 3, onDisks: 6, offDisks: 3, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                // 11
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 2, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},              // 12
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 1, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                // 13
 | 
					
						
							|  |  |  | 	{dataBlocks: 3, onDisks: 6, offDisks: 2, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                // 14
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 8, offDisks: 3, blocksize: int64(2 * oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},            // 15
 | 
					
						
							|  |  |  | 	{dataBlocks: 5, onDisks: 10, offDisks: 6, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true},                // 16
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 5, onDisks: 10, offDisks: 2, blocksize: int64(blockSizeV2), data: 2 * oneMiByte, offset: oneMiByte, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 17
 | 
					
						
							|  |  |  | 	{dataBlocks: 5, onDisks: 10, offDisks: 1, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                         // 18
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, onDisks: 12, offDisks: 3, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	// 19
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 6, onDisks: 12, offDisks: 7, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true},                                             // 20
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, onDisks: 16, offDisks: 8, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                                            // 21
 | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	{dataBlocks: 8, onDisks: 16, offDisks: 9, blocksize: int64(oneMiByte), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true},                                               // 22
 | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	{dataBlocks: 8, onDisks: 16, offDisks: 7, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                                            // 23
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 1, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                                             // 24
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: oneMiByte, offset: 0, length: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},                                             // 25
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(blockSizeV2) + 1, offset: 0, length: int64(blockSizeV2) + 1, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                               // 26
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 12, length: int64(blockSizeV2) + 17, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                             // 27
 | 
					
						
							|  |  |  | 	{dataBlocks: 3, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 1023, length: int64(blockSizeV2) + 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},             // 28
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 8, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 11, length: int64(blockSizeV2) + 2*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},             // 29
 | 
					
						
							|  |  |  | 	{dataBlocks: 6, onDisks: 12, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 512, length: int64(blockSizeV2) + 8*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},           // 30
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, onDisks: 16, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: int64(blockSizeV2), length: int64(blockSizeV2) - 1, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 31
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(oneMiByte), offset: -1, length: 3, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false},                                              // 32
 | 
					
						
							|  |  |  | 	{dataBlocks: 2, onDisks: 4, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(oneMiByte), offset: 1024, length: -1, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false},                                           // 33
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 6, offDisks: 0, blocksize: int64(blockSizeV2), data: int64(blockSizeV2), offset: 0, length: int64(blockSizeV2), algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                                       // 34
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 6, offDisks: 1, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 12, length: int64(blockSizeV2) + 17, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false},                             // 35
 | 
					
						
							|  |  |  | 	{dataBlocks: 4, onDisks: 6, offDisks: 3, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 1023, length: int64(blockSizeV2) + 1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true},              // 36
 | 
					
						
							|  |  |  | 	{dataBlocks: 8, onDisks: 12, offDisks: 4, blocksize: int64(blockSizeV2), data: int64(2 * blockSizeV2), offset: 11, length: int64(blockSizeV2) + 2*1024, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false},            // 37
 | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func TestErasureDecode(t *testing.T) { | 
					
						
							|  |  |  | 	for i, test := range erasureDecodeTests { | 
					
						
							| 
									
										
										
										
											2022-07-26 03:37:26 +08:00
										 |  |  | 		setup, err := newErasureTestSetup(t, test.dataBlocks, test.onDisks-test.dataBlocks, test.blocksize) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err != nil { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to create test setup: %v", i, err) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 		erasure, err := NewErasure(t.Context(), test.dataBlocks, test.onDisks-test.dataBlocks, test.blocksize) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err != nil { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		disks := setup.disks | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		data := make([]byte, test.data) | 
					
						
							|  |  |  | 		if _, err = io.ReadFull(crand.Reader, data); err != nil { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to generate random test data: %v", i, err) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		writeAlgorithm := test.algorithm | 
					
						
							|  |  |  | 		if !test.algorithm.Available() { | 
					
						
							|  |  |  | 			writeAlgorithm = DefaultBitrotAlgorithm | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		buffer := make([]byte, test.blocksize, 2*test.blocksize) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		writers := make([]io.Writer, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for i, disk := range disks { | 
					
						
							| 
									
										
										
										
											2024-01-31 04:43:25 +08:00
										 |  |  | 			writers[i] = newBitrotWriter(disk, "", "testbucket", "object", erasure.ShardFileSize(test.data), writeAlgorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 		n, err := erasure.Encode(t.Context(), bytes.NewReader(data), writers, buffer, erasure.dataBlocks+1) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotWriters(writers) | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | 		if err != nil { | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			t.Fatalf("Test %d: failed to create erasure test file: %v", i, err) | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		if n != test.data { | 
					
						
							|  |  |  | 			t.Fatalf("Test %d: failed to create erasure test file", i) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		for i, w := range writers { | 
					
						
							|  |  |  | 			if w == nil { | 
					
						
							|  |  |  | 				disks[i] = nil | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Get the checksums of the current part.
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		bitrotReaders := make([]io.ReaderAt, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for index, disk := range disks { | 
					
						
							|  |  |  | 			if disk == OfflineDisk { | 
					
						
							|  |  |  | 				continue | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2020-06-13 11:04:01 +08:00
										 |  |  | 			tillOffset := erasure.ShardFileOffset(test.offset, test.length, test.data) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-08 11:27:31 +08:00
										 |  |  | 			bitrotReaders[index] = newBitrotReader(disk, nil, "testbucket", "object", tillOffset, writeAlgorithm, bitrotWriterSum(writers[index]), erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		writer := bytes.NewBuffer(nil) | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 		_, err = erasure.Decode(t.Context(), writer, bitrotReaders, test.offset, test.length, test.data, nil) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotReaders(bitrotReaders) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err != nil && !test.shouldFail { | 
					
						
							|  |  |  | 			t.Errorf("Test %d: should pass but failed with: %v", i, err) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if err == nil && test.shouldFail { | 
					
						
							|  |  |  | 			t.Errorf("Test %d: should fail but it passed", i) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if err == nil { | 
					
						
							|  |  |  | 			if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) { | 
					
						
							| 
									
										
										
										
											2024-01-18 15:03:17 +08:00
										 |  |  | 				t.Errorf("Test %d: read returns wrong file content.", i) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for i, r := range bitrotReaders { | 
					
						
							|  |  |  | 			if r == nil { | 
					
						
							|  |  |  | 				disks[i] = OfflineDisk | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 		if err == nil && !test.shouldFail { | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 			bitrotReaders = make([]io.ReaderAt, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 			for index, disk := range disks { | 
					
						
							|  |  |  | 				if disk == OfflineDisk { | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2020-06-13 11:04:01 +08:00
										 |  |  | 				tillOffset := erasure.ShardFileOffset(test.offset, test.length, test.data) | 
					
						
							| 
									
										
										
										
											2021-01-08 11:27:31 +08:00
										 |  |  | 				bitrotReaders[index] = newBitrotReader(disk, nil, "testbucket", "object", tillOffset, writeAlgorithm, bitrotWriterSum(writers[index]), erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 			} | 
					
						
							|  |  |  | 			for j := range disks[:test.offDisks] { | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 				if bitrotReaders[j] == nil { | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				switch r := bitrotReaders[j].(type) { | 
					
						
							|  |  |  | 				case *wholeBitrotReader: | 
					
						
							|  |  |  | 					r.disk = badDisk{nil} | 
					
						
							|  |  |  | 				case *streamingBitrotReader: | 
					
						
							|  |  |  | 					r.disk = badDisk{nil} | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			} | 
					
						
							|  |  |  | 			if test.offDisks > 0 { | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 				bitrotReaders[0] = nil | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 			writer.Reset() | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 			_, err = erasure.Decode(t.Context(), writer, bitrotReaders, test.offset, test.length, test.data, nil) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 			closeBitrotReaders(bitrotReaders) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 			if err != nil && !test.shouldFailQuorum { | 
					
						
							|  |  |  | 				t.Errorf("Test %d: should pass but failed with: %v", i, err) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if err == nil && test.shouldFailQuorum { | 
					
						
							|  |  |  | 				t.Errorf("Test %d: should fail but it passed", i) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if !test.shouldFailQuorum { | 
					
						
							|  |  |  | 				if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) { | 
					
						
							| 
									
										
											  
											
												Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read 
from the local disk, even if the disk contains a parity shard. 
In default setup there is a 50% chance that at least 
one shard that otherwise would have been fetched remotely 
will be read locally instead.
It basically trades RPC call overhead for reed-solomon. 
On distributed localhost this seems to be fairly break-even, 
with a very small gain in throughput and latency. 
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
 * Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
 * First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
 * Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
 * First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
											
										 
											2020-05-27 07:47:23 +08:00
										 |  |  | 					t.Errorf("Test %d: read returns wrong file content", i) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2016-07-19 14:56:16 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | // Test erasureDecode with random offset and lengths.
 | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | // This test is t.Skip()ed as it a long time to run, hence should be run
 | 
					
						
							|  |  |  | // explicitly after commenting out t.Skip()
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func TestErasureDecodeRandomOffsetLength(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2020-10-27 01:29:29 +08:00
										 |  |  | 	if testing.Short() { | 
					
						
							|  |  |  | 		t.Skip() | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	// Initialize environment needed for the test.
 | 
					
						
							|  |  |  | 	dataBlocks := 7 | 
					
						
							|  |  |  | 	parityBlocks := 7 | 
					
						
							| 
									
										
										
										
											2016-11-23 10:18:22 +08:00
										 |  |  | 	blockSize := int64(1 * humanize.MiByte) | 
					
						
							| 
									
										
										
										
											2022-07-26 03:37:26 +08:00
										 |  |  | 	setup, err := newErasureTestSetup(t, dataBlocks, parityBlocks, blockSize) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		t.Error(err) | 
					
						
							|  |  |  | 		return | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	disks := setup.disks | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 	erasure, err := NewErasure(t.Context(), dataBlocks, parityBlocks, blockSize) | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		t.Fatalf("failed to create ErasureStorage: %v", err) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-11-23 10:18:22 +08:00
										 |  |  | 	// Prepare a slice of 5MiB with random data.
 | 
					
						
							|  |  |  | 	data := make([]byte, 5*humanize.MiByte) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	length := int64(len(data)) | 
					
						
							|  |  |  | 	_, err = rand.Read(data) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		t.Fatal(err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	writers := make([]io.Writer, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	for i, disk := range disks { | 
					
						
							|  |  |  | 		if disk == nil { | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-01-31 04:43:25 +08:00
										 |  |  | 		writers[i] = newBitrotWriter(disk, "", "testbucket", "object", erasure.ShardFileSize(length), DefaultBitrotAlgorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	// 10000 iterations with random offsets and lengths.
 | 
					
						
							|  |  |  | 	iterations := 10000 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Create a test file to read from.
 | 
					
						
							| 
									
										
										
										
											2017-08-15 09:08:42 +08:00
										 |  |  | 	buffer := make([]byte, blockSize, 2*blockSize) | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 	n, err := erasure.Encode(t.Context(), bytes.NewReader(data), writers, buffer, erasure.dataBlocks+1) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	closeBitrotWriters(writers) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		t.Fatal(err) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	if n != length { | 
					
						
							|  |  |  | 		t.Errorf("erasureCreateFile returned %d, expected %d", n, length) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// To generate random offset/length.
 | 
					
						
							| 
									
										
										
										
											2017-03-19 02:28:41 +08:00
										 |  |  | 	r := rand.New(rand.NewSource(UTCNow().UnixNano())) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	buf := &bytes.Buffer{} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	// Verify erasure.Decode() for random offsets and lengths.
 | 
					
						
							| 
									
										
										
										
											2025-08-29 10:39:48 +08:00
										 |  |  | 	for range iterations { | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 		offset := r.Int63n(length) | 
					
						
							|  |  |  | 		readLen := r.Int63n(length - offset) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		expected := data[offset : offset+readLen] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		// Get the checksums of the current part.
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		bitrotReaders := make([]io.ReaderAt, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for index, disk := range disks { | 
					
						
							|  |  |  | 			if disk == OfflineDisk { | 
					
						
							|  |  |  | 				continue | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2020-06-13 11:04:01 +08:00
										 |  |  | 			tillOffset := erasure.ShardFileOffset(offset, readLen, length) | 
					
						
							| 
									
										
										
										
											2021-01-08 11:27:31 +08:00
										 |  |  | 			bitrotReaders[index] = newStreamingBitrotReader(disk, nil, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-04-09 22:28:39 +08:00
										 |  |  | 		_, err = erasure.Decode(t.Context(), buf, bitrotReaders, offset, readLen, length, nil) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotReaders(bitrotReaders) | 
					
						
							| 
									
										
										
										
											2016-07-20 16:30:30 +08:00
										 |  |  | 		if err != nil { | 
					
						
							|  |  |  | 			t.Fatal(err, offset, readLen) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		got := buf.Bytes() | 
					
						
							|  |  |  | 		if !bytes.Equal(expected, got) { | 
					
						
							|  |  |  | 			t.Fatalf("read data is different from what was expected, offset=%d length=%d", offset, readLen) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		buf.Reset() | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | // Benchmarks
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func benchmarkErasureDecode(data, parity, dataDown, parityDown int, size int64, b *testing.B) { | 
					
						
							| 
									
										
										
										
											2022-07-26 03:37:26 +08:00
										 |  |  | 	setup, err := newErasureTestSetup(b, data, parity, blockSizeV2) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		b.Fatalf("failed to create test setup: %v", err) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	disks := setup.disks | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	erasure, err := NewErasure(context.Background(), data, parity, blockSizeV2) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		b.Fatalf("failed to create ErasureStorage: %v", err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	writers := make([]io.Writer, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	for i, disk := range disks { | 
					
						
							|  |  |  | 		if disk == nil { | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2024-01-31 04:43:25 +08:00
										 |  |  | 		writers[i] = newBitrotWriter(disk, "", "testbucket", "object", erasure.ShardFileSize(size), DefaultBitrotAlgorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	content := make([]byte, size) | 
					
						
							| 
									
										
											  
											
												[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
											
										 
											2021-03-07 06:09:34 +08:00
										 |  |  | 	buffer := make([]byte, blockSizeV2, 2*blockSizeV2) | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	_, err = erasure.Encode(context.Background(), bytes.NewReader(content), writers, buffer, erasure.dataBlocks+1) | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 	closeBitrotWriters(writers) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		b.Fatalf("failed to create erasure test file: %v", err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-08-29 10:39:48 +08:00
										 |  |  | 	for i := range dataDown { | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		writers[i] = nil | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	for i := data; i < data+parityDown; i++ { | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		writers[i] = nil | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	b.SetBytes(size) | 
					
						
							|  |  |  | 	b.ReportAllocs() | 
					
						
							| 
									
										
										
										
											2025-08-29 10:39:48 +08:00
										 |  |  | 	for b.Loop() { | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		bitrotReaders := make([]io.ReaderAt, len(disks)) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		for index, disk := range disks { | 
					
						
							|  |  |  | 			if writers[index] == nil { | 
					
						
							|  |  |  | 				continue | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2020-06-13 11:04:01 +08:00
										 |  |  | 			tillOffset := erasure.ShardFileOffset(0, size, size) | 
					
						
							| 
									
										
										
										
											2021-01-08 11:27:31 +08:00
										 |  |  | 			bitrotReaders[index] = newStreamingBitrotReader(disk, nil, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize()) | 
					
						
							| 
									
										
										
										
											2018-08-07 06:14:08 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2021-01-28 02:21:14 +08:00
										 |  |  | 		if _, err = erasure.Decode(context.Background(), bytes.NewBuffer(content[:0]), bitrotReaders, 0, size, size, nil); err != nil { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 			panic(err) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2019-01-17 20:58:18 +08:00
										 |  |  | 		closeBitrotReaders(bitrotReaders) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func BenchmarkErasureDecodeQuick(b *testing.B) { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	const size = 12 * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	b.Run(" 00|00 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 0, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 00|X0 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 0, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0|00 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 1, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0|X0 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 1, 1, size, b) }) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func BenchmarkErasureDecode_4_64KB(b *testing.B) { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	const size = 64 * 1024 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	b.Run(" 00|00 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 0, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 00|X0 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 0, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0|00 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 1, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0|X0 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 1, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 00|XX ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 0, 2, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XX|00 ", func(b *testing.B) { benchmarkErasureDecode(2, 2, 2, 0, size, b) }) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func BenchmarkErasureDecode_8_20MB(b *testing.B) { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	const size = 20 * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	b.Run(" 0000|0000 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 0, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 0000|X000 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 0, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X000|0000 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 1, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X000|X000 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 1, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 0000|XXXX ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 0, 4, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XX00|XX00 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 2, 2, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XXXX|0000 ", func(b *testing.B) { benchmarkErasureDecode(4, 4, 4, 0, size, b) }) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func BenchmarkErasureDecode_12_30MB(b *testing.B) { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	const size = 30 * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	b.Run(" 000000|000000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 0, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 000000|X00000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 0, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X00000|000000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 1, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X00000|X00000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 1, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 000000|XXXXXX ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 0, 6, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XXX000|XXX000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 3, 3, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XXXXXX|000000 ", func(b *testing.B) { benchmarkErasureDecode(6, 6, 6, 0, size, b) }) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | func BenchmarkErasureDecode_16_40MB(b *testing.B) { | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | 	const size = 40 * 1024 * 1024 | 
					
						
							| 
									
										
										
										
											2018-08-24 14:35:37 +08:00
										 |  |  | 	b.Run(" 00000000|00000000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 0, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 00000000|X0000000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 0, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0000000|00000000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 1, 0, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" X0000000|X0000000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 1, 1, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" 00000000|XXXXXXXX ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 0, 8, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XXXX0000|XXXX0000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 4, 4, size, b) }) | 
					
						
							|  |  |  | 	b.Run(" XXXXXXXX|00000000 ", func(b *testing.B) { benchmarkErasureDecode(8, 8, 8, 0, size, b) }) | 
					
						
							| 
									
										
										
										
											2017-11-18 06:57:04 +08:00
										 |  |  | } |