mirror of https://github.com/minio/minio.git
				
				
				
			s3select should honour custom record delimiter (#6419)
Allow custom delimiters like `\r\n`, `a`, `\r` etc in input csv and replace with `\n`. Fixes #6403
This commit is contained in:
		
							parent
							
								
									92bc7caf7a
								
							
						
					
					
						commit
						30d4a2cf53
					
				|  | @ -178,6 +178,10 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r | ||||||
| 		writeErrorResponse(w, ErrInvalidQuoteFields, r.URL) | 		writeErrorResponse(w, ErrInvalidQuoteFields, r.URL) | ||||||
| 		return | 		return | ||||||
| 	} | 	} | ||||||
|  | 	if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 { | ||||||
|  | 		writeErrorResponse(w, ErrInvalidRequestParameter, r.URL) | ||||||
|  | 		return | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	getObject := objectAPI.GetObject | 	getObject := objectAPI.GetObject | ||||||
| 	if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) { | 	if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) { | ||||||
|  | @ -222,9 +226,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r | ||||||
| 	if selectReq.InputSerialization.CSV.FileHeaderInfo == "" { | 	if selectReq.InputSerialization.CSV.FileHeaderInfo == "" { | ||||||
| 		selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone | 		selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone | ||||||
| 	} | 	} | ||||||
|  | 	if selectReq.InputSerialization.CSV.RecordDelimiter == "" { | ||||||
|  | 		selectReq.InputSerialization.CSV.RecordDelimiter = "\n" | ||||||
|  | 	} | ||||||
| 	if selectReq.InputSerialization.CSV != nil { | 	if selectReq.InputSerialization.CSV != nil { | ||||||
| 		options := &s3select.Options{ | 		options := &s3select.Options{ | ||||||
| 			HasHeader:            selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone, | 			HasHeader:            selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone, | ||||||
|  | 			RecordDelimiter:      selectReq.InputSerialization.CSV.RecordDelimiter, | ||||||
| 			FieldDelimiter:       selectReq.InputSerialization.CSV.FieldDelimiter, | 			FieldDelimiter:       selectReq.InputSerialization.CSV.FieldDelimiter, | ||||||
| 			Comments:             selectReq.InputSerialization.CSV.Comments, | 			Comments:             selectReq.InputSerialization.CSV.Comments, | ||||||
| 			Name:                 "S3Object", // Default table name for all objects
 | 			Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  |  | ||||||
|  | @ -0,0 +1,87 @@ | ||||||
|  | /* | ||||||
|  |  * Minio Cloud Storage, (C) 2018 Minio, Inc. | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | package ioutil | ||||||
|  | 
 | ||||||
|  | import ( | ||||||
|  | 	"bufio" | ||||||
|  | 	"io" | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | var ( | ||||||
|  | 	nByte byte = 10 // the byte that corresponds to the '\n' rune.
 | ||||||
|  | 	rByte byte = 13 // the byte that corresponds to the '\r' rune.
 | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | // DelimitedReader reduces the custom delimiter to `\n`.
 | ||||||
|  | type DelimitedReader struct { | ||||||
|  | 	r           *bufio.Reader | ||||||
|  | 	delimiter   []rune // Select can have upto 2 characters as delimiter.
 | ||||||
|  | 	assignEmpty bool   // Decides whether the next read byte should be discarded.
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // NewDelimitedReader detects the custom delimiter and replaces with `\n`.
 | ||||||
|  | func NewDelimitedReader(r io.Reader, delimiter []rune) *DelimitedReader { | ||||||
|  | 	return &DelimitedReader{r: bufio.NewReader(r), delimiter: delimiter, assignEmpty: false} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Reads and replaces the custom delimiter with `\n`.
 | ||||||
|  | func (r *DelimitedReader) Read(p []byte) (n int, err error) { | ||||||
|  | 	n, err = r.r.Read(p) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return | ||||||
|  | 	} | ||||||
|  | 	for i, b := range p { | ||||||
|  | 		if r.assignEmpty { | ||||||
|  | 			swapAndNullify(p, i) | ||||||
|  | 			r.assignEmpty = false | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		if b == rByte && rune(b) != r.delimiter[0] { | ||||||
|  | 			// Replace the carriage returns with `\n`.
 | ||||||
|  | 			// Mac styled csv will have `\r` as their record delimiter.
 | ||||||
|  | 			p[i] = nByte | ||||||
|  | 		} else if rune(b) == r.delimiter[0] { // Eg, `\r\n`,`ab`,`a` are valid delimiters
 | ||||||
|  | 			if i+1 == len(p) && len(r.delimiter) > 1 { | ||||||
|  | 				// If the first delimiter match falls on the boundary,
 | ||||||
|  | 				// Peek the next byte and if it matches, discard it in the next byte read.
 | ||||||
|  | 				if nextByte, nerr := r.r.Peek(1); nerr == nil { | ||||||
|  | 					if rune(nextByte[0]) == r.delimiter[1] { | ||||||
|  | 						p[i] = nByte | ||||||
|  | 						// To Discard in the next read.
 | ||||||
|  | 						r.assignEmpty = true | ||||||
|  | 					} | ||||||
|  | 				} | ||||||
|  | 			} else if len(r.delimiter) > 1 && rune(p[i+1]) == r.delimiter[1] { | ||||||
|  | 				// The second delimiter falls in the same chunk.
 | ||||||
|  | 				p[i] = nByte | ||||||
|  | 				r.assignEmpty = true | ||||||
|  | 			} else if len(r.delimiter) == 1 { | ||||||
|  | 				// Replace with `\n` incase of single charecter delimiter match.
 | ||||||
|  | 				p[i] = nByte | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Occupy the first byte space and nullify the last byte.
 | ||||||
|  | func swapAndNullify(p []byte, n int) { | ||||||
|  | 	for i := n; i < len(p)-1; i++ { | ||||||
|  | 		p[i] = p[i+1] | ||||||
|  | 	} | ||||||
|  | 	p[len(p)-1] = 0 | ||||||
|  | } | ||||||
|  | @ -0,0 +1,83 @@ | ||||||
|  | /* | ||||||
|  |  * Minio Cloud Storage, (C) 2016, 2017, 2018 Minio, Inc. | ||||||
|  |  * | ||||||
|  |  * Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |  * you may not use this file except in compliance with the License. | ||||||
|  |  * You may obtain a copy of the License at | ||||||
|  |  * | ||||||
|  |  *     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |  * See the License for the specific language governing permissions and | ||||||
|  |  * limitations under the License. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | package ioutil | ||||||
|  | 
 | ||||||
|  | import ( | ||||||
|  | 	"bytes" | ||||||
|  | 	"io" | ||||||
|  | 	"strings" | ||||||
|  | 	"testing" | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | // Test for DelimitedCSVReader.
 | ||||||
|  | func TestDelimitedReader(t *testing.T) { | ||||||
|  | 	expected := "username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45" | ||||||
|  | 
 | ||||||
|  | 	inputs := []struct { | ||||||
|  | 		inputcsv  string | ||||||
|  | 		delimiter string | ||||||
|  | 		chunkSize int | ||||||
|  | 	}{ | ||||||
|  | 		// case 1 - with default `\n` delimiter.
 | ||||||
|  | 		{"username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45", "\n", 10}, | ||||||
|  | 		// case 2 - with carriage return `\r` which should be replaced with `\n` by default.
 | ||||||
|  | 		{"username,age\rbanana,12\rcarrot,23\rapple,34\rbrinjal,90\rraddish,45", "\n", 10}, | ||||||
|  | 		// case 3 - with a double character delimiter (octals).
 | ||||||
|  | 		{"username,age\r\nbanana,12\r\ncarrot,23\r\napple,34\r\nbrinjal,90\r\nraddish,45", "\r\n", 10}, | ||||||
|  | 		// case 4 - with a double character delimiter.
 | ||||||
|  | 		{"username,agexvbanana,12xvcarrot,23xvapple,34xvbrinjal,90xvraddish,45", "xv", 10}, | ||||||
|  | 		// case 5 - with a double character delimiter `\t `
 | ||||||
|  | 		{"username,age\t banana,12\t carrot,23\t apple,34\t brinjal,90\t raddish,45", "\t ", 10}, | ||||||
|  | 		// case 6 - This is a special case where the first delimiter match falls in the 13'th byte space
 | ||||||
|  | 		// ie, the last byte space of the read chunk, In this case the reader should peek in the next byte
 | ||||||
|  | 		// and replace with `\n`.
 | ||||||
|  | 		{"username,agexxbanana,12xxcarrot,23xxapple,34xxbrinjal,90xxraddish,45", "xx", 13}, | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	for c, input := range inputs { | ||||||
|  | 		var readcsv []byte | ||||||
|  | 		var err error | ||||||
|  | 		delimitedReader := NewDelimitedReader(strings.NewReader(input.inputcsv), []rune(input.delimiter)) | ||||||
|  | 		for err == nil { | ||||||
|  | 			chunk := make([]byte, input.chunkSize) | ||||||
|  | 			_, err = delimitedReader.Read(chunk) | ||||||
|  | 			readcsv = append(readcsv, chunk...) | ||||||
|  | 		} | ||||||
|  | 		if err != io.EOF { | ||||||
|  | 			t.Fatalf("Case %d: Error in delimited read", c+1) | ||||||
|  | 		} | ||||||
|  | 		expected := []byte(expected) | ||||||
|  | 		cleanCsv := removeNulls(readcsv) | ||||||
|  | 		if !bytes.Equal(cleanCsv, expected) { | ||||||
|  | 			t.Fatalf("Case %d: Expected the delimited csv to be `%s`, but instead found `%s`", c+1, string(expected), string(cleanCsv)) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Removes all the tailing nulls in chunks.
 | ||||||
|  | // Null chunks will be assigned if there is a reduction
 | ||||||
|  | // Eg, When `xv` is reduced to `\n`, the last byte is nullified.
 | ||||||
|  | func removeNulls(csv []byte) []byte { | ||||||
|  | 	cleanCsv := []byte{} | ||||||
|  | 	for _, p := range csv { | ||||||
|  | 		if p != 0 { | ||||||
|  | 			cleanCsv = append(cleanCsv, p) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return cleanCsv | ||||||
|  | } | ||||||
|  | @ -29,6 +29,7 @@ import ( | ||||||
| 	"net/http" | 	"net/http" | ||||||
| 
 | 
 | ||||||
| 	gzip "github.com/klauspost/pgzip" | 	gzip "github.com/klauspost/pgzip" | ||||||
|  | 	"github.com/minio/minio/pkg/ioutil" | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| const ( | const ( | ||||||
|  | @ -79,6 +80,9 @@ type Options struct { | ||||||
| 	// HasHeader when true, will treat the first row as a header row.
 | 	// HasHeader when true, will treat the first row as a header row.
 | ||||||
| 	HasHeader bool | 	HasHeader bool | ||||||
| 
 | 
 | ||||||
|  | 	// RecordDelimiter is the string that records are delimited by.
 | ||||||
|  | 	RecordDelimiter string | ||||||
|  | 
 | ||||||
| 	// FieldDelimiter is the string that fields are delimited by.
 | 	// FieldDelimiter is the string that fields are delimited by.
 | ||||||
| 	FieldDelimiter string | 	FieldDelimiter string | ||||||
| 
 | 
 | ||||||
|  | @ -127,7 +131,8 @@ func NewInput(opts *Options) (*Input, error) { | ||||||
| 		tempBytesScanned = opts.StreamSize | 		tempBytesScanned = opts.StreamSize | ||||||
| 		myReader = bzip2.NewReader(opts.ReadFrom) | 		myReader = bzip2.NewReader(opts.ReadFrom) | ||||||
| 	} | 	} | ||||||
| 
 | 	// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
 | ||||||
|  | 	normalizedReader := ioutil.NewDelimitedReader(myReader, []rune(opts.RecordDelimiter)) | ||||||
| 	progress := &statInfo{ | 	progress := &statInfo{ | ||||||
| 		BytesScanned:   tempBytesScanned, | 		BytesScanned:   tempBytesScanned, | ||||||
| 		BytesProcessed: 0, | 		BytesProcessed: 0, | ||||||
|  | @ -135,7 +140,7 @@ func NewInput(opts *Options) (*Input, error) { | ||||||
| 	} | 	} | ||||||
| 	reader := &Input{ | 	reader := &Input{ | ||||||
| 		options: opts, | 		options: opts, | ||||||
| 		reader:  csv.NewReader(myReader), | 		reader:  csv.NewReader(normalizedReader), | ||||||
| 		stats:   progress, | 		stats:   progress, | ||||||
| 	} | 	} | ||||||
| 	reader.firstRow = nil | 	reader.firstRow = nil | ||||||
|  |  | ||||||
|  | @ -48,6 +48,7 @@ func TestCheckForDuplicates(t *testing.T) { | ||||||
| func TestMyProcessing(t *testing.T) { | func TestMyProcessing(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -96,6 +97,7 @@ func TestMyProcessing(t *testing.T) { | ||||||
| func TestMyRowIndexResults(t *testing.T) { | func TestMyRowIndexResults(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -271,6 +273,7 @@ func TestMyParser(t *testing.T) { | ||||||
| 	for _, table := range tables { | 	for _, table := range tables { | ||||||
| 		options := &Options{ | 		options := &Options{ | ||||||
| 			HasHeader:            false, | 			HasHeader:            false, | ||||||
|  | 			RecordDelimiter:      "\n", | ||||||
| 			FieldDelimiter:       ",", | 			FieldDelimiter:       ",", | ||||||
| 			Comments:             "", | 			Comments:             "", | ||||||
| 			Name:                 "S3Object", // Default table name for all objects
 | 			Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -346,6 +349,7 @@ func TestMyAggregationFunc(t *testing.T) { | ||||||
| func TestToStringAgg(t *testing.T) { | func TestToStringAgg(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -380,6 +384,7 @@ func TestToStringAgg(t *testing.T) { | ||||||
| func TestMyRowColLiteralResults(t *testing.T) { | func TestMyRowColLiteralResults(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -459,6 +464,7 @@ func TestMyWhereEval(t *testing.T) { | ||||||
| 	for _, table := range tables { | 	for _, table := range tables { | ||||||
| 		options := &Options{ | 		options := &Options{ | ||||||
| 			HasHeader:            false, | 			HasHeader:            false, | ||||||
|  | 			RecordDelimiter:      "\n", | ||||||
| 			FieldDelimiter:       ",", | 			FieldDelimiter:       ",", | ||||||
| 			Comments:             "", | 			Comments:             "", | ||||||
| 			Name:                 "S3Object", // Default table name for all objects
 | 			Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -610,6 +616,7 @@ func TestInterpreter(t *testing.T) { | ||||||
| 	for _, table := range tables { | 	for _, table := range tables { | ||||||
| 		options := &Options{ | 		options := &Options{ | ||||||
| 			HasHeader:            false, | 			HasHeader:            false, | ||||||
|  | 			RecordDelimiter:      "\n", | ||||||
| 			FieldDelimiter:       ",", | 			FieldDelimiter:       ",", | ||||||
| 			Comments:             "", | 			Comments:             "", | ||||||
| 			Name:                 "S3Object", // Default table name for all objects
 | 			Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -651,6 +658,7 @@ func TestInterpreter(t *testing.T) { | ||||||
| func TestMyXMLFunction(t *testing.T) { | func TestMyXMLFunction(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -689,6 +697,7 @@ func TestMyXMLFunction(t *testing.T) { | ||||||
| func TestMyProtocolFunction(t *testing.T) { | func TestMyProtocolFunction(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -732,6 +741,7 @@ func TestMyProtocolFunction(t *testing.T) { | ||||||
| func TestMyInfoProtocolFunctions(t *testing.T) { | func TestMyInfoProtocolFunctions(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            true, | 		HasHeader:            true, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -773,6 +783,7 @@ func TestMyInfoProtocolFunctions(t *testing.T) { | ||||||
| func TestMyErrorProtocolFunctions(t *testing.T) { | func TestMyErrorProtocolFunctions(t *testing.T) { | ||||||
| 	options := &Options{ | 	options := &Options{ | ||||||
| 		HasHeader:            false, | 		HasHeader:            false, | ||||||
|  | 		RecordDelimiter:      "\n", | ||||||
| 		FieldDelimiter:       ",", | 		FieldDelimiter:       ",", | ||||||
| 		Comments:             "", | 		Comments:             "", | ||||||
| 		Name:                 "S3Object", // Default table name for all objects
 | 		Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  | @ -1007,6 +1018,7 @@ func TestMyValids(t *testing.T) { | ||||||
| 	for _, table := range tables { | 	for _, table := range tables { | ||||||
| 		options := &Options{ | 		options := &Options{ | ||||||
| 			HasHeader:            false, | 			HasHeader:            false, | ||||||
|  | 			RecordDelimiter:      "\n", | ||||||
| 			FieldDelimiter:       ",", | 			FieldDelimiter:       ",", | ||||||
| 			Comments:             "", | 			Comments:             "", | ||||||
| 			Name:                 "S3Object", // Default table name for all objects
 | 			Name:                 "S3Object", // Default table name for all objects
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue