Skip to content

Commit

Permalink
[enhance](hive) support reading hive table with OpenCSVSerde (apache#…
Browse files Browse the repository at this point in the history
…42257)

## Proposed changes
OpenCSVSerde Properties:
| **Property** | **Description** | **Default Value** | **Supported in
Doris** |

|---------------------------------------|---------------------------------------------------------------------------------------------------|-------------------|--------------------------|
| `separatorChar` | Defines the character used to separate fields
(columns) in a CSV file. | `,` | Yes |
| `quoteChar` | Defines the character used to quote fields that contain
special characters, like the separator. | `"` | Yes |
| `escapeChar` | Specifies the escape character used for escaping
special characters, including quotes and delimiters. | `"` | Yes |

### Explanation:
- **`separatorChar`**: This property defines the character that
separates columns in the CSV file. Typically, a comma (`,`) is used as
the default separator.
- **`quoteChar`**: This character is used to enclose fields that contain
special characters (like the separator). For example, if a field
contains a comma, it is enclosed in quotes (`"`).
- **`escapeChar`**: Specifies the character used to escape special
characters, such as quotes or the separator. In many cases, a backslash
(`\\`) is used as the escape character.
  • Loading branch information
suxiaogang223 authored Oct 24, 2024
1 parent ef48470 commit 5f10b21
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
insert into serde_test5 values(1, "abc"),(2, "def");
insert into serde_test6 values(1, "abc"),(2, "def");
insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");

CREATE TABLE test_open_csv_default_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE;

CREATE TABLE test_open_csv_standard_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
STORED AS TEXTFILE;
CREATE TABLE test_open_csv_custom_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = "\t",
"quoteChar" = "\'",
"escapeChar" = "|"
)
STORED AS TEXTFILE;
INSERT INTO TABLE test_open_csv_default_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
INSERT INTO TABLE test_open_csv_standard_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
INSERT INTO TABLE test_open_csv_custom_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.OpenCSVSerde;

import java.util.HashMap;
import java.util.Map;
Expand All @@ -27,15 +28,12 @@

public class HiveProperties {
public static final String PROP_FIELD_DELIMITER = "field.delim";
public static final String PROP_SEPARATOR_CHAR = "separatorChar";
public static final String PROP_SERIALIZATION_FORMAT = "serialization.format";
public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"

public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";

public static final String PROP_QUOTE_CHAR = "quoteChar";

public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim";
public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim";
public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
Expand All @@ -49,6 +47,14 @@ public class HiveProperties {
public static final String PROP_NULL_FORMAT = "serialization.null.format";
public static final String DEFAULT_NULL_FORMAT = "\\N";

// The following properties are used for OpenCsvSerde.
public static final String PROP_SEPARATOR_CHAR = OpenCSVSerde.SEPARATORCHAR;
public static final String DEFAULT_SEPARATOR_CHAR = ",";
public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
public static final String DEFAULT_QUOTE_CHAR = "\"";
public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
public static final String DEFAULT_ESCAPE_CHAR = "\\";

public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
PROP_FIELD_DELIMITER,
PROP_COLLECTION_DELIMITER_HIVE2,
Expand All @@ -59,37 +65,33 @@ public class HiveProperties {
PROP_QUOTE_CHAR,
PROP_MAP_KV_DELIMITER,
PROP_ESCAPE_DELIMITER,
PROP_NULL_FORMAT
);
PROP_ESCAPE_CHAR,
PROP_NULL_FORMAT);

public static String getFieldDelimiter(Table table) {
// This method is used for text format.
// If you need compatibility with csv format, please use `getColumnSeparator`.
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
}

public static String getColumnSeparator(Table table) {
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
public static String getSeparatorChar(Table table) {
Optional<String> separatorChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_SEPARATOR_CHAR, separatorChar);
}


public static String getLineDelimiter(Table table) {
Optional<String> lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_LINE_DELIMITER, lineDelim));
DEFAULT_LINE_DELIMITER, lineDelim));
}

public static String getMapKvDelimiter(Table table) {
Optional<String> mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
}

public static String getCollectionDelimiter(Table table) {
Expand All @@ -101,14 +103,6 @@ public static String getCollectionDelimiter(Table table) {
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3));
}

public static Optional<String> getQuoteChar(Table table) {
Map<String, String> serdeParams = table.getSd().getSerdeInfo().getParameters();
if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
}
return Optional.empty();
}

public static Optional<String> getEscapeDelimiter(Table table) {
Optional<String> escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
if (escapeDelim.isPresent()) {
Expand All @@ -127,6 +121,16 @@ public static String getNullFormat(Table table) {
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, nullFormat);
}

public static String getQuoteChar(Table table) {
Optional<String> quoteChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
}

public static String getEscapeChar(Table table) {
Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar);
}

// Set properties to table
public static void setTableProperties(Table table, Map<String, String> properties) {
HashMap<String, String> serdeProps = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -385,20 +385,36 @@ protected Map<String, String> getLocationProperties() throws UserException {
protected TFileAttributes getFileAttributes() throws UserException {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
Table table = hmsTable.getRemoteTable();
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set quote char
HiveProperties.getQuoteChar(table).ifPresent(d -> textParams.setEnclose(d.getBytes()[0]));
// 6. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 7. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
// TODO: separate hive text table and OpenCsv table
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
// set properties of LazySimpleSerDe
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 6. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
// set set properties of OpenCSVSerde
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set enclose char
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
// 4. set escape char
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
}

TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ b 2.2
3 aaa
4 "null"

-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""

-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !1 --
a 1.1
b 2.2
Expand Down Expand Up @@ -79,3 +91,14 @@ b 2.2
3 aaa
4 "null"

-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""

-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte
hive_docker """truncate table regression.serde_test8;"""
sql """insert into ${catalog_name}.regression.serde_test8 select * from ${catalog_name}.regression.serde_test7;"""
qt_9 """select * from ${catalog_name}.regression.serde_test8 order by id;"""

qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;"""
qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
}
}

0 comments on commit 5f10b21

Please sign in to comment.