Allow set none for avroCodec and parquetCodec

streamnative · Nov 28, 2023 · f4e67a1 · f4e67a1
1 parent 6437d1e
commit f4e67a1
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 8 deletions.
diff --git a/docs/aws-s3-sink.md b/docs/aws-s3-sink.md
@@ -129,8 +129,8 @@ Before using the AWS S3 sink connector, you need to configure it. This table out
 | `useHumanReadableSchemaVersion` | Boolean | False    | false        | Use a human-readable format string for the schema version in the message metadata. If it is set to `true`, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format.                                                                                                                                                               |
 | `skipFailedMessages`            | Boolean | False    | false        | Configure whether to skip a message which it fails to be processed. If it is set to `true`, the connector will skip the failed messages by `ack` it. Otherwise, the connector will `fail` the message.                                                                                                                                                                                |
 | `pathPrefix`                    | String  | False    | false        | If it is set, the output files are stored in a folder under the given bucket path. The `pathPrefix` must be in the format of `xx/xxx/`.                                                                                                                                                                                                                                               |
-| `avroCodec`                     | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
-| `parquetCodec`                  | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
+| `avroCodec`                     | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: none (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
+| `parquetCodec`                  | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: none (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
 | `jsonAllowNaN`                  | Boolean | False    | false        | Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=`json`. Since JSON specification does not allow such values this is a non-standard feature and disabled by default.                                                                                                                                                                                    |
 
 ## Advanced features

diff --git a/docs/azure-blob-storage-sink.md b/docs/azure-blob-storage-sink.md
@@ -120,8 +120,8 @@ Before using the Azure Blob Storage sink connector, you need to configure it. Th
 | `useHumanReadableSchemaVersion`       | Boolean | False    | false        | Use a human-readable format string for the schema version in the message metadata. If it is set to `true`, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format.                                                                                                                                                               |
 | `skipFailedMessages`                  | Boolean | False    | false        | Configure whether to skip a message which it fails to be processed. If it is set to `true`, the connector will skip the failed messages by `ack` it. Otherwise, the connector will `fail` the message.                                                                                                                                                                                |
 | `pathPrefix`                          | String  | False    | false        | If it is set, the output files are stored in a folder under the given bucket path. The `pathPrefix` must be in the format of `xx/xxx/`.                                                                                                                                                                                                                                               |
-| `avroCodec`                           | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
-| `parquetCodec`                        | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
+| `avroCodec`                           | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: none (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
+| `parquetCodec`                        | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: none (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
 | `jsonAllowNaN`                        | Boolean | False    | false        | Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=`json`. Since JSON specification does not allow such values this is a non-standard feature and disabled by default.                                                                                                                                                                                    |
 
 There are three methods to authenticate with Azure Blob Storage:

diff --git a/docs/google-cloud-storage-sink.md b/docs/google-cloud-storage-sink.md
@@ -124,8 +124,8 @@ Before using the Google Cloud Storage sink connector, you need to configure it.
 | `useHumanReadableSchemaVersion`   | Boolean | False    | false        | Use a human-readable format string for the schema version in the message metadata. If it is set to `true`, the schema version is in plain string format. Otherwise, the schema version is in hex-encoded string format.                                                                                                                                                               |
 | `skipFailedMessages`              | Boolean | False    | false        | Configure whether to skip a message which it fails to be processed. If it is set to `true`, the connector will skip the failed messages by `ack` it. Otherwise, the connector will `fail` the message.                                                                                                                                                                                |
 | `pathPrefix`                      | String  | False    | false        | If it is set, the output files are stored in a folder under the given bucket path. The `pathPrefix` must be in the format of `xx/xxx/`.                                                                                                                                                                                                                                               |
-| `avroCodec`                       | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: null (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
-| `parquetCodec`                    | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: null (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
+| `avroCodec`                       | String  | False    | snappy       | Compression codec used when formatType=`avro`. Available compression types are: none (no compression), deflate, bzip2, xz, zstandard, snappy.                                                                                                                                                                                                                                         |
+| `parquetCodec`                    | String  | False    | gzip         | Compression codec used when formatType=`parquet`. Available compression types are: none (no compression), snappy, gzip, lzo, brotli, lz4, zstd.                                                                                                                                                                                                                                       |
 | `jsonAllowNaN`                    | Boolean | False    | false        | Recognize 'NaN', 'INF', '-INF' as legal floating number values when formatType=`json`. Since JSON specification does not allow such values this is a non-standard feature and disabled by default.                                                                                                                                                                                    |
 
 ## Advanced features

diff --git a/src/main/java/org/apache/pulsar/io/jcloud/BlobStoreAbstractConfig.java b/src/main/java/org/apache/pulsar/io/jcloud/BlobStoreAbstractConfig.java
@@ -86,11 +86,11 @@ public class BlobStoreAbstractConfig implements Serializable {
     private boolean partitionerUseIndexAsOffset;
 
     // The AVRO codec.
-    // Options: null, deflate, bzip2, xz, zstandard, snappy
+    // Options: none, deflate, bzip2, xz, zstandard, snappy
     private String avroCodec = "snappy";
 
     // The Parquet codec.
-    // Options: null, snappy, gzip, lzo, brotli, lz4, zstd
+    // Options: none, snappy, gzip, lzo, brotli, lz4, zstd
     private String parquetCodec = "gzip";
 
     private String timePartitionPattern;
@@ -183,6 +183,13 @@ public void validate() {
         checkArgument(pendingQueueSize > 0, "pendingQueueSize must be a positive integer.");
         checkArgument(pendingQueueSize >= batchSize, "pendingQueueSize must be larger than or "
                 + "equal to batchSize");
+
+        if (avroCodec != null && (avroCodec.isEmpty() || avroCodec.equals("none"))) {
+            avroCodec = null;
+        }
+        if (parquetCodec != null && (parquetCodec.isEmpty() || parquetCodec.equals("none"))) {
+            parquetCodec = null;
+        }
     }
 
     private static boolean hasURIScheme(String endpoint) {

diff --git a/src/test/java/org/apache/pulsar/io/jcloud/ConnectorConfigTest.java b/src/test/java/org/apache/pulsar/io/jcloud/ConnectorConfigTest.java
@@ -329,4 +329,34 @@ public void testNotAllowEndpointEmptyWithAzure() throws IOException {
         }
     }
 
+    @Test
+    public void testCodec() throws IOException {
+        Map<String, Object> config = new HashMap<>();
+        config.put("provider", PROVIDER_AZURE);
+        config.put("azureStorageAccountConnectionString", "test-connection-string");
+        config.put("bucket", "test-container-name");
+        config.put("formatType", "bytes");
+        config.put("partitionerType", "PARTITION");
+        config.put("avroCodec", "snappy");
+        config.put("parquetCodec", "snappy");
+        CloudStorageSinkConfig cloudStorageSinkConfig = CloudStorageSinkConfig.load(config);
+        cloudStorageSinkConfig.validate();
+        Assert.assertEquals("snappy", cloudStorageSinkConfig.getAvroCodec());
+        Assert.assertEquals("snappy", cloudStorageSinkConfig.getParquetCodec());
+
+        config.put("avroCodec", "");
+        config.put("parquetCodec", "");
+        cloudStorageSinkConfig = CloudStorageSinkConfig.load(config);
+        cloudStorageSinkConfig.validate();
+        Assert.assertNull(cloudStorageSinkConfig.getAvroCodec());
+        Assert.assertNull(cloudStorageSinkConfig.getParquetCodec());
+
+        config.put("avroCodec", "none");
+        config.put("parquetCodec", "none");
+        cloudStorageSinkConfig = CloudStorageSinkConfig.load(config);
+        cloudStorageSinkConfig.validate();
+        Assert.assertNull(cloudStorageSinkConfig.getAvroCodec());
+        Assert.assertNull(cloudStorageSinkConfig.getParquetCodec());
+    }
+
 }