From bd8357f9fdc429d062df7a769ca4eac0bbd419d4 Mon Sep 17 00:00:00 2001 From: Fletcher Gornick Date: Fri, 16 Aug 2024 18:35:42 -0500 Subject: [PATCH] feat: uri decode function --- Makefile.am | 5 ++-- docs/content/manual/dev/manual.yml | 13 ++++++++- jq.1.prebuilt | 17 ++++++++++-- src/builtin.c | 42 ++++++++++++++++++++++++++++++ src/builtin.jq | 3 ++- tests/jq.test | 20 +++++++++++++- tests/man.test | 4 +++ tests/uri.test | 38 +++++++++++++++++++++++++++ tests/uritest | 5 ++++ 9 files changed, 139 insertions(+), 8 deletions(-) create mode 100644 tests/uri.test create mode 100755 tests/uritest diff --git a/Makefile.am b/Makefile.am index 0b4b81e78e..a183477fde 100644 --- a/Makefile.am +++ b/Makefile.am @@ -141,7 +141,7 @@ endif ### Tests (make check) -TESTS = tests/mantest tests/jqtest tests/shtest tests/utf8test tests/base64test +TESTS = tests/mantest tests/jqtest tests/shtest tests/utf8test tests/base64test tests/uritest if !WIN32 TESTS += tests/optionaltest endif @@ -218,7 +218,6 @@ EXTRA_DIST = $(DOC_FILES) $(man_MANS) $(TESTS) $(TEST_LOG_COMPILER) \ jq.1.prebuilt jq.spec src/lexer.c src/lexer.h src/parser.c \ src/parser.h src/version.h src/builtin.jq scripts/version \ libjq.pc \ - tests/base64.test tests/jq-f-test.sh tests/jq.test \ tests/modules/a.jq tests/modules/b/b.jq tests/modules/c/c.jq \ tests/modules/c/d.jq tests/modules/data.json \ tests/modules/home1/.jq tests/modules/home2/.jq/g.jq \ @@ -232,7 +231,7 @@ EXTRA_DIST = $(DOC_FILES) $(man_MANS) $(TESTS) $(TEST_LOG_COMPILER) \ tests/onig.supp tests/local.supp \ tests/setup tests/torture/input0.json \ tests/optional.test tests/man.test tests/manonig.test \ - tests/jq.test tests/onig.test tests/base64.test \ + tests/jq.test tests/onig.test tests/base64.test tests/uri.test \ tests/utf8-truncate.jq tests/jq-f-test.sh \ tests/no-main-program.jq tests/yes-main-program.jq diff --git a/docs/content/manual/dev/manual.yml b/docs/content/manual/dev/manual.yml index 1eb7d9b867..90bd033064 100644 --- a/docs/content/manual/dev/manual.yml +++ b/docs/content/manual/dev/manual.yml @@ -1300,7 +1300,7 @@ sections: input: '[1,[[],{"a":2}]]' output: ['[[0],[1,1,"a"]]'] - - title: "`add`" + - title: "`add`, `add(generator)`" body: | The filter `add` takes as input an array, and produces as @@ -1311,6 +1311,9 @@ sections: If the input is an empty array, `add` returns `null`. + `add(generator)` operates on the given generator rather than + the input. + examples: - program: add input: '["a","b","c"]' @@ -1321,6 +1324,9 @@ sections: - program: add input: '[]' output: ["null"] + - program: add(.[].a) + input: '[{"a":3}, {"a":5}, {"b":6}]' + output: ['8'] - title: "`any`, `any(condition)`, `any(generator; condition)`" body: | @@ -2135,6 +2141,11 @@ sections: Applies percent-encoding, by mapping all reserved URI characters to a `%XX` sequence. + * `@urid`: + + The inverse of `@uri`, applies percent-decoding, by mapping + all `%XX` sequences to their corresponding URI characters. + * `@csv`: The input must be an array, and it is rendered as CSV diff --git a/jq.1.prebuilt b/jq.1.prebuilt index 7239e87d16..553b63fc15 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -1,5 +1,5 @@ . -.TH "JQ" "1" "May 2024" "" "" +.TH "JQ" "1" "August 2024" "" "" . .SH "NAME" \fBjq\fR \- Command\-line JSON processor @@ -1340,12 +1340,15 @@ jq \'[paths(type == "number")]\' . .IP "" 0 . -.SS "add" +.SS "add, add(generator)" The filter \fBadd\fR takes as input an array, and produces as output the elements of the array added together\. This might mean summed, concatenated or merged depending on the types of the elements of the input array \- the rules are the same as those for the \fB+\fR operator (described above)\. . .P If the input is an empty array, \fBadd\fR returns \fBnull\fR\. . +.P +\fBadd(generator)\fR operates on the given generator rather than the input\. +. .IP "" 4 . .nf @@ -1361,6 +1364,10 @@ jq \'add\' jq \'add\' [] => null + +jq \'add(\.[]\.a)\' + [{"a":3}, {"a":5}, {"b":6}] +=> 8 . .fi . @@ -2323,6 +2330,12 @@ Applies HTML/XML escaping, by mapping the characters \fB<>&\'"\fR to their entit Applies percent\-encoding, by mapping all reserved URI characters to a \fB%XX\fR sequence\. . .TP +\fB@urid\fR: +. +.IP +The inverse of \fB@uri\fR, applies percent\-decoding, by mapping all \fB%XX\fR sequences to their corresponding URI characters\. +. +.TP \fB@csv\fR: . .IP diff --git a/src/builtin.c b/src/builtin.c index e39975b0a0..69e9b07214 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -657,6 +657,48 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { } jv_free(input); return line; + } else if (!strcmp(fmt_s, "urid")) { + jv_free(fmt); + input = f_tostring(jq, input); + + jv line = jv_string(""); + const char *errmsg = "is not a valid uri encoding"; + const char *s = jv_string_value(input); + while (*s) { + if (*s != '%') { + line = jv_string_append_buf(line, s++, 1); + } else { + unsigned char unicode[4] = {0}; + int b = 0; + // check leading bits of first octet to determine length of unicode character + // (https://datatracker.ietf.org/doc/html/rfc3629#section-3) + while (b == 0 || (b < 4 && unicode[0] >> 7 & 1 && unicode[0] >> (7-b) & 1)) { + if (*(s++) != '%') { + jv_free(line); + return type_error(input, errmsg); + } + for (int i=0; i<2; i++) { + unicode[b] <<= 4; + char c = *(s++); + if ('0' <= c && c <= '9') unicode[b] |= c - '0'; + else if ('a' <= c && c <= 'f') unicode[b] |= c - 'a' + 10; + else if ('A' <= c && c <= 'F') unicode[b] |= c - 'A' + 10; + else { + jv_free(line); + return type_error(input, errmsg); + } + } + b++; + } + if (!jvp_utf8_is_valid((const char *)unicode, (const char *)unicode+b)) { + jv_free(line); + return type_error(input, errmsg); + } + line = jv_string_append_buf(line, (const char *)unicode, b); + } + } + jv_free(input); + return line; } else if (!strcmp(fmt_s, "sh")) { jv_free(fmt); if (jv_get_kind(input) != JV_KIND_ARRAY) diff --git a/src/builtin.jq b/src/builtin.jq index 802595bafd..aa33cd4b75 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -8,7 +8,8 @@ def unique: group_by(.) | map(.[0]); def unique_by(f): group_by(f) | map(.[0]); def max_by(f): _max_by_impl(map([f])); def min_by(f): _min_by_impl(map([f])); -def add: reduce .[] as $x (null; . + $x); +def add(f): reduce f as $x (null; . + $x); +def add: add(.[]); def del(f): delpaths([path(f)]); def abs: if . < 0 then - . else . end; def _assign(paths; $value): reduce path(paths) as $p (.; setpath($p; $value)); diff --git a/tests/jq.test b/tests/jq.test index 1502fbe058..88cd5d8b9f 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -61,7 +61,7 @@ null null "interpolation" -@text,@json,([1,.]|@csv,@tsv),@html,@uri,@sh,(@base64|.,@base64d) +@text,@json,([1,.]|@csv,@tsv),@html,(@uri|.,@urid),@sh,(@base64|.,@base64d) "!()<>&'\"\t" "!()<>&'\"\t" "\"!()<>&'\\\"\\t\"" @@ -69,6 +69,7 @@ null "1\t!()<>&'\"\\t" "!()<>&'"\t" "%21%28%29%3C%3E%26%27%22%09" +"!()<>&'\"\t" "'!()<>&'\\''\"\t'" "ISgpPD4mJyIJ" "!()<>&'\"\t" @@ -86,6 +87,10 @@ null "\u03bc" "%CE%BC" +@urid +"%CE%BC" +"\u03bc" + @html "\(.)" "" "<script>hax</script>" @@ -642,6 +647,19 @@ map_values(.+1) [0,1,2] [1,2,3] +[add(null), add(range(range(10))), add(empty), add(10,range(10))] +null +[null,120,null,55] + +# Real-world use case for add(empty) +.sum = add(.arr[]) +{"arr":[]} +{"arr":[],"sum":null} + +add({(.[]):1}) | keys +["a","a","b","a","d","b","d","a","d"] +["a","b","d"] + # # User-defined functions # Oh god. diff --git a/tests/man.test b/tests/man.test index 7a9cf6798c..6c5eba390a 100644 --- a/tests/man.test +++ b/tests/man.test @@ -358,6 +358,10 @@ add [] null +add(.[].a) +[{"a":3}, {"a":5}, {"b":6}] +8 + any [true, false] true diff --git a/tests/uri.test b/tests/uri.test new file mode 100644 index 0000000000..de10244463 --- /dev/null +++ b/tests/uri.test @@ -0,0 +1,38 @@ +# Tests are groups of three lines: program, input, expected output +# Blank lines and lines starting with # are ignored + +@uri +"<>&'\"\t" +"%3C%3E%26%27%22%09" + +# decoding encoded output results in same text +(@uri|@urid) +"<>&'\"\t" +"<>&'\"\t" + +# testing variable length unicode characters +@uri +"a \u03bc \u2230 \ud83d\ude0e" +"a%20%CE%BC%20%E2%88%B0%20%F0%9F%98%8E" + +@urid +"a%20%CE%BC%20%E2%88%B0%20%F0%9F%98%8E" +"a \u03bc \u2230 \ud83d\ude0e" + +### invalid uri strings + +# unicode character should be length 4 (not 3) +. | try @urid catch . +"%F0%93%81" +"string (\"%F0%93%81\") is not a valid uri encoding" + +# invalid hex value ('FX') +. | try @urid catch . +"%FX%9F%98%8E" +"string (\"%FX%9F%98%8E\") is not a valid uri encoding" + +# trailing utf-8 octets must be formatted like 10xxxxxx +# 'C0' = 11000000 invalid +. | try @urid catch . +"%F0%C0%81%8E" +"string (\"%F0%C0%81%8E\") is not a valid uri encoding" diff --git a/tests/uritest b/tests/uritest new file mode 100755 index 0000000000..1d2642c510 --- /dev/null +++ b/tests/uritest @@ -0,0 +1,5 @@ +#!/bin/sh + +. "${0%/*}/setup" "$@" + +$VALGRIND $Q $JQ -L "$mods" --run-tests $JQTESTDIR/uri.test