diff --git a/devkit.opam b/devkit.opam index c0bbfa8..10ef50a 100644 --- a/devkit.opam +++ b/devkit.opam @@ -18,7 +18,6 @@ depends: [ "camlzip" "libevent" {>= "0.8.0"} "ocurl" {>= "0.7.2"} - "ocamlnet" "pcre" "trace" {>= "0.4"} "extunix" {>= "0.1.4"} diff --git a/dune b/dune index 059f32c..00e05f1 100644 --- a/dune +++ b/dune @@ -16,10 +16,12 @@ libevent lwt lwt.unix - netstring + ocamlnet_lite pcre stdlib-shims + str trace.core + unix yojson (select possibly_otel.ml @@ -65,7 +67,7 @@ (executable (name test) - (libraries devkit extlib extunix libevent netstring ounit2 yojson) + (libraries devkit extlib extunix libevent ocamlnet_lite ounit2 unix yojson) (modules test test_httpev)) (rule diff --git a/httpev.ml b/httpev.ml index 0fc6db4..948cd92 100644 --- a/httpev.ml +++ b/httpev.ml @@ -184,7 +184,7 @@ let get_content_length headers = | Some s -> try Some (int_of_string s) with _ -> failed Header (sprintf "content-length %S" s) let decode_args s = - try Netencoding.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s + try Ocamlnet_lite.Netencoding.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s let acceptable_encoding headers = let split s c = List.map (String.strip ~chars:" \t\r\n") @@ Stre.nsplitc s c in diff --git a/ocamlnet_lite/dune b/ocamlnet_lite/dune new file mode 100644 index 0000000..e32763d --- /dev/null +++ b/ocamlnet_lite/dune @@ -0,0 +1,7 @@ +(library + (name ocamlnet_lite) + (public_name devkit.ocamlnet_lite) + (libraries + extlib ; just for Array.create + pcre + str)) diff --git a/ocamlnet_lite/netaux.ml b/ocamlnet_lite/netaux.ml new file mode 100644 index 0000000..3af5e6e --- /dev/null +++ b/ocamlnet_lite/netaux.ml @@ -0,0 +1,34 @@ +module ArrayAux = struct + let int_blit_ref = + ref + (fun (src:int array) srcpos dest destpos len -> + (* A specialised version of Array.blit for int arrays. + * Faster than the polymorphic Array.blit for + * various reasons. + *) + if (len < 0 || srcpos < 0 || + srcpos+len > Array.length src || + destpos < 0 || + destpos+len > Array.length dest) then + invalid_arg "Netaux.ArrayAux.int_blit"; + if src != dest || destpos <= srcpos then ( + for i = 0 to len-1 do + Array.unsafe_set + dest + (destpos+i) + (Array.unsafe_get src (srcpos+i)) + done + ) else ( + for i = len-1 downto 0 do + Array.unsafe_set + dest + (destpos+i) + (Array.unsafe_get src (srcpos+i)) + done + ) + ) + + let int_blit src srcpos dest destpos len = + !int_blit_ref src srcpos dest destpos len + +end diff --git a/ocamlnet_lite/netaux.mli b/ocamlnet_lite/netaux.mli new file mode 100644 index 0000000..e04dac5 --- /dev/null +++ b/ocamlnet_lite/netaux.mli @@ -0,0 +1,13 @@ +(** Internal auxiliary functions + * + * This is an internal module. + *) + +(* Auxiliary stuff *) + +module ArrayAux : sig + val int_blit : int array -> int -> int array -> int -> int -> unit + (** A specialisation of [Array.blit] for int arrays. + * (Performance reasons.) + *) +end diff --git a/ocamlnet_lite/netbuffer.ml b/ocamlnet_lite/netbuffer.ml new file mode 100644 index 0000000..a5539e9 --- /dev/null +++ b/ocamlnet_lite/netbuffer.ml @@ -0,0 +1,88 @@ +type t = { + mutable buffer : Bytes.t; + mutable buffer_length : int; (* = String.length buffer *) + mutable length : int; + create_length : int; +} + +(* To help the garbage collector: + * The 'buffer' has a minimum length of 31 bytes. This minimum can still + * be stored in the minor heap. + * The 'buffer' has a length which is always near a multiple of two. This + * limits the number of different bucket sizes, and simplifies reallocation + * of freed memory. + *) + +(* Optimal string length: + * Every string takes: 1 word for the header, enough words for the + * contents + 1 Null byte (for C compatibility). + * If the buffer grows, it is best to use a new string length such + * that the number of words is exactly twice as large as for the previous + * string. + * n: length of the previous string in bytes + * w: storage size of the previous string in words + * n': length of the new string in bytes + * w' = 2*w: storage size of the new string in words + * + * w = (n+1) / word_length + 1 + * [it is assumed that (n+1) is always a multiple of word_length] + * + * n' = (2*w - 1) * word_length - 1 + * + * n' = [2 * ( [n+1] / word_length + 1) - 1] * word_length - 1 + * = ... + * = (2*n + 2) + word_length - 1 + * = 2 * n + word_length + 1 + * + * n'+1 is again a multiple of word_length: + * n'+1 = 2*n + 2 + word_length + * = 2*(n+1) + word_length + * = a multiple of word_length because n+1 is a multiple of word_length + *) + +let word_length = Sys.word_size / 8 (* in bytes *) + +let create n = + let bl = max n 31 in + { + buffer = Bytes.create bl; + buffer_length = bl; + length = 0; + create_length = n; + } + +let contents b = Bytes.sub_string b.buffer 0 b.length +let to_bytes b = Bytes.sub b.buffer 0 b.length + +let to_tstring_poly : type s. t -> s Netstring_tstring.tstring_kind -> s = + fun b kind -> + match kind with + | Netstring_tstring.String_kind -> contents b + | Netstring_tstring.Bytes_kind -> to_bytes b + +let alloc_space b n = + let rec new_size s = + if s >= n then s else new_size ((2 * s) + word_length + 1) + in + let size = min (new_size b.buffer_length) Sys.max_string_length in + if size < n then failwith "Netbuffer: string too large"; + let buffer' = Bytes.create size in + Bytes.blit b.buffer 0 buffer' 0 b.length; + b.buffer <- buffer'; + b.buffer_length <- size + +let ensure_space b n = + (* Ensure that there are n bytes space in b *) + if n > b.buffer_length then alloc_space b n + +let add_internal blit b s k l = + ensure_space b (l + b.length); + blit s k b.buffer b.length l; + b.length <- b.length + l + +let add_substring b s k l = + if k < 0 || l < 0 || k > String.length s - l then + invalid_arg "Netbuffer.add_substring"; + add_internal Bytes.blit_string b s k l + +let add_string b s = add_substring b s 0 (String.length s) diff --git a/ocamlnet_lite/netbuffer.mli b/ocamlnet_lite/netbuffer.mli new file mode 100644 index 0000000..9a9afc5 --- /dev/null +++ b/ocamlnet_lite/netbuffer.mli @@ -0,0 +1,19 @@ +(** A Netbuffer.t is a buffer that can grow and shrink dynamically. *) + +type t + +val create : int -> t + (** Creates a netbuffer which allocates initially this number of bytes. + * The logical length is zero. + *) + +val to_tstring_poly : t -> 's Netstring_tstring.tstring_kind -> 's + (** Return the buffer in the format as selected by the arg *) + +(** {2 Appending strings} *) + +val add_string : t -> string -> unit + (** [add_string nb s]: Adds a copy of the string [s] to the logical end of + * the netbuffer [nb]. If necessary, [nb] grows. + *) + diff --git a/ocamlnet_lite/netconversion.ml b/ocamlnet_lite/netconversion.ml new file mode 100644 index 0000000..fe42963 --- /dev/null +++ b/ocamlnet_lite/netconversion.ml @@ -0,0 +1,741 @@ +open Netaux.ArrayAux + +exception Malformed_code +exception Cannot_represent of int + +let multibyte_limit = (* 6 *) 50 + +(* The longest multibyte character of all supported encodings, + * and the longest substitution string. + *) + +let big_slice = (* 3 *) 250 + +(* The typical length of slices *) + +(* Seems to be a good source: ftp://dkuug.dk/i18n/charmaps + *) + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | (* Encoding subset: *) + `Enc_subset of encoding * (int -> bool) ] + +(* Internal conversion interface: + * + * let (n_char, n_byte, enc') = read_XXX slice_char slice_blen s_in p_in l_in: + * + * - Scans the bytes from position p_in until the slice is decoded, but at + * most until the last position p_in+l_in-1 of the input string s_in, and + * decodes the character for the selected encoding. + * - "slice_char" is a preallocated array of ints storing the code points + * of the characters. It is allowed that "slice_char" is only partially + * filled with characters. In this case, there must be a -1 after the + * last valid code point. + * - "slice_blen" is another "int array" with the same size as "slice_char". + * It contains the byte length of every character. It is initialized with + * a sequence of ones, so single-byte readers don't have to worry about + * this array. + * - Returns: + * * n_char: the number of decoded characters + * * n_byte: the number of scanned bytes ( <= l_in ) + * * enc': the new encoding + * - In the case of multi-byte encodings it is possible that + * the last byte to read at position p_in+l_in-1 is the beginning of + * a character. This character is excluded from being decoded. + * - Errors: If an invalid byte sequence is found, the exception + * Malformed_code_read(_,_,_) is raised. The exception returns the + * triple (n_char, n_byte, enc') describing how much could be read + * before the reader ran into the bad sequence. slice_char and slice_blen + * are only partially initialized, with a (-1) at the end of slice_char. + * + * let (n_char, n_byte) = + * write_XXX slice_char slice_pos slice_length s_out p_out l_out subst + * + * - Writes the characters found in slice_char to s_out. Only the elements + * from slice_pos to slice_pos + slice_length -1 are written. The resulting + * bytes are written to s_out from byte position p_out to p_out+l_out-1. + * - There must not be a -1 (EOF mark) in the first slice_length characters + * of slice_char. + * - Only whole characters must be written. + * - For code points p that cannot be represented in the output + * encoding, the function subst is called. The function must return + * the (already encoded) string to substitute. This must be a small string. + * - Of course, p >= 0. As special case, p >= 0x110000 may be used to force + * that subst is called (it is assumed that max_int can be never + * represented). + * - Returns: + * * n_char: the number of processed characters + * * n_byte: the number of written bytes ( <= l_in ) + * + * let (n_char, n_byte) = + * back_XXX s_in range_in range_in_len p_in n_char: + * + * - The substring of s_in beginning at range_in and with length + * range_in_len is considered as the valid range + * - The cursor is at byte position p_in and goes n_char characters back + * - The routine returns: + * * n_char: the characters the cursor was actually moved backwards + * * n_byte: the bytes the cursor was actually moved backwards + * - The validity of the input encoding needs not to be checked + *) + +exception Malformed_code_read of (int * int * encoding) + +(* UNSAFE_OPT: A number of functions have been optimized by using + * unsafe features of O'Caml (unsafe_get, unsafe_set, unsafe_chr). + * These functions have been checked very carefully, and there are + * a lot of comments arguing about the correctness of the array + * and string accesses. + *) + +type poly_reader = { + read : + 's. + 's Netstring_tstring.tstring_ops -> + int array -> + int array -> + 's -> + int -> + int -> + int * int * encoding; +} + +let read_utf8 is_java = + (* UNSAFE_OPT *) + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + (* k: counts the bytes + * n: counts the characters + *) + let p = ref p_in in + let p_max = p_in + l_in in + let n = ref 0 in + let n_ret = ref (-1) in + + let malformed_code () = + slice_char.(!n) <- -1; + raise (Malformed_code_read (!n, !p - p_in, `Enc_utf8)) + in + + let slice_length = Array.length slice_char in + + while !p < p_max && !n < slice_length do + let k_inc = + (* length of the character in bytes; 0 means: stop *) + + (* We know: + * (1) p_in >= 0 ==> !p >= 0 + * (2) !p < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + (* match s_in.[k_in + k] with *) + match ops.unsafe_get s_in !p with + | '\000' -> + if is_java then malformed_code (); + (* slice_char.(n) <- 0; *) + Array.unsafe_set slice_char !n 0; + (* ok *) + 1 + | '\001' .. '\127' as x -> + (* slice_char.(n) <- Char.code x; *) + Array.unsafe_set slice_char !n (Char.code x); + (* ok *) + 1 + | '\128' .. '\223' as x -> + if !p + 1 >= p_max then 0 + else + (* ==> !p+1 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let n2 = + (* Char.code (s_in.[!p + 1]) *) + Char.code (ops.unsafe_get s_in (!p + 1)) + in + if is_java && n1 = 0x80 && n2 = 0xc0 then ( + (* slice_char.(n) <- 0; *) + Array.unsafe_set slice_char !n 0; + (* ok *) + 2) + else ( + if n2 < 128 || n2 > 191 then malformed_code (); + let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in + if p < 128 then malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 2) + | '\224' .. '\239' as x -> + if !p + 2 >= p_max then 0 + else + (* ==> !p+2 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let n2 = + (* Char.code (s_in.[!p + 1]) *) + Char.code (ops.unsafe_get s_in (!p + 1)) + in + let n3 = + (* Char.code (s_in.[!p + 2]) *) + Char.code (ops.unsafe_get s_in (!p + 2)) + in + if n2 < 128 || n2 > 191 then malformed_code (); + if n3 < 128 || n3 > 191 then malformed_code (); + let p = + ((n1 land 0b1111) lsl 12) + lor ((n2 land 0b111111) lsl 6) + lor (n3 land 0b111111) + in + if p < 0x800 then malformed_code (); + if p >= 0xd800 && p < 0xe000 then + (* Surrogate pairs are not supported in UTF-8 *) + malformed_code (); + if p >= 0xfffe && p <= 0xffff then malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 3 + | '\240' .. '\247' as x -> + if !p + 3 >= p_max then 0 + else + (* ==> !p+3 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let chars = ops.unsafe_get3 s_in (!p + 1) in + let n2 = chars lsr 16 in + let n3 = (chars lsr 8) land 0xff in + let n4 = chars land 0xff in + if n2 < 128 || n2 > 191 then malformed_code (); + if n3 < 128 || n3 > 191 then malformed_code (); + if n4 < 128 || n4 > 191 then malformed_code (); + let p = + ((n1 land 0b111) lsl 18) + lor ((n2 land 0b111111) lsl 12) + lor ((n3 land 0b111111) lsl 6) + lor (n4 land 0b111111) + in + if p < 0x10000 then malformed_code (); + if p >= 0x110000 then + (* These code points are not supported. *) + malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 4 + | _ -> + (* Outside the valid range of XML characters *) + malformed_code () + in + + (* If k_inc = 0, the character was partially outside the processed + * range of the string, and could not be decoded. + *) + if k_inc > 0 then ( + (* We know: + * (1) n >= 0, because n starts with 0 and is only increased + * (2) n < Array.length slice_char = Array.length slice_blen + * ==> unsafe set ok + *) + (* slice_blen.(n) <- k_inc; *) + Array.unsafe_set slice_blen !n k_inc; + (* next iteration: *) + p := !p + k_inc; + incr n) + else ( + (* Stop loop: *) + n_ret := !n; + n := slice_length) + done; + + if !n_ret = -1 then n_ret := !n; + if !n_ret < slice_length then (* EOF marker *) + slice_char.(!n_ret) <- -1; + (!n_ret, !p - p_in, `Enc_utf8) + in + { read } + +let read_utf8_ref = ref read_utf8 + +let read_subset inner_read def = + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + let n, k, enc' = inner_read.read ops slice_char slice_blen s_in p_in l_in in + + (* check codepoints: *) + for j = 0 to n - 1 do + if not (def slice_char.(j)) then ( + (* raise Malformed_code_read... *) + (* to get enc'' read again: *) + let slice_char' = Array.make j (-1) in + let slice_blen' = Array.make j 1 in + let n', k', enc'' = + try inner_read.read ops slice_char' slice_blen' s_in p_in l_in + with Malformed_code_read (_, _, _) -> assert false + in + assert (n' = j); + int_blit slice_char' 0 slice_char 0 j; + int_blit slice_blen' 0 slice_blen 0 j; + slice_char.(j) <- -1; + raise (Malformed_code_read (j, k', enc''))) + done; + + (n, k, enc') + in + { read } + +(* + * let (n_char, b_byte) = + * write_XXX slice_char slice_length s_out p_out l_out subst + *) + +let write_utf8 is_java slice_char slice_pos slice_length s_out p_out l_out subst + = + (* UNSAFE_OPT *) + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + while !n < n_max do + (* We know: + * (1) !n >= 0, because it starts with 0 and is only increased + * (2) !n < n_max = slice_pos + slice_length <= Array.length slice + * ==> unsafe get ok + *) + let p = + (* slice.( !n ) *) + Array.unsafe_get slice_char !n + in + + let index = p_out + !k in + + let k_inc = + (* k_inc: how many bytes are written. (-1) means: stop *) + if p <= 127 && ((not is_java) || p <> 0) then ( + if p < 0 then assert false; + (* EOF mark *) + if !k < l_out then ( + (* (1) index = p_out + !k < p_out + l_out <= + * String.length s_out + * (2) p_out, !n >= 0 + * ==> unsafe set ok + * + * 0 <= p <= 127 ==> unsafe_chr ok + *) + (* s_out.[index] <- Char.chr p; *) + Bytes.unsafe_set s_out index (Char.unsafe_chr p); + 1) + else -1) + else if p <= 0x7ff then + if !k + 1 < l_out then ( + (* (1) index+1 = p_out + !k + 1 < p_out + l_out <= + * String.length s_out + * (2) p_out, !k >= 0 + * ==> unsafe set ok + * + * p <= 0x7ff ==> p lsr 6 <= 0x1f + * ==> 0xc0 lor (p lsr 6) <= df + * p land 0x3f <= 0x3f ==> 0x80 lor (p land 0x3f) <= 0xbf + * ==> unsafe_chr ok + *) + (* s_out.[index] <- Char.chr (0xc0 lor (p lsr 6)); *) + (* s_out.[index + 1] <- Char.chr (0x80 lor (p land 0x3f)); *) + Bytes.unsafe_set s_out index (Char.unsafe_chr (0xc0 lor (p lsr 6))); + Bytes.unsafe_set s_out (index + 1) + (Char.unsafe_chr (0x80 lor (p land 0x3f))); + 2) + else -1 + else if p <= 0xffff then ( + (* Refuse writing surrogate pairs, and fffe, ffff *) + if (p >= 0xd800 && p < 0xe000) || p >= 0xfffe then + failwith "Netconversion.write_utf8"; + if !k + 2 < l_out then ( + (* (1) index+2 = p_out + !k + 2 < p_out + l_out <= + * String.length s_out + * (2) p_out, !k >= 0 + * ==> unsafe set ok + * + * Well, and it can be proven that unsafe_chr is ok, too... + *) + (* s_out.[index] <- Char.chr (0xe0 lor (p lsr 12)); *) + (* s_out.[index + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); *) + (* s_out.[index + 2] <- Char.chr (0x80 lor (p land 0x3f)); *) + Bytes.unsafe_set s_out index (Char.unsafe_chr (0xe0 lor (p lsr 12))); + Bytes.unsafe_set s_out (index + 1) + (Char.unsafe_chr (0x80 lor ((p lsr 6) land 0x3f))); + Bytes.unsafe_set s_out (index + 2) + (Char.unsafe_chr (0x80 lor (p land 0x3f))); + 3) + else -1) + else if p <= 0x10ffff then + if !k + 3 < l_out then ( + (* No such characters are defined... *) + Bytes.set s_out index (Char.chr (0xf0 lor (p lsr 18))); + Bytes.set s_out (index + 1) + (Char.chr (0x80 lor ((p lsr 12) land 0x3f))); + Bytes.set s_out (index + 2) + (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); + Bytes.set s_out (index + 3) (Char.chr (0x80 lor (p land 0x3f))); + 4) + else -1 + else + (* Higher code points are not possible in XML; call subst *) + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_utf8: Substitution string too long"; + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + l_repl (* may be 0! *)) + else -1 (* Exit whole conversion *) + in + + if k_inc >= 0 then ( + k := !k + k_inc; + incr n) + else ( + n_ret := !n; + n := n_max) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let special_cpoint = 0x110000 + +let write_subset inner_writer def slice_char slice_pos slice_length s_out p_out + l_out subst = + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + (* Force that the subst' function is called for all undefined code + * points + *) + let slice_char' = Array.sub slice_char slice_pos slice_length in + for n = 0 to slice_length - 1 do + let ch = slice_char'.(n) in + if ch >= special_cpoint || not (def ch) then + slice_char'.(n) <- special_cpoint + n + done; + + let subst' ch = + if ch >= special_cpoint then + subst slice_char.(slice_pos + ch - special_cpoint) + else subst ch + in + + inner_writer slice_char' 0 slice_length s_out p_out l_out subst' + +(* `Enc_*_bom considers the BOM as a character with code point -3. + * This encoding is only internally used. + *) + +let rec get_reader1 (enc : encoding) = + (* get_reader1 supports the additional internal encodings of + * encoding1. get_reader (below) only supports the exported + * encodings. + *) + match enc with + | `Enc_utf8 -> !read_utf8_ref false + | `Enc_subset (e, def) -> + let reader' = get_reader1 e in + read_subset reader' def + +let get_reader = (get_reader1 : encoding -> 'a :> encoding -> 'a) + +let rec get_writer enc = + match enc with + | `Enc_utf8 -> write_utf8 false + | `Enc_subset (e, def) -> + let writer' = get_writer e in + write_subset writer' def + +let recode_poly ~in_ops ~in_enc ~in_buf ~in_pos ~in_len ~out_enc ~out_buf + ~out_pos ~out_len ~max_chars ~subst = + let open Netstring_tstring in + if + in_pos < 0 || in_len < 0 + || in_pos + in_len > in_ops.length in_buf + || out_pos < 0 || out_len < 0 + || out_pos + out_len > Bytes.length out_buf + then invalid_arg "Netconversion.recode"; + + (* An array with 250 elements can be allocated in the minor heap. *) + let slice_length = big_slice in + let slice_char = Array.make slice_length (-1) in + let slice_blen = Array.make slice_length 1 in + + let in_k = ref 0 in + (* read bytes *) + let in_n = ref 0 in + (* read characters *) + let in_eof = ref (!in_k >= in_len) in + let out_k = ref 0 in + (* written bytes *) + let out_n = ref 0 in + (* written characters *) + let out_eof = ref (!out_k >= out_len || !out_n >= max_chars) in + + let rd_enc = ref in_enc in + let reader = ref (get_reader in_enc) in + let writer = get_writer out_enc in + + while (not !in_eof) && not !out_eof do + let in_n_inc, in_k_inc, rd_enc' = + try + !reader.read in_ops slice_char slice_blen in_buf (in_pos + !in_k) + (in_len - !in_k) + with Malformed_code_read (in_n_inc, in_k_inc, rd_enc') -> + if in_n_inc = 0 then raise Malformed_code; + (in_n_inc, in_k_inc, rd_enc') + in + + let out_n_inc_max = min in_n_inc (max_chars - !out_n) in + (* do not write more than max_chars *) + let out_n_inc, out_k_inc = + if out_n_inc_max > 0 then + writer slice_char 0 out_n_inc_max out_buf (out_pos + !out_k) + (out_len - !out_k) subst + else (0, 0) + in + + let in_n_inc', in_k_inc' = + if in_n_inc > out_n_inc then ( + (* Not all read characters could be written *) + let sum = ref 0 in + for j = 0 to out_n_inc - 1 do + sum := !sum + slice_blen.(j) + done; + (out_n_inc, !sum)) + else (in_n_inc, in_k_inc) + in + + in_k := !in_k + in_k_inc'; + in_n := !in_n + in_n_inc'; + out_k := !out_k + out_k_inc; + out_n := !out_n + out_n_inc; + + (* Detect change of input encoding: *) + if rd_enc' <> !rd_enc then ( + rd_enc := rd_enc'; + reader := get_reader rd_enc'; + Array.fill slice_blen 0 slice_length 1); + + (* EOF criteria: + * - It is possible that !in_k never reaches in_len because there is a + * multibyte character at the end that is partially outside the input + * range + * - For the same reason it is possible that !out_k never reaches out_len + * - It is accepted as reader EOF if not even one character can be + * scanned + * - It is accepted as writer EOF if fewer than in_n_inc characters + * could be written + *) + in_eof := !in_k >= in_len || in_n_inc = 0; + + out_eof := !out_k >= out_len || !out_n >= max_chars || out_n_inc < in_n_inc + done; + + (!in_k, !out_k, !rd_enc) + +let rec ustring_of_uchar enc = + let multi_byte writer n p = + let s = Bytes.create n in + let _, n_act = + writer [| p |] 0 1 s 0 n (fun _ -> raise (Cannot_represent p)) + in + Bytes.sub_string s 0 n_act + in + match enc with + | `Enc_utf8 -> multi_byte (write_utf8 false) 4 + | `Enc_subset (e, def) -> + fun p -> + if def p then ustring_of_uchar e p else raise (Cannot_represent p) + +let makechar enc = + let us = ustring_of_uchar enc in + fun p -> try us p with Cannot_represent _ -> raise Not_found + +(* The following algorithms assume that there is an upper limit of the length + * of a multibyte character. Currently, UTF8 is the encoding with the longest + * multibyte characters (6 bytes). + * Because of this limit, it is allowed to allocate a buffer that is "large + * enough" in order to ensure that at least one character is recoded in every + * loop cycle. If the buffer was not large enough, no character would be + * processed in a cycle, and the algorithm would hang. + *) + +let convert_poly : + type s t. + in_ops:s Netstring_tstring.tstring_ops -> + out_kind:t Netstring_tstring.tstring_kind -> + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + s -> + t = + fun ~in_ops ~out_kind ?(subst = fun p -> raise (Cannot_represent p)) ~in_enc + ~out_enc ?(range_pos = 0) ?range_len s -> + let open Netstring_tstring in + let range_len = + match range_len with Some l -> l | None -> in_ops.length s - range_pos + in + + if range_pos < 0 || range_len < 0 || range_pos + range_len > in_ops.length s + then invalid_arg "Netconversion.convert"; + + (* Estimate the size of the output string: + * length * 2 is just guessed. It is assumed that this number is usually + * too large, and to avoid that too much memory is wasted, the buffer is + * limited by 10000. + *) + let size = ref (max multibyte_limit (min 10000 (range_len * 2))) in + let out_buf = ref (Bytes.create !size) in + + let k_in = ref 0 in + let k_out = ref 0 in + + while !k_in < range_len do + let in_len = range_len - !k_in in + let out_len = !size - !k_out in + assert (out_len >= multibyte_limit); + (* space for at least one char *) + let k_in_inc, k_out_inc, _in_enc' = + recode_poly ~in_ops ~in_enc ~in_buf:s ~in_pos:(range_pos + !k_in) ~in_len + ~out_enc ~out_buf:!out_buf ~out_pos:!k_out ~out_len ~max_chars:max_int + ~subst + in + if k_in_inc = 0 then raise Malformed_code; + (* Reasons for k_in_inc = 0: + * (1) There is not enough space in out_buf to add a single character + * (2) in_buf ends with a prefix of a multi-byte character + * Because there is always space for at least one character + * ( = multibyte_limit ), reason (1) can be excluded. So it must + * be (2), and we can raise Malformed_code. + *) + k_in := !k_in + k_in_inc; + k_out := !k_out + k_out_inc; + (* double the size of out_buf: *) + let size' = min Sys.max_string_length (!size + !size) in + if size' < !size + multibyte_limit then + failwith "Netconversion.convert: string too long"; + let out_buf' = Bytes.create size' in + Bytes.blit !out_buf 0 out_buf' 0 !k_out; + out_buf := out_buf'; + size := size' + done; + match out_kind with + | Netstring_tstring.String_kind -> Bytes.sub_string !out_buf 0 !k_out + | Netstring_tstring.Bytes_kind -> Bytes.sub !out_buf 0 !k_out + +let convert ?subst ~in_enc ~out_enc ?range_pos ?range_len s = + convert_poly ?subst ~in_ops:Netstring_tstring.string_ops + ~out_kind:Netstring_tstring.String_kind ~in_enc ~out_enc ?range_pos + ?range_len s + +let uarray_of_ustring_poly ops enc ?(range_pos = 0) ?range_len s = + let open Netstring_tstring in + let range_len = + match range_len with Some l -> l | None -> ops.length s - range_pos + in + + if range_pos < 0 || range_len < 0 || range_pos + range_len > ops.length s then + invalid_arg "Netconversion.uarray_of_ustring"; + + let slice_length = big_slice in + let slice_char = Array.make slice_length (-1) in + let slice_blen = Array.make slice_length 1 in + + let k = ref 0 in + let e = ref enc in + let reader = ref (get_reader enc) in + let buf = ref [] in + + while !k < range_len do + let n_inc, k_inc, enc' = + try + !reader.read ops slice_char slice_blen s (range_pos + !k) + (range_len - !k) + with Malformed_code_read (_, _, _) -> raise Malformed_code + in + + k := !k + k_inc; + buf := Array.sub slice_char 0 n_inc :: !buf; + + if enc' <> !e then ( + e := enc'; + reader := get_reader enc'; + Array.fill slice_blen 0 slice_length 1); + + if n_inc < slice_length then ( + (* EOF *) + if !k < range_len then raise Malformed_code; + (* s ends with multi-byte prefix*) + k := range_len) + done; + + Array.concat (List.rev !buf) + +let uarray_of_ustring enc = + uarray_of_ustring_poly Netstring_tstring.string_ops enc + +let ustring_of_uarray_poly out_kind + ?(subst = fun code -> raise (Cannot_represent code)) enc ?(pos = 0) ?len ua + = + let len = match len with Some l -> l | None -> Array.length ua - pos in + + if pos < 0 || len < 0 || pos + len > Array.length ua then + invalid_arg "Netconversion.ustring_of_uarray"; + + (* Estimate the size of the output string: + * length * 2 is just guessed. It is assumed that this number is usually + * too large, and to avoid that too much memory is wasted, the buffer is + * limited by 10000. + *) + let size = ref (max multibyte_limit (min 10000 (len * 2))) in + let out_buf = ref (Bytes.create !size) in + + let writer = get_writer enc in + + let k_in = ref 0 in + let k_out = ref 0 in + + while !k_in < len do + let k_in_inc, k_out_inc = + writer ua (pos + !k_in) (len - !k_in) !out_buf !k_out (!size - !k_out) + subst + in + k_in := !k_in + k_in_inc; + k_out := !k_out + k_out_inc; + + (* double the size of out_buf: *) + let size' = min Sys.max_string_length (!size + !size) in + if size' < !size + multibyte_limit then + failwith "Netconversion.ustring_of_uarray: string too long"; + let out_buf' = Bytes.create size' in + Bytes.blit !out_buf 0 out_buf' 0 !k_out; + out_buf := out_buf'; + size := size' + done; + + Netstring_tstring.bytes_subpoly out_kind !out_buf 0 !k_out + +let ustring_of_uarray ?subst = + ustring_of_uarray_poly Netstring_tstring.String_kind ?subst diff --git a/ocamlnet_lite/netconversion.mli b/ocamlnet_lite/netconversion.mli new file mode 100644 index 0000000..615e4b7 --- /dev/null +++ b/ocamlnet_lite/netconversion.mli @@ -0,0 +1,375 @@ +(** Conversion between character encodings + * + * {b Contents} + * {ul + * {- {!Netconversion.preliminaries} + * {ul + * {- {!Netconversion.unicode}} + * {- {!Netconversion.subsets}} + * {- {!Netconversion.linking}} + * {- {!Netconversion.domain}} + * {- {!Netconversion.problems}}}} + * {- {!Netconversion.interface} + * {ul + * {- {!Netconversion.direct_conv}} + * {- {!Netconversion.cursors} + * {ul {- {!Netconversion.bom}}}} + * {- {!Netconversion.unicode_functions}} + * } + * } + * } + *) + +(** {1:preliminaries Preliminaries} + * + * A {b character set} is a set of characters where every character is + * identified by a {b code point}. An {b encoding} is a way of + * representing characters from a set in byte strings. For example, + * the Unicode character set has more than 96000 characters, and + * the code points have values from 0 to 0x10ffff (not all code points + * are assigned yet). The UTF-8 encoding represents the code points + * by sequences of 1 to 4 bytes. There are also encodings that + * represent code points from several sets, e.g EUC-JP covers four + * sets. + * + * Encodings are enumerated by the type [encoding], and names follow + * the convention [`Enc_*], e.g. [`Enc_utf8]. + * Character sets are enumerated by the type + * [charset], and names follow the convention [`Set_*], e.g. + * [`Set_unicode]. + * + * This module deals mainly with encodings. It is important to know + * that the same character set may have several encodings. For example, + * the Unicode character set can be encoded as UTF-8 or UTF-16. + * For the 8 bit character sets, however, there is usually only one + * encoding, e.g [`Set_iso88591] is always encoded as [`Enc_iso88591]. + * + * In a {b single-byte encoding} every code point is represented by + * one byte. This is what many programmers are accustomed at, and + * what the OCaml language specially supports: A [string] is + * a sequence of [char]s, where [char] means an 8 bit quantity + * interpreted as character. For example, the following piece of code allocates + * a [string] of four [char]s, and assigns them individually: + * + * {[ + * let s = String.create 4 in + * s.[0] <- 'G'; + * s.[1] <- 'e'; + * s.[2] <- 'r'; + * s.[3] <- 'd'; + * ]} + * + * In a {b multi-byte encoding} there are code points that are represented + * by several bytes. As we still represent such text as [string], the + * problem arises that a single [char], actually a byte, often represents + * only a fraction of a full multi-byte character. There are two solutions: + * - Give up the principle that text is represented by [string]. + * This is, for example, the approach chosen by [Camomile], another OCaml + * library dealing with Unicode. Instead, text is represented as + * [int array]. This way, the algorithms processing the text can + * remain the same. + * - Give up the principle that individual characters can be directly + * accessed in a text. This is the primary way chosen by Ocamlnet. + * This means that there is not any longer the possibility to read + * or write the [n]th character of a text. One can, however, still + * compose texts by just concatenating the strings representing + * individual characters. Furthermore, it is possible to define + * a cursor for a text that moves sequentially along the text. + * The consequence is that programmers are restricted to sequential + * algorithms. Note that the majority of text processing falls into + * this class. + * + * The corresponding piece of code for Ocamlnet's Unicode implementation + * is: + * {[ + * let b = Buffer.create 80 in + * Buffer.add b (ustring_of_uchar `Enc_utf8 71); (* 71 = code point of 'G' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 101); (* 101 = code point of 'e' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 114); (* 114 = code point of 'r' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 100); (* 100 = code point of 'd' *) + * let s = Buffer.contents b + * ]} + * + * It is important to always remember that a [char] is no longer + * a character but simply a byte. In many of the following explanations, + * we strictly distinguish between {b byte positions} or {b byte counts}, + * and {b character positions} or {b character counts}. + * + * There a number of special effects that usually only occur in + * multi-byte encodings: + * + * - Bad encodings: Not every byte sequence is legal. When scanning + * such text, the functions will raise the exception [Malformed_code] + * when they find illegal bytes. + * - Unassigned code points: It may happen that a byte sequence is + * a correct representation for a code point, but that the code point + * is unassigned in the character set. When scanning, this is also + * covered by the exception [Malformed_code]. When converting from + * one encoding to another, it is also possible that the code point + * is only unassigned in the target character set. This case is + * usually handled by a substitution function [subst], and if no such + * function is defined, by the exception [Cannot_represent]. + * - Incomplete characters: The trailing bytes of a string may be the + * correct beginning of a byte sequence for a character, but not a + * complete sequence. Of course, if that string is the end of a + * text, this is just illegal, and also a case for [Malformed_code]. + * However, when text is processed chunk by chunk, this phenomenon + * may happen legally for all chunks but the last. For this reason, + * some of the functions below handle this case specially. + * - Byte order marks: Some encodings have both big and little endian + * variants. A byte order mark at the beginning of the text declares + * which variant is actually used. This byte order mark is a + * declaration written like a character, but actually not a + * character. + * + * There is a special class of encodings known as {b ASCII-compatible}. + * They are important because there are lots of programs and protocols + * that only interpret bytes from 0 to 127, and treat the bytes from + * 128 to 255 as data. These programs can process texts as long as + * the bytes from 0 to 127 are used as in ASCII. Fortunately, many + * encodings are ASCII-compatible, including UTF-8. + * + * {2:unicode Unicode} + * + * [Netconversion] is centred around Unicode. + * The conversion from one encoding to another works by finding the + * Unicode code point of the character + * to convert, and by representing the code point in the target encoding, + * even if neither encodings have to do with Unicode. + * Of course, this approach requires that all character sets handled + * by [Netconversion] are subsets of Unicode. + * + * The supported range of Unicode code points: 0 to 0xd7ff, 0xe000 to 0xfffd, + * 0x10000 to 0x10ffff. All these code points can be represented in + * UTF-8 and UTF-16. [Netconversion] does not know which of the code + * points are assigned and which not, and because of this, it simply + * allows all code points of the mentioned ranges (but for other character + * sets, the necessary lookup tables exist). + * + * {b UTF-8:} The UTF-8 representation can have one to four bytes. Malformed + * byte sequences are always rejected, even those that want to cheat the + * reader like "0xc0 0x80" for the code point 0. There is special support + * for the Java variant of UTF-8 ([`Enc_java]). [`Enc_utf8] strings must not + * have a byte order mark (it would be interpreted as "zero-width space" + * character). However, the Unicode standard allows byte order marks + * at the very beginning of texts; use [`Enc_utf8_opt_bom] in this case. + * + * {b UTF-16:} When reading from a string encoded as [`Enc_utf16], a byte + * order mark is expected at the beginning. The detected variant + * ([`Enc_utf16_le] or [`Enc_utf16_be]) is usually returned by the parsing + * function. The byte order mark is not included into the output string. - + * Some functions of this + * module cannot cope with [`Enc_utf16] (i.e. UTF-16 without endianess + * annotation), and will fail. + * + * Once the endianess is determined, the code point 0xfeff is no longer + * interpreted as byte order mark, but as "zero-width non-breakable space". + * + * Some code points are represented by pairs of 16 bit values, these + * are the so-called "surrogate pairs". They can only occur in UTF-16. + * + * {b UTF-32:} This is very much the same as for UTF-16. There is a little + * endian version [`Enc_utf32_le] and a big endian version [`Enc_utf32_be]. + * + * {2:subsets Subsets of Unicode} + * + * The non-Unicode character sets are subsets of Unicode. Here, it may + * happen that a Unicode code point does not have a corresponding + * code point. In this case, certain rules are applied to handle + * this (see below). It is, however, ensured that every non-Unicode + * code point has a corresponding Unicode code point. (In other words, + * character sets cannot be supported for which this property does + * not hold.) + * + * It is even possible to create further subsets artificially. The + * encoding [`Enc_subset(e,def)] means to derive a new encoding from + * the existing one [e], but to only accept the code points for which + * the definition function [def] yields the value [true]. For example, + * the encoding + * {[ `Enc_subset(`Enc_usascii, + * fun i -> i <> 34 && i <> 38 && i <> 60 && i <> 62) ]} + * is ASCII without the bracket angles, the quotation mark, and the + * ampersand character, i.e. the subset of ASCII that can be included + * in HTML text without escaping. + * + * If a code point is not defined by the encoding but found in a text, + * the reader will raise the exception [Malformed_code]. When text is + * output, however, the [subst] function will be called for undefined code + * points (which raises [Cannot_represent] by default). The [subst] + * function is an optional argument of many conversion functions that + * allows it to insert a substitution text for undefined code points. + * Note, however, that the substitution text is restricted to at most + * 50 characters (because unlimited length would lead to difficult + * problems we would like to avoid). + * + * {2:linking Linking this module} + * + * Many encodings require lookup tables. The following encodings + * are built-in and always supported: + * + * - Unicode: [`Enc_utf8], [`Enc_java], [`Enc_utf16], [`Enc_utf16_le], + [`Enc_utf16_be], [`Enc_utf32], [`Enc_utf32_le], [`Enc_utf32_be] + * - Other: [`Enc_usascii], [`Enc_iso88591], [`Enc_empty] + * + * The lookup tables for the other encodings are usually loaded at + * runtime, but it is also possible to embed them in the generated + * binary executable. See {!Netunidata} for details. The functions + * [available_input_encodings] and [available_output_encodings] can + * be invoked to find out which encodings can be loaded, or are available + * otherwise. + * + * {2:domain Supported Encodings, Restrictions} + * + * I took the mappings from [www.unicode.org], and the standard names of + * the character sets from IANA. Obviously, many character sets are missing + * that can be supported; especially ISO646 character sets, and many EBCDIC + * code pages. Stateful encodings like generic ISO-2022 have been omitted + * (stateless subsets of ISO-2022 like EUC can be supported, however; + * currently we support EUC-JP and EUC-KR). + * + * Because of the copyright statement from Unicode, I cannot put the + * source tables that describe the mappings into the distribution. They + * are publicly available from [www.unicode.org]. + * + * {2:problems Known Problems} + * + * - The following charsets do not have a bijective mapping to Unicode: + * adobe_standard_encoding, adobe_symbol_encoding, + * adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation + * simply removes one of the conflicting code point pairs - this might + * not what you want. + * - Japanese encodings: + * JIS X 0208: The character 1/32 is mapped to 0xFF3C, and not + * to 0x005C. + *) + +(** {1:interface Interface} + * + * {b Naming conventions:} + * + * As it is possible to refer to substrings by either giving a byte + * offset or by counting whole characters, these naming conventions + * are helpful: + * + * - Labels called [range_pos] and [range_len] refer to byte positions of + * characters, or substrings + * - Labels called [count] refer to positions given as the number of characters + * relative to an origin + * + * Furthermore: + * + * - A [uchar] is a single Unicode code point represented as int + * - A [ustring] is a string of encoded characters + * - A [uarray] is an [array of int] representing a string + *) + +exception Malformed_code +(** Raised when an illegal byte sequence is found *) + +exception Cannot_represent of int +(** Raised when a certain Unicode code point cannot be represented in + * the selected output encoding + *) + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | (* Encoding subset: *) + `Enc_subset of encoding * (int -> bool) ] +(** The polymorphic variant enumerating the supported encodings. We have: + * - [`Enc_utf8]: UTF-8 + * - [`Enc_subset(e,def)]: The subset of [e] by applying the definition + * function [def] + *) + +(**********************************************************************) +(* String functions *) +(**********************************************************************) + +(** {2:unicode_functions Unicode String Functions} *) + +val uarray_of_ustring : + encoding -> ?range_pos:int -> ?range_len:int -> string -> int array +(** Returns the characters of the string as array of Unicode code points. + * + * @param range_pos The byte position of the substring to extract + * (default: 0) + * @param range_len The byte length of the substring to extract + * (default: byte length of the input string minus [range_pos]) + *) + +val ustring_of_uarray : + ?subst:(int -> string) -> + encoding -> + ?pos:int -> + ?len:int -> + int array -> + string +(** Returns the array of Unicode code points as encoded string. + * + * @param pos Selects a subarray: [pos] is the first array position + * to encode (default: 0) + * @param len Selects a subarray: [len] is the length of the subarray + * to encode (default: array length minus [pos]) + * @param subst This function is called when a code point cannot be represented + * in the chosen character encoding. It must returns the (already encoded) + * string to substitute for this code point. By default + * (if ~subst is not passed), the exception [Cannot_represent] + * will be raised in this case. + *) + +val convert_poly : + in_ops:'s1 Netstring_tstring.tstring_ops -> + out_kind:'s2 Netstring_tstring.tstring_kind -> + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + 's1 -> + 's2 + +val convert : + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + string -> + string +(** Converts the string from [in_enc] to [out_enc], and returns it. +* The string must consist of a whole number of characters. If it +* ends with an incomplete multi-byte character, however, this is +* detected, and the exception [Malformed_code] will be raised. +* This exception is also raised for other encoding errors in the +* input string. +* +* @param subst This function is invoked for code points of [in_enc] that +* cannot be represented in [out_enc], and the result of the function +* invocation is substituted (directly, without any further conversion). +* Restriction: The string returned by [subst] must not be longer than 50 +* bytes. +* If [subst] is missing, [Cannot_represent] is raised in this case. +* +* @param range_pos Selects a substring for conversion. [range_pos] +* is the byte position of the first character of the substring. +* (Default: 0) +* +* @param range_len Selects a substring for conversion. [range_len] +* is the length of the substring in bytes (Default: Length +* of the input string minus [range_pos]) +*) + +val makechar : encoding -> int -> string +(** [makechar enc i:] + * Creates the string representing the Unicode code point [i] in encoding + * [enc]. Raises [Not_found] if the character is legal but cannot be + * represented in [enc]. + * + * Possible encodings: everything but [`Enc_utf16] and [`Enc_utf32] + * + * Evaluation hints: + * - PRE_EVAL(encoding) + * + * @deprecated This function is deprecated since ocamlnet-0.96. Use + * [ustring_of_uchar] instead. + *) diff --git a/ocamlnet_lite/netencoding.ml b/ocamlnet_lite/netencoding.ml new file mode 100644 index 0000000..b238bbd --- /dev/null +++ b/ocamlnet_lite/netencoding.ml @@ -0,0 +1,559 @@ +module Url = struct + (* adapted from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) + let hex_digits = + [| + '0'; + '1'; + '2'; + '3'; + '4'; + '5'; + '6'; + '7'; + '8'; + '9'; + 'A'; + 'B'; + 'C'; + 'D'; + 'E'; + 'F'; + |] + + let to_hex2 k = + (* Converts k to a 2-digit hex string *) + let s = Bytes.create 2 in + Bytes.set s 0 hex_digits.((k lsr 4) land 15); + Bytes.set s 1 hex_digits.(k land 15); + Bytes.unsafe_to_string s + + let of_hex1 c = + match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | _ -> raise Not_found + + let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" + let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" + + let encode ?(plus = true) s = + Netstring_str.global_substitute url_encoding_re + (fun r _ -> + match Netstring_str.matched_string r s with + | " " when plus -> "+" + | x -> + let k = Char.code x.[0] in + "%" ^ to_hex2 k) + s + + let decode ?(plus = true) ?(pos = 0) ?len s = + let s_l = String.length s in + let s1 = + if pos = 0 && len = None then s + else + let len = match len with Some n -> n | None -> s_l in + String.sub s pos len + in + let l = String.length s1 in + Netstring_str.global_substitute url_decoding_re + (fun r _ -> + match Netstring_str.matched_string r s1 with + | "+" -> if plus then " " else "+" + | _ -> ( + let i = Netstring_str.match_beginning r in + (* Assertion: s1.[i] = '%' *) + if i + 2 >= l then failwith "Web.Url.decode"; + let c1 = s1.[i + 1] in + let c2 = s1.[i + 2] in + try + let k1 = of_hex1 c1 in + let k2 = of_hex1 c2 in + String.make 1 (Char.chr ((k1 lsl 4) lor k2)) + with Not_found -> failwith "Web.Url.decode")) + s1 + + let url_split_re = Netstring_str.regexp "[&=]" + + let dest_url_encoded_parameters parstr = + let rec parse_after_amp tl = + match tl with + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Text value + :: tl' -> + (decode name, decode value) :: parse_next tl' + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Delim "&" + :: tl' -> + (decode name, "") :: parse_after_amp tl' + | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> + [ (decode name, "") ] + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + and parse_next tl = + match tl with + | [] -> [] + | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + in + let toklist = Netstring_str.full_split url_split_re parstr in + match toklist with [] -> [] | _ -> parse_after_amp toklist +end + +module Html = struct + let etable = + [ + ("lt", 60); + ("gt", 62); + ("amp", 38); + ("quot", 34); + (* Note: " is new in HTML-4.0, but it has been widely used + * much earlier. + *) + ("apos", 39); + (* Only used if contained in unsafe_chars *) + (* ISO-8859-1: *) + ("nbsp", 160); + ("iexcl", 161); + ("cent", 162); + ("pound", 163); + ("curren", 164); + ("yen", 165); + ("brvbar", 166); + ("sect", 167); + ("uml", 168); + ("copy", 169); + ("ordf", 170); + ("laquo", 171); + ("not", 172); + ("shy", 173); + ("reg", 174); + ("macr", 175); + ("deg", 176); + ("plusmn", 177); + ("sup2", 178); + ("sup3", 179); + ("acute", 180); + ("micro", 181); + ("para", 182); + ("middot", 183); + ("cedil", 184); + ("sup1", 185); + ("ordm", 186); + ("raquo", 187); + ("frac14", 188); + ("frac12", 189); + ("frac34", 190); + ("iquest", 191); + ("Agrave", 192); + ("Aacute", 193); + ("Acirc", 194); + ("Atilde", 195); + ("Auml", 196); + ("Aring", 197); + ("AElig", 198); + ("Ccedil", 199); + ("Egrave", 200); + ("Eacute", 201); + ("Ecirc", 202); + ("Euml", 203); + ("Igrave", 204); + ("Iacute", 205); + ("Icirc", 206); + ("Iuml", 207); + ("ETH", 208); + ("Ntilde", 209); + ("Ograve", 210); + ("Oacute", 211); + ("Ocirc", 212); + ("Otilde", 213); + ("Ouml", 214); + ("times", 215); + ("Oslash", 216); + ("Ugrave", 217); + ("Uacute", 218); + ("Ucirc", 219); + ("Uuml", 220); + ("Yacute", 221); + ("THORN", 222); + ("szlig", 223); + ("agrave", 224); + ("aacute", 225); + ("acirc", 226); + ("atilde", 227); + ("auml", 228); + ("aring", 229); + ("aelig", 230); + ("ccedil", 231); + ("egrave", 232); + ("eacute", 233); + ("ecirc", 234); + ("euml", 235); + ("igrave", 236); + ("iacute", 237); + ("icirc", 238); + ("iuml", 239); + ("eth", 240); + ("ntilde", 241); + ("ograve", 242); + ("oacute", 243); + ("ocirc", 244); + ("otilde", 245); + ("ouml", 246); + ("divide", 247); + ("oslash", 248); + ("ugrave", 249); + ("uacute", 250); + ("ucirc", 251); + ("uuml", 252); + ("yacute", 253); + ("thorn", 254); + ("yuml", 255); + (* Other: *) + ("fnof", 402); + ("Alpha", 913); + ("Beta", 914); + ("Gamma", 915); + ("Delta", 916); + ("Epsilon", 917); + ("Zeta", 918); + ("Eta", 919); + ("Theta", 920); + ("Iota", 921); + ("Kappa", 922); + ("Lambda", 923); + ("Mu", 924); + ("Nu", 925); + ("Xi", 926); + ("Omicron", 927); + ("Pi", 928); + ("Rho", 929); + ("Sigma", 931); + ("Tau", 932); + ("Upsilon", 933); + ("Phi", 934); + ("Chi", 935); + ("Psi", 936); + ("Omega", 937); + ("alpha", 945); + ("beta", 946); + ("gamma", 947); + ("delta", 948); + ("epsilon", 949); + ("zeta", 950); + ("eta", 951); + ("theta", 952); + ("iota", 953); + ("kappa", 954); + ("lambda", 955); + ("mu", 956); + ("nu", 957); + ("xi", 958); + ("omicron", 959); + ("pi", 960); + ("rho", 961); + ("sigmaf", 962); + ("sigma", 963); + ("tau", 964); + ("upsilon", 965); + ("phi", 966); + ("chi", 967); + ("psi", 968); + ("omega", 969); + ("thetasym", 977); + ("upsih", 978); + ("piv", 982); + ("bull", 8226); + ("hellip", 8230); + ("prime", 8242); + ("Prime", 8243); + ("oline", 8254); + ("frasl", 8260); + ("weierp", 8472); + ("image", 8465); + ("real", 8476); + ("trade", 8482); + ("alefsym", 8501); + ("larr", 8592); + ("uarr", 8593); + ("rarr", 8594); + ("darr", 8595); + ("harr", 8596); + ("crarr", 8629); + ("lArr", 8656); + ("uArr", 8657); + ("rArr", 8658); + ("dArr", 8659); + ("hArr", 8660); + ("forall", 8704); + ("part", 8706); + ("exist", 8707); + ("empty", 8709); + ("nabla", 8711); + ("isin", 8712); + ("notin", 8713); + ("ni", 8715); + ("prod", 8719); + ("sum", 8721); + ("minus", 8722); + ("lowast", 8727); + ("radic", 8730); + ("prop", 8733); + ("infin", 8734); + ("ang", 8736); + ("and", 8743); + ("or", 8744); + ("cap", 8745); + ("cup", 8746); + ("int", 8747); + ("there4", 8756); + ("sim", 8764); + ("cong", 8773); + ("asymp", 8776); + ("ne", 8800); + ("equiv", 8801); + ("le", 8804); + ("ge", 8805); + ("sub", 8834); + ("sup", 8835); + ("nsub", 8836); + ("sube", 8838); + ("supe", 8839); + ("oplus", 8853); + ("otimes", 8855); + ("perp", 8869); + ("sdot", 8901); + ("lceil", 8968); + ("rceil", 8969); + ("lfloor", 8970); + ("rfloor", 8971); + ("lang", 9001); + ("rang", 9002); + ("loz", 9674); + ("spades", 9824); + ("clubs", 9827); + ("hearts", 9829); + ("diams", 9830); + ("OElig", 338); + ("oelig", 339); + ("Scaron", 352); + ("scaron", 353); + ("Yuml", 376); + ("circ", 710); + ("tilde", 732); + ("ensp", 8194); + ("emsp", 8195); + ("thinsp", 8201); + ("zwnj", 8204); + ("zwj", 8205); + ("lrm", 8206); + ("rlm", 8207); + ("ndash", 8211); + ("mdash", 8212); + ("lsquo", 8216); + ("rsquo", 8217); + ("sbquo", 8218); + ("ldquo", 8220); + ("rdquo", 8221); + ("bdquo", 8222); + ("dagger", 8224); + ("Dagger", 8225); + ("permil", 8240); + ("lsaquo", 8249); + ("rsaquo", 8250); + ("euro", 8364); + ] + + let quick_etable_html = + let ht = Hashtbl.create 50 in + List.iter (fun (name, value) -> Hashtbl.add ht name value) etable; + ht + + let quick_etable_xml = + let ht = Hashtbl.create 5 in + List.iter + (fun name -> + let value = List.assoc name etable in + Hashtbl.add ht name value) + [ "lt"; "gt"; "amp"; "quot"; "apos" ]; + ht + + let rev_etable = + (* Only code points 0 to 255: *) + let a = Array.make 256 "" in + List.iter + (fun (name, value) -> if value <= 255 then a.(value) <- "&" ^ name ^ ";") + etable; + a + + let rev_etable_rest = + (* Only code points >= 256: *) + let ht = Hashtbl.create 150 in + List.iter + (fun (name, value) -> + if value >= 256 then Hashtbl.add ht value ("&" ^ name ^ ";")) + etable; + ht + + let unsafe_chars_html4 = + "<>\"&\000\001\002\003\004\005\006\007\008\011\012\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031\127" + + let encode_poly ~in_enc ~in_ops ~out_kind ?(out_enc = `Enc_utf8) + ?(prefer_name = true) ?(unsafe_chars = unsafe_chars_html4) () = + (* This function implements the general case *) + for i = 0 to String.length unsafe_chars - 1 do + if Char.code unsafe_chars.[i] >= 128 then + invalid_arg + "Netencoding.Html.encode: non-ASCII character in unsafe_chars" + done; + (* Are there better implementations than the general one? *) + (* Create the domain function: *) + let dom_array = Array.make 128 true in + let dom p = p >= 128 || dom_array.(p) in + (* Set dom_array from unsafe_chars: *) + for i = 0 to String.length unsafe_chars - 1 do + let c = Char.code unsafe_chars.[i] in + dom_array.(c) <- false + done; + (* Create the substitution function: *) + let subst p = + let name = + if prefer_name then + if p <= 255 then rev_etable.(p) + else try Hashtbl.find rev_etable_rest p with Not_found -> "" + else "" + in + if name = "" then "" ^ string_of_int p ^ ";" else name + in + (* Recode: *) + fun s -> + Netconversion.convert_poly ~in_ops ~out_kind ~subst ~in_enc + ~out_enc:(`Enc_subset (out_enc, dom)) + s + + let encode ~in_enc ?out_enc ?prefer_name ?unsafe_chars () = + let in_ops = Netstring_tstring.string_ops in + let out_kind = Netstring_tstring.String_kind in + encode_poly ~in_enc ~in_ops ~out_kind ?out_enc ?prefer_name ?unsafe_chars () + + type entity_set = [ `Html | `Xml | `Empty ] + + let eref_re = + Netstring_str.regexp + "&\\(#\\([0-9]+\\);\\|#[xX]\\([0-9a-fA-F]+\\);\\|\\([a-zA-Z]+\\);\\)" + + let total_enc = + (* every byte must have a corresponding Unicode code point, i.e. the + * encoding must be "byte-total" + *) + function + | _ -> false + + let hex_digit_of_char c = + match c with + | '0' .. '9' -> Char.code c - 48 + | 'A' .. 'F' -> Char.code c - 55 + | 'a' .. 'f' -> Char.code c - 87 + | _ -> assert false + + let hex_of_string s = + let n = ref 0 in + for i = 0 to String.length s - 1 do + let d = hex_digit_of_char s.[i] in + n := (!n lsl 4) lor d + done; + !n + + let search_all re s pos = + let rec search p acc = + match + try Some (Netstring_str.search_forward re s p) with Not_found -> None + with + | Some (k, r) -> search (k + 1) ((k, r) :: acc) + | None -> List.rev acc + in + search pos [] + + let decode_half_poly ~in_enc ~out_kind ~out_enc + ?(lookup = + fun name -> + failwith ("Netencoding.Html.decode: Unknown entity `" ^ name ^ "'")) + ?(subst = + fun p -> + failwith + ("Netencoding.Html.decode: Character cannot be represented: " + ^ string_of_int p)) ?(entity_base = (`Html : entity_set)) () = + (* makechar: *) + let raw_makechar = Netconversion.makechar out_enc in + let makechar p = try raw_makechar p with Not_found -> subst p in + (* Entity lookup: *) + let lookup_entity = + match entity_base with + | `Html | `Xml -> ( + let ht = + if entity_base = `Html then quick_etable_html else quick_etable_xml + in + fun name -> + try makechar (Hashtbl.find ht name) with Not_found -> lookup name) + | `Empty -> lookup + in + (* Recode strings: *) + let recode_str = + if total_enc in_enc && in_enc = out_enc then fun s pos len -> + if pos = 0 && len = String.length s then s else String.sub s pos len + else fun s range_pos range_len -> + Netconversion.convert ~in_enc ~out_enc ~subst ~range_pos ~range_len s + in + fun s -> + (* Find all occurrences of &name; or num; or num; *) + let occurrences = search_all eref_re s 0 in + (* Collect the resulting string in a buffer *) + let buf = Netbuffer.create 250 in + let n = ref 0 in + List.iter + (fun (n0, r) -> + let n1 = Netstring_str.match_end r in + if n0 > !n then Netbuffer.add_string buf (recode_str s !n (n0 - !n)); + let replacement = + let num = + try Netstring_str.matched_group r 2 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + if num <> "" then + let n = int_of_string num in + makechar n + else + let xnum = + try Netstring_str.matched_group r 3 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + if xnum <> "" then + let n = hex_of_string xnum in + makechar n + else + let name = + try Netstring_str.matched_group r 4 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + assert (name <> ""); + lookup_entity name + in + Netbuffer.add_string buf replacement; + n := n1) + occurrences; + let n0 = String.length s in + if n0 > !n then Netbuffer.add_string buf (recode_str s !n (n0 - !n)); + (* Return *) + Netbuffer.to_tstring_poly buf out_kind + + let decode ~in_enc ~out_enc ?lookup ?subst ?entity_base () = + let out_kind = Netstring_tstring.String_kind in + decode_half_poly ~in_enc ~out_kind ~out_enc ?lookup ?subst ?entity_base () +end diff --git a/ocamlnet_lite/netencoding.mli b/ocamlnet_lite/netencoding.mli new file mode 100644 index 0000000..0969e7d --- /dev/null +++ b/ocamlnet_lite/netencoding.mli @@ -0,0 +1,137 @@ +(* *********************************************************************) +(* HTMLization *) +(* *********************************************************************) + +(* THREAD-SAFETY: + * The Html functions are thread-safe. + *) + +module Url : sig +(** Encoding/Decoding within URLs: + * + * The following two functions perform the '%'-substitution for + * characters that may otherwise be interpreted as metacharacters. + * + * According to: RFC 1738, RFC 1630 + * + * Option [plus]: This option has been added because there are some + * implementations that do not map ' ' to '+', for example Javascript's + * [escape] function. The default is [true] because this is the RFC- + * compliant definition. + *) + +(** There are no tstring and polymorphic versions of the encode and + decode functions, as URLs are comparatively short, and it is + considered as acceptable for the user to convert types as needed, + even if strings need to be copied for that. +*) + + val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string + (** Option [plus]: Whether '+' is converted to space. The default + * is true. If false, '+' is returned as it is. + * + * The optional arguments [pos] and [len] may restrict the string + * to process to this substring. + *) + + val encode : ?plus:bool -> string -> string + (** Option [plus]: Whether spaces are converted to '+'. The default + * is true. If false, spaces are converted to "%20", and + * only %xx sequences are produced. + *) + + val dest_url_encoded_parameters : string -> (string * string) list + (** The argument is the URL-encoded parameter string. The result is + * the corresponding list of (name,value) pairs. + * Note: Whitespace within the parameter string is ignored. + * If there is a format error, the function fails. + *) +end + +module Html : sig + (** Encodes characters that need protection by converting them to + * entity references. E.g. ["<"] is converted to ["<"]. + * As the entities may be named, there is a dependency on the character + * set. + *) + + val encode : + in_enc:Netconversion.encoding -> + ?out_enc:Netconversion.encoding -> + (* default: `Enc_usascii *) + ?prefer_name:bool -> + (* default: true *) + ?unsafe_chars:string -> + (* default: unsafe_chars_html4 *) + unit -> + string -> + string + (** The input string that is encoded as [in_enc] is recoded to + * [out_enc], and the following characters are encoded as HTML + * entity ([&name;] or [num;]): + * - The ASCII characters contained in [unsafe_chars] + * - The characters that cannot be represented in [out_enc]. By + * default ([out_enc=`Enc_usascii]), only ASCII characters can be + * represented, and thus all code points >= 128 are encoded as + * HTML entities. If you pass [out_enc=`Enc_utf8], all characters + * can be represented. + * + * For example, the string ["(ad)"] is encoded as + * ["(a<b) & (c>d)"]. + * + * It is required that [out_enc] is an ASCII-compatible encoding. + * + * The option [prefer_name] selects whether named entities (e.g. [<]) + * or numeric entities (e.g. [<]) are prefered. + * + * The efficiency of the function can be improved when the same encoding + * is applied to several strings. Create a specialized encoding function + * by passing all arguments up to the unit argument, and apply this + * function several times. For example: + * {[ + * let my_enc = encode ~in_enc:`Enc_utf8 () in + * let s1' = my_enc s1 in + * let s2' = my_enc s2 in ... + * ]} + *) + + type entity_set = [ `Html | `Xml | `Empty ] + + val decode : + in_enc:Netconversion.encoding -> + out_enc:Netconversion.encoding -> + ?lookup:(string -> string) -> + (* default: see below *) + ?subst:(int -> string) -> + (* default: see below *) + ?entity_base:entity_set -> + (* default: `Html *) + unit -> + string -> + string + (** The input string is recoded from [in_enc] to [out_enc], and HTML + * entities ([&name;] or [num;]) are resolved. The input encoding + * [in_enc] must be ASCII-compatible. + * + * By default, the function knows all entities defined for HTML 4 (this + * can be changed using [entity_base], see below). If other + * entities occur, the function [lookup] is called and the name of + * the entity is passed as input string to the function. It is + * expected that [lookup] returns the value of the entity, and that this + * value is already encoded as [out_enc]. + * By default, [lookup] raises a [Failure] exception. + * + * If a character cannot be represented in the output encoding, + * the function [subst] is called. [subst] must return a substitute + * string for the character. + * By default, [subst] raises a [Failure] exception. + * + * The option [entity_base] determines which set of entities are + * considered as the known entities that can be decoded without + * help by the [lookup] function: [`Html] selects all entities defined + * for HTML 4, [`Xml] selects only [<], [>], [&], ["], + * and ['], + * and [`Empty] selects the empty set (i.e. [lookup] is always called). + *) + +end diff --git a/ocamlnet_lite/netstring_str.ml b/ocamlnet_lite/netstring_str.ml new file mode 100644 index 0000000..79b275f --- /dev/null +++ b/ocamlnet_lite/netstring_str.ml @@ -0,0 +1,356 @@ +open ExtLib + +let explode s = + let l = String.length s in + let rec loop k = if k < l then s.[k] :: loop (k + 1) else [] in + loop 0 + +let implode l = + let n = List.length l in + let s = Bytes.create n in + let k = ref 0 in + List.iter + (fun c -> + Bytes.set s !k c; + incr k) + l; + Bytes.to_string s + +let quote_set s = + let l = explode s in + let have_circum = List.mem '^' l in + let have_minus = List.mem '-' l in + let have_rbracket = List.mem ']' l in + let l1 = List.filter (fun c -> c <> '^' && c <> '-' && c <> ']') l in + let l2 = if have_rbracket then ']' :: l1 else l1 in + let l3 = if have_circum then l2 @ [ '^' ] else l2 in + let l4 = if have_minus then l3 @ [ '-' ] else l3 in + let s4 = implode l4 in + match s4 with + | "" -> failwith "Netstring_str.quote_set: empty" + | "^" -> "^" + | "^-" -> "[-^]" + | _ -> "[" ^ s4 ^ "]" + +type setatom = Schar of char | Srange of (char * char) +and set = setatom list + +type re_term = + | Texact of string (* literal characters (except NUL) *) + | Tnullchar (* NUL characer *) + | Tany (* . but no newline *) + | Tnull (* emptiness *) + | Tconcat of re_term list + | Tstar of re_term (* x* *) + | Tplus of re_term (* x+ *) + | Toption of re_term (* x? *) + | Tset of set (* [...] *) + | Tnegset of set (* [^...] *) + | Tbegline (* ^ *) + | Tendline (* $ *) + | Talt of re_term list (* x\|y *) + | Tgroup of (int * re_term) (* \(...\) *) + | Trefer of int (* \i *) + | Twordbound (* \b *) + +(**********************************************************************) +(* Final types *) + +type regexp = Pcre.regexp +type split_result = Str.split_result = Text of string | Delim of string +type result = Pcre.substrings + +(**********************************************************************) +(* Parse Str-style regexps, and convert to Pcre-style regexps *) + +let scan_str_regexp re_string = + let l = String.length re_string in + let k = ref (-1) in + let c = ref ' ' in + let esc = ref false in + let group = ref 1 in + let n_open_groups = ref 0 in + let closed_groups = Array.create 10 false in + + let next () = + incr k; + if !k < l then + let c1 = re_string.[!k] in + if c1 = '\\' then + if !k < l then ( + incr k; + c := re_string.[!k]; + esc := true) + else failwith "Web.Url.Netstring_str regexp: bad backslash" + else ( + esc := false; + c := c1) + in + + let next_noesc () = + incr k; + if !k < l then ( + c := re_string.[!k]; + esc := false) + in + + let rec scan_alternative () = + let t1 = scan_concatenation () in + if !k < l then + if !esc && !c = '|' then ( + next (); + match scan_alternative () with + | Talt alist -> Talt (t1 :: alist) + | t -> Talt [ t1; t ]) + else t1 + else t1 + and scan_concatenation () = + let t1 = scan_repetition () in + if t1 = Tnull then t1 + else + let t2 = scan_concatenation () in + match t2 with + | Tnull -> t1 + | Texact s2 -> ( + match t1 with + | Texact s1 -> Texact (s1 ^ s2) + | _ -> Tconcat [ t1; t2 ]) + | Tconcat clist -> Tconcat (t1 :: clist) + | _ -> Tconcat [ t1; t2 ] + and scan_repetition () = + let t1 = ref (scan_literal_or_group ()) in + let continue = ref true in + while !continue do + if !k < l && not !esc then + match !c with + | '*' -> + next (); + t1 := Tstar !t1 + | '+' -> + next (); + t1 := Tplus !t1 + | '?' -> + next (); + t1 := Toption !t1 + (* {...} is not implemented in Str *) + | _ -> continue := false + else continue := false + done; + !t1 + and scan_literal_or_group () = + if !k >= l then Tnull + else if !esc then ( + match !c with + | '(' -> + next (); + let n = !group in + incr group; + incr n_open_groups; + let t = scan_alternative () in + decr n_open_groups; + if !k < l && !esc && !c = ')' then ( + next (); + closed_groups.(n) <- true; + Tgroup (n, t)) + else failwith "regexp: closing paranthesis \\) not found" + | '1' .. '9' -> + let n = Char.code !c - Char.code '0' in + if closed_groups.(n) then ( + next (); + Trefer n) + else failwith "regexp: bad reference to group" + (* + | 'w' -> next(); Twordchar + | 'W' -> next(); Tnowordchar + *) + | 'b' -> + next (); + Twordbound + (* + | 'B' -> next(); Tnowordbound + | '<' -> next(); Twordbeg + | '>' -> next(); Twordend + | '`' -> next(); Tbegbuf + | '\'' -> next(); Tendbuf + *) + | '\\' -> + next (); + Texact (String.make 1 '\\') + | '|' -> Tnull + | ')' -> + if !n_open_groups > 0 then Tnull + else failwith "regexp: unmatched closing parenthesis" + | ch -> + next (); + Texact (String.make 1 ch)) + else + match !c with + | '*' -> Tnull + | '+' -> Tnull + | '?' -> Tnull + | '{' -> Tnull + | '^' -> + next (); + Tbegline + | '$' -> + next (); + Tendline + | '.' -> + next (); + Tany + | '\000' -> + next (); + Tnullchar + | '[' -> + next_noesc (); + if !k < l then ( + let negated = ref false in + let set = ref [] in + + let add_char c = set := Schar c :: !set in + + let add_range c1 c2 = set := Srange (c1, c2) :: !set in + + if !c = '^' then ( + next_noesc (); + negated := true); + + let continue = ref true in + let first = ref true in + + (* the character after [ or [^ ? *) + while !continue && !k < l do + match () with + | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' -> + failwith + "regexp: Character classes such as [[:digit:]] not \ + implemented" + (* TODO: check for predefined sets *) + | () when !c = ']' && not !first -> + next (); + continue := false + | () + when !k + 2 < l + && re_string.[!k + 1] = '-' + && re_string.[!k + 2] <> ']' -> + (* range *) + add_range !c re_string.[!k + 2]; + next_noesc (); + next_noesc (); + next_noesc (); + first := false + | () -> + add_char !c; + next_noesc (); + first := false + done; + + if !continue then failwith "regexp: closing bracket ] not found"; + + if !negated then Tnegset !set else Tset !set) + else failwith "regexp: closing bracket ] not found" + | ch -> + next (); + Texact (String.make 1 ch) + in + + try + next (); + scan_alternative () + with Failure msg -> failwith (msg ^ " - regexp: " ^ re_string) + +let pcre_safe_quote c = + (* for print_set *) + match c with + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> String.make 1 c + | '\000' -> "\\000" + | _ -> "\\" ^ String.make 1 c + +let rec print_pcre_regexp ret = + match ret with + | Texact s -> Pcre.quote s + | Tnullchar -> + (* Pcre.quote "\000" returns nonsense *) + "[\\000]" + | Tany -> "." + | Tnull -> "(?:)" + | Tconcat l -> String.concat "" (List.map print_pcre_regexp l) + | Tstar ret' -> print_pcre_subregexp ret' ^ "*" + | Tplus ret' -> print_pcre_subregexp ret' ^ "+" + | Toption ret' -> print_pcre_subregexp ret' ^ "?" + | Tset s -> "[" ^ print_set s ^ "]" + | Tnegset s -> "[^" ^ print_set s ^ "]" + | Talt l -> String.concat "|" (List.map print_pcre_subregexp l) + | Tgroup (_, ret') -> "(" ^ print_pcre_regexp ret' ^ ")" + | Trefer n -> + (* Put parentheses around \n to disambiguate from \nn *) + "(?:\\" ^ string_of_int n ^ ")" + | Tbegline -> "^" + | Tendline -> "(?:$)" + | Twordbound -> "\\b" + +and print_pcre_subregexp ret = + (* Print ret, but put parentheses around ret *) + match ret with + | Tset _ | Tnegset _ | Tgroup (_, _) -> + (* No additional parentheses needed *) + print_pcre_regexp ret + | _ -> + (* Print (?:ret). This is the "neutral" form of grouping that only + * changes precedence + *) + "(?:" ^ print_pcre_regexp ret ^ ")" + +and print_set s = + String.concat "" + (List.map + (function + | Schar c -> pcre_safe_quote c + | Srange (c1, c2) -> pcre_safe_quote c1 ^ "-" ^ pcre_safe_quote c2) + s) + +(**********************************************************************) +(* Emulation *) + +let regexp s = + let ret = scan_str_regexp s in + let s' = print_pcre_regexp ret in + Pcre.regexp ~flags:[ `MULTILINE ] s' + +let search_forward pat s pos = + let result = Pcre.exec ~rex:pat ~pos s in + (fst (Pcre.get_substring_ofs result 0), result) + +let matched_string result _ = + (* Unfortunately, Pcre.get_substring will not raise Not_found if there is + * no matched string. Instead, it returns "", but this value cannot be + * distinguished from an empty match. + * The workaround is to call Pcre.get_substring_ofs first. This function + * will raise Not_found if there is not any matched string. + * + * NOTE: Current versions of Pcre do return Not_found! + *) + ignore (Pcre.get_substring_ofs result 0); + Pcre.get_substring result 0 + +let match_beginning result = fst (Pcre.get_substring_ofs result 0) +let match_end result = snd (Pcre.get_substring_ofs result 0) + +let matched_group result n _ = + (* See also the comment for [matched_string] *) + if n < 0 || n >= Pcre.num_of_subs result then raise Not_found; + ignore (Pcre.get_substring_ofs result n); + Pcre.get_substring result n + +let global_substitute pat subst s = + Pcre.substitute_substrings ~rex:pat ~subst:(fun r -> subst r s) s + +let tr_split_result r = + List.map + (function + | Pcre.Text t -> Text t | Pcre.Delim d -> Delim d | _ -> assert false) + (List.filter + (function Pcre.Group (_, _) | Pcre.NoGroup -> false | _ -> true) + r) + +let full_split sep s = tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) diff --git a/ocamlnet_lite/netstring_str.mli b/ocamlnet_lite/netstring_str.mli new file mode 100644 index 0000000..19762ce --- /dev/null +++ b/ocamlnet_lite/netstring_str.mli @@ -0,0 +1,57 @@ +type regexp +(** The type of regular expressions *) + +type split_result = Str.split_result = + | Text of string + | Delim of string (** Here we keep compatibility with [Str] *) + +type result +(** The type of matching results *) + +val regexp : string -> regexp +(** Parses a regexp *) + +val search_forward : regexp -> string -> int -> int * result +(** Searches a match of the string with the regexp, starting at + * the position and in forward direction. + * Raises [Not_found] if no match could be found. + * Returns [(p,r)] when a match at position [p] is found, + * described by [r]. + *) + +val quote_set : string -> string +(** Returns a regexp (as string) that matches any of the characters in + the argument. The argument must be non-empty + *) + +val matched_string : result -> string -> string +(** Extracts the matched part from the string. The string argument + * must be the same string passed to [string_match] or the search + * functions, and the result argument must be the corresponding + * result. + *) + +val match_beginning : result -> int +(** Returns the position where the matched part begins *) + +val match_end : result -> int +(** Returns the position where the matched part ends *) + +val matched_group : result -> int -> string -> string +(** Extracts the substring the nth group matches from the whole + * string. The string argument + * must be the same string passed to [string_match] or the search + * functions, and the result argument must be the corresponding + * result. + *) + +val full_split : regexp -> string -> split_result list +(** Like [split_delim], but returns the delimiters in the result *) + +val global_substitute : + regexp -> (result -> string -> string) -> string -> string +(** [global_substitute re subst s]: Applies the substitution function + * [subst] to all matchings of [re] in [s], and returns the + * transformed string. [subst] is called with the current [result] + * of the match and the whole string [s]. + *) diff --git a/ocamlnet_lite/netstring_tstring.ml b/ocamlnet_lite/netstring_tstring.ml new file mode 100644 index 0000000..be4c8d9 --- /dev/null +++ b/ocamlnet_lite/netstring_tstring.ml @@ -0,0 +1,171 @@ +open Netsys_types + +type _ tstring_kind = + | String_kind : string tstring_kind + | Bytes_kind : Bytes.t tstring_kind + +type 't tstring_ops = { + kind : 't tstring_kind option; + length : 't -> int; + get : 't -> int -> char; + unsafe_get : 't -> int -> char; + unsafe_get3 : 't -> int -> int; (* get 3 chars packed into one int *) + copy : 't -> 't; + string : 't -> string; + bytes : 't -> Bytes.t; + sub : 't -> int -> int -> 't; + substring : 't -> int -> int -> string; + subbytes : 't -> int -> int -> Bytes.t; + subpoly : 'u. 'u tstring_kind -> 't -> int -> int -> 'u; + blit_to_bytes : 't -> int -> Bytes.t -> int -> int -> unit; + index_from : 't -> int -> char -> int; + index_from3 : 't -> int -> int -> char -> char -> char -> int; + rindex_from : 't -> int -> char -> int; + rindex_from3 : 't -> int -> int -> char -> char -> char -> int; +} + +type tstring_ops_box = + | Tstring_ops_box : 't tstring_kind * 't tstring_ops -> tstring_ops_box + +type tstring_box = + | Tstring_box : 't tstring_kind * 't tstring_ops * 't -> tstring_box + +type tstring_polybox = + | Tstring_polybox : 't tstring_ops * 't -> tstring_polybox +(* Warning: you cannot match on the type 't here *) + +let str_subpoly : type u. u tstring_kind -> string -> int -> int -> u = function + | String_kind -> String.sub + | Bytes_kind -> + fun s pos len -> + let b = Bytes.create len in + Bytes.blit_string s pos b 0 len; + b + +let str_index_from3 s p n c1 c2 c3 = + (* FIXME: implement in C *) + let sn = String.length s in + if n < 0 || p < 0 || p > sn - n then invalid_arg "index_from3"; + let lim = p + n in + let p = ref p in + while + !p < lim + && + let c = String.unsafe_get s !p in + c <> c1 && c <> c2 && c <> c3 + do + incr p + done; + if !p >= lim then raise Not_found; + !p + +let str_rindex_from3 s p n c1 c2 c3 = + (* FIXME: implement in C *) + let sn = String.length s in + if n < 0 || p < -1 || p >= sn || n - 1 > p then invalid_arg "rindex_from"; + let lim = p - n + 1 in + let p = ref p in + while + !p >= lim + && + let c = String.unsafe_get s !p in + c <> c1 && c <> c2 && c <> c3 + do + decr p + done; + if !p < lim then raise Not_found; + !p + +let string_ops = + { + kind = Some String_kind; + length = String.length; + get = String.get; + unsafe_get = String.unsafe_get; + unsafe_get3 = + (fun s k -> + let c0 = Char.code (String.unsafe_get s k) in + let c1 = Char.code (String.unsafe_get s (k + 1)) in + let c2 = Char.code (String.unsafe_get s (k + 2)) in + (c0 lsl 16) lor (c1 lsl 8) lor c2); + copy = (fun s -> s); + (* ... for the time being ... *) + string = (fun s -> s); + bytes = Bytes.of_string; + sub = String.sub; + substring = String.sub; + subbytes = + (fun s p l -> + let b = Bytes.create l in + Bytes.blit_string s p b 0 l; + b); + subpoly = str_subpoly; + blit_to_bytes = Bytes.blit_string; + index_from = String.index_from; + index_from3 = str_index_from3; + rindex_from = String.rindex_from; + rindex_from3 = str_rindex_from3; + } + +let bytes_index_from3 s p n c1 c2 c3 = + str_index_from3 (Bytes.unsafe_to_string s) p n c1 c2 c3 + +let bytes_rindex_from3 s p n c1 c2 c3 = + str_rindex_from3 (Bytes.unsafe_to_string s) p n c1 c2 c3 + +let bytes_subpoly : type u. u tstring_kind -> Bytes.t -> int -> int -> u = + function + | String_kind -> Bytes.sub_string + | Bytes_kind -> Bytes.sub + +let bytes_ops = + { + kind = Some Bytes_kind; + length = Bytes.length; + get = Bytes.get; + unsafe_get = Bytes.unsafe_get; + unsafe_get3 = + (fun s k -> + let c0 = Char.code (Bytes.unsafe_get s k) in + let c1 = Char.code (Bytes.unsafe_get s (k + 1)) in + let c2 = Char.code (Bytes.unsafe_get s (k + 2)) in + (c0 lsl 16) lor (c1 lsl 8) lor c2); + copy = Bytes.copy; + string = Bytes.to_string; + bytes = (fun s -> s); + sub = Bytes.sub; + substring = Bytes.sub_string; + subbytes = Bytes.sub; + subpoly = bytes_subpoly; + blit_to_bytes = Bytes.blit; + index_from = Bytes.index_from; + index_from3 = bytes_index_from3; + rindex_from = Bytes.rindex_from; + rindex_from3 = bytes_rindex_from3; + } + +let ops_of_tstring = function + | `String _ -> Tstring_ops_box (String_kind, string_ops) + | `Bytes _ -> Tstring_ops_box (Bytes_kind, bytes_ops) + +type 'a with_fun = { with_fun : 's. 's tstring_ops -> 's -> 'a } + +let with_tstring : 'a with_fun -> tstring -> 'a = + fun f -> function + | `String s -> f.with_fun string_ops s + | `Bytes s -> f.with_fun bytes_ops s + +let length_tstring ts = + with_tstring { with_fun = (fun ops s -> ops.length s) } ts + +let polymorph_string_transformation : + type s t. (string -> string) -> s tstring_ops -> t tstring_kind -> s -> t = + fun f ops out_kind s -> + let s' = f (ops.string s) in + match out_kind with + | String_kind -> s' + | Bytes_kind -> Bytes.of_string s' + +let tstring_of_tbuffer = function + | `Bytes s -> `Bytes s + | `String s -> `Bytes s diff --git a/ocamlnet_lite/netstring_tstring.mli b/ocamlnet_lite/netstring_tstring.mli new file mode 100644 index 0000000..0e4075d --- /dev/null +++ b/ocamlnet_lite/netstring_tstring.mli @@ -0,0 +1,82 @@ +(** Support module for tagged strings *) + +open Netsys_types + +(** GADT for encoding the string type (string/bytes/bigarray) *) +type _ tstring_kind = + | String_kind : string tstring_kind + | Bytes_kind : Bytes.t tstring_kind + +type 't tstring_ops = { + kind : 't tstring_kind option; + length : 't -> int; + get : 't -> int -> char; + unsafe_get : 't -> int -> char; + unsafe_get3 : 't -> int -> int; + (** get 3 chars packed into one int + (first char shifted by 16 bits, second char shifted by 8 bits, + third char unshifted) *) + copy : 't -> 't; + string : 't -> string; (** if possible this function does not make a copy *) + bytes : 't -> Bytes.t; (** if possible this function does not make a copy *) + sub : 't -> int -> int -> 't; + substring : 't -> int -> int -> string; + subbytes : 't -> int -> int -> Bytes.t; + subpoly : 'u. 'u tstring_kind -> 't -> int -> int -> 'u; + blit_to_bytes : 't -> int -> Bytes.t -> int -> int -> unit; + index_from : 't -> int -> char -> int; + index_from3 : 't -> int -> int -> char -> char -> char -> int; + (** finds any of three chars. The second int is the search radius *) + rindex_from : 't -> int -> char -> int; + rindex_from3 : 't -> int -> int -> char -> char -> char -> int; + (** finds any of three chars. The second int is the search radius *) +} +(** Operations to call on strings *) + +(** GADT for hiding the type parameter *) +type tstring_ops_box = + | Tstring_ops_box : 't tstring_kind * 't tstring_ops -> tstring_ops_box + +(** GADT for hiding the type parameter *) +type tstring_box = + | Tstring_box : 't tstring_kind * 't tstring_ops * 't -> tstring_box + +(** GADT for hiding the type parameter. Warning: This GADT does not permit you + to recover the kind of string + *) +type tstring_polybox = + | Tstring_polybox : 't tstring_ops * 't -> tstring_polybox + +val string_ops : string tstring_ops +(** Implementation of the operations for [string] *) + +val bytes_ops : Bytes.t tstring_ops +(** Implementation of the operations for [bytes] *) + +val ops_of_tstring : tstring -> tstring_ops_box +(** Create a [Tstring_ops_box] *) + +type 'a with_fun = { with_fun : 's. 's tstring_ops -> 's -> 'a } +(** A polymorphic function for strings *) + +val with_tstring : 'a with_fun -> tstring -> 'a +(** [with_tstring f ts]: Calls [f.with_fun] with the right implementation of + the [tstring_ops] argument + *) + +val length_tstring : tstring -> int +(** Get the length of a tagged string *) + +val tstring_of_tbuffer : tbuffer -> tstring +(** Get the tagged string of a tagged buffer *) + +val polymorph_string_transformation : + (string -> string) -> 's tstring_ops -> 't tstring_kind -> 's -> 't +(** [polymorph_string_transformation f ops kind s]: Converts [s] to a + string, runs [f] on this string, and converts the result to the + type demanded by [kind] + *) + +(**/**) + +val bytes_subpoly : 'u tstring_kind -> Bytes.t -> int -> int -> 'u diff --git a/ocamlnet_lite/netsys_types.ml b/ocamlnet_lite/netsys_types.ml new file mode 100644 index 0000000..19c76b3 --- /dev/null +++ b/ocamlnet_lite/netsys_types.ml @@ -0,0 +1,32 @@ +(* WARNING! THIS IS A COPY OF NETSYS_TYPES.MLI! *) + +(** Types for all Netsys modules *) + +(** {2 Bytes and characters} *) + +(** Remember that up to OCaml-4.01 there was only the [string] type, + and strings were mutable (although frequently used as if there were + immutable). Since OCaml-4.02 there is the immutable [string] and + the mutable [bytes] type. + + The general strategy for switching to the string/bytes scheme is + to replace [string] everywhere with [bytes], and to provide + additional functions taking strings as input or output where it + makes sense. There are exceptions, though, e.g. when the string + acts as a key in a data structure. + + The type name "string" also occurs in function names (e.g. + "get_string") and in variant names (e.g. [String_case]). As we + want to be backward compatible, we keep the old names for functions + on [bytes], and mark them as deprecated. + *) + +type tbuffer = [ `Bytes of Bytes.t | `String of Bytes.t ] + (** A tagged buffer. Note that the [`String] case is deprecated, and only + provided for backward compatibility. + *) + +type tstring = [ `Bytes of Bytes.t | `String of string ] + (** A tagged string which is considered as immutable. See also the + support module {!Netstring_tstring}. + *) diff --git a/ocamlnet_lite/netsys_types.mli b/ocamlnet_lite/netsys_types.mli new file mode 100644 index 0000000..d3faf1a --- /dev/null +++ b/ocamlnet_lite/netsys_types.mli @@ -0,0 +1,32 @@ +(* $Id$ *) + +(** Types for all Netsys modules *) + +(** {2 Bytes and characters} *) + +(** Remember that up to OCaml-4.01 there was only the [string] type, + and strings were mutable (although frequently used as if there were + immutable). Since OCaml-4.02 there is the immutable [string] and + the mutable [bytes] type. + + The general strategy for switching to the string/bytes scheme is + to replace [string] everywhere with [bytes], and to provide + additional functions taking strings as input or output where it + makes sense. There are exceptions, though, e.g. when the string + acts as a key in a data structure. + + The type name "string" also occurs in function names (e.g. + "get_string") and in variant names (e.g. [String_case]). As we + want to be backward compatible, we keep the old names for functions + on [bytes], and mark them as deprecated. + *) + +type tbuffer = [ `Bytes of Bytes.t | `String of Bytes.t ] + (** A tagged buffer. Note that the [`String] case is deprecated, and only + provided for backward compatibility. + *) + +type tstring = [ `Bytes of Bytes.t | `String of string ] + (** A tagged string which is considered as immutable. See also the + support module {!Netstring_tstring}. + *) diff --git a/test.ml b/test.ml index 615415d..6518423 100644 --- a/test.ml +++ b/test.ml @@ -1,6 +1,7 @@ open OUnit open Printf open ExtLib +open Ocamlnet_lite module U = ExtUnix.Specific @@ -514,6 +515,22 @@ let () = test "bit_struct_list" @@ fun () -> t [ 0; 1; 2; 3; 4; 5; 6; 7 ]; () +let () = test "Web.htmldecode" @@ fun () -> + assert_equal (Web.htmldecode "A <p> tag & a <div> tag.") "A
tag & a
tag & a